diff --git a/tensorflow_networking/verbs/BUILD b/tensorflow_networking/verbs/BUILD index 88a93eb..3e50adc 100644 --- a/tensorflow_networking/verbs/BUILD +++ b/tensorflow_networking/verbs/BUILD @@ -1,13 +1,41 @@ # Description: # Verbs RDMA communication interfaces and implementations for TensorFlow. -package(default_visibility = [ - "//tensorflow_networking:__subpackages__", -]) +load("@org_tensorflow//tensorflow:tensorflow.bzl", "tf_cuda_library") -licenses(["notice"]) # Apache 2.0 +# For platform specific build config +load( + "@org_tensorflow//tensorflow/core/platform:default/build_config.bzl", + "tf_proto_library_cc", +) -load("@org_tensorflow//tensorflow:tensorflow.bzl", "tf_cuda_library") +load( + "@org_tensorflow//tensorflow:tensorflow.bzl", + "tf_cc_binary", + "tf_cc_test", + "tf_cuda_library", +) + +load("@org_tensorflow//tensorflow:tensorflow.bzl", "tf_cuda_cc_test") +load("@org_tensorflow//tensorflow:tensorflow.bzl", "tf_cuda_cc_tests") + +# For platform specific build config +load( + "@org_tensorflow//tensorflow/core/platform:default/build_config.bzl", + "tf_kernel_tests_linkstatic", +) + +load( + "@org_tensorflow//tensorflow/core/platform:default/build_config_root.bzl", + "tf_cuda_tests_tags", +) + +package( + default_visibility = [ + "//tensorflow_networking:__subpackages__", + ], + licenses = ["notice"], # Apache 2.0 +) exports_files(["LICENSE"]) @@ -19,12 +47,6 @@ filegroup( ]), ) -# For platform specific build config -load( - "@org_tensorflow//tensorflow/core:platform/default/build_config.bzl", - "tf_proto_library_cc", -) - tf_proto_library_cc( name = "verbs_service_proto", srcs = ["verbs_service.proto"], @@ -43,6 +65,10 @@ cc_library( "@org_tensorflow//tensorflow/core:framework", "@org_tensorflow//tensorflow/core:lib", ], + linkopts = select({ + "@org_tensorflow//tensorflow:with_verbs_support": ["-libverbs"], + "//conditions:default": [], + }), ) cc_library( @@ -52,9 +78,10 @@ cc_library( deps = [ ":grpc_verbs_service_impl", ":rdma_mgr", + ":rdma", ":verbs_service_proto_cc", "@org_tensorflow//tensorflow:grpc++", - "@org_tensorflow//tensorflow/core:lib", + #"@org_tensorflow//tensorflow/core:lib_internal", "@org_tensorflow//tensorflow/core/distributed_runtime:session_mgr", "@org_tensorflow//tensorflow/core/distributed_runtime/rpc:async_service_interface", "@org_tensorflow//tensorflow/core/distributed_runtime/rpc:grpc_call", @@ -77,6 +104,7 @@ cc_library( name = "grpc_verbs_client", srcs = ["grpc_verbs_client.cc"], hdrs = ["grpc_verbs_client.h"], + copts = ["-Og", "-g3"], deps = [ ":grpc_verbs_service_impl", ":verbs_service_proto_cc", @@ -90,27 +118,34 @@ cc_library( cc_library( name = "rdma_rendezvous_mgr", srcs = ["rdma_rendezvous_mgr.cc"], - hdrs = ["rdma_rendezvous_mgr.h"], + hdrs = ["rdma_rendezvous_mgr.h", "rdma.h"], + copts = ["-Og", "-g3"], deps = [ ":rdma_mgr", ":verbs_util", - "@org_tensorflow//tensorflow/core", + #"@org_tensorflow//tensorflow/core:core_cpu_internal", + #"@org_tensorflow//tensorflow/core:gpu_runtime", "@org_tensorflow//tensorflow/core:lib", "@org_tensorflow//tensorflow/core/distributed_runtime:base_rendezvous_mgr", "@org_tensorflow//tensorflow/core/distributed_runtime:worker_env", + #"@org_tensorflow//tensorflow/core/distributed_runtime:worker_cache_partial", ], ) tf_cuda_library( name = "rdma_mgr", srcs = ["rdma_mgr.cc"], - hdrs = ["rdma_mgr.h"], + hdrs = ["rdma_mgr.h", "rdma.h"], + copts = ["-Og", "-g3"], deps = [ ":grpc_verbs_client", - ":rdma", + #":rdma", + ":verbs_util", ":verbs_service_proto_cc", "@org_tensorflow//tensorflow/core", + #"@org_tensorflow//tensorflow/core:core_cpu_internal", "@org_tensorflow//tensorflow/core:lib", + #"@org_tensorflow//tensorflow/core:lib_internal", "@org_tensorflow//tensorflow/core/distributed_runtime:session_mgr", "@org_tensorflow//tensorflow/core/distributed_runtime:worker_env", "@org_tensorflow//tensorflow/core/distributed_runtime/rpc:grpc_channel", @@ -118,21 +153,31 @@ tf_cuda_library( ], ) + tf_cuda_library( name = "rdma", srcs = ["rdma.cc"], hdrs = ["rdma.h"], - linkopts = ["-libverbs"], + linkopts = select({ + "@org_tensorflow//tensorflow:with_verbs_support": ["-libverbs"], + "//conditions:default": [], + }), + copts = ["-Og", "-g3"], deps = [ + ":rdma_mgr", ":grpc_verbs_client", ":verbs_service_proto_cc", ":verbs_util", - "@org_tensorflow//tensorflow/core", + #"@org_tensorflow//tensorflow/core:core_cpu_internal", "@org_tensorflow//tensorflow/core:framework", + #"@org_tensorflow//tensorflow/core:framework_internal", + #"@org_tensorflow//tensorflow/core:gpu_runtime", "@org_tensorflow//tensorflow/core:lib", + #"@org_tensorflow//tensorflow/core:lib_internal", "@org_tensorflow//tensorflow/core/distributed_runtime:rendezvous_mgr_interface", "@org_tensorflow//tensorflow/core/distributed_runtime:session_mgr", "@org_tensorflow//tensorflow/core/distributed_runtime:worker_env", + "@org_tensorflow//tensorflow/core/distributed_runtime/rpc:grpc_channel", ], ) @@ -151,3 +196,4 @@ cc_library( ], alwayslink = 1, ) + diff --git a/tensorflow_networking/verbs/Dockerfile b/tensorflow_networking/verbs/Dockerfile deleted file mode 100644 index cecb40d..0000000 --- a/tensorflow_networking/verbs/Dockerfile +++ /dev/null @@ -1,82 +0,0 @@ -ARG UBUNTU_VERSION=16.04 - -FROM ubuntu:${UBUNTU_VERSION} AS base - -RUN apt-get update && apt-get install -y --no-install-recommends \ - build-essential \ - curl \ - git \ - libcurl3-dev \ - libfreetype6-dev \ - libhdf5-serial-dev \ - libpng12-dev \ - libzmq3-dev \ - pkg-config \ - rsync \ - software-properties-common \ - unzip \ - zip \ - zlib1g-dev \ - openjdk-8-jdk \ - openjdk-8-jre-headless \ - libibverbs-dev \ - && \ - apt-get clean && \ - rm -rf /var/lib/apt/lists/* - -ENV CI_BUILD_PYTHON python - -ARG USE_PYTHON_3_NOT_2 -ARG _PY_SUFFIX=${USE_PYTHON_3_NOT_2:+3} -ARG PYTHON=python${_PY_SUFFIX} -ARG PIP=pip${_PY_SUFFIX} - -# See http://bugs.python.org/issue19846 -ENV LANG C.UTF-8 - -RUN apt-get update && apt-get install -y \ - ${PYTHON} \ - ${PYTHON}-pip - -RUN ${PIP} --no-cache-dir install --upgrade \ - pip \ - setuptools - -# Some TF tools expect a "python" binary -RUN ln -s $(which ${PYTHON}) /usr/local/bin/python - -RUN apt-get update && apt-get install -y \ - build-essential \ - curl \ - git \ - wget \ - openjdk-8-jdk \ - ${PYTHON}-dev \ - swig - -RUN ${PIP} --no-cache-dir install \ - Pillow \ - h5py \ - keras_applications \ - keras_preprocessing \ - matplotlib \ - mock \ - numpy \ - scipy \ - sklearn \ - pandas \ - && test "${USE_PYTHON_3_NOT_2}" -eq 1 && true || ${PIP} --no-cache-dir install \ - enum34 - -# Install bazel -ARG BAZEL_VERSION=0.24.1 -RUN mkdir /bazel && \ - wget -O /bazel/installer.sh "https://github.com/bazelbuild/bazel/releases/download/${BAZEL_VERSION}/bazel-${BAZEL_VERSION}-installer-linux-x86_64.sh" && \ - wget -O /bazel/LICENSE.txt "https://raw.githubusercontent.com/bazelbuild/bazel/master/LICENSE" && \ - chmod +x /bazel/installer.sh && \ - /bazel/installer.sh && \ - rm -f /bazel/installer.sh - -ADD . /tf_networking -WORKDIR /tf_networking -RUN bazel build -c opt //tensorflow_networking/verbs:verbs_server_lib diff --git a/tensorflow_networking/verbs/docker_howto.txt b/tensorflow_networking/verbs/docker_howto.txt deleted file mode 100644 index 16b8392..0000000 --- a/tensorflow_networking/verbs/docker_howto.txt +++ /dev/null @@ -1,13 +0,0 @@ -Building a networking component may require libraries not available or installable in your normal environment. -As of late 2018 the networking contrib extensions to TensorFlow 1.x can be built in a docker container, as follows. - -1. Ensure docker is installed. -2. Invoke a docker container with the latest nightly development build: - $ docker run -it -w /tensorflow -v $PWD:/mnt -e HOST_PERMS"$(id -u):$(id -g)" tensorflow/tensorflow:nightly-devel bash -3. Configure for bazel build - $ ./configure -4. Install any necessary additional packages, e.g. - $ apt-get update - $ apt-get install libibverbs-dev -5. Build with the desired extension - $ bazel build --config=verbs //tensorflow/tools/pip_package:build_pip_package diff --git a/tensorflow_networking/verbs/grpc_verbs_client.cc b/tensorflow_networking/verbs/grpc_verbs_client.cc index 28411fd..d0ad3ef 100644 --- a/tensorflow_networking/verbs/grpc_verbs_client.cc +++ b/tensorflow_networking/verbs/grpc_verbs_client.cc @@ -37,6 +37,39 @@ Status GrpcVerbsClient::GetRemoteAddress(const GetRemoteAddressRequest* request, return GetRemoteAddress(&call_options, request, response); } + +Status GrpcVerbsClient::ReqDriverMessage(CallOptions* call_options, + const DriverMessageReq* request, + DriverMessageResp* response) { + ::grpc::ClientContext ctx; + ctx.set_fail_fast(false); + SetDeadline(&ctx, call_options->GetTimeout()); + return FromGrpcStatus(stub_->ReqDriverMessage(&ctx, *request, response)); +} + +Status GrpcVerbsClient::ReqDriverMessage(const DriverMessageReq* request, + DriverMessageResp* response) { + CallOptions call_options; + call_options.SetTimeout(-1); // no time out + return ReqDriverMessage(&call_options, request, response); +} + +Status GrpcVerbsClient::ReqPleSendOrCheck(CallOptions* call_options, + const PleSendOrCheckReq* request, + PleSendOrCheckResp* response) { + ::grpc::ClientContext ctx; + ctx.set_fail_fast(false); + SetDeadline(&ctx, call_options->GetTimeout()); + return FromGrpcStatus(stub_->ReqPleSendOrCheck(&ctx, *request, response)); +} + +Status GrpcVerbsClient::ReqPleSendOrCheck(const PleSendOrCheckReq* request, + PleSendOrCheckResp* response) { + CallOptions call_options; + call_options.SetTimeout(-1); // no time out + return ReqPleSendOrCheck(&call_options, request, response); +} + void GrpcVerbsClient::SetDeadline(::grpc::ClientContext* ctx, int64 time_in_ms) { if (time_in_ms > 0) { diff --git a/tensorflow_networking/verbs/grpc_verbs_client.h b/tensorflow_networking/verbs/grpc_verbs_client.h index db01798..c703f64 100644 --- a/tensorflow_networking/verbs/grpc_verbs_client.h +++ b/tensorflow_networking/verbs/grpc_verbs_client.h @@ -16,11 +16,11 @@ limitations under the License. #ifndef TENSORFLOW_CONTRIB_VERBS_GRPC_VERBS_CLIENT_H_ #define TENSORFLOW_CONTRIB_VERBS_GRPC_VERBS_CLIENT_H_ +#include "tensorflow_networking/verbs/grpc_verbs_service_impl.h" +#include "tensorflow_networking/verbs/verbs_service.pb.h" #include "tensorflow/core/distributed_runtime/call_options.h" #include "tensorflow/core/distributed_runtime/rpc/grpc_util.h" #include "tensorflow/core/lib/core/status.h" -#include "tensorflow_networking/verbs/grpc_verbs_service_impl.h" -#include "tensorflow_networking/verbs/verbs_service.pb.h" namespace tensorflow { @@ -37,6 +37,19 @@ class GrpcVerbsClient { Status GetRemoteAddress(const GetRemoteAddressRequest* request, GetRemoteAddressResponse* response); + Status ReqDriverMessage(CallOptions* call_options, + const DriverMessageReq* request, + DriverMessageResp* response); + Status ReqDriverMessage(const DriverMessageReq* request, + DriverMessageResp* response); + + Status ReqPleSendOrCheck(CallOptions* call_options, + const PleSendOrCheckReq* request, + PleSendOrCheckResp* response); + + Status ReqPleSendOrCheck(const PleSendOrCheckReq* request, + PleSendOrCheckResp* response); + private: std::unique_ptr stub_; diff --git a/tensorflow_networking/verbs/grpc_verbs_service.cc b/tensorflow_networking/verbs/grpc_verbs_service.cc index 11d8704..b38358a 100644 --- a/tensorflow_networking/verbs/grpc_verbs_service.cc +++ b/tensorflow_networking/verbs/grpc_verbs_service.cc @@ -13,7 +13,10 @@ See the License for the specific language governing permissions and limitations under the License. ==============================================================================*/ +#ifdef TENSORFLOW_USE_VERBS + #include "tensorflow_networking/verbs/grpc_verbs_service.h" + #include "tensorflow/core/distributed_runtime/rpc/grpc_util.h" #include "tensorflow/core/distributed_runtime/session_mgr.h" @@ -71,12 +74,34 @@ void GrpcVerbsService::Shutdown() { } \ } while (0) +#define ENQUEUE_Driver_REQUEST(method, method_func, supports_cancel) \ + do { \ + mutex_lock l(shutdown_mu_); \ + if (!is_shutdown_) { \ + Call:: \ + EnqueueRequest(&verbs_service_, cq_, \ + &grpc::VerbsService::AsyncService::Request##method_func, \ + &GrpcVerbsService::method_func##Handler, \ + (supports_cancel)); \ + } \ + } while (0) + + // This method blocks forever handling requests from the completion queue. void GrpcVerbsService::HandleRPCsLoop() { for (int i = 0; i < 10; ++i) { ENQUEUE_REQUEST(GetRemoteAddress, false); } + for (int i =0; i < 10; ++i) { + ENQUEUE_Driver_REQUEST(DriverMessage, ReqDriverMessage, false); + } + + for (int i =0; i < 10; ++i) { + ENQUEUE_Driver_REQUEST(PleSendOrCheck, ReqPleSendOrCheck, false); + } + void* tag; bool ok; @@ -98,6 +123,77 @@ void GrpcVerbsService::GetRemoteAddressHandler( ENQUEUE_REQUEST(GetRemoteAddress, false); } +void GrpcVerbsService::ReqDriverMessageHandler( + WorkerCall* call) { + Status s = ReqDriverMessageSync(&call->request, &call->response); + call->SendResponse(ToGrpcStatus(s)); + ENQUEUE_Driver_REQUEST(DriverMessage, ReqDriverMessage, false); +} + +void GrpcVerbsService::ReqPleSendOrCheckHandler( + WorkerCall* call) { + Status s = ReqPleSendOrCheckSync(&call->request, &call->response); + call->SendResponse(ToGrpcStatus(s)); + ENQUEUE_Driver_REQUEST(PleSendOrCheck, ReqPleSendOrCheck, false); +} + +// synchronous method +Status GrpcVerbsService::ReqDriverMessageSync(const DriverMessageReq* request, + DriverMessageResp* response) { + // analysis send-driven Request + const string& remote_host_name = request->host_name(); + RdmaChannel* channel = rdma_mgr_->FindChannel(remote_host_name); + CHECK(channel != nullptr) << "GrpcVerbsService RdmaChannel for:" + << remote_host_name << " is nullptr"; + RDMA_LOG(1) << "GrpcVerbsService Channel local_name_:" + << channel->local_name_; + string worker_name = worker_env_->session_mgr->LegacySession()->worker_name; + + CHECK(worker_name == channel->local_name_) + << "worker_name != channel->local_name_" + << " worker_name:" << worker_name + << " channel->local_name_:" << channel->local_name_; + + // LOG(INFO) << "GrpcVerbsService recv: " << remote_host_name; + std::shared_ptr driver_mgr_ptr = channel->GetRdmaSendDriverMgr(); + driver_mgr_ptr->RpcUpdateRemoteDriverEntry(request, response); + return Status::OK(); +} + +Status GrpcVerbsService::ReqPleSendOrCheckSync(const PleSendOrCheckReq* request, + PleSendOrCheckResp* response) { + // analysis request + const string& remote_host_name = request->host_name(); + RdmaChannel* channel = rdma_mgr_->FindChannel(remote_host_name); + CHECK(channel != nullptr) << "ReqPleSendOrCheckSync RdmaChannel for:" + << remote_host_name << " is nullptr"; + LOG(INFO) << "ReqPleSendOrCheckSync Channel local_name_:" + << channel->local_name_; + string worker_name = worker_env_->session_mgr->LegacySession()->worker_name; + + CHECK(worker_name == channel->local_name_) + << "worker_name != channel->local_name_" + << " worker_name:" << worker_name + << " channel->local_name_:" << channel->local_name_; + + if (channel->could_send_driver_) { + LOG(INFO) << "ReqPleSendOrCheckSync for remote:" + << remote_host_name + << " is ok"; + response->set_host_name(channel->local_name_); + response->set_is_ok(true); + return Status::OK(); + } + + // service allocate static mem and notify to endpoint + // TODO(wuyongyu02): change to large MR + channel->InitAndSetDriverStatus(); + response->set_host_name(channel->local_name_); + response->set_is_ok(true); + // LOG(INFO) << "ReqPleSendOrCheckSync recv: " << remote_host_name; + return Status::OK(); +} + // synchronous method Status GrpcVerbsService::GetRemoteAddressSync( const GetRemoteAddressRequest* request, @@ -116,7 +212,13 @@ Status GrpcVerbsService::GetRemoteAddressSync( rc->SetRemoteAddress(ra, false); rc->Connect(); int i = 0; - int idx[] = {1, 0}; + // int idx[] = {1, 0}; + int idx[RdmaChannel::kNumMessageBuffers + 1]; + for (auto k=0; k < RdmaChannel::kNumMessageBuffers; k = k + 2) { + idx[k] = k+1; + idx[k+1] = k; + // LOG(ERROR) << "idx[" << k << "]:" << idx[k] << " idx[" << k+1 << "]:" << idx[k+1]; + } std::vector mb(rc->message_buffers()); CHECK_EQ(request->mr_size(), RdmaChannel::kNumMessageBuffers); for (const auto& mr : request->mr()) { @@ -136,7 +238,7 @@ Status GrpcVerbsService::GetRemoteAddressSync( // setting up response response->set_host_name( worker_env_->session_mgr->LegacySession()->worker_name); - Channel* channel_info = response->mutable_channel(); + ChannelInfo* channel_info = response->mutable_channel(); channel_info->set_lid(rc->self().lid); channel_info->set_qpn(rc->self().qpn); channel_info->set_psn(rc->self().psn); @@ -157,3 +259,5 @@ void SetNewVerbsService(GrpcVerbsService** handle, const WorkerEnv* worker_env, } } // namespace tensorflow + +#endif // TENSORFLOW_USE_VERBS diff --git a/tensorflow_networking/verbs/grpc_verbs_service.h b/tensorflow_networking/verbs/grpc_verbs_service.h index 0d36859..494798b 100644 --- a/tensorflow_networking/verbs/grpc_verbs_service.h +++ b/tensorflow_networking/verbs/grpc_verbs_service.h @@ -16,15 +16,18 @@ limitations under the License. #ifndef TENSORFLOW_CONTRIB_VERBS_GRPC_VERBS_SERVICE_H_ #define TENSORFLOW_CONTRIB_VERBS_GRPC_VERBS_SERVICE_H_ +#ifdef TENSORFLOW_USE_VERBS + #include "grpcpp/alarm.h" #include "grpcpp/grpcpp.h" #include "grpcpp/server_builder.h" +#include "tensorflow_networking/verbs/verbs_service.pb.h" +#include "tensorflow_networking/verbs/rdma.h" +#include "tensorflow_networking/verbs/rdma_mgr.h" +#include "tensorflow_networking/verbs/verbs_service.pb.h" #include "tensorflow/core/distributed_runtime/rpc/async_service_interface.h" #include "tensorflow/core/distributed_runtime/rpc/grpc_call.h" #include "tensorflow/core/lib/core/refcount.h" -#include "tensorflow_networking/verbs/grpc_verbs_service_impl.h" -#include "tensorflow_networking/verbs/rdma_mgr.h" -#include "tensorflow_networking/verbs/verbs_service.pb.h" namespace tensorflow { @@ -44,11 +47,23 @@ class GrpcVerbsService : public AsyncServiceInterface { WorkerCall* call); Status GetRemoteAddressSync(const GetRemoteAddressRequest* request, GetRemoteAddressResponse* response); + + void ReqDriverMessageHandler( + WorkerCall* call); + + void ReqPleSendOrCheckHandler( + WorkerCall* call); + + Status ReqDriverMessageSync(const DriverMessageReq* request, + DriverMessageResp* response); + + Status ReqPleSendOrCheckSync(const PleSendOrCheckReq* request, + PleSendOrCheckResp* response); ::grpc::ServerCompletionQueue* cq_; grpc::VerbsService::AsyncService verbs_service_; mutex shutdown_mu_; - bool is_shutdown_ TF_GUARDED_BY(shutdown_mu_); + bool is_shutdown_ GUARDED_BY(shutdown_mu_); ::grpc::Alarm* shutdown_alarm_; // not owned RdmaMgr* rdma_mgr_; @@ -63,4 +78,5 @@ void SetNewVerbsService(GrpcVerbsService** handle, const WorkerEnv* worker_env, } // namespace tensorflow +#endif // TENSORFLOW_USE_VERBS #endif // TENSORFLOW_CONTRIB_VERBS_GRPC_VERBS_SERVICE_H_ diff --git a/tensorflow_networking/verbs/grpc_verbs_service_impl.cc b/tensorflow_networking/verbs/grpc_verbs_service_impl.cc index 506bcb4..3c8d8bd 100644 --- a/tensorflow_networking/verbs/grpc_verbs_service_impl.cc +++ b/tensorflow_networking/verbs/grpc_verbs_service_impl.cc @@ -30,6 +30,8 @@ namespace grpc { static const char* grpcVerbsService_method_names[] = { "/tensorflow.VerbsService/GetRemoteAddress", + "/tensorflow.VerbsService/ReqDriverMessage", + "/tensorflow.VerbsService/ReqPleSendOrCheck" }; std::unique_ptr VerbsService::NewStub( @@ -44,7 +46,14 @@ VerbsService::Stub::Stub( : channel_(channel), rpcmethod_GetRemoteAddress_(grpcVerbsService_method_names[0], ::grpc::internal::RpcMethod::NORMAL_RPC, - channel) {} + channel), + rpcmethod_ReqDriverMessage_(grpcVerbsService_method_names[1], + ::grpc::internal::RpcMethod::NORMAL_RPC, + channel), + rpcmethod_ReqPleSendOrCheck_(grpcVerbsService_method_names[2], + ::grpc::internal::RpcMethod::NORMAL_RPC, + channel) + {} ::grpc::Status VerbsService::Stub::GetRemoteAddress( ::grpc::ClientContext* context, const GetRemoteAddressRequest& request, @@ -53,8 +62,24 @@ ::grpc::Status VerbsService::Stub::GetRemoteAddress( channel_.get(), rpcmethod_GetRemoteAddress_, context, request, response); } +::grpc::Status VerbsService::Stub::ReqDriverMessage( + ::grpc::ClientContext* context, const DriverMessageReq& request, + DriverMessageResp* response) { + // LOG(INFO) << "Stub ReqDriverMessage..." + // << " rpcmethod_ReqDriverMessage_:" << rpcmethod_ReqDriverMessage_; + return ::grpc::internal::BlockingUnaryCall( + channel_.get(), rpcmethod_ReqDriverMessage_, context, request, response); +} + +::grpc::Status VerbsService::Stub::ReqPleSendOrCheck( + ::grpc::ClientContext* context, const PleSendOrCheckReq& request, + PleSendOrCheckResp* response) { + return ::grpc::internal::BlockingUnaryCall( + channel_.get(), rpcmethod_ReqPleSendOrCheck_, context, request, response); +} + VerbsService::AsyncService::AsyncService() { - for (int i = 0; i < 1; ++i) { + for (int i = 0; i < 3; ++i) { AddMethod(new ::grpc::internal::RpcServiceMethod( grpcVerbsService_method_names[i], ::grpc::internal::RpcMethod::NORMAL_RPC, nullptr)); diff --git a/tensorflow_networking/verbs/grpc_verbs_service_impl.h b/tensorflow_networking/verbs/grpc_verbs_service_impl.h index cdd8904..caabc85 100644 --- a/tensorflow_networking/verbs/grpc_verbs_service_impl.h +++ b/tensorflow_networking/verbs/grpc_verbs_service_impl.h @@ -32,7 +32,7 @@ namespace tensorflow { namespace grpc { // Implementation of `tensorflow.VerbsService`, based on the -// definition in "//tensorflow_networking/verbs/verbs_service.proto", +// definition in "//tensorflow/contrib/verbs/verbs_service.proto", // and the gRPC generated stub and service classes. // See the proto file for the definition of methods and messages. class VerbsService GRPC_FINAL { @@ -43,6 +43,13 @@ class VerbsService GRPC_FINAL { virtual ::grpc::Status GetRemoteAddress( ::grpc::ClientContext* context, const GetRemoteAddressRequest& request, GetRemoteAddressResponse* response) = 0; + virtual ::grpc::Status ReqDriverMessage( + ::grpc::ClientContext* context, const DriverMessageReq& request, + DriverMessageResp* response) = 0; + + virtual ::grpc::Status ReqPleSendOrCheck( + ::grpc::ClientContext* context, const PleSendOrCheckReq& request, + PleSendOrCheckResp* response) = 0; }; class Stub GRPC_FINAL : public StubInterface { public: @@ -51,9 +58,21 @@ class VerbsService GRPC_FINAL { ::grpc::ClientContext* context, const GetRemoteAddressRequest& request, GetRemoteAddressResponse* response) GRPC_OVERRIDE; + ::grpc::Status ReqDriverMessage( + ::grpc::ClientContext* context, + const DriverMessageReq& request, + DriverMessageResp* response) GRPC_OVERRIDE; + + ::grpc::Status ReqPleSendOrCheck( + ::grpc::ClientContext* context, + const PleSendOrCheckReq& request, + PleSendOrCheckResp* response) GRPC_OVERRIDE; + private: std::shared_ptr< ::grpc::ChannelInterface> channel_; const ::grpc::internal::RpcMethod rpcmethod_GetRemoteAddress_; + const ::grpc::internal::RpcMethod rpcmethod_ReqDriverMessage_; + const ::grpc::internal::RpcMethod rpcmethod_ReqPleSendOrCheck_; }; static std::unique_ptr NewStub( const std::shared_ptr< ::grpc::ChannelInterface>& channel, @@ -71,6 +90,25 @@ class VerbsService GRPC_FINAL { ::grpc::Service::RequestAsyncUnary(0, context, request, response, new_call_cq, notification_cq, tag); } + + void RequestReqDriverMessage( + ::grpc::ServerContext* context, DriverMessageReq* request, + ::grpc::ServerAsyncResponseWriter* response, + ::grpc::CompletionQueue* new_call_cq, + ::grpc::ServerCompletionQueue* notification_cq, void* tag) { + ::grpc::Service::RequestAsyncUnary(1, context, request, response, + new_call_cq, notification_cq, tag); + } + + void RequestReqPleSendOrCheck( + ::grpc::ServerContext* context, PleSendOrCheckReq* request, + ::grpc::ServerAsyncResponseWriter* response, + ::grpc::CompletionQueue* new_call_cq, + ::grpc::ServerCompletionQueue* notification_cq, void* tag) { + ::grpc::Service::RequestAsyncUnary(2, context, request, response, + new_call_cq, notification_cq, tag); + } + }; }; diff --git a/tensorflow_networking/verbs/rdma.cc b/tensorflow_networking/verbs/rdma.cc index b4b3dad..971dd32 100644 --- a/tensorflow_networking/verbs/rdma.cc +++ b/tensorflow_networking/verbs/rdma.cc @@ -13,14 +13,22 @@ See the License for the specific language governing permissions and limitations under the License. ==============================================================================*/ +#ifdef TENSORFLOW_USE_VERBS + #include #include +#include +#include +#include +#include +#include +#include +#include "tensorflow_networking/verbs/rdma.h" +#include "tensorflow_networking/verbs/verbs_service.pb.h" #include "tensorflow/core/common_runtime/device_mgr.h" #include "tensorflow/core/common_runtime/dma_helper.h" #include "tensorflow/core/common_runtime/process_util.h" -#include "tensorflow_networking/verbs/rdma.h" -#include "tensorflow_networking/verbs/verbs_service.pb.h" #if GOOGLE_CUDA #include "tensorflow/core/common_runtime/gpu/gpu_process_state.h" #include "tensorflow/core/common_runtime/gpu/gpu_util.h" @@ -54,6 +62,12 @@ string MessageTypeToString(RdmaMessageType rmt) { case RDMA_MESSAGE_TENSOR_REQUEST: return "RDMA_MESSAGE_TENSOR_REQUEST"; break; + case RDMA_MESSAGE_DRIVER_BEGIN: + return "RDMA_MESSAGE_DRIVER_BEGIN"; + break; + case RDMA_MESSAGE_ERROR_STATUS: + return "RDMA_MESSAGE_ERROR_STATUS"; + break; default: return "UNKNOWN MESSAGE"; } @@ -79,6 +93,8 @@ string get_env_var(char const* var_name) { ibv_context* open_device(ibv_device* ibv_dev) { ibv_context* context = ibv_open_device(ibv_dev); + LOG(INFO) << "RDMA context->num_comp_vectors:" << context->num_comp_vectors; + CHECK(context) << "Open context failed for " << ibv_get_device_name(ibv_dev); return context; } @@ -98,6 +114,13 @@ int get_dev_active_port_count(ibv_device* device) { CHECK(context) << "Open context failed for " << ibv_get_device_name(device); rc = ibv_query_device(context, &device_att); CHECK(!rc) << "Failed to query the device"; + LOG(INFO) << "[RDMA Device Info] " + << " max_qp:" << device_att.max_qp + << " max_cq:" << device_att.max_cq + << " max_pd:" << device_att.max_pd + << " max_mr:" << device_att.max_mr + << " max_mr_size:" << device_att.max_mr_size; + for (port_index = 1; port_index <= device_att.phys_port_cnt; port_index++) { rc = ibv_query_port(context, port_index, &port_attr); @@ -398,128 +421,348 @@ ibv_pd* alloc_protection_domain(ibv_context* context) { return pd; } +Chunk::Chunk(struct ibv_pd* pd) : + pd_(pd), allocate_size_(0), curr_size_(0), empty_size_(0), + offset_(0), total_waste_size_(0), total_realloc_size_(0) { + chunk_addr_size = VerbsEnvRegistrar::Instance()->RdmaChunkSize(); + if (EIGEN_MAX_ALIGN_BYTES > 0) { + int ratio = (chunk_addr_size + EIGEN_MAX_ALIGN_BYTES) / EIGEN_MAX_ALIGN_BYTES; + chunk_addr_size = ratio * EIGEN_MAX_ALIGN_BYTES; + } + LOG(INFO) << "chunk size:" + << chunk_addr_size + << " EIGEN_MAX_ALIGN_BYTES:" + << EIGEN_MAX_ALIGN_BYTES; +} + +void Chunk::FreeChunk() { + LOG(INFO) << "delete Chunk"; + for (auto& it : mrs_) { + ibv_dereg_mr(it); + } + for (auto& it : chunk_addrs_) { + free(it); + } +} + +Chunk::~Chunk() { } + +void Chunk::Alloc(size_t size, void** p, ibv_mr** mr, size_t realloc_size) { + mutex_lock l(alloc_mu_); + size_t align_size = size; + if (EIGEN_MAX_ALIGN_BYTES > 0) { + int ratio = (size + EIGEN_MAX_ALIGN_BYTES - 1) / EIGEN_MAX_ALIGN_BYTES; + align_size = ratio * EIGEN_MAX_ALIGN_BYTES; + } + // empty addr need alloc new data + if (empty_size_ < align_size) { + size_t malloc_size = (align_size + chunk_addr_size - 1) / chunk_addr_size * chunk_addr_size; + curr_size_ += malloc_size; + total_waste_size_ += empty_size_; + total_realloc_size_ += realloc_size; + LOG(INFO) << "RDMA Allocate Memory: " << curr_size_ << " Bytes " << total_waste_size_ << " " << total_realloc_size_; + offset_ = 0; + empty_size_ = malloc_size; + size_t allocate_size= 0; + ib_malloc((void**)&new_p_, &allocate_size, malloc_size, + EIGEN_MAX_ALIGN_BYTES); + new_mr_ = ibv_reg_mr(pd_, new_p_, malloc_size, + IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_WRITE); + mrs_.emplace_back(new_mr_); + chunk_addrs_.emplace_back(new_p_); + } + *p = (void*)(((char *)new_p_) + offset_); + empty_size_ -= align_size; + *mr = new_mr_; + offset_ += align_size; +} + RdmaAdapter::RdmaAdapter(const WorkerEnv* worker_env) : context_(open_device(set_device())), params_(params_init(context_)), pd_(alloc_protection_domain(context_)), worker_env_(worker_env) { - event_channel_ = ibv_create_comp_channel(context_); - CHECK(event_channel_) << "Failed to create completion channel"; - cq_ = ibv_create_cq(context_, MAX_CONCURRENT_WRITES * 2, NULL, event_channel_, - 0); - CHECK(cq_) << "Failed to create completion queue"; - CHECK(!ibv_req_notify_cq(cq_, 0)) << "Failed to request CQ notification"; + recv_chunk_ = new Chunk(pd_); + cq_nums_ = VerbsEnvRegistrar::Instance()->RdmaCqNums(); + wc_vec_ = new ibv_wc*[cq_nums_]; + cq_vec_ = new ibv_cq*[cq_nums_]; + event_channel_vec_ = new ibv_comp_channel*[cq_nums_]; + for (int i = 0; i < cq_nums_; i++) { + wc_vec_[i] = new ibv_wc[MAX_CONCURRENT_WRITES * 2]; + event_channel_vec_[i] = ibv_create_comp_channel(context_); + CHECK(event_channel_vec_[i]) << "Failed to create of " << i + << " completion channel"; + cq_vec_[i] = ibv_create_cq(context_, MAX_CONCURRENT_WRITES * 2, NULL, + event_channel_vec_[i], 0); + CHECK(cq_vec_[i]) << "Failed to create of " << i << " completion queue"; + CHECK(!ibv_req_notify_cq(cq_vec_[i], 0)) + << "Failed to request CQ notification"; + } + LOG(INFO) << "RdmaCQpoolSize:" + << VerbsEnvRegistrar::Instance()->RdmaCQpoolSize(); + pool_ = new thread::ThreadPool(Env::Default(), ThreadOptions(), + "process_wr_impl", VerbsEnvRegistrar::Instance()->RdmaCQpoolSize(), + false, nullptr); } RdmaAdapter::~RdmaAdapter() { - polling_thread_.reset(); - CHECK(!ibv_destroy_cq(cq_)) << "Failed to destroy CQ"; - CHECK(!ibv_destroy_comp_channel(event_channel_)) - << "Failed to destroy channel"; + for (int i = 0; i < cq_nums_; i++) { + polling_thread_vec_[i].reset(); + } + for (int i = 0; i < cq_nums_; i++) { + CHECK(!ibv_destroy_cq(cq_vec_[i])) << "Failed to destroy CQ"; + CHECK(!ibv_destroy_comp_channel(event_channel_vec_[i])) + << "Failed to destroy channel"; + } CHECK(!ibv_dealloc_pd(pd_)) << "Failed to deallocate PD"; CHECK(!ibv_close_device(context_)) << "Failed to release context"; + recv_chunk_->FreeChunk(); + delete recv_chunk_; + recv_chunk_ = nullptr; } void RdmaAdapter::StartPolling() { - polling_thread_.reset(Env::Default()->StartThread( - ThreadOptions(), "RdmaAdapterCQThread", [this] { Process_CQ(); })); + for (int i = 0; i < cq_nums_; i++) { + polling_thread_vec_.emplace_back(Env::Default()->StartThread( + ThreadOptions(), "RdmaAdapterCQThread", + [this, i] { Pool_Process_CQ(i); })); + } VLOG(2) << "Start RdmaAdapter: " << name(); } string RdmaAdapter::name() const { return string(context_->device->name); } -// Function to process incoming messages -// There are two types of messages: -// 1. IBV_WC_RECV_RDMA_WITH_IMM (receive) -// 2. IBV_WC_RDMA_WRITE (send)) -void RdmaAdapter::Process_CQ() { - while (true) { - ibv_cq* cq; - void* cq_context; - CHECK(!ibv_get_cq_event(event_channel_, &cq, &cq_context)); - CHECK(cq == cq_); - ibv_ack_cq_events(cq, 1); - CHECK(!ibv_req_notify_cq(cq_, 0)); +void RdmaAdapter::Process_WR(ibv_wc wc_, int cq_num) { + if (wc_.status != IBV_WC_SUCCESS) { + return; + } + CHECK(wc_.status == IBV_WC_SUCCESS) + << "Failed status \n" + << ibv_wc_status_str(wc_.status) << " " << wc_.status << " " + << static_cast(wc_.wr_id) << " " << wc_.vendor_err; + if (wc_.opcode == IBV_WC_RECV_RDMA_WITH_IMM) { + RdmaChannel* rc = reinterpret_cast(wc_.wr_id); + if (rc == nullptr) { + LOG(FATAL) << "Process_WR Faild wc_.wr_id:" << wc_.wr_id + << " cq_num:" << cq_num; + return; + } + // put back a recv wr. + rc->Recv(); + // imm_data is the index of RX buffer in the buffer table. + uint32_t imm_data = wc_.imm_data; + RdmaMessageBuffer* rb; + RdmaMessage rm; + + if (imm_data > RDMA_IMM_MAX_REQUEST_ID && imm_data <= RDMA_IMM_DATA_ACK) { + // receive an ack to a message + int pair_index = imm_data - RDMA_IMM_MAX_REQUEST_ID -1; + int buffer_index = 2 * pair_index; + rb = rc->message_buffers()[buffer_index]; + rb->SetBufferStatus(remote, idle); + rb->SendNextItem(); + return; + } - int ne = - ibv_poll_cq(cq_, MAX_CONCURRENT_WRITES * 2, static_cast(wc_)); - CHECK_GE(ne, 0); - for (int i = 0; i < ne; ++i) { - CHECK(wc_[i].status == IBV_WC_SUCCESS) - << "Failed status \n" - << ibv_wc_status_str(wc_[i].status) << " " << wc_[i].status << " " - << static_cast(wc_[i].wr_id) << " " << wc_[i].vendor_err; - if (wc_[i].opcode == IBV_WC_RECV_RDMA_WITH_IMM) { - RdmaChannel* rc = reinterpret_cast(wc_[i].wr_id); - // put back a recv wr. - rc->Recv(); - // imm_data is the index of RX buffer in the buffer table. - uint32_t imm_data = wc_[i].imm_data; - RdmaMessageBuffer* rb; - RdmaMessage rm; - - if (imm_data == RDMA_IMM_DATA_ACK) { - // receive an ack to a message - rb = rc->tx_message_buffer_; - rb->SetBufferStatus(remote, idle); - rb->SendNextItem(); - continue; + if (imm_data <= RDMA_IMM_MAX_REQUEST_ID) { + // receive a tensor RDMA write + uint32_t request_index = imm_data; + if (imm_data < RDMA_IMM_MIN_SENDMGR_BASE) { + RdmaTensorRequest* request = rc->GetTensorRequest(request_index); + if (request == nullptr) { + LOG(INFO) << "Normal request_index:" + << request_index + << " , Normal request is done by SendDriverMgr"; + return; + } + RDMA_LOG(1) << "DoNormal request_index:" << request_index; + request->RecvTensorContent(); + } else { + // RecvSendDriver + const auto& tensors_uid_parsed_key = + rc->channel_record_->GetChannelTensorsUidParsedkey(); + const auto& it = tensors_uid_parsed_key.find(imm_data); + if (it == tensors_uid_parsed_key.end()) { + LOG(FATAL) << "RdmaTensorRequest Not find parsed_key:" + << it->second; + } + const auto& parsed_key = it->second; + bool has_data = false; + std::shared_ptr entry_ptr = + rc->rdma_send_driver_mgr_->GetDriverEntry(parsed_key, &has_data); + + const auto& tensors_meta_data = + rc->channel_record_->GetChannelTensorsMetaData(); + const auto& meta = tensors_meta_data.find(parsed_key); + if (meta == tensors_meta_data.end()) { + LOG(FATAL) + << "meta is not find in rc->channel_record_->tensors_meta_data_"; } - if (imm_data <= RDMA_IMM_MAX_REQUEST_ID) { - // receive a tensor RDMA write - uint32_t request_index = imm_data; - RdmaTensorRequest* request = rc->GetTensorRequest(request_index); - request->RecvTensorContent(); - continue; + bool can_memcpy = DataTypeCanUseMemcpy(meta->second.data_type_); + if (!has_data) { + // parsed DriverPrefixMessage + DriverPrefixMessage driver_prefix = + DriverPrefixMessage::ParseDriverPrefixMessage( + (void*)entry_ptr->addr_, meta->second.meta_changed_); + Tensor* val; + void* entry_tensor_addr = nullptr; + // get rama's offset addr of Tensor + if (meta->second.meta_changed_) { + entry_tensor_addr = (void*)(entry_ptr->addr_ + + DriverPrefixMessage::kPrefixMessageTotalBytes); + } else { + entry_tensor_addr = (void*)(entry_ptr->addr_ + + DriverPrefixMessage::CkPrefixMessageTotalBytes); + } + if (can_memcpy) { + // tensor can use zero-copy + auto fake_allocator = new FakeAllocator(entry_tensor_addr); + if (meta->second.meta_changed_) { + val = new Tensor(fake_allocator, + meta->second.data_type_, + driver_prefix.tensor_shape_); + } else { + val = new Tensor(fake_allocator, + meta->second.data_type_, + meta->second.tensor_shape_); + } + // memcpy(DMAHelper::base(val), entry_tensor_addr, val->TotalBytes()); + } else { + // proto should not used zero-copy + if (meta->second.meta_changed_) { + val = new Tensor(meta->second.data_type_, + driver_prefix.tensor_shape_); + } else { + val = new Tensor(meta->second.data_type_, + meta->second.tensor_shape_); + } + TensorProto proto; + CHECK(ParseProtoUnlimited(&proto,entry_tensor_addr, + driver_prefix.tensor_bytes_)) + << " fail to parse proto from array"; + if (proto.dtype() > 0 && proto.dtype() <= DataType_MAX) { + Tensor parsed(proto.dtype()); + if (parsed.FromProto(cpu_allocator(), proto)) { + *val = std::move(parsed); + } + } + } + Status s = Status::OK(); + bool is_dead = driver_prefix.is_dead_; + int64 recv_micros = 0; + Rendezvous::Args send_args = Rendezvous::Args(); + rc->local_driver_buffer_mgr_->QueueRdmaSave(parsed_key, + send_args, val, is_dead, recv_micros); + // if (val != nullptr) { + // delete val; + // val = nullptr; + // } + } else { + // When recv a SendDriverData which means that : + // Localrecv SendDriver is Ready. + LOG(FATAL) << "Local recv SendDriver Data is not ready" + << " has_data:" << has_data; } + } + return; + } - // receive a control message - rb = rc->rx_message_buffer_; - RdmaMessage::ParseMessage(rm, rb->buffer_); - RdmaMessageBuffer::SendAck(rc); - RDMA_LOG(1) << "Step 0x" << std::hex << rm.step_id_ << std::dec - << ": Received " << MessageTypeToString(rm.type_) << " " - << "#" << rm.request_index_ << ": " << rm.name_; - - if (rm.type_ == RDMA_MESSAGE_TENSOR_REQUEST) { - RdmaTensorResponse* response = rc->AddTensorResponse(rm); - response->Start(); - } else if (rm.type_ == RDMA_MESSAGE_META_DATA_UPDATE) { - RdmaTensorRequest* request = rc->GetTensorRequest(rm.request_index_); - request->RecvTensorMetaData(rm.data_type_, rm.tensor_shape_, + // receive a control message + int pair_index = imm_data - RDMA_IMM_DATA_ACK -1; + int buffer_index = 2 * pair_index + 1; + rb = rc->message_buffers()[buffer_index]; + RdmaMessage::ParseMessage(rm, rb->buffer_); + RdmaMessageBuffer::SendAck(rc, pair_index+1); + RDMA_LOG(1) << "Step 0x" << std::hex << rm.step_id_ << std::dec + << ": Received " << MessageTypeToString(rm.type_) << " " + << "#" << rm.request_index_ << ": " << rm.name_; + RDMA_LOG(1) << "pair_index imm_data:" << imm_data + << " Process_WR rm type:" << MessageTypeToString(rm.type_); + + if (rm.type_ == RDMA_MESSAGE_TENSOR_REQUEST) { + RdmaTensorResponse* response = rc->AddTensorResponse(rm); + RDMA_LOG(1) << "GetResponse...."; + response->Start(); + } else if (rm.type_ == RDMA_MESSAGE_META_DATA_UPDATE) { + RDMA_LOG(1) << "Recevive RDMA_MESSAGE_META_DATA_UPDATE"; + RdmaTensorRequest* request = rc->GetTensorRequest(rm.request_index_); + if (request == nullptr) { + LOG(FATAL) << "RDMA_MESSAGE_META_DATA_UPDATE request : " + << rm.request_index_ << " is already done by LocalBufferMgr."; + } + request->RecvTensorMetaData(rm.data_type_, rm.tensor_shape_, rm.is_dead_, rm.tensor_bytes_); #ifdef RDMA_DATA_VALIDATION - request->RecvTensorChecksum(rm.checksum_); + request->RecvTensorChecksum(rm.checksum_); #endif - } else if (rm.type_ == RDMA_MESSAGE_TENSOR_RE_REQUEST) { - RdmaTensorResponse* response = rc->UpdateTensorResponse(rm); - response->Resume(); - } else if (rm.type_ == RDMA_MESSAGE_ERROR_STATUS) { - RdmaTensorRequest* request = rc->GetTensorRequest(rm.request_index_); - request->RecvErrorStatus(rm.status_); - } - } else if (wc_[i].opcode == IBV_WC_RDMA_WRITE) { - RdmaWriteID* wr_id = reinterpret_cast(wc_[i].wr_id); - RDMA_LOG(2) << "Write complete of type " << wr_id->write_type; - switch (wr_id->write_type) { - case RDMA_WRITE_ID_ACK: - break; - case RDMA_WRITE_ID_MESSAGE: { - RdmaMessageBuffer* rb = - reinterpret_cast(wr_id->write_context); - rb->SetBufferStatus(local, idle); - rb->SendNextItem(); - break; - } - case RDMA_WRITE_ID_TENSOR_WRITE: { - RdmaTensorResponse* response = - reinterpret_cast(wr_id->write_context); - response->Destroy(); - } - } - delete wr_id; + } else if (rm.type_ == RDMA_MESSAGE_DRIVER_BEGIN) { + LOG(INFO) << "Recevive RDMA_MESSAGE_DRIVER_BEGIN"; + RdmaTensorRequest* request = rc->GetTensorRequest(rm.request_index_); + if (request == nullptr) { + LOG(INFO) << "RDMA_MESSAGE_DRIVER_BEGIN request : " + << rm.request_index_ << " is already done by LocalBufferMgr."; + } + } else if (rm.type_ == RDMA_MESSAGE_TENSOR_RE_REQUEST) { + RdmaTensorResponse* response = rc->UpdateTensorResponse(rm); + response->Resume(); + } else if (rm.type_ == RDMA_MESSAGE_ERROR_STATUS) { + RdmaTensorRequest* request = rc->GetTensorRequest(rm.request_index_); + request->RecvErrorStatus(rm.status_); + } + } else if (wc_.opcode == IBV_WC_RDMA_WRITE) { + RdmaWriteID* wr_id = reinterpret_cast(wc_.wr_id); + RDMA_LOG(2) << "Write complete of type " << wr_id->write_type; + switch (wr_id->write_type) { + case RDMA_WRITE_ID_ACK: + break; + case RDMA_WRITE_ID_MESSAGE: { + RdmaMessageBuffer* rb = + reinterpret_cast(wr_id->write_context); + // TODO(wuyongyu02): (local buffer idle) + rb->SetBufferStatus(local, idle); + rb->SendNextItem(); + break; + } + case RDMA_WRITE_ID_SEND_DEIVER_WRITE: { + DriverEntry* entry = + reinterpret_cast(wr_id->write_context); + RDMA_LOG(1)<< "succeed send FreeEntry uid:" << entry->uinque_id_; + break; } + case RDMA_WRITE_ID_TENSOR_WRITE: { + RdmaTensorResponse* response = + reinterpret_cast(wr_id->write_context); + response->Destroy(); + } + } + if (wr_id->write_type != RDMA_WRITE_ID_SEND_DEIVER_WRITE) { + delete wr_id; + } + } +} + +void RdmaAdapter::Pool_Process_CQ(int cq_num) { + LOG(INFO) << "Pool_Process_CQ:" << cq_num; + auto cq = cq_vec_[cq_num]; + auto event_channel = event_channel_vec_[cq_num]; + auto wc = wc_vec_[cq_num]; + while (true) { + ibv_cq* cq_tmp; + void* cq_context; + CHECK(!ibv_get_cq_event(event_channel, &cq_tmp, &cq_context)); + CHECK(cq_tmp == cq); + ibv_ack_cq_events(cq_tmp, 1); + CHECK(!ibv_req_notify_cq(cq, 0)); + + int ne = + ibv_poll_cq(cq, MAX_CONCURRENT_WRITES * 2, static_cast(wc)); + CHECK_GE(ne, 0); + + for (int i = 0; i < ne; ++i) { + auto c = std::bind(&RdmaAdapter::Process_WR, this, wc[i], cq_num); + pool_->Schedule(std::move(c)); + // worker_env_->compute_pool->Schedule(std::move(c)); } } } @@ -537,7 +780,7 @@ int RdmaChannel::PingPostRecv() { int RdmaChannel::PingPostSend() { struct ibv_send_wr wr, *bad_wr; memset(&wr, 0, sizeof(wr)); - wr.wr_id = (uint64_t) this; + wr.wr_id = (uint64_t)this; wr.sg_list = &ping_sge_list_; wr.num_sge = 1; wr.opcode = IBV_WR_SEND; @@ -547,11 +790,29 @@ int RdmaChannel::PingPostSend() { } RdmaChannel::RdmaChannel(const RdmaAdapter* adapter, const string local_name, - const string remote_name) + const string remote_name, GrpcChannelCache* channel_cache, ibv_cq* cq) : adapter_(adapter), local_name_(local_name), remote_name_(remote_name), - request_serial_(0) { + request_serial_(0), + could_send_driver_(false), + channel_cache_(channel_cache), + pd_(adapter->pd_) { + + rdma_memory_mgr_ = new RdmaMemoryMgr(adapter->pd_); + alloc_visitors_.emplace_back([&](void* ptr, int numa_node, + size_t num_bytes) { + LOG(INFO) << "RdmaChannel RdmaMgr alloc_visitor"; + rdma_memory_mgr_->InsertMemoryRegion( + ptr, num_bytes, strings::StrCat("CPU:", numa_node)); + }); + free_visitors_.emplace_back([&](void* ptr, int numa_node, + size_t num_bytes) { + rdma_memory_mgr_->EvictMemoryRegion(ptr, num_bytes); + }); + + rdma_mem_allocator_ = new BFCRdmaAllocator(alloc_visitors_, free_visitors_); + struct ibv_sge list; mr_ = ibv_reg_mr(adapter_->pd_, ping_buff_, kPingBuffSize, @@ -568,13 +829,14 @@ RdmaChannel::RdmaChannel(const RdmaAdapter* adapter, const string local_name, { struct ibv_qp_init_attr attr; memset(&attr, 0, sizeof(ibv_qp_init_attr)); - attr.send_cq = adapter_->cq_; - attr.recv_cq = adapter_->cq_; + attr.send_cq = cq; + attr.recv_cq = cq; attr.cap.max_send_wr = adapter_->params_.queue_depth; attr.cap.max_recv_wr = adapter_->params_.queue_depth; attr.cap.max_send_sge = 1; attr.cap.max_recv_sge = 1; attr.qp_type = IBV_QPT_RC; + // attr.qp_type = IBV_QPT_UC; qp_ = ibv_create_qp(adapter_->pd_, &attr); CHECK(qp_) << "Failed to create queue pair"; @@ -589,6 +851,7 @@ RdmaChannel::RdmaChannel(const RdmaAdapter* adapter, const string local_name, attr.port_num = adapter_->params_.port_num; attr.qp_access_flags = IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_WRITE; + // https://man7.org/linux/man-pages/man3/ibv_modify_qp.3.html int mask = IBV_QP_STATE | IBV_QP_PKEY_INDEX | IBV_QP_PORT | IBV_QP_ACCESS_FLAGS; CHECK(!ibv_modify_qp(qp_, &attr, mask)) << "Failed to set QP to INIT"; @@ -614,24 +877,50 @@ RdmaChannel::RdmaChannel(const RdmaAdapter* adapter, const string local_name, // create message and ack buffers, then initialize the tables. { const string buffer_names[] = {"tx_message_buffer", "rx_message_buffer"}; - tx_message_buffer_ = new RdmaMessageBuffer(this, buffer_names[0]); - rx_message_buffer_ = new RdmaMessageBuffer(this, buffer_names[1]); message_buffers_.reserve(kNumMessageBuffers); - message_buffers_.push_back(tx_message_buffer_); - message_buffers_.push_back(rx_message_buffer_); - // create buffer on host - tx_message_buffer_->CreateCPUBuffer(RdmaMessage::kRdmaMessageBufferSize); - rx_message_buffer_->CreateCPUBuffer(RdmaMessage::kRdmaMessageBufferSize); + + // add other buffers + for (int i = 0; i < kNumMessageBuffers; i = i + 2) { + int pair_index = i/2+1; + std::stringstream ss; + ss << pair_index; + auto* tx_buffer1 = new RdmaMessageBuffer(this, + "tx_message_buffer_" + ss.str()); + tx_buffer1->pair_index_ = pair_index; + auto* rx_buffer2 = new RdmaMessageBuffer(this, + "rx_message_buffer_" + ss.str()); + rx_buffer2->pair_index_ = pair_index; + message_buffers_.push_back(tx_buffer1); + message_buffers_.push_back(rx_buffer2); + // create buffer and bind to MR + // tx_buffer1->CreateCPUBuffer(RdmaMessage::kRdmaMessageBufferSize); + // rx_buffer2->CreateCPUBuffer(RdmaMessage::kRdmaMessageBufferSize); + // NOTE(wuyongyu02): use chunk to alloc MR + void* p1; + void* p2; + ibv_mr* mr1; + ibv_mr* mr2; + adapter_->recv_chunk_->Alloc(ib_allocate_size(RdmaMessage::kRdmaMessageBufferSize * 2), &p1, &mr1); + CHECK(p1 != nullptr) << " p1 is nullptr"; + tx_buffer1->ChunkCreateCPUBuffer(RdmaMessage::kRdmaMessageBufferSize, p1, mr1); + adapter_->recv_chunk_->Alloc(ib_allocate_size(RdmaMessage::kRdmaMessageBufferSize * 2), &p2, &mr2); + CHECK(p1 != nullptr) << " p2 is nullptr"; + rx_buffer2->ChunkCreateCPUBuffer(RdmaMessage::kRdmaMessageBufferSize, p2, mr2); + } } CHECK(PingPostRecv() == 0) << "Couldn't post receive from " << remote_name_ << " with error " << std::strerror(errno); + + channel_record_ = std::make_shared(this); + rdma_send_driver_mgr_ = std::make_shared(this); + local_driver_buffer_mgr_ = std::make_shared(this); } RdmaChannel::~RdmaChannel() { ibv_dereg_mr(mr_); CHECK(!ibv_destroy_qp(qp_)) << "Failed to destroy QP"; - delete tx_message_buffer_; - delete rx_message_buffer_; + // delete tx_message_buffer_; + // delete rx_message_buffer_; } void RdmaChannel::SetRemoteAddress(const RdmaAddress& ra, bool override) { @@ -657,7 +946,7 @@ void RdmaChannel::SetRemoteAddress(const RdmaAddress& ra, bool override) { void RdmaChannel::Recv() { struct ibv_recv_wr wr; memset(&wr, 0, sizeof(wr)); - wr.wr_id = (uint64_t) this; + wr.wr_id = (uint64_t)this; struct ibv_recv_wr* bad_wr; CHECK(!ibv_post_recv(qp_, &wr, &bad_wr)) << "Failed to post recv"; } @@ -668,9 +957,12 @@ RdmaTensorRequest* RdmaChannel::InsertTensorRequest( const RdmaTensorRequest::RecvDoneCallback& done) { mutex_lock lock{ct_mu_}; uint32_t request_index = request_serial_++; - if (request_serial_ > RDMA_IMM_MAX_REQUEST_ID) { + + // > RDMA_IMM_MIN_SENDMGR_BASE for SendMgr + if (request_serial_ >= RDMA_IMM_MIN_SENDMGR_BASE) { request_serial_ = 0; } + RdmaTensorRequest request(request_index, key, step_id, this, dst_dev, recv_args, done); auto it = request_table_.emplace(request_index, request); @@ -679,16 +971,34 @@ RdmaTensorRequest* RdmaChannel::InsertTensorRequest( void RdmaChannel::RemoveTensorRequest(uint32_t request_index) { mutex_lock lock{ct_mu_}; - request_table_.erase(request_index); + RDMA_LOG(1) << "RemoveTensorRequest:" << request_index; + //<< " parsed_key:" << key_; + const auto& it = request_table_.find(request_index); + if (it != request_table_.end()) { + request_table_.erase(request_index); + } } RdmaTensorRequest* RdmaChannel::GetTensorRequest(uint32_t request_index) { mutex_lock lock{ct_mu_}; RequestTable::iterator iter = request_table_.find(request_index); - CHECK(iter != request_table_.end()); + // CHECK(iter != request_table_.end()) + // << " RdmaChannel is already been delete."; + if (iter == request_table_.end()) { + return nullptr; + } return &iter->second; } +RdmaTensorRequest* RdmaChannel::GetTensorRequestForCat(uint32_t request_index) { + mutex_lock lock{ct_mu_}; + RequestTable::iterator iter = request_table_.find(request_index); + if (iter != request_table_.end()) { + return &iter->second; + } + return nullptr; +} + void RdmaChannel::Connect() { { mutex_lock lock{mu_}; @@ -728,11 +1038,11 @@ void RdmaChannel::Connect(const RdmaAddress& remoteAddr) { attr.ah_attr.grh.traffic_class = adapter_->params_.traffic_class; int r; - CHECK(!(r = ibv_modify_qp(qp_, &attr, IBV_QP_STATE | IBV_QP_AV | - IBV_QP_PATH_MTU | - IBV_QP_DEST_QPN | IBV_QP_RQ_PSN | - IBV_QP_MAX_DEST_RD_ATOMIC | - IBV_QP_MIN_RNR_TIMER))) + CHECK(!(r = ibv_modify_qp(qp_, &attr, + IBV_QP_STATE | IBV_QP_AV | IBV_QP_PATH_MTU | + IBV_QP_DEST_QPN | IBV_QP_RQ_PSN | + IBV_QP_MAX_DEST_RD_ATOMIC | + IBV_QP_MIN_RNR_TIMER))) << "QP to Ready to Receive " << r; memset(&attr, 0, sizeof(ibv_qp_attr)); @@ -743,10 +1053,10 @@ void RdmaChannel::Connect(const RdmaAddress& remoteAddr) { attr.rnr_retry = 7; /* infinite */ attr.max_rd_atomic = 1; - CHECK(!(r = ibv_modify_qp(qp_, &attr, IBV_QP_STATE | IBV_QP_TIMEOUT | - IBV_QP_RETRY_CNT | - IBV_QP_RNR_RETRY | IBV_QP_SQ_PSN | - IBV_QP_MAX_QP_RD_ATOMIC))) + CHECK(!(r = ibv_modify_qp(qp_, &attr, + IBV_QP_STATE | IBV_QP_TIMEOUT | IBV_QP_RETRY_CNT | + IBV_QP_RNR_RETRY | IBV_QP_SQ_PSN | + IBV_QP_MAX_QP_RD_ATOMIC))) << "QP to Ready to Send " << r; connected_ = true; @@ -755,6 +1065,711 @@ void RdmaChannel::Connect(const RdmaAddress& remoteAddr) { } } +RdmaSendDriverMgr::RdmaSendDriverMgr(RdmaChannel* channel) { + channel_ = channel; + driver_mgr_is_ok_ = false; +} + +size_t RdmaSendDriverMgr::InitLocalDriverEntry() { + // LOG(INFO) << "InitLocalDriverEntry begin..."; + const auto& tensors_meta_data = + channel_->channel_record_->GetChannelTensorsMetaData(); + const auto& global_tensors_meta_data = + RecordTensorMetaData::Singleton().GetGlobalTensorsMetaData(); + // LOG(INFO) << "To Remote name:" << channel_->remote_name_ + // << "Channel_Record_Size:" << tensors_meta_data.size(); + const auto& tensors_uidkeys = + channel_->channel_record_->GetChannelTensorsUidParsedkey(); + + CHECK(tensors_meta_data.size() == tensors_uidkeys.size()) + << "tensors_meta_data size:" << tensors_meta_data.size() + << " tensors_uidkeys size:" << tensors_uidkeys.size(); + + LOG(INFO) << "InitLocalDriverEntry channel Metadata key begin " + << "create dirven-entry:" + << tensors_meta_data.size(); + std::set regrex_edge_keys; + for (auto& it : tensors_meta_data) { + const auto& meta_data = it.second; + const uint32& uid = meta_data.uid_; + void* addr; + ibv_mr *mr; + // allocate memory and region + int find_allocate_bytes = 0; + //NOTE(wuyongyu02) alloc recv-tensor memory + if (!channel_->FindLocalMr(it.first, &addr, &mr, &find_allocate_bytes)) { + LOG(INFO) << it.first << "not not find.."; + find_allocate_bytes = 0; + } else { + LOG(INFO) << it.first << "find.. bytes:" << find_allocate_bytes; + } + int need_bytes = VerbsEnvRegistrar::Instance()->RdmaTensorBufferRatio() * + ChannelRecordTensorMetaData::GetTensorBytes(meta_data) + + DriverPrefixMessage::kPrefixMessageTotalBytes; + + if (find_allocate_bytes < need_bytes) { + LOG(INFO) << it.first << "reallocate find.. need:" + << need_bytes << " " << find_allocate_bytes; + channel_->channel_record_->AllocateMemoryAndRegion(it.first, meta_data, + channel_->adapter_->pd_, &addr, &mr, &find_allocate_bytes); + } + driver_entries_[it.first] = std::make_shared( + uid, it.first, addr, mr, find_allocate_bytes); + driver_entries_[it.first]->meta_changed_ = meta_data.meta_changed_; + } + + LOG(INFO) << "InitLocalDriverEntry channel Metadata key:" + << tensors_meta_data.size() + << " driver_entries size:" + << driver_entries_.size() + << " global_tensors_meta_data size:" + << global_tensors_meta_data.size(); + // Notify local driven-entires entry through Rpc + // Notify by Rpc + NotifyRemoteDriverEntry(); + return driver_entries_.size(); +} + +// server service Update +void RdmaSendDriverMgr::RpcUpdateDriverEntries(const DriverMessageResp& resp) { + CHECK(channel_->remote_name_ == resp.host_name()) + << "channel_->remote_name_:" << channel_->remote_name_ + << " resp.host_name:" << resp.host_name(); + size_t driver_mgr_is_ok = 0; + for (const auto& it : resp.item()) { + const auto& parsed_key = it.parsed_key(); + const auto& entry = driver_entries_.find(parsed_key); + if (entry == driver_entries_.end()) { + LOG(FATAL) << "RDMA parsed key " + << parsed_key + << " is not find in driver_entries_"; + for (auto& k : driver_entries_) { + LOG(INFO) << "kkkk:" << k.first; + } + } + auto& entry_ptr = driver_entries_[parsed_key]; + if (it.status() == DriverMessageItem::RPC_0 && + entry_ptr->dri_status_ == RPC_0) { + entry_ptr->dri_status_ == RPC_1; + } else if (it.status() == DriverMessageItem::RPC_1 && + entry_ptr->dri_status_ == RPC_1) { + entry_ptr->dri_status_ == DATA_NOT_READY; + driver_mgr_is_ok++; + } else { + LOG(ERROR) << "RDMA RdmaSendDriverMgr::DriverEntries" + << " local_name:" << channel_->local_name_ + << " remote_name:" << channel_->remote_name_ + << " key:" << parsed_key + << " entry.dri_status_:" << entry_ptr->dri_status_ + << " it.status:" << it.status(); + } + } + // When all entries is ok, so set driver_mgr status to 'ok' + if (driver_mgr_is_ok == driver_entries_.size()) { + driver_mgr_is_ok_.store(true); + // LOG(INFO) << "[Succeed] " + // << channel_->remote_name_ + // << " driver_mgr_ptr RpcSend Entries is ok!"; + } +} + +bool RdmaSendDriverMgr::RpcReqResp(GrpcVerbsClient* client, + const DriverMessageReq& req) { + // synchronous call + const auto& remote_name = channel_->remote_name_; + DriverMessageResp resp; + Status s; + int attempts = 0; + static const int max_num_attempts = 5; + do { + s = client->ReqDriverMessage(&req, &resp); + // save obtained remote addresses + // connect to the remote channel + if (s.ok()) { + RpcUpdateDriverEntries(resp); + } else { + LOG(ERROR) << "ReqDriverMessage Connecting to " << remote_name << ": Got " + << s.error_message() << ". Retrying (" << (attempts + 1) + << "/" << max_num_attempts << ")..."; + if (++attempts == max_num_attempts) { + return false; + } + channel_->adapter_->worker_env_->env->SleepForMicroseconds(2000000); + } + } while (!s.ok()); + return true; +} + +// Notify by Rpc +void RdmaSendDriverMgr::NotifyRemoteDriverEntry() { + const auto& remote_name = channel_->remote_name_; + const auto& local_name = channel_->local_name_; + RDMA_LOG(1) << "NotifyRemoteDriverEntry local_worker_name:" << local_name + << " remote_name:" << remote_name + << " driver_entries_ size:" << driver_entries_.size(); + + auto* cache = channel_->channel_cache_; + // get the channel cache + SharedGrpcChannelPtr client_channel = + channel_->channel_cache_->FindWorkerChannel(remote_name); + CHECK(client_channel != nullptr) << "target:" + << remote_name + << " client_channel is null!"; + GrpcVerbsClient* client = new GrpcVerbsClient(client_channel); + CHECK(client != nullptr) << "No worker known as " << remote_name; + + DriverMessageReq req; + req.set_host_name(local_name); + for (auto& it : driver_entries_) { + auto* item = req.add_item(); + auto driver_entry_ptr = it.second; + item->set_unique_id(driver_entry_ptr->uinque_id_); + item->set_parsed_key(it.first); + item->set_remote_addr(driver_entry_ptr->addr_); + item->set_rkey(driver_entry_ptr->lkey_); + item->set_allocate_bytes(driver_entry_ptr->allocate_size_); + item->set_meta_changed(driver_entry_ptr->meta_changed_); + item->set_status(DriverMessageItem::RPC_0); + // Remember to update driver_entries_ Status + it.second->dri_status_ = RPC_0; + } + if (RpcReqResp(client, req)) { + DriverMessageReq req_rpc2; + req_rpc2.set_host_name(local_name); + for (auto& it : driver_entries_) { + auto* item = req_rpc2.add_item(); + auto driver_entry_ptr = it.second; + item->set_unique_id(driver_entry_ptr->uinque_id_); + item->set_parsed_key(it.first); + item->set_remote_addr(driver_entry_ptr->addr_); + item->set_rkey(driver_entry_ptr->lkey_); + item->set_allocate_bytes(driver_entry_ptr->allocate_size_); + item->set_meta_changed(driver_entry_ptr->meta_changed_); + item->set_status(DriverMessageItem::RPC_1); + // Remember to update driver_entries_ Status + it.second->dri_status_ = RPC_1; + } + if (!RpcReqResp(client, req_rpc2)) { + LOG(ERROR) << "ReqDriverMessage RpcReqResp2 remote node " + << remote_name << " FAILED"; + } + } else { + LOG(ERROR) << "ReqDriverMessage RpcReqResp remote node " + << remote_name << " FAILED"; + } + RDMA_LOG(0) << "ReqDriverMessage Connected to remote node " << remote_name; + delete client; +} + +void RdmaSendDriverMgr::RpcUpdateRemoteDriverEntry( + const DriverMessageReq* request, DriverMessageResp* response) { + // setting up response + response->set_host_name(channel_->local_name_); + int recv_driver_mgr_entry_ok_nums = 0; + for (const auto& req_item : request->item()) { + DriverMessageItem* resp_item = response->add_item(); + string parsed_key = req_item.parsed_key(); + resp_item->set_parsed_key(parsed_key); + const auto& it = recv_entries_.find(parsed_key); + DriverMessageItem::DriverStatus status = req_item.status(); + if (it == recv_entries_.end() && status == DriverMessageItem::RPC_0) { + recv_entries_[parsed_key] = std::make_shared(); + recv_entries_[parsed_key]->uinque_id_ = req_item.unique_id(); + recv_entries_[parsed_key]->addr_ = req_item.remote_addr(); + recv_entries_[parsed_key]->lkey_ = req_item.rkey(); + recv_entries_[parsed_key]->allocate_size_ = req_item.allocate_bytes(); + recv_entries_[parsed_key]->meta_changed_ = req_item.meta_changed(); + recv_entries_[parsed_key]->parsed_key_ = parsed_key; + // update recv entries + recv_entries_[parsed_key]->dri_status_ = RPC_0; + // response status + resp_item->set_status(DriverMessageItem::RPC_0); + RDMA_LOG(1) << "RpcUpdateRemoteDriverEntry parsed_key :" + << parsed_key + << " recv dir_status: RPC_0 " + << " update dir_status RPC_1 : " + << recv_entries_[parsed_key]->dri_status_; + } else if (it->second->dri_status_ == RPC_0 && + status == DriverMessageItem::RPC_1) { + // response status + resp_item->set_status(DriverMessageItem::RPC_1); + // update recv entries + recv_entries_[parsed_key]->dri_status_ = DATA_NOT_READY; + recv_driver_mgr_entry_ok_nums += 1; + RDMA_LOG(1) << "RpcUpdateRemoteDriverEntry parsed_key :" + << parsed_key + << " recv dir_status: RPC_1 " + << " update dir_status DATA_NOT_READY : " + << recv_entries_[parsed_key]->dri_status_; + } else { + LOG(ERROR) << "UpdateRemoteDriverEntry:" + << "local_name:" + << channel_->local_name_ + << " revc from remote:" + << request->host_name() + << " parsed_key:" + << parsed_key + << " recv_entries dri_status is not `RPC_1` " + << " status is :" + << status + << " dri_status is " + << recv_entries_[parsed_key]->dri_status_; + } + } + RDMA_LOG(1) << "RdmaSendDriverMgr::RpcUpdateRemoteDriverEntry end...." + << " localname:" << channel_->local_name_ + << " remotename:" << channel_->remote_name_ + << " recv_entries_ size:" << recv_entries_.size(); + // driver_mgr_ptr is ok and can send tensor to other client. + if (recv_driver_mgr_entry_ok_nums == recv_entries_.size()) { + // allocate string RDMA + // NOTE(wuyongyu02) + // Allocate StringMessage change to FindOrCreateMemeoryRegion + // AllocateRecvEntriesStringMemoryAndRegion(); + LOG(INFO) << "[Succeed] " + << request->host_name() + << " driver_mgr_ptr RecvEntries is ok!"; + } +} + +void RdmaSendDriverMgr::AllocateRecvEntriesStringMemoryAndRegion() { + for (auto& k : recv_entries_) { + void* addr; + ibv_mr *mr; + // allocate memory and region + int allocate_bytes = 0; + channel_->channel_record_->AllocateSendStringMemoryAndRegion( + channel_->adapter_->pd_, &addr, &mr, &allocate_bytes); + k.second->send_mem_mr_ = std::make_shared( + addr, mr, allocate_bytes); + RDMA_LOG(1) << "AllocateRecvEntriesStringMemoryAndRegion:" + << k.first + << " allocate_bytes:" + << allocate_bytes; + } +} + +std::shared_ptr RdmaSendDriverMgr::GetRecvEntry( + const std::string& parsed_key, bool* has_data) { + const auto& it = recv_entries_.find(parsed_key); + if (it == recv_entries_.end()) { + for (auto& find : recv_entries_) { + if (absl::StrContains(parsed_key, find.first)) { + return find.second; + } + } + // LOG(FATAL) << parsed_key << " is not find in recv_entries_."; + return nullptr; + } + *has_data = recv_entries_[parsed_key]->dri_status_ == DATA_READY; + // LOG(INFO) << "parsed_key:" << parsed_key + // << " status:" << recv_entries_[parsed_key]->dri_status_ + // << " has_data:" << *has_data; + return recv_entries_[parsed_key]; +} + +std::shared_ptr RdmaSendDriverMgr::GetDriverEntry( + const std::string& parsed_key, bool* has_data) { + const auto& it = driver_entries_.find(parsed_key); + if (it == driver_entries_.end()) { + for (auto& find : driver_entries_) { + if (absl::StrContains(parsed_key, find.first)) { + return find.second; + } + } + LOG(FATAL) << parsed_key << " is not find in driver_entries_."; + } + *has_data = driver_entries_[parsed_key]->dri_status_ == DATA_READY; + return driver_entries_[parsed_key]; +} + +DriverEntry::DriverEntry() { + dri_status_.store(DRIVER_INIT); +} + +DriverEntry::DriverEntry(const uint32& uid, + const std::string& parsedkey, + void* addr, + ibv_mr* mr, + int allocate_size) { + addr_ = (uint64_t) addr; + mem_mr_ = + std::make_shared(addr, mr, allocate_size); + lkey_ = mr->lkey; + uinque_id_ = uid; + parsed_key_ = parsedkey; + dri_status_.store(DRIVER_INIT); + allocate_size_ = allocate_size; +} + +string ChannelRecordTensorMetaData::RegexEdgeName(const string & str) { + std::string regex_str(".*edge_\\d*(_.*)(_\\d*)?;0:0"); + std::regex pattern(regex_str, std::regex::icase); + std::smatch result; + if (std::regex_match(str, result, pattern)) { + return std::string(result[1]); + } else { + LOG(ERROR) << "RegexEdgeName key:" << str << " is not matchaed. pattern:" + << regex_str; + } + return str; +} + +void ChannelRecordTensorMetaData::InitMetaDataFromEnv() { + // Init Channel + mutex_lock l(channel_tensor_meta_data_mu_); + const string& name = channel_->local_name_; + if (absl::StrContains(name, "worker") || + absl::StrContains(name, "localhost")) { + const string meta_str = GetWorkerMetas(); + StringPiece s(meta_str); + while (!s.empty()) { + StringPiece result = ConsumeNextPart(&s, '|'); + if (!result.empty()) { + StringPiece meta_name_view = ConsumeNextPart(&result, '#'); + if (!meta_name_view.empty()) { + auto meta_name = string(meta_name_view); + std::stringstream ss(string(result).c_str()); + int meta_size = 0; + ss >> meta_size; + CHECK(meta_size > 0) + << " meta_name" << meta_name << " size:" << meta_size; + auto find = channel_tensors_meta_data_.find(meta_name); + if (find == channel_tensors_meta_data_.end()) { + auto it = channel_tensors_meta_data_.emplace(meta_name, + TensorMetaData()); + channel_tensors_uid_parsed_key_.emplace(uid_, meta_name); + auto& meta = channel_tensors_meta_data_[meta_name]; + meta.uid_ = uid_; + if (it.second) { + uid_++; + } + meta.data_type_ = DT_INT64; + meta.tensor_shape_ = {}; + meta.proto_size_ = 0; + meta.is_dead_ = false; + } + } + } + } + } +} + +ChannelRecordTensorMetaData::ChannelRecordTensorMetaData(RdmaChannel* channel) { + channel_ = channel; + InitMetaDataFromEnv(); +} + +uint32 ChannelRecordTensorMetaData::GetEnumSize(const DataType& date_type) { + switch (date_type) { + case DT_FLOAT: + return 4; + break; + case DT_DOUBLE: + return 8; + break; + case DT_INT32: + return 4; + break; + case DT_UINT32: + return 4; + break; + case DT_UINT16: + return 2; + break; + case DT_INT8: + return 1; + break; + case DT_UINT8: + return 1; + break; + case DT_INT16: + return 2; + break; + case DT_INT64: + return 8; + break; + case DT_UINT64: + return 8; + break; + case DT_BOOL: + return 1; + break; + default: + return 4; + break; + } +} + +void ChannelRecordTensorMetaData::AllocateSendStringMemoryAndRegion(ibv_pd* pd, + void** addr, + ibv_mr** mr, + int* addr_size, + Allocator* alloc_attr) { + // allocate prefix DriverPrefixMessage + auto total_bytes = DriverPrefixMessage::kPrefixMessageTotalBytes; + RDMA_LOG(1) << "AllocateSendStringMemoryAndRegion total bytes:" + << total_bytes; + *addr = malloc(total_bytes); + CHECK(addr != nullptr) + << "AllocateSendStringMemoryAndRegion addr malloc faild!"; + *mr = ibv_reg_mr(pd, *addr, total_bytes, + IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_WRITE); + *addr_size = total_bytes; +} + +int ChannelRecordTensorMetaData::GetTensorBytes(const TensorMetaData& m) { + int total_bytes = 0; + if (DataTypeCanUseMemcpy(m.data_type_)) { + int m1 = m.tensor_shape_.num_elements(); + total_bytes = m1 * GetEnumSize(m.data_type_); + } else { + total_bytes = m.proto_size_; + } + return total_bytes; +} + +void ChannelRecordTensorMetaData::AllocateMemoryAndRegion( + const string& key, + const TensorMetaData& m, + ibv_pd* pd, + void** addr, + ibv_mr** mr, + int* addr_size, + Allocator* alloc_attr) const { + int total_bytes = GetTensorBytes(m); + total_bytes = + VerbsEnvRegistrar::Instance()->RdmaTensorBufferRatio() * total_bytes; + // allocate prefix DriverPrefixMessage + total_bytes += DriverPrefixMessage::kPrefixMessageTotalBytes; + RDMA_LOG(1) << "AllocateMemoryAndRegion key:" + << key + << " total bytes:" << total_bytes; + *addr = malloc(total_bytes); + CHECK(addr != nullptr) << "AllocateMemoryAndRegion addr malloc faild!"; + *mr = ibv_reg_mr(pd, *addr, total_bytes, + IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_WRITE); + *addr_size = total_bytes; +} + + +void ChannelRecordTensorMetaData::Record(const std::string& tensor_name, + const TensorMetaData& m) { + // send-driver stop record + if (channel_->could_send_driver_) { + return; + } + // LOG(INFO) << "ChannelRecordTensorMetaData::Record " << is_stable_; + mutex_lock l(channel_tensor_meta_data_mu_); + auto find = channel_tensors_meta_data_.find(tensor_name); + if (find == channel_tensors_meta_data_.end()) { + // LOG(INFO) << "Channel Record Tensorname:" << tensor_name; + auto it = channel_tensors_meta_data_.emplace(tensor_name, TensorMetaData()); + channel_tensors_uid_parsed_key_.emplace(uid_, tensor_name); + auto& meta = channel_tensors_meta_data_[tensor_name]; + meta.uid_ = uid_; + if (it.second) { + uid_++; + } + meta.data_type_ = m.data_type_; + meta.tensor_shape_ = m.tensor_shape_; + meta.proto_size_ = m.proto_size_; + meta.is_dead_ = m.is_dead_; + } else { + auto& meta = find->second; + bool can_memcpy = DataTypeCanUseMemcpy(m.data_type_); + if (can_memcpy) { + int m1 = 1; + int m2 = 1; + for (int d = 0; d < m.tensor_shape_.dims(); d++) { + m1 *= m.tensor_shape_.dim_size(d); + m2 *= meta.tensor_shape_.dim_size(d); + } + if (m1 > m2) { + meta.data_type_ = m.data_type_; + meta.tensor_shape_ = m.tensor_shape_; + meta.proto_size_ = m.proto_size_; + meta.is_dead_ = m.is_dead_; + } + if (m1 != m2) { + // LOG(INFO) << "Tensorname:" << tensor_name << " meta_changed."; + meta.meta_changed_ = true; + } + } + if ((!can_memcpy && meta.proto_size_ > m.proto_size_)) { + meta.data_type_ = m.data_type_; + meta.tensor_shape_ = m.tensor_shape_; + meta.proto_size_ = 10 * m.proto_size_; + meta.is_dead_ = m.is_dead_; + } + if (!can_memcpy && meta.proto_size_ != m.proto_size_) { + // LOG(INFO) << "Tensorname:" << tensor_name << " _meta_changed."; + meta.meta_changed_ = true; + } + } +} + +StringPiece ChannelRecordTensorMetaData::ConsumeNextPart(StringPiece* s, + char delim) { + for (size_t offset = 0; offset < s->size(); offset++) { + if ((*s)[offset] == delim) { + StringPiece result(s->data(), offset); + s->remove_prefix(offset + 1); // +1: remove delim, as well + return result; + } + } + // No delimiter found: return rest of string + StringPiece result(s->data(), s->size()); + s->remove_prefix(s->size()); + return result; +} + +string RecordTensorMetaData::DebugString() const { + std::vector lc; + for (auto& it : global_tensors_meta_data_) { + std::vector ds; + ds.emplace_back(string(it.first)); + // dtype + ds.emplace_back(std::to_string(it.second.data_type_)); + // num elements + auto num_elements = it.second.tensor_shape_.num_elements(); + ds.emplace_back(std::to_string(num_elements)); + auto total_bytes = num_elements * GetEnumSize(it.second.data_type_); + ds.emplace_back(std::to_string(total_bytes)); + lc.push_back(absl::StrJoin(lc, ",")); + } + return absl::StrJoin(lc, "\n"); +} + +void RecordTensorMetaData::WriteOutput(const std::string& content) const { + Env* env = Env::Default(); + std::string path_dir = GetMetaOutput(); + if (!env->FileExists(path_dir).ok()) { + LOG(INFO) << "File " << path_dir << " is not exists!"; + env->CreateDir(path_dir); + LOG(INFO) << "CreateFileDir " << path_dir << " sucess!"; + } + + std::string cfn = path_dir + "/" + local_worker_name_; + // Write something to the temporary file. + std::unique_ptr file_to_write; + TF_CHECK_OK(env->NewWritableFile(cfn, &file_to_write)); + TF_CHECK_OK(file_to_write->Append(content)); + TF_CHECK_OK(file_to_write->Close()); + TF_CHECK_OK(env->FileExists(cfn)); +} + +void RecordTensorMetaData::ReadFile(const std::string& filename, + StringPiece* content) { + Env* env = Env::Default(); + // Read from the temporary file and check content. + std::unique_ptr file_to_read; + TF_CHECK_OK(env->NewRandomAccessFile(filename, &file_to_read)); + // StringPiece content; + char scratch[1024]; + CHECK_EQ(error::OUT_OF_RANGE, + file_to_read->Read(0 /* offset */, 1024 /* n */, content, scratch) + .code()); +} + +uint32 RecordTensorMetaData::GetEnumSize(const DataType& date_type) { + switch (date_type) { + case DT_FLOAT: + return 4; + break; + case DT_DOUBLE: + return 8; + break; + case DT_INT32: + return 4; + break; + case DT_UINT32: + return 4; + break; + case DT_UINT16: + return 2; + break; + case DT_INT8: + return 1; + break; + case DT_UINT8: + return 1; + break; + case DT_INT16: + return 2; + break; + case DT_INT64: + return 8; + break; + case DT_UINT64: + return 8; + break; + case DT_BOOL: + return 1; + break; + default: + return 4; + break; + } +} + +void RecordTensorMetaData::GlobalRecord(const std::string& origin_tensor_name, + const TensorMetaData& m, bool stop_record) { + // send-driver status stop record + if (stop_record) { + return; + } + mutex_lock l(global_tensor_meta_data_mu_); + auto tensor_name = ChannelRecordTensorMetaData::RegexEdgeName( + origin_tensor_name); + auto find = global_tensors_meta_data_.find(tensor_name); + // LOG(INFO) << "Record Tensorname:" << tensor_name; + if (find == global_tensors_meta_data_.end()) { + auto it = global_tensors_meta_data_.emplace(tensor_name, TensorMetaData()); + global_tensors_uid_parsed_key_.emplace(uid_, tensor_name); + auto& meta = global_tensors_meta_data_[tensor_name]; + meta.uid_ = uid_; + if (it.second) { + uid_++; + } + meta.data_type_ = m.data_type_; + meta.tensor_shape_ = m.tensor_shape_; + meta.proto_size_ = m.proto_size_; + meta.is_dead_ = m.is_dead_; + } else { + auto& meta = find->second; + bool can_memcpy = DataTypeCanUseMemcpy(m.data_type_); + if (can_memcpy) { + int m1 = 1; + int m2 = 1; + for (int d = 0; d < m.tensor_shape_.dims(); d++) { + m1 *= m.tensor_shape_.dim_size(d); + m2 *= meta.tensor_shape_.dim_size(d); + } + if (m1 > m2) { + meta.data_type_ = m.data_type_; + meta.tensor_shape_ = m.tensor_shape_; + meta.proto_size_ = m.proto_size_; + meta.is_dead_ = m.is_dead_; + } + } + if ((!can_memcpy && meta.proto_size_ > m.proto_size_)) { + meta.data_type_ = m.data_type_; + meta.tensor_shape_ = m.tensor_shape_; + meta.proto_size_ = 10 * m.proto_size_; + meta.is_dead_ = m.is_dead_; + } + } + int tmp_sizes = 0; + for(const auto& k : global_tensors_meta_data_) { + tmp_sizes += ChannelRecordTensorMetaData::GetTensorBytes(k.second); + } + if (tmp_sizes > total_bytes_) { + total_bytes_ = tmp_sizes; + LOG(INFO) << "GlobalRecord bytes:" << total_bytes_; + } +} + RdmaMessageBuffer::RdmaMessageBuffer(RdmaChannel* channel, string name) : channel_(channel), name_(name) {} @@ -769,6 +1784,26 @@ void RdmaMessageBuffer::FreeBuffer() { } } +void RdmaMessageBuffer::ChunkCreateCPUBuffer(size_t size, void* buffer, ibv_mr* mr, + bool lock) { + CHECK(size > 0); + if (lock) { + mu_.lock(); + } + if (local_status_ != none) { + // delete existing buffer + } + size_ = size; + buffer_ = buffer; + self_ = mr; + CHECK(self_) << "Failed to register memory region"; + buffer_on_host_ = true; + local_status_ = idle; + if (lock) { + mu_.unlock(); + } +} + // Allocate CPU memory for the Rdma buffer // Args: // size: to-be-allocated memory size @@ -827,6 +1862,44 @@ void RdmaMessageBuffer::Write(uint32_t imm_data, size_t buffer_size) { remote_.remote_addr, remote_.rkey, RDMA_WRITE_ID_MESSAGE, this); } +// Generalized Write method +void RdmaMessageBuffer::WriteWithPrefix(const RdmaChannel* channel, + uint32_t imm_data, + size_t buffer_size, + uint64_t src_addr, + uint32_t lkey, + uint64_t remote_addr, + uint32_t rkey, + RdmaWriteIDType write_type, + void* write_context, + uint64_t prefix_addr, + uint32_t prefix_lkey, + size_t prefix_size) { + struct ibv_sge* list = new ibv_sge[2]; + list[0].addr = prefix_addr; + list[0].length = prefix_size; + list[0].lkey = prefix_lkey; + + list[1].addr = src_addr; + list[1].length = buffer_size; + list[1].lkey = lkey; + + struct ibv_send_wr wr; + memset(&wr, 0, sizeof(wr)); + + wr.wr_id = (uint64_t) new RdmaWriteID(write_type, write_context); + wr.sg_list = list; + wr.num_sge = 2; + wr.opcode = IBV_WR_RDMA_WRITE_WITH_IMM; + wr.send_flags = IBV_SEND_SIGNALED; + wr.imm_data = imm_data; + wr.wr.rdma.remote_addr = remote_addr; + wr.wr.rdma.rkey = rkey; + + struct ibv_send_wr* bad_wr; + CHECK(!ibv_post_send(channel->qp_, &wr, &bad_wr)) << "Failed to post send"; +} + // Generalized Write method void RdmaMessageBuffer::Write(const RdmaChannel* channel, uint32_t imm_data, size_t buffer_size, uint64_t src_addr, @@ -840,6 +1913,7 @@ void RdmaMessageBuffer::Write(const RdmaChannel* channel, uint32_t imm_data, struct ibv_send_wr wr; memset(&wr, 0, sizeof(wr)); + wr.wr_id = (uint64_t) new RdmaWriteID(write_type, write_context); wr.sg_list = &list; wr.num_sge = 1; @@ -854,17 +1928,21 @@ void RdmaMessageBuffer::Write(const RdmaChannel* channel, uint32_t imm_data, } // Send the next ack from the buffer's job queue. -void RdmaMessageBuffer::SendAck(const RdmaChannel* channel) { - Write(channel, RDMA_IMM_DATA_ACK, 0, 0, 0, 0, 0, RDMA_WRITE_ID_ACK, nullptr); +void RdmaMessageBuffer::SendAck(const RdmaChannel* channel, int pair_index) { + Write(channel, RDMA_IMM_MAX_REQUEST_ID + pair_index, 0, 0, 0, 0, 0, + RDMA_WRITE_ID_ACK, nullptr); } // Send the next message from the buffer's job queue. void RdmaMessageBuffer::SendNextItem() { - uint32_t imm_data = RDMA_IMM_DATA_MESSAGE; + uint32_t imm_data = RDMA_IMM_DATA_ACK + pair_index_; mu_.lock(); if (!queue_.empty() && (local_status_ == idle) && (remote_status_ == idle)) { local_status_ = busy; remote_status_ = busy; + time_guard_ = 0; + rm_ack_micros_ = 0; + // LOG(ERROR) << "SendNextItem queue size:" << queue_.size(); string message = queue_.front(); queue_.pop(); // local/remote_status_ won't be set back to idle @@ -969,47 +2047,63 @@ static void StreamGPUOp(Device* gpu_device, const DeviceContext* device_context, RdmaTensorResponse* RdmaChannel::AddTensorResponse(const RdmaMessage& rm) { mutex_lock lock{mu_}; - auto it = - responses_table_.emplace(rm.request_index_, RdmaTensorResponse(this, rm)); + auto it = responses_table_.emplace(rm.request_index_, + std::make_shared(this, rm)); CHECK(it.second) << "Response with the ID " << rm.request_index_ << " already exists."; - return &it.first->second; + // replica request_index + it.first->second->request_index_ = rm.request_index_; + return it.first->second.get(); } RdmaTensorResponse* RdmaChannel::UpdateTensorResponse(const RdmaMessage& rm) { mutex_lock lock{mu_}; auto it = responses_table_.find(rm.request_index_); CHECK(it != responses_table_.end()) << "No response found."; - RdmaTensorResponse* response = &it->second; + RdmaTensorResponse* response = it->second.get(); response->Update(rm); return response; } void RdmaChannel::RemoveTensorResponse(uint32_t request_index) { mutex_lock lock{mu_}; - responses_table_.erase(request_index); + if (responses_table_.find(request_index) != responses_table_.end()) + responses_table_.erase(request_index); } void RdmaTensorResponse::Start() { + // LOG(INFO) << "RdmaTensorResponse Start..."; Rendezvous::ParsedKey parsed; Status s = Rendezvous::ParseKey(rm_.name_, &parsed); + if (s.ok()) { + s = PrepareRecvTensor(parsed, &src_dev_); + } if (!s.ok()) { - SendErrorStatus(s); + SendErrorStatus(s, "RdmaTensorResponse::Start::PrepareRecvTensor"); return; } - + recv_local_send_rdma_ = 0; channel_->adapter_->worker_env_->rendezvous_mgr->RecvLocalAsync( rm_.step_id_, parsed, - [this, parsed](const Status& status, const Rendezvous::Args& send_args, + [this](const Status& status, const Rendezvous::Args& send_args, const Rendezvous::Args& recv_args, const Tensor& in, - bool is_dead) { - CHECK(status.ok()) << "RecvLocalAsync was not ok." - << " error message: " << status.error_message(); - RecvHandler(parsed, send_args, recv_args, in, is_dead); + bool is_dead) mutable { + // (wuyongyu02) if the sender don't receive tensor from local + // should't CHECK(FALSE), can send SendErrorStatus like + // RdmaTensorResponse::RecvHandler(...) {... SendErrorStatus(status);} + // CHECK(status.ok()) << "RecvLocalAsync was not ok." + // << "src_device : " << parsed.src_device + // << "dst_device : " << parsed.dst_device + // << " error message: " << status.error_message(); + if (!status.ok()) { + // SendErrorStatus(status, "rendezvous_mgr->RecvLocalAsync::"); + return; + } + RecvHandler(send_args, recv_args, in, is_dead); }); } -void RdmaTensorResponse::Resume() { SendContent(*tensor_, *proto_, is_dead_); } +void RdmaTensorResponse::Resume() { SendContent(*tensor_, *proto_, is_dead_, true); } // Helper for RecvTensor. Validates "key" and returns the source // device in "*src_dev". @@ -1035,16 +2129,9 @@ Status RdmaTensorResponse::PrepareRecvTensor( return Status::OK(); } -void RdmaTensorResponse::RecvHandler(Rendezvous::ParsedKey parsed, - const Rendezvous::Args& send_args, +void RdmaTensorResponse::RecvHandler(const Rendezvous::Args& send_args, const Rendezvous::Args& recv_args, const Tensor& in, bool is_dead) { - Status s = PrepareRecvTensor(parsed, &src_dev_); - if (!s.ok()) { - SendErrorStatus(s); - return; - } - meta_data_changed_ = TensorMetaDataChanged(in, is_dead); #ifdef RDMA_DATA_VALIDATION // Always send a meta data message with the source checksum @@ -1071,9 +2158,7 @@ void RdmaTensorResponse::RecvHandler(Rendezvous::ParsedKey parsed, // so anyway we'll have to copy it from GPU to CPU first. If at some // point in time Clone() is changed to only save a shallow copy, we can // skip the copy here as well. - if ((in.TotalBytes() > 0) && !meta_data_changed_ && - (RdmaMemoryMgr::Singleton().FindMemoryRegion( - (void*)DMAHelper::base(&in), in.TotalBytes()) != nullptr)) { + if ((in.TotalBytes() > 0) && !meta_data_changed_) { StreamGPUOp(src_dev_, send_dev_context, [this, in, proto, is_dead](const Status& s) { Send(in, proto, is_dead, s); @@ -1101,7 +2186,8 @@ void RdmaTensorResponse::RecvHandler(Rendezvous::ParsedKey parsed, }); } #else - SendErrorStatus(errors::Internal("No GPU device in process")); + SendErrorStatus(errors::Internal("No GPU device in process"), + "No GPU device in process"); #endif // GOOGLE_CUDA } else { // tensor is in CPU memory. @@ -1115,17 +2201,180 @@ void RdmaTensorResponse::RecvHandler(Rendezvous::ParsedKey parsed, void RdmaTensorResponse::Send(const Tensor& in, const TensorProto& proto, bool is_dead, const Status& status) { if (!status.ok()) { - SendErrorStatus(status); + SendErrorStatus(status, "RdmaTensorResponse::Send::!status.ok"); + return; + } + SendBck(in, proto, is_dead, status); +} + +void RdmaChannel::SendDriverData(const Tensor& in, + bool is_dead, + const std::string& name) { + bool has_data = false; + std::shared_ptr entry = + rdma_send_driver_mgr_->GetRecvEntry(name, &has_data); + + CHECK(entry.get() != nullptr) << "Channel SendDriverData to " + << name + << " is_dead:" + << is_dead + << " dtype:" + << DataTypeString(in.dtype()) + << " shape:" + << in.shape(); + + bool can_memcpy = DataTypeCanUseMemcpy(in.dtype()); + TensorProto proto; + if (!can_memcpy) { + in.AsProtoTensorContent(&proto); + } + size_t tensor_bytes = can_memcpy ? in.TotalBytes() : proto.ByteSize(); + if (is_dead) { + tensor_bytes = 0; + } + entry->send_micros_ = 0; + // prefix + string prefix = DriverPrefixMessage::CreateDriverPrefixMessage(in.shape(), + tensor_bytes, is_dead, entry->send_micros_, entry->meta_changed_); + uint32_t imm_data = entry->uinque_id_; + + // tensor + uint32_t send_tensor_lkey = 0; + size_t prefix_s = prefix.size(); + int need_length = prefix_s + tensor_bytes; + if (entry->tensor_addr_ == nullptr) { + if (!FindLocalMr(name, &entry->tensor_addr_, + &entry->smr_, &entry->local_allocate_size_)) { + entry->local_allocate_size_ = 0; + } + } + if (need_length > entry->local_allocate_size_) { + LOG(INFO) << "key :" << name << " relloc need_length:" + << need_length + << " already size:" + << entry->local_allocate_size_; + entry->local_allocate_size_ = Alloc(prefix_s + + VerbsEnvRegistrar::Instance()->RdmaTensorBufferRatio() * tensor_bytes, + &entry->tensor_addr_, &entry->smr_, false); + } + + if (!is_dead) { + if (can_memcpy) { + // allocate region and copy data + entry->src_buffer_ = const_cast(DMAHelper::buffer(&in)); + if (entry->src_buffer_ != nullptr) { + if (tensor_bytes > 0) { + void* addr_offset = (void*)((uint64_t)entry->tensor_addr_ + prefix_s); + memcpy(addr_offset, DMAHelper::base(&in), tensor_bytes); + } + } + } else { + // for send dirven + void* addr_offset = (void*)((uint64_t)entry->tensor_addr_ + prefix_s); + proto.SerializeToArray(addr_offset, tensor_bytes); + } + } else { + tensor_bytes = 0; + } + memcpy(entry->tensor_addr_, prefix.data(), prefix_s); + send_tensor_lkey = (entry->smr_ == nullptr) ? + 0 : entry->smr_->lkey; + // remote mr addr + uint64_t remote_addr = entry->addr_; + uint32_t rkey = entry->lkey_; + CHECK(tensor_bytes + prefix_s <= entry->allocate_size_) + << " 1name:" << name + << " May should large allocate static memory ratio" + << " tensor_bytes:" << tensor_bytes + << " prefix_s:" << prefix_s + << " entry->allocate_size_:" << entry->allocate_size_; + auto tensor_addr = (uint64_t)entry->tensor_addr_; + RdmaMessageBuffer::Write(this, imm_data, tensor_bytes + prefix_s, + tensor_addr, send_tensor_lkey, remote_addr, rkey, + RDMA_WRITE_ID_SEND_DEIVER_WRITE, entry.get()); +} + +void RdmaChannel::InitAndSetDriverStatus() { + size_t entries_size = rdma_send_driver_mgr_->InitLocalDriverEntry(); + // init LocalDriverBufferMgr + size_t ready_size = local_driver_buffer_mgr_->InitLocalDriverBufferMgr(); + CHECK_EQ(entries_size, ready_size) + << "NotifyAsyncAllocator entries_size:" + << entries_size + << " ready_size:" + << ready_size; + // TODO(wuyongyu) could_send_driver must set before Async InitLocalDriverEntry + could_send_driver_ = true; +} + +void RdmaChannel::PleSendOrCheck() { + const auto& remote_name = remote_name_; + const auto& local_name = local_name_; + RDMA_LOG(1) << "NotifyRemoteDriverEntry local_worker_name:" << local_name + << " remote_name:" << remote_name; + + // get the channel cache + SharedGrpcChannelPtr client_channel = + channel_cache_->FindWorkerChannel(remote_name); + CHECK(client_channel != nullptr) << "PleSendOrCheck target:" + << remote_name + << " client_channel is null!"; + GrpcVerbsClient* client = new GrpcVerbsClient(client_channel); + CHECK(client != nullptr) << "PleSendOrCheck No worker known as " + << remote_name; + + PleSendOrCheckReq req; + req.set_host_name(local_name); + // synchronous call + PleSendOrCheckResp resp; + Status s; + int attempts = 0; + static const int max_num_attempts = 5; + do { + s = client->ReqPleSendOrCheck(&req, &resp); + // save obtained remote addresses + // connect to the remote channel + if (s.ok() && resp.is_ok()) { + LOG(INFO) << "verbs to "<< remote_name << " ReqPleSendOrCheck succeed!"; + } else { + LOG(ERROR) << "ReqPleSendOrCheck Connecting to " + << remote_name << ": Got " + << s.error_message() << ". Retrying (" << (attempts + 1) + << " Remote worker Async status:" << resp.is_ok() + << "/" << max_num_attempts << ")..." + << " resp.is_ok:" << resp.is_ok(); + + if (++attempts == max_num_attempts) { + CHECK(FATAL) << "RdmaChannel::PleSendOrCheck failed"; + } + adapter_->worker_env_->env->SleepForMicroseconds(2000000); + } + } while (!s.ok()); + delete client; +} + +void RdmaTensorResponse::SendBck(const Tensor& in, const TensorProto& proto, + bool is_dead, const Status& status) { + if (!status.ok()) { + SendErrorStatus(status, "RdmaTensorResponse::SendBck::!status.ok"); return; } bool can_memcpy = DataTypeCanUseMemcpy(in.dtype()); bool proto_size_changed = (!can_memcpy) && (proto.ByteSize() != rm_.tensor_bytes_); + + int pair_index = (request_index_ % RdmaChannel::kNumMessageBuffers) / 2; + int buffer_index = 2 * pair_index; + auto* tx_buffer = channel_->message_buffers()[buffer_index]; + // move cpu allocator tensor to RdmaMR tensor + // RdmaClone(in, proto, is_dead); if (meta_data_changed_ || proto_size_changed) { Clone(in, proto, is_dead); + // Here is a bug SendMetaData(in, proto, is_dead); + tx_buffer->SendNextItem(); } else { - SendContent(in, proto, is_dead); + SendContent(in, proto, is_dead, false); } } @@ -1134,6 +2383,21 @@ bool RdmaTensorResponse::TensorMetaDataChanged(const Tensor& in, bool is_dead) { (rm_.is_dead_ != is_dead); } +void RdmaTensorResponse::RdmaClone(const Tensor& in, const TensorProto& proto, + bool is_dead) { + bool can_memcpy = DataTypeCanUseMemcpy(in.dtype()); + if (can_memcpy && (in.TotalBytes() > 0)) { + tensor_ = new Tensor(channel_->rdma_mem_allocator_, in.dtype(), in.shape()); + memcpy(DMAHelper::base(tensor_), DMAHelper::base(&in), in.TotalBytes()); + } else { + tensor_ = new Tensor(in.dtype(), in.shape()); + } + if (!can_memcpy) { + proto_ = new TensorProto(proto); + } + is_dead_ = is_dead; +} + void RdmaTensorResponse::Clone(const Tensor& in, const TensorProto& proto, bool is_dead) { // Clone the data to be sent later. For simplicity, we clone the tensor's @@ -1142,13 +2406,23 @@ void RdmaTensorResponse::Clone(const Tensor& in, const TensorProto& proto, // that some tensors share their buffer between different step-ids, so the // tensor content may change before re-request was completed. bool can_memcpy = DataTypeCanUseMemcpy(in.dtype()); + // if (can_memcpy && (in.TotalBytes() > 0)) { + // AllocatorAttributes host_alloc_attrs; + // host_alloc_attrs.set_nic_compatible(true); + // host_alloc_attrs.set_on_host(true); + // Allocator* allocator = src_dev_->GetAllocator(host_alloc_attrs); + // tensor_ = new Tensor(allocator, in.dtype(), in.shape()); + // memcpy(DMAHelper::base(tensor_), DMAHelper::base(&in), in.TotalBytes()); + // } else { + // tensor_ = new Tensor(in.dtype(), in.shape()); + // } if (can_memcpy && (in.TotalBytes() > 0)) { - AllocatorAttributes host_alloc_attrs; - host_alloc_attrs.set_nic_compatible(true); - host_alloc_attrs.set_on_host(true); - Allocator* allocator = src_dev_->GetAllocator(host_alloc_attrs); - tensor_ = new Tensor(allocator, in.dtype(), in.shape()); - memcpy(DMAHelper::base(tensor_), DMAHelper::base(&in), in.TotalBytes()); + channel_->FindOrCreateRemoteBytesAddrMemoryRegion(rm_.name_, + &src_addr_, &mr_, &res_region_, in.TotalBytes()); + auto src_buffer = const_cast(DMAHelper::buffer(&in)); + memcpy(src_addr_, DMAHelper::base(&in), in.TotalBytes()); + res_fake_allocator_ = new FakeAllocator(src_addr_); + tensor_ = new Tensor(res_fake_allocator_, in.dtype(), in.shape()); } else { tensor_ = new Tensor(in.dtype(), in.shape()); } @@ -1160,12 +2434,13 @@ void RdmaTensorResponse::Clone(const Tensor& in, const TensorProto& proto, void RdmaTensorResponse::SendMetaData(const Tensor& in, const TensorProto& proto, bool is_dead) { + // LOG(INFO) << "SendMetaData..."; + send_meta_begin_ = 0; RDMA_LOG(2) << "Request #" << rm_.request_index_ << ": Meta data changed: " << rm_.name_; bool can_memcpy = DataTypeCanUseMemcpy(in.dtype()); size_t tensor_bytes = (can_memcpy) ? in.TotalBytes() : proto.ByteSize(); - // Send meta-data update: RdmaMessage rm; rm.type_ = RDMA_MESSAGE_META_DATA_UPDATE; rm.name_size_ = rm_.name_.size(); @@ -1186,31 +2461,46 @@ void RdmaTensorResponse::SendMetaData(const Tensor& in, << " data-type = " << DataTypeString(rm.data_type_) << "." << " is-dead = " << rm.is_dead_ << ")"; + // rm.create_micros_ = send_meta_begin_; string message = RdmaMessage::CreateMessage(rm); - channel_->tx_message_buffer_->EnqueueItem(message); - channel_->tx_message_buffer_->SendNextItem(); + int pair_index = (request_index_ % RdmaChannel::kNumMessageBuffers) / 2; + int buffer_index = 2 * pair_index; + auto* tx_message_buffer = channel_->message_buffers()[buffer_index]; + tx_message_buffer->EnqueueItem(message); } void RdmaTensorResponse::SendContent(const Tensor& in, const TensorProto& proto, - bool is_dead) { + bool is_dead, + bool is_resume) { + // update recv_local_send_rmda + // overcome the sendmeta effects bool can_memcpy = DataTypeCanUseMemcpy(in.dtype()); size_t tensor_bytes = (can_memcpy) ? in.TotalBytes() : proto.ByteSize(); uint32_t imm_data = rm_.request_index_; + + AllocatorAttributes host_alloc_attrs; + host_alloc_attrs.set_nic_compatible(true); + host_alloc_attrs.set_on_host(true); + Allocator* allocator = src_dev_->GetAllocator(host_alloc_attrs); if (!is_dead) { - if (can_memcpy) { + if (can_memcpy && !is_resume || in.TotalBytes() == 0) { + // when send_content directly so we need copy data src_buffer_ = const_cast(DMAHelper::buffer(&in)); if (src_buffer_ != nullptr) { - src_buffer_->Ref(); // Keep buffer alive until write is complete - src_addr_ = src_buffer_->data(); - mr_ = RdmaMemoryMgr::Singleton().FindMemoryRegion(src_addr_, - tensor_bytes); + // src_buffer_->Ref(); // Keep buffer alive until write is complete + // TODO(wuyongyu02): Move to Meta Change + channel_->FindOrCreateRemoteBytesAddrMemoryRegion(rm_.name_, + &src_addr_, &mr_, &res_region_, tensor_bytes); + if (tensor_bytes > 0) { + memcpy(src_addr_, src_buffer_->data(), tensor_bytes); + } } - } else { + } + if(!can_memcpy){ RDMA_LOG(2) << "Encoding proto: " << rm_.name_ << " (Size: " << tensor_bytes << ") " << in.DebugString(); - src_addr_ = malloc(tensor_bytes); - mr_ = ibv_reg_mr(channel_->adapter_->pd_, src_addr_, tensor_bytes, - IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_WRITE); + channel_->FindOrCreateRemoteBytesAddrMemoryRegion(rm_.name_, + &src_addr_, &mr_, &res_region_, tensor_bytes); proto.SerializeToArray(src_addr_, tensor_bytes); } } else { @@ -1230,7 +2520,8 @@ void RdmaTensorResponse::SendContent(const Tensor& in, const TensorProto& proto, rm_.rkey_, RDMA_WRITE_ID_TENSOR_WRITE, this); } -void RdmaTensorResponse::SendErrorStatus(const Status& status) { +void RdmaTensorResponse::SendErrorStatus(const Status& status, + const std::string& src_func_name) { RdmaMessage rm; rm.type_ = RDMA_MESSAGE_ERROR_STATUS; rm.name_size_ = rm_.name_.size(); @@ -1238,28 +2529,40 @@ void RdmaTensorResponse::SendErrorStatus(const Status& status) { rm.step_id_ = rm_.step_id_; rm.request_index_ = rm_.request_index_; rm.status_ = status; - LOG(ERROR) << "Step 0x" << std::hex << rm.step_id_ << std::dec + + LOG(INFO) << "Step 0x" << (int64)rm.step_id_ << std::dec << ": Sending RDMA_MESSAGE_ERROR_STATUS #" << rm.request_index_ - << ": " << rm.name_ << ". Status: " << status.ToString(); + << ": " << rm.name_ << ". Status: " << status.ToString() + << " src_func_name:" << src_func_name; string message = RdmaMessage::CreateMessage(rm); - channel_->tx_message_buffer_->EnqueueItem(message); - channel_->tx_message_buffer_->SendNextItem(); + int pair_index = (request_index_ % RdmaChannel::kNumMessageBuffers) / 2; + int buffer_index = 2 * pair_index; + // buffer_index = 0; + auto* tx_message_buffer = channel_->message_buffers()[buffer_index]; + // channel_->tx_message_buffer_->EnqueueItem(message); + // channel_->tx_message_buffer_->SendNextItem(); + tx_message_buffer->EnqueueItem(message); + tx_message_buffer->SendNextItem(); // Destroy the response. Destroy(); } void RdmaTensorResponse::Destroy() { + if (res_region_.get() != nullptr) { + // res_region_->Unref(); + } + // response end if (src_buffer_ != nullptr) { - src_buffer_->Unref(); + // src_buffer_->Unref(); } if (tensor_ != nullptr) { delete tensor_; } if (proto_ != nullptr) { - ibv_dereg_mr(mr_); - free(src_addr_); + // ibv_dereg_mr(mr_); + // free(src_addr_); delete proto_; } // Remove response from the pending list: @@ -1275,8 +2578,10 @@ string RdmaMessage::CreateMessage(const RdmaMessage& rm) { // Rdma Message format // type|name_size|name|step_id|request_index|remote_addr|rkey|is_dead|... // 1B| 2B | 512| 8B | 8B | 8B | 4B | 1B |... - // ...|data_type|tensor_shape|tensor_bytes|error_status | - // ...| XB | XB | 8B |size - 4B, proto - XB | + // ...|data_type|tensor_shape|tensor_bytes|create_micros|error_status | + // ...| XB | XB | 8B |8B |size - 4B, proto - XB | + // ...| remote_bytes_addr | remote_bytes_value + // 8B | 4B // // ACK: Imm-type: ACK // TENSOR_REQUEST: Imm-type: MESSAGE @@ -1292,12 +2597,13 @@ string RdmaMessage::CreateMessage(const RdmaMessage& rm) { // Fields: type, request_index, name, step_id, error_status // Tensor content: Imm-type: request_index size_t message_size = kMessageTotalBytes; - char message[kMessageTotalBytes + kErrorStatusMaxSize]; + char message[kMessageTotalBytes + kErrorStatusMaxSize + 100]; // type message[kTypeStartIndex] = static_cast(rm.type_) & 0xff; // request index memcpy(&message[kRequestIndexStartIndex], &rm.request_index_, sizeof(rm.request_index_)); + // name, step_id, remote_addr, rkey if ((rm.type_ == RDMA_MESSAGE_TENSOR_REQUEST) || (rm.type_ == RDMA_MESSAGE_TENSOR_RE_REQUEST)) { @@ -1308,10 +2614,16 @@ string RdmaMessage::CreateMessage(const RdmaMessage& rm) { sizeof(rm.remote_addr_)); memcpy(&message[kRkeyStartIndex], &rm.rkey_, sizeof(rm.rkey_)); memcpy(&message[kStepIdStartIndex], &rm.step_id_, sizeof(rm.step_id_)); + + // memcpy(&message[KRemoteBytesAddrKeyStartIndex], + // &rm.remote_bytes_addr_key_, sizeof(rm.remote_bytes_addr_key_)); + // memcpy(&message[KRemoteBytesAddrStartIndex], + // &rm.remote_bytes_addr_, sizeof(rm.remote_bytes_addr_)); } - // is_dead, data_type, tensor_shape, tensor_bytes + // is_dead, data_type, tensor_shape, tensor_bytes, create_micros if ((rm.type_ == RDMA_MESSAGE_TENSOR_REQUEST) || (rm.type_ == RDMA_MESSAGE_META_DATA_UPDATE) || + (rm.type_ == RDMA_MESSAGE_DRIVER_BEGIN) || (rm.type_ == RDMA_MESSAGE_TENSOR_RE_REQUEST)) { memcpy(&message[kIsDeadStartIndex], &rm.is_dead_, sizeof(rm.is_dead_)); @@ -1321,14 +2633,18 @@ string RdmaMessage::CreateMessage(const RdmaMessage& rm) { sizeof(rm.tensor_shape_)); memcpy(&message[kTensorBytesStartIndex], &rm.tensor_bytes_, sizeof(rm.tensor_bytes_)); + // memcpy(&message[kCreateMicrosStartIndex], &rm.create_micros_, + // sizeof(rm.create_micros_)); } -// checksum + // checksum #ifdef RDMA_DATA_VALIDATION memcpy(&message[kChecksumStartIndex], &rm.checksum_, sizeof(rm.checksum_)); #endif // error status if (rm.type_ == RDMA_MESSAGE_ERROR_STATUS) { ::grpc::Status gs = ToGrpcStatus(rm.status_); + // (wuyongyu) decrease the error message size https://km.sankuai.com/page/403000580 + // ::grpc::Status gs = ::grpc::Status::OK; ErrorStatusProto gsProto; gsProto.set_error_code(gs.error_code()); gsProto.set_error_message(gs.error_message()); @@ -1348,12 +2664,47 @@ string RdmaMessage::CreateMessage(const RdmaMessage& rm) { return string(message, message_size); } -// Parse a RdmaMessage according to the pre-defined format -// Args: -// rm: the message structure where the parsed message will be saved -// buffer: the place where the raw message is stored -// Returns: -// None +string FussionMessages::CreateFusionMessages( + const std::vector& rmv) { + CHECK(rmv.size() < kRdmaMaxMessagesNumber) + << "FussionMessages CreateFusionMessages must less " + << kRdmaMaxMessagesNumber; + size_t message_size = kTotalFussionMessageSize; + char message[kTotalFussionMessageSize + RdmaMessage::kRdmaMessageBufferSize]; + uint32_t* mn = (uint32_t*)&message[kMessageNumbersStartIndex]; + *mn = rmv.size(); + for (int i = 0; i < rmv.size(); i++) { + string m = RdmaMessage::CreateMessage(rmv[i]); + uint32_t* ms = (uint32_t*)&message[kMessageSizeStartIndex + i * 4]; + *ms = m.size(); + uint32_t s; + memcpy(&s, &message[kMessageSizeStartIndex + i * 4], sizeof(s)); + memcpy(&message[KStringMessagesStartIndex + + i * RdmaMessage::kRdmaMessageBufferSize], m.data(), m.size()); + } +} + +void FussionMessages::ParseFussionMessages(std::vector& rmv, + void* buffer) { + char* message = static_cast(buffer); + uint32_t mn = 0; + memcpy(&mn, &message[kMessageNumbersStartIndex], sizeof(mn)); + if (mn == 0) { + return; + } + rmv.reserve(mn); + for (int i=0; i < mn; i++) { + uint32_t message_size; + memcpy(&message_size, &message[kMessageSizeStartIndex + i * 4], + sizeof(message_size)); + char m[RdmaMessage::kMessageTotalBytes + + RdmaMessage::kErrorStatusMaxSize + 100]; + memcpy(m, &message[KStringMessagesStartIndex + + i * RdmaMessage::kRdmaMessageBufferSize], message_size); + RdmaMessage::ParseMessage(rmv[i], &m); + } +} + void RdmaMessage::ParseMessage(RdmaMessage& rm, void* buffer) { char* message = static_cast(buffer); // type @@ -1361,6 +2712,7 @@ void RdmaMessage::ParseMessage(RdmaMessage& rm, void* buffer) { // request index memcpy(&rm.request_index_, &message[kRequestIndexStartIndex], sizeof(rm.request_index_)); + // name, step_id, remote_addr, rkey if ((rm.type_ == RDMA_MESSAGE_TENSOR_REQUEST) || (rm.type_ == RDMA_MESSAGE_TENSOR_RE_REQUEST)) { @@ -1371,10 +2723,17 @@ void RdmaMessage::ParseMessage(RdmaMessage& rm, void* buffer) { sizeof(rm.remote_addr_)); memcpy(&rm.rkey_, &message[kRkeyStartIndex], sizeof(rm.rkey_)); memcpy(&rm.step_id_, &message[kStepIdStartIndex], sizeof(rm.step_id_)); + // memcpy(&rm.remote_bytes_addr_key_, + // &message[KRemoteBytesAddrKeyStartIndex], + // sizeof(rm.remote_bytes_addr_key_)); + // memcpy(&rm.remote_bytes_addr_, + // &message[KRemoteBytesAddrStartIndex], + // sizeof(rm.remote_bytes_addr_)); } // data_type, tensor_bytes, tensor_shape, is_dead if ((rm.type_ == RDMA_MESSAGE_TENSOR_REQUEST) || (rm.type_ == RDMA_MESSAGE_META_DATA_UPDATE) || + (rm.type_ == RDMA_MESSAGE_DRIVER_BEGIN) || (rm.type_ == RDMA_MESSAGE_TENSOR_RE_REQUEST)) { memcpy(&rm.is_dead_, &message[kIsDeadStartIndex], sizeof(rm.is_dead_)); memcpy(&rm.data_type_, &message[kDataTypeStartIndex], @@ -1383,8 +2742,10 @@ void RdmaMessage::ParseMessage(RdmaMessage& rm, void* buffer) { sizeof(rm.tensor_shape_)); memcpy(&rm.tensor_bytes_, &message[kTensorBytesStartIndex], sizeof(rm.tensor_bytes_)); + // memcpy(&rm.create_micros_, &message[kCreateMicrosStartIndex], + // sizeof(rm.create_micros_)); } -// checksum + // checksum #ifdef RDMA_DATA_VALIDATION memcpy(&rm.checksum_, &message[kChecksumStartIndex], sizeof(rm.checksum_)); #endif @@ -1401,6 +2762,10 @@ void RdmaMessage::ParseMessage(RdmaMessage& rm, void* buffer) { } } +ibv_mr* RdmaChannel::FindMemoryRegion(void* addr, size_t length) { + return rdma_memory_mgr_->FindMemoryRegion(addr, length); +} + //***************************************************************************** // RdmaMemoryMgr //***************************************************************************** @@ -1423,12 +2788,15 @@ void RdmaMemoryMgr::InsertMemoryRegion(void* addr, size_t length, RDMA_LOG(1) << "Insert memory region 0x" << std::hex << mr->rkey << ". [" << addr << "-" << (void*)((uint64_t)addr + length - 1) << "]" << " SIZE: 0x" << length << " (" << allocator_name << ")."; + // LOG(INFO) << "Insert memory region 0x" << std::hex << mr->rkey << ". [" + // << addr << "-" << (void*)((uint64_t)addr + length - 1) << "]" + // << " SIZE: 0x" << length << " (" << allocator_name << ")."; if (mr != nullptr) { mutex_lock l(mrs_mu_); auto iter = std::upper_bound(mrs_.begin(), mrs_.end(), addr, &Comparator); mrs_.insert(iter, {mr, &MRDeleter}); } else { - LOG(WARNING) << "Cannot register memory region"; + LOG(FATAL) << "Cannot register memory region"; } } @@ -1445,7 +2813,7 @@ void RdmaMemoryMgr::EvictMemoryRegion(void* addr, size_t length) { } } -const TensorMetaData* RdmaMemoryMgr::GetTensorMetaData( +const TensorMetaData* RdmaChannel::GetTensorMetaData( const std::string& tensor_name) { mutex_lock l(tensor_meta_data_mu_); auto it = tensors_meta_data_.find(tensor_name); @@ -1455,7 +2823,7 @@ const TensorMetaData* RdmaMemoryMgr::GetTensorMetaData( return &it->second; } -const TensorMetaData* RdmaMemoryMgr::SetTensorMetaData( +const TensorMetaData* RdmaChannel::SetTensorMetaData( const std::string& tensor_name, DataType dtype, const TensorShape& shape, bool is_dead, size_t proto_size) { mutex_lock l(tensor_meta_data_mu_); @@ -1471,49 +2839,276 @@ const TensorMetaData* RdmaMemoryMgr::SetTensorMetaData( // RdmaTensorRequest //***************************************************************************** +Status LocalDriverBufferMgr::QueueRdmaSave(const string& key, + const Args& send_args, Tensor* val, const bool is_dead, + const uint64& send_begin_micros) { + string key_hash(key); + if (!status_.ok()) { + Status s = status_; + return s; + } + QueueItems* queue_pair = queue_table_[key_hash]; + CHECK(queue_pair != nullptr) << "QueueRdmaSave queue_pair is nullptr:" + << key_hash; + ItemQueue * queue_item = queue_pair->queue; + queue_pair->queue_lock_.lock(); + if (queue_item->empty() || queue_item->front()->HasValue()) { + RDMA_LOG(1) << "QueueRdmaSave Enqueue Send Item (key:" << key << "). "; + Item* item = new Item; + item->value = val; + item->is_dead = is_dead; + item->has_value = true; + item->send_args = send_args; + item->send_start_micros_ = Env::Default()->NowMicros(); + if (item->send_args.device_context) { + item->send_args.device_context->Ref(); + } + queue_item->push_back(item); + // LOG(INFO) << "QueueRdmaEnqueueSendWaitRecv_Micros:" + // << item->send_start_micros_ - send_args.rendezvous_micros; + queue_pair->queue_lock_.unlock(); + return Status::OK(); + } + RDMA_LOG(1) << "QueueRdmaSave Consume Recv Item (key:" << key << "). "; + Item* item = queue_item->front(); + if (queue_item->size() == 1) { + VLOG(2) << "Clean up Send/Recv queue (key:" << key << "). "; + // queue_table_.erase(key_hash); + queue_item->pop_front(); + } else { + queue_item->pop_front(); + } + queue_pair->queue_lock_.unlock(); + DCHECK(item->HasCallback()); + // LOG(INFO) << "QueueRdmaRecvWaitSend_Micros key:" << key << " micros:" + // << Env::Default()->NowMicros() - item->recv_start_micros_; + item->waiter(Status::OK(), send_args, item->recv_args, *val, is_dead); + delete item; + return Status::OK(); +} + +Status LocalDriverBufferMgr::RdmaSave(const string& key, const Args& send_args, + const Tensor& val, const bool is_dead) { + LOG(FATAL) << "this should not used;"; + return Status::OK(); +} + +void LocalDriverBufferMgr::QueueLoadAsync(const string& key, + const Args& recv_args, DoneCallback done, + const uint64& request_start_micros) { + string key_hash(key); + if (!status_.ok()) { + // Rendezvous has been aborted. + Status s = status_; + done(s, Args(), recv_args, Tensor(), false); + return; + } + const auto& find = queue_table_.find(key_hash); + if (find == queue_table_.end()) { + for (auto& find : queue_table_) { + if (absl::StrContains(key_hash, find.first)) { + key_hash = find.first; + break; + } + } + } + QueueItems* queue_pair = queue_table_[key_hash]; + CHECK(queue_pair != nullptr) + << "QueueLoadAsync queue_pair is null:" << key_hash; + ItemQueue * queue_item = queue_pair->queue; + + queue_pair->queue_lock_.lock(); + if (queue_item->empty() || !queue_item->front()->HasValue()) { + CancellationManager* cm = recv_args.cancellation_manager; + CancellationToken token = CancellationManager::kInvalidToken; + bool already_cancelled = false; + if (cm != nullptr) { + token = cm->get_cancellation_token(); + already_cancelled = !cm->RegisterCallback(token, [this, token, + key_hash] { + Item* item = nullptr; + { + QueueItems* queue_pair = queue_table_[key_hash]; + ItemQueue * queue_item = queue_pair->queue; + if (queue_item->empty() || !queue_item->front()->HasValue()) { + for (auto it = queue_item->begin(); it != queue_item->end(); + it++) { + if ((*it)->cancellation_token == token) { + item = *it; + if (queue_item->size() == 1) { + // key_hash queue can reuse + // table_.erase(key_hash); + queue_item->erase(it); + } else { + queue_item->erase(it); + } + } + } + } + } + if (item != nullptr) { + item->waiter(StatusGroup::MakeDerived( + errors::Cancelled("LoadAsync is cancelled.")), + Args(), item->recv_args, Tensor(), /*is_dead=*/false); + delete item; + } + }); + } + if (already_cancelled) { + queue_pair->queue_lock_.unlock(); + done(StatusGroup::MakeDerived( + errors::Cancelled("LoadAsync is cancelled.")), + Args(), recv_args, Tensor(), /*is_dead=*/false); + return; + } + RDMA_LOG(1) << "LoadAsync Enqueue Recv Item (key:" << key << "). "; + Item* item = new Item; + if (cm != nullptr) { + auto wrapped_done = std::bind( + [cm, token](const DoneCallback& done, + // Begin unbound arguments. + const Status& s, const Args& send_args, + const Args& recv_args, const Tensor& v, bool dead) { + cm->TryDeregisterCallback(token); + RDMA_LOG(1) << "LoadAsync Enqueue Recv DoneCallback begin..."; + done(s, send_args, recv_args, v, dead); + }, + std::move(done), std::placeholders::_1, std::placeholders::_2, + std::placeholders::_3, std::placeholders::_4, + std::placeholders::_5); + item->waiter = std::move(wrapped_done); + } else { + item->waiter = std::move(done); + } + item->recv_args = recv_args; + item->cancellation_token = token; + item->request_start_micros_ = request_start_micros; + item->recv_start_micros_ = Env::Default()->NowMicros(); + if (item->recv_args.device_context) { + item->recv_args.device_context->Ref(); + } + queue_item->push_back(item); + queue_pair->queue_lock_.unlock(); + return; + } + RDMA_LOG(1) << "LoadAsync Consume Send Item (key:" << key << "). "; + Item* item = queue_item->front(); + // LOG(INFO) << "QueueRdmaSendWaitRecv_Micros key:" << key << " micros:" + // << Env::Default()->NowMicros() - item->send_start_micros_; + if (queue_item->size() == 1) { + VLOG(2) << "Clean up Send/Recv queue (key:" << key << "). "; + // queue_table_.erase(key_hash); + queue_item->pop_front(); + } else { + queue_item->pop_front(); + } + queue_pair->queue_lock_.unlock(); + DCHECK(item->HasValue()); + done(Status::OK(), item->send_args, recv_args, *(item->value), item->is_dead); + delete item; +} + +void LocalDriverBufferMgr::LoadAsync(const string& key, const Args& recv_args, + DoneCallback done) { + LOG(FATAL) << "LoadAsync is not impl."; + return; +} + +size_t LocalDriverBufferMgr::InitLocalDriverBufferMgr() { + RDMA_LOG(1) << "InitLocalDriverBufferMgr begin..."; + const auto& tensors_meta_data = + channel_->channel_record_->GetChannelTensorsMetaData(); + const auto& tensors_uid_parsed_key = + channel_->channel_record_->GetChannelTensorsUidParsedkey(); + + CHECK(tensors_meta_data.size() == tensors_uid_parsed_key.size()) + << "tensors_meta_data size:" + << tensors_meta_data.size() + << " tensors_uid_parsed_key size:" + << tensors_uid_parsed_key.size(); + + std::vector print_keys; + for (auto& it : tensors_meta_data) { + auto tfi = table_.find(it.first); + if (tfi == table_.end()) { + table_[it.first] = new Item(); + } + auto qfi = queue_table_.find(it.first); + if (qfi == queue_table_.end()) { + print_keys.emplace_back(it.first); + queue_table_[it.first] = new QueueItems(); + queue_table_[it.first]->queue = new ItemQueue(); + } + } + RDMA_LOG(1) << "InitLocalDriverBufferMgr Queutable:" + << print_keys.size() + << " " + << absl::StrJoin(print_keys, ","); + size_t ready_size = queue_table_.size(); + RDMA_LOG(1) << "InitLocalDriverBufferMgr end size:" << ready_size; + return ready_size; +} + +void LocalDriverBufferMgr::StartAbort(const Status& status) { + CHECK(!status.ok()); + Table table; + { + status_.Update(status); + table_.swap(table); + } + for (auto& p : table) { + Item* item = p.second; + if (!item->HasCallback()) { + item->waiter(status, Args(), Args(), Tensor(), false); + } + } +} + +//***************************************************************************** +// RdmaTensorRequest +//***************************************************************************** + RdmaTensorRequest::RdmaTensorRequest( uint32_t index, const string& key, int64 step_id, RdmaChannel* channel, Device* dst_dev, const Rendezvous::Args recv_args, const RdmaTensorRequest::RecvDoneCallback& done) : index_(index), - key_(key), step_id_(step_id), channel_(channel), dst_dev_(dst_dev), recv_args_(recv_args), - meta_data_(RdmaMemoryMgr::Singleton().GetTensorMetaData(key)), result_tensor_(nullptr), proxy_tensor_(nullptr), rdma_addr_(nullptr), mr_(nullptr), - done_(done) {} + done_(done), + begin_start_req_(0) { + key_.assign(key, 0, RdmaMessage::kNameCapacity); +} -RdmaTensorRequest::~RdmaTensorRequest() { DeallocateTensors(); } +RdmaTensorRequest::~RdmaTensorRequest() { + DeallocateTensors(); +} void RdmaTensorRequest::Done(const Status& s) { Tensor val = std::move(*result_tensor_); - -#ifdef RDMA_DATA_VALIDATION - // Validate checksum - // Unfortunately we can't always do a Checksum directly on the result tensor. - // If the result tensor is on GPU, then we need to copy it back to CPU. If - // we happen to be in the midst of a proxy callback, then the copying will - // get stuck. - uint64_t checksum = (proxy_tensor_ != nullptr) - ? Checksum(nullptr, nullptr, *proxy_tensor_) - : Checksum(dst_dev_, recv_args_.device_context, val); - ValidateChecksum(checksum_, checksum, val, index_, key_, "RDMA"); -#endif - Rendezvous::Args recv_args = std::move(recv_args_); bool is_dead = (meta_data_ == nullptr) ? false : meta_data_->is_dead_; RecvDoneCallback done = done_; DeallocateTensors(); + // if (result_region_.get() != nullptr) { + // result_region_->Unref(); + // } channel_->RemoveTensorRequest(index_); done(s, Rendezvous::Args(), recv_args, val, is_dead); } void RdmaTensorRequest::DeallocateTensors() { + // if (fake_allocator_ != nullptr) { + // LOG(INFO) << "delete fake_allocator"; + // delete fake_allocator_; + // fake_allocator_ = nullptr; + // } if (result_tensor_ != nullptr) { delete result_tensor_; result_tensor_ = nullptr; @@ -1524,37 +3119,173 @@ void RdmaTensorRequest::DeallocateTensors() { } } +size_t RdmaChannel::Alloc(size_t size, void** p, ibv_mr** mr, + bool dynamic, size_t realloc_size) const { + size_t allocate_size = size; + if (dynamic) { + ib_malloc(p, &allocate_size, size, EIGEN_MAX_ALIGN_BYTES); + *mr = ibv_reg_mr(pd_, *p, allocate_size, + IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_WRITE); + return allocate_size; + } + // chunk alloc + adapter_->recv_chunk_->Alloc(ib_allocate_size(size), p, mr, realloc_size); + return allocate_size; +} + +bool RdmaChannel::FindLocalMr(const std::string& key, + void** remote_bytes_addr, ibv_mr** mr, int* length) { + mutex_lock l(remote_bytes_addr_mu_); + auto it = remote_bytes_addr_mrs_.find(key); + if (it == remote_bytes_addr_mrs_.end()) { + return false; + } + *remote_bytes_addr = it->second->addr_; + *mr = it->second->mr_ptr_; + *length = it->second->size_; + CHECK(*remote_bytes_addr != nullptr && *mr != nullptr) + << "key " << key << "*remote_bytes_addr is null?"; + return *remote_bytes_addr != nullptr && *mr != nullptr; +} + +void RdmaChannel::FindOrCreateRemoteBytesAddrMemoryRegion( + const std::string& key, + void** remote_bytes_addr, + ibv_mr** mr, + std::shared_ptr * region, + size_t length, + const Allocator* alloc_attr) { + int allocate_size = 0; + // region has already exists addr's info. + if ((*region).get() != nullptr && (*region)->size_ > length) { + *remote_bytes_addr = (*region)->addr_; + *mr = (*region)->mr_ptr_; + // (*region)->Ref(); + return; + } + // allocate_size = VerbsEnvRegistrar::Instance()->RdmaTensorBufferRatio() * length; + // allocate_size = Alloc(allocate_size, remote_bytes_addr, mr, true); + // *region = std::make_shared( + // *remote_bytes_addr, *mr, allocate_size); + // return; + + // TODO(wuyongyu02): KV is used, + // because https://km.sankuai.com/page/641262306 + // because sparse tensors, so we need malloc large memory + if (!could_send_driver_) { + remote_bytes_addr_mu_.lock(); + } + auto it = remote_bytes_addr_mrs_.find(key); + if (it == remote_bytes_addr_mrs_.end()) { + allocate_size = VerbsEnvRegistrar::Instance()->RdmaTensorBufferRatio() * length; + // because concat DriverPrefixMessage + allocate_size += DriverPrefixMessage::kPrefixMessageTotalBytes; + allocate_size = Alloc(allocate_size, remote_bytes_addr, mr, false); + *region = std::make_shared( + *remote_bytes_addr, *mr, allocate_size); + remote_bytes_addr_mrs_[key] = *region; + // LOG(INFO) << "#1 key:" << key << " size:" << length; + if (!could_send_driver_) { + remote_bytes_addr_mu_.unlock(); + } + } else { + if (length > it->second->size_) { + allocate_size = VerbsEnvRegistrar::Instance()->RdmaTensorBufferRatio() * length; + // because concat DriverPrefixMessage + allocate_size += DriverPrefixMessage::kPrefixMessageTotalBytes; + allocate_size = Alloc(allocate_size, remote_bytes_addr, mr, false, it->second->size_); + *region = std::make_shared( + *remote_bytes_addr, *mr, allocate_size); + if (length > it->second->size_) { + it->second = *region; + it->second->size_ = allocate_size; + } + // LOG(INFO) << "#2 create new tensor:" << key; + } + // else if(it->second->RefCountIsOne()) { + // allocate_size = VerbsEnvRegistrar::Instance()->RdmaTensorBufferRatio() * length; + // // because concat DriverPrefixMessage + // allocate_size += DriverPrefixMessage::kPrefixMessageTotalBytes; + // allocate_size = Alloc(allocate_size, remote_bytes_addr, mr, false); + // *region = std::make_shared( + // *remote_bytes_addr, *mr, allocate_size); + // } + else { + *region = it->second; + *remote_bytes_addr = it->second->addr_; + *mr = it->second->mr_ptr_; + // LOG(INFO) << "#3 key:" << key << " size:" << length; + } + // (*region)->Ref(); + if (!could_send_driver_) { + remote_bytes_addr_mu_.unlock(); + } + } +} + +size_t RdmaChannel::ChannelAllocateTensors( + const string& key, + const TensorMetaData& meta, + const Allocator* alloc_attr, ibv_mr** mr/*new */, + std::shared_ptr * region, + void** rdma_addr/*new*/) { + size_t max_length = 0; + if (DataTypeCanUseMemcpy(meta.data_type_)) { + max_length = RecordTensorMetaData::GetTensorLength(meta.data_type_, + meta.tensor_shape_); + } else { + max_length = meta.proto_size_; + } + // use allocator for RdmaTensorrequest + FindOrCreateRemoteBytesAddrMemoryRegion(key, rdma_addr, mr, region, + max_length, alloc_attr); + return max_length; +} + +size_t RdmaTensorRequest::GetTensorLength(const TensorMetaData& meta) { + size_t max_length = 0; + if (DataTypeCanUseMemcpy(meta.data_type_)) { + max_length = RecordTensorMetaData::GetTensorLength(meta.data_type_, + meta.tensor_shape_); + } else { + max_length = meta.proto_size_; + } + return max_length; +} + bool RdmaTensorRequest::AllocateTensors() { - result_tensor_ = + auto len = channel_->ChannelAllocateTensors( + key_, *meta_data_, dst_dev_->GetAllocator(recv_args_.alloc_attrs), + &mr_, &result_region_, &rdma_addr_); + if (DataTypeCanUseMemcpy(meta_data_->data_type_)) { + fake_allocator_ = new FakeAllocator(rdma_addr_); + result_tensor_ = new Tensor(fake_allocator_, + meta_data_->data_type_, + meta_data_->tensor_shape_); + } else { + // proto + result_tensor_ = new Tensor(dst_dev_->GetAllocator(recv_args_.alloc_attrs), meta_data_->data_type_, meta_data_->tensor_shape_); - + } size_t tensor_size = result_tensor_->TotalBytes(); bool can_memcpy = DataTypeCanUseMemcpy(result_tensor_->dtype()); - if (can_memcpy) { - if (tensor_size == 0) { - return true; - } - rdma_addr_ = DMAHelper::base(result_tensor_); - mr_ = RdmaMemoryMgr::Singleton().FindMemoryRegion(rdma_addr_, tensor_size); + if (can_memcpy && tensor_size == 0) { + return true; + } #if GOOGLE_CUDA - if (mr_ == nullptr) { + if (can_memcpy) { // Can't RDMA directly to result. Use a proxy. proxy_tensor_ = new Tensor(GPUProcessState::singleton()->GetGpuHostAllocator(0), result_tensor_->dtype(), result_tensor_->shape()); rdma_addr_ = DMAHelper::base(proxy_tensor_); - mr_ = - RdmaMemoryMgr::Singleton().FindMemoryRegion(rdma_addr_, tensor_size); + // mr_ = + // RdmaMemoryMgr::Singleton().FindMemoryRegion(rdma_addr_, tensor_size); } #endif - } else { - uint32_t proto_size = meta_data_->proto_size_; - rdma_addr_ = malloc(proto_size); - mr_ = ibv_reg_mr(RdmaMemoryMgr::Singleton().pd_, rdma_addr_, proto_size, - IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_WRITE); - } - CHECK(mr_ != nullptr) << " No memory region found for address " << rdma_addr_ + CHECK(mr_ != nullptr) << " No memory region found for address " + << rdma_addr_ << ": " << key_; return true; } @@ -1574,7 +3305,9 @@ void RdmaTensorRequest::AllocateTensorsAsync(StatusCallback done) { } void RdmaTensorRequest::Send(RdmaMessageType message_type) { - RdmaMessageBuffer* rb = channel_->tx_message_buffer_; + int pair_index = (index_ % RdmaChannel::kNumMessageBuffers) / 2; + int buffer_index = 2 * pair_index; + auto* rb = channel_->message_buffers()[buffer_index]; RdmaMessage rm; rm.type_ = message_type; rm.request_index_ = index_; @@ -1591,7 +3324,7 @@ void RdmaTensorRequest::Send(RdmaMessageType message_type) { rm.data_type_ = DT_INVALID; } rm.rkey_ = (mr_ == nullptr) ? 0 : mr_->rkey; - + // rm.create_micros_ = 0; RDMA_LOG(1) << "Step 0x" << std::hex << rm.step_id_ << std::dec << ": Sending " << MessageTypeToString(message_type) << " #" << index_ << ": " << rm.name_ << " on " << rdma_addr_ @@ -1604,50 +3337,44 @@ void RdmaTensorRequest::Send(RdmaMessageType message_type) { void RdmaTensorRequest::RecvTensorMetaData(DataType dtype, TensorShape shape, bool is_dead, size_t proto_size) { - meta_data_ = RdmaMemoryMgr::Singleton().SetTensorMetaData( + meta_data_ = channel_->SetTensorMetaData( key_, dtype, shape, is_dead, proto_size); - + // channel record MetaData + channel_->channel_record_->Record(key_, *meta_data_); + // global record + // RecordTensorMetaData::Singleton().GlobalRecord(key_, *meta_data_); DeallocateTensors(); + // if (result_region_.get() != nullptr) { + // result_region_->Unref(); + // } AllocateTensorsAsync( [this](const Status& s) { Send(RDMA_MESSAGE_TENSOR_RE_REQUEST); }); } void RdmaTensorRequest::RecvTensorContent() { + uint64_t deal_data_begin = Env::Default()->NowMicros(); bool can_memcpy = DataTypeCanUseMemcpy(meta_data_->data_type_); size_t message_size = can_memcpy ? result_tensor_->TotalBytes() : meta_data_->proto_size_; + RDMA_LOG(1) << "Step 0x" << std::hex << step_id_ << std::dec << ": Received tensor content #" << index_ << ": " << key_ << " (Size: 0x" << std::hex << message_size << ")"; - Tensor val; - -#if GOOGLE_CUDA - if (proxy_tensor_ != nullptr) { - CountCopies(key_, (void*)DMAHelper::base(proxy_tensor_), - (void*)DMAHelper::base(result_tensor_), - result_tensor_->TotalBytes(), false); - GPUUtil::CopyCPUTensorToGPU(proxy_tensor_, recv_args_.device_context, - dst_dev_, result_tensor_, - [this](const Status& s) { - CHECK(s.ok()) << "copy tensor to gpu sync"; - Done(s); - }, - true /*sync_dst_compute*/); - return; - } -#endif - if (can_memcpy) { + // copy Tensor from rdma_addr_ + // TODO(wuyongyu) + // only the rdma_addr_ has value , can memcpy + // if (result_tensor_->TotalBytes() > 0) { + // memcpy(DMAHelper::base(result_tensor_), (void*)(rdma_addr_), + // result_tensor_->TotalBytes()); + // } + // Recv Tensor memory if can resuse Done(Status::OK()); } else { - RDMA_LOG(2) << "Decoding proto: " << key_ - << " (Size: " << meta_data_->proto_size_ << ")"; TensorProto proto; CHECK(ParseProtoUnlimited(&proto, rdma_addr_, meta_data_->proto_size_)) << "fail to parse proto from array"; - ibv_dereg_mr(mr_); - free(rdma_addr_); Status s = dst_dev_->MakeTensorFromProto(proto, recv_args_.alloc_attrs, result_tensor_); Done(s); @@ -1663,7 +3390,7 @@ void RdmaTensorRequest::RecvErrorStatus(const Status& status) { } void RdmaTensorRequest::Start() { - meta_data_ = RdmaMemoryMgr::Singleton().GetTensorMetaData(key_); + meta_data_ = channel_->GetTensorMetaData(key_); if (meta_data_ != nullptr) { AllocateTensorsAsync( [this](const Status& s) { Send(RDMA_MESSAGE_TENSOR_REQUEST); }); @@ -1671,5 +3398,6 @@ void RdmaTensorRequest::Start() { Send(RDMA_MESSAGE_TENSOR_REQUEST); } } - } // end namespace tensorflow + +#endif diff --git a/tensorflow_networking/verbs/rdma.h b/tensorflow_networking/verbs/rdma.h index bd9460f..4582980 100644 --- a/tensorflow_networking/verbs/rdma.h +++ b/tensorflow_networking/verbs/rdma.h @@ -16,8 +16,9 @@ limitations under the License. #ifndef TENSORFLOW_CONTRIB_VERBS_RDMA_H_ #define TENSORFLOW_CONTRIB_VERBS_RDMA_H_ -#include +#ifdef TENSORFLOW_USE_VERBS +#include #include // for memset #include #include // for shared_ptr @@ -25,7 +26,11 @@ limitations under the License. #include #include #include +#include +#include +#include +#include "tensorflow_networking/verbs/verbs_util.h" #include "tensorflow/core/distributed_runtime/worker_env.h" #include "tensorflow/core/framework/rendezvous.h" #include "tensorflow/core/framework/tensor.h" @@ -33,7 +38,12 @@ limitations under the License. #include "tensorflow/core/framework/types.h" #include "tensorflow/core/platform/env.h" #include "tensorflow/core/platform/mutex.h" -#include "tensorflow_networking/verbs/verbs_util.h" +#include "tensorflow/core/distributed_runtime/rpc/grpc_channel.h" +#include "tensorflow_networking/verbs/verbs_service.pb.h" +#include "tensorflow_networking/verbs/grpc_verbs_client.h" +#include "absl/container/flat_hash_map.h" +#include "tensorflow_networking/verbs/rdma_mgr.h" + namespace tensorflow { #define PKEY_DEFAULT 0 @@ -80,6 +90,7 @@ enum RdmaMessageType { RDMA_MESSAGE_TENSOR_RE_REQUEST, RDMA_MESSAGE_TENSOR_REQUEST, RDMA_MESSAGE_ERROR_STATUS, + RDMA_MESSAGE_DRIVER_BEGIN }; struct RdmaMessage { @@ -87,7 +98,7 @@ struct RdmaMessage { uint16_t name_size_; string name_; int64 step_id_; - uint64_t request_index_; + uint32_t request_index_; union { uint64_t remote_addr_; #ifdef RDMA_DATA_VALIDATION @@ -100,13 +111,20 @@ struct RdmaMessage { TensorShape tensor_shape_; size_t tensor_bytes_; + // int64 create_micros_; + + // uint32_t remote_bytes_addr_key_; + // uint64_t remote_bytes_addr_; // For error status: Status status_; + // (wuyongyu02) add the 'create_micros' for cat log // type|name_size|name|step_id|request_index|remote_addr/checksum|rkey|... // 1B| 2B | 512| 8B | 8B | 8B | 4B |... - // ...|is_dead|data_type|tensor_shape|tensor_bytes|error_status | - // ...| 1B | XB | XB | 8B |size - 4B, proto - XB | + // ...|is_dead|data_type|tensor_shape|tensor_bytes|create_micros |... + // ...| 1B | XB | XB | 8B | 8B |... + // ...|remote_bytes_addr| error_status | + // ...|8B | size - 4B, proto - XB | static const size_t kNameCapacity = 512; static const size_t kTypeStartIndex = 0; static const size_t kNameSizeStartIndex = kTypeStartIndex + sizeof(type_); @@ -127,8 +145,14 @@ struct RdmaMessage { kDataTypeStartIndex + sizeof(data_type_); static const size_t kTensorBytesStartIndex = kTensorShapeStartIndex + sizeof(TensorShape); + // static const size_t kCreateMicrosStartIndex = + // kTensorBytesStartIndex + sizeof(tensor_bytes_); + + // static const size_t kErrorStatusStartIndex = + // kCreateMicrosStartIndex + sizeof(create_micros_); static const size_t kErrorStatusStartIndex = kTensorBytesStartIndex + sizeof(tensor_bytes_); + static const size_t kErrorStatusMaxSize = 4096; static const size_t kMessageTotalBytes = kErrorStatusStartIndex; @@ -138,18 +162,70 @@ struct RdmaMessage { static void ParseMessage(RdmaMessage& rm, void* buffer); }; +// Parse a RdmaMessage according to the pre-defined format +// Args: +// rm: the message structure where the parsed message will be saved +// buffer: the place where the raw message is stored +// Returns: +// None +struct FussionMessages { + /* data */ + static const size_t kRdmaMaxMessagesNumber = 50; + uint32_t message_numbers; + uint32_t message_size[kRdmaMaxMessagesNumber]; + std::string messages[kRdmaMaxMessagesNumber]; + /* func */ + static string CreateFusionMessages(const std::vector& rmv); + static void ParseFussionMessages(std::vector& rmv, void* buffer); + + /* index */ + static const size_t kMessageNumbersStartIndex = 0; + static const size_t kMessageSizeStartIndex = kMessageNumbersStartIndex + + sizeof(message_numbers); + static const size_t KStringMessagesStartIndex = kMessageSizeStartIndex + + sizeof(message_size); + static const size_t kTotalFussionMessageSize = KStringMessagesStartIndex + + kRdmaMaxMessagesNumber * RdmaMessage::kRdmaMessageBufferSize; +}; + +class FakeAllocator : public Allocator { + public: + FakeAllocator(void* buffer) : buffer_(buffer) {} + string Name() override { return "fake_allocator"; } + void* AllocateRaw(size_t alignment, size_t num_bytes) override { + //simply return the pre-allocated data + return buffer_; + } + void DeallocateRaw(void* ptr) override { + //TODO(wyy): does the real owner will free buffer_? + // free(buffer_); + // port::AlignedFree(buffer_); + } + + private: + //data should be 64 bytes aligned + void* buffer_ = nullptr; +}; + +class RdmaChannel; +class ChannelRecordTensorMetaData; +class RdmaSendDriverMgr; + // Immediate types for RDMA write +const int Const_kNumMessageBuffers = 80; // origin 80 enum RdmaImmDataType { - RDMA_IMM_MAX_REQUEST_ID = 0xFFFFFFFD, - RDMA_IMM_DATA_ACK = 0xFFFFFFFE, - RDMA_IMM_DATA_MESSAGE = 0xFFFFFFFF + RDMA_IMM_MAX_REQUEST_ID = 0xFFFFFFFF - 2 * Const_kNumMessageBuffers - 2, + RDMA_IMM_DATA_ACK = 0xFFFFFFFF - Const_kNumMessageBuffers - 1, + RDMA_IMM_DATA_MESSAGE = 0xFFFFFFFF, + RDMA_IMM_MIN_SENDMGR_BASE = int(RDMA_IMM_MAX_REQUEST_ID/2 + 1), }; // Write types for RDMA write-complete events enum RdmaWriteIDType { RDMA_WRITE_ID_ACK, RDMA_WRITE_ID_MESSAGE, - RDMA_WRITE_ID_TENSOR_WRITE + RDMA_WRITE_ID_TENSOR_WRITE, + RDMA_WRITE_ID_SEND_DEIVER_WRITE }; // Context for RDMA write-complete events @@ -169,6 +245,9 @@ class TensorMetaData { DataType data_type_; size_t proto_size_; bool is_dead_; + uint32 uid_; + // record is the mata change for send-driven + bool meta_changed_ = false; std::ostream& print(std::ostream& out) const { out << "Dtype = " << DataTypeString(data_type_) @@ -183,8 +262,6 @@ inline std::ostream& operator<<(std::ostream& out, return meta_data.print(out); } -class RdmaChannel; - void MRDeleter(ibv_mr* mr); using MemoryRegionPtr = std::unique_ptr; @@ -192,42 +269,175 @@ using MemoryRegionPtr = std::unique_ptr; // Manages the local meta-data cache, and the registered RDMA memory regions. class RdmaMemoryMgr { public: - static RdmaMemoryMgr& Singleton() { - static RdmaMemoryMgr instance; - return instance; - } + RdmaMemoryMgr(struct ibv_pd* pd) :pd_(pd) {} + // static RdmaMemoryMgr& Singleton() { + // static RdmaMemoryMgr instance; + // return instance; + // } - // Memory regions ibv_mr* FindMemoryRegion(void* addr, size_t length); + void InsertMemoryRegion(void* addr, size_t length, const std::string& allocator_name); void EvictMemoryRegion(void* addr, size_t length); - // Tensor meta-data cache - const TensorMetaData* GetTensorMetaData(const std::string& tensor_name); - const TensorMetaData* SetTensorMetaData(const std::string& tensor_name, - DataType dtype, - const TensorShape& shape, - bool is_dead, size_t proto_size); + static bool Comparator(const void* ptr, const MemoryRegionPtr& other) { + return ptr < reinterpret_cast(other->addr) + other->length; + } struct ibv_pd* pd_; - protected: - RdmaMemoryMgr() : pd_(nullptr) {} + private: + // Managed memory regions + mutex mrs_mu_; + std::vector mrs_ GUARDED_BY(mrs_mu_); +}; - static bool Comparator(const void* ptr, const MemoryRegionPtr& other) { - return ptr < reinterpret_cast(other->addr) + other->length; +class RecordTensorMetaData { + public: + RecordTensorMetaData() { + // stop_.store(true, std::memory_order_relaxed); + total_bytes_ = 0; + } + + ~RecordTensorMetaData() { + // stop_.store(false, std::memory_order_relaxed); + } + + static RecordTensorMetaData& Singleton() { + static RecordTensorMetaData instance; + return instance; + } + + static uint32 GetTensorLength(const DataType& date_type, + const TensorShape& tensor_shape) { + return GetEnumSize(date_type) * tensor_shape.num_elements(); + } + + static uint32 GetEnumSize(const DataType& date_type); + + void GlobalRecord(const std::string& origin_tensor_name, + const TensorMetaData& m, bool stop_record=false); + + typedef std::unordered_map GTensorMetaType; + typedef std::unordered_map GTensorsUidKeyType; + + const GTensorMetaType& GetGlobalTensorsMetaData() { + return global_tensors_meta_data_; + } + + const GTensorsUidKeyType& GetGlobalTensorsUidParsedkey() { + return global_tensors_uid_parsed_key_; } + string DebugString() const; + + void WriteOutput(const std::string& content) const; + + void ReadFile(const std::string& filename, StringPiece* content); + private: - mutex tensor_meta_data_mu_; - std::unordered_map tensors_meta_data_; + mutex global_tensor_meta_data_mu_; + GTensorMetaType global_tensors_meta_data_; + GTensorsUidKeyType global_tensors_uid_parsed_key_; + // uid_ should less RDMA_IMM_MAX_REQUEST_ID + uint32 uid_ = RDMA_IMM_MIN_SENDMGR_BASE; + // std::atomic stop_; + uint64 total_bytes_; + string local_worker_name_ = ""; +}; - // Managed memory regions - mutex mrs_mu_; - std::vector mrs_ TF_GUARDED_BY(mrs_mu_); +// which is a member of RdmaChannel +class LocalDriverBufferMgr { + public: + explicit LocalDriverBufferMgr(RdmaChannel* channel) : channel_(channel) { + DCHECK(channel != nullptr) + << "LocalDriverBufferMgr construct channel is nullptr."; + } + + typedef Rendezvous::DoneCallback DoneCallback; + typedef Rendezvous::Args Args; + typedef Rendezvous::ParsedKey ParsedKey; + struct Item { + mutex item_lock_; + DoneCallback waiter = nullptr; + Tensor* value; + bool is_dead = false; + bool has_value = false; + Args send_args; + Args recv_args; + CancellationToken cancellation_token; + uint64 send_start_micros_; + uint64 recv_start_micros_; + uint64 request_start_micros_; + + ~Item() { + if (send_args.device_context) { + send_args.device_context->Unref(); + } + if (recv_args.device_context) { + recv_args.device_context->Unref(); + } + if (value != nullptr) { + // delete value; + } + } + + // Returns true iff this item represents a value being sent. + bool HasCallback() const { return this->waiter != nullptr; } + + bool HasValue() const { return this->has_value;} + }; + + typedef std::deque ItemQueue; + + struct QueueItems { + ItemQueue* queue; + mutex queue_lock_; + }; + + + typedef gtl::FlatMap Table; + + typedef gtl::FlatMap QueueTable; + + + size_t InitLocalDriverBufferMgr(); + + Status RdmaSave(const string& key, const Args& send_args, const Tensor& val, + const bool is_dead); + + Status QueueRdmaSave(const string& key, const Args& send_args, + Tensor* val, const bool is_dead, + const uint64& send_begin_micros); + + void LoadAsync(const string& key, const Args& recv_args, + DoneCallback done); + + void QueueLoadAsync(const string& key, const Args& recv_args, + DoneCallback done, const uint64& request_start_micros); + + void StartAbort(const Status& status); + + ~LocalDriverBufferMgr() { + if (!table_.empty()) { + StartAbort(errors::Cancelled("LocalDriverBufferMgr deleted")); + } + } + + public: + bool use_queue_item_ = true; + + private: + RdmaChannel* channel_; // not owned + Table table_; // GUARDED_BY(mu_); + QueueTable queue_table_; + Status status_ = Status::OK(); // GUARDED_BY(mu_); + TF_DISALLOW_COPY_AND_ASSIGN(LocalDriverBufferMgr); }; +class RemoteBytesAddrMemoryRegion; + // RdmaTensorRequest // Represents a single tensor request. class RdmaTensorRequest { @@ -269,6 +479,10 @@ class RdmaTensorRequest { // Invoke Done() with the status code. void RecvErrorStatus(const Status& status); + RdmaChannel* rdma_channel() { + return channel_; + } + #ifdef RDMA_DATA_VALIDATION // Receive tensor checksum // @@ -277,6 +491,12 @@ class RdmaTensorRequest { // checksum right before invoking Done(). void RecvTensorChecksum(uint64_t checksum) { checksum_ = checksum; } #endif + uint64_t begin_start_req_; + string key_; + // SendMetaData message micros + // uint64_t rm_create_micros_; + RecvDoneCallback done_; + Rendezvous::Args recv_args_; private: void Done(const Status& s); @@ -285,30 +505,56 @@ class RdmaTensorRequest { void AllocateTensorsAsync(StatusCallback done); void DeallocateTensors(); + size_t GetTensorLength(const TensorMetaData& meta); + uint32_t index_; - string key_; int64 step_id_; RdmaChannel* channel_; Device* dst_dev_; - Rendezvous::Args recv_args_; + const TensorMetaData* meta_data_; + FakeAllocator* fake_allocator_ = nullptr; Tensor* result_tensor_; + + std::shared_ptr result_region_; Tensor* proxy_tensor_; void* rdma_addr_; + // void* rdma_remote_bytes_addr_ = nullptr; + // ibv_mr* remote_bytes_addr_mr_ = nullptr; ibv_mr* mr_; - RecvDoneCallback done_; #ifdef RDMA_DATA_VALIDATION uint64_t checksum_; #endif }; +struct DriverEntry; + // RdmaTensorResponse // Represents a single tensor response. class RdmaTensorResponse { public: // Creates a response for request message. RdmaTensorResponse(RdmaChannel* channel, const RdmaMessage& rm) - : channel_(channel), rm_(rm) {} + : channel_(channel), rm_(rm) { + // strings::StrCat( + // src_device, ";", strings::Uint64ToHexString(src_incarnation, buf), ";", + // dst_device, ";", name, ";", frame_iter.frame_id, ":", + // frame_iter.iter_id); + if (!rm.name_.empty()) { + size_t found = rm.name_.find(";"); + string str = rm.name_.substr(found + 1, rm.name_.size()); + + found = str.find(";"); + str = str.substr(found + 1, str.size()); + + found = str.find(";"); + req_to_device_ = str.substr(0, found); + parsed_key_ = rm.name_; + } else { + req_to_device_ = ""; + parsed_key_ = ""; + } + } void Update(const RdmaMessage& rm) { rm_ = rm; } @@ -333,20 +579,36 @@ class RdmaTensorResponse { // Destroy the response's resources and remove it from the pending list. void Destroy(); + public: + uint64 request_index_; + uint64 recv_local_send_rdma_; + uint64 recv_send_content_ = 0; + uint64 send_meta_begin_; + string parsed_key_; + string req_to_device_; + private: - void RecvHandler(Rendezvous::ParsedKey parsed, - const Rendezvous::Args& send_args, + void RecvHandler(const Rendezvous::Args& send_args, const Rendezvous::Args& recv_args, const Tensor& in, bool is_dead); void Clone(const Tensor& in, const TensorProto& proto, bool is_dead); + + + void RdmaClone(const Tensor& in, const TensorProto& proto, + bool is_dead); + void Send(const Tensor& in, const TensorProto& proto, bool is_dead, const Status& status); + void SendBck(const Tensor& in, const TensorProto& proto, bool is_dead, + const Status& status); + bool TensorMetaDataChanged(const Tensor& in, bool is_dead); Status PrepareRecvTensor(const Rendezvous::ParsedKey& parsed, Device** src_dev); void SendMetaData(const Tensor& in, const TensorProto& proto, bool is_dead); - void SendContent(const Tensor& in, const TensorProto& proto, bool is_dead); - void SendErrorStatus(const Status& status); + void SendContent(const Tensor& in, const TensorProto& proto, bool is_dead, + bool is_resume); + void SendErrorStatus(const Status& status, const std::string& src_func_name); RdmaChannel* channel_; RdmaMessage rm_; // The request message @@ -361,6 +623,35 @@ class RdmaTensorResponse { TensorProto* proto_ = nullptr; Tensor* tensor_ = nullptr; bool is_dead_ = false; + + std::shared_ptr res_region_; + FakeAllocator* res_fake_allocator_; +}; + +class Chunk { + public: + Chunk(struct ibv_pd* pd); + + void FreeChunk(); + + ~Chunk(); + + void Alloc(size_t size, void** p, ibv_mr** mr, size_t realloc_size=0); + + private: + void* new_p_; + ibv_mr* new_mr_; + size_t chunk_addr_size = 64*1024*1024; + uint64 offset_; + uint64 curr_size_; + uint64 empty_size_; + uint64 total_waste_size_; + uint64 total_realloc_size_; + mutex alloc_mu_; + int allocate_size_; + struct ibv_pd* pd_; + std::vector mrs_; + std::vector chunk_addrs_; }; class RdmaMessageBuffer; @@ -371,8 +662,11 @@ class RdmaAdapter { friend class RdmaChannel; friend class RdmaMessageBuffer; friend class RdmaTensorResponse; + friend class RdmaTensorRequest; friend class RdmaMgr; friend class RdmaRemoteRendezvous; + friend class RdmaSendDriverMgr; + friend class ChannelRecordTensorMetaData; public: RdmaAdapter(const WorkerEnv* worker_env); @@ -380,29 +674,298 @@ class RdmaAdapter { // Adapter name, e.g. mlx5_0. string name() const; void StartPolling(); - void Process_CQ(); + void Pool_Process_CQ(int cq_num); + void Process_WR(ibv_wc wc_, int cq_num); protected: - static const int MAX_CONCURRENT_WRITES = 1000; + thread::ThreadPool* pool_; + static const int MAX_CONCURRENT_WRITES = 5000; // origin 1000 , second 5000 ibv_context* context_; // RDMA configuration parameters RdmaParams params_; // ibverbs protection domain ibv_pd* pd_; // Completion event channel, to wait for work completions - ibv_comp_channel* event_channel_; + ibv_comp_channel** event_channel_vec_; + // Completion queue, to poll on work completions - ibv_cq* cq_; + ibv_cq** cq_vec_; + // + int cq_nums_; // Pre-allocated work completions array used for polling - ibv_wc wc_[MAX_CONCURRENT_WRITES * 2]; + ibv_wc** wc_vec_; // worker env for thread const WorkerEnv* worker_env_; // thread for cq. - std::unique_ptr polling_thread_; + std::vector > polling_thread_vec_; + Chunk* recv_chunk_ = nullptr; }; // Class that represents a connection to a remote Rdma peer. // Responsible for connecting queue pairs. +class RemoteBytesAddrMemoryRegion { + public: + RemoteBytesAddrMemoryRegion(void* addr, ibv_mr* mr, size_t s) { + mr_ptr_ = mr; + addr_ = addr; + size_ = s; + ref_.store(0); + } + + // TODO(wuyongyu02) need ibv_dereg_mr mr_ptr_ + ~RemoteBytesAddrMemoryRegion() { + if (mr_ptr_!= nullptr && addr_ != nullptr) { + // ibv_dereg_mr(mr_ptr_); + // free(addr_); + addr_ = nullptr; + mr_ptr_ = nullptr; + } + } + + bool RefCountIsOne() const { + return (ref_.load(std::memory_order_acquire) >= 1); + } + + void Ref() const { + ref_.fetch_add(1, std::memory_order_relaxed); + } + + bool Unref() const { + ref_.store(0); + return true; + } + + mutable std::atomic_int_fast32_t ref_; + void* addr_; + ibv_mr* mr_ptr_; + size_t size_; +}; + +// save bytes info +struct DriverPrefixMessage { + TensorShape tensor_shape_; + size_t tensor_bytes_; + bool is_dead_; + uint64 send_micros_; + // for not meta changed + static const size_t CKIsDeadIndexStartIndex = 0; + static const size_t CkSendMiscrosStartIndex = + CKIsDeadIndexStartIndex + sizeof(is_dead_); + static const size_t CkPrefixMessageTotalBytes = + CkSendMiscrosStartIndex + sizeof(send_micros_); + + static const size_t kTensorShapeStartIndex = 0; + static const size_t kTensorBytesStartIndex = + kTensorShapeStartIndex + sizeof(tensor_shape_); + static const size_t KIsDeadIndexStartIndex = + kTensorBytesStartIndex + sizeof(tensor_bytes_); + + static const size_t KSendMicrosStartIndex = + KIsDeadIndexStartIndex + sizeof(is_dead_); + + static const size_t kPrefixMessageTotalBytes = + KSendMicrosStartIndex + sizeof(send_micros_); + + static std::string CreateDriverPrefixMessage(const TensorShape& shape, + const size_t& tensor_bytes, const bool& is_dead, + const uint64& send_micros, const bool& meta_changed) { + if (meta_changed) { + char message[kPrefixMessageTotalBytes + 100]; + memcpy(message + kTensorShapeStartIndex, &shape, sizeof(shape)); + memcpy(message + kTensorBytesStartIndex, &tensor_bytes, + sizeof(tensor_bytes)); + memcpy(message + KIsDeadIndexStartIndex, &is_dead, sizeof(is_dead)); + memcpy(message + KSendMicrosStartIndex, &send_micros, + sizeof(send_micros)); + return std::string(message, kPrefixMessageTotalBytes); + } else { + char message[CkPrefixMessageTotalBytes + 100]; + memcpy(message + CKIsDeadIndexStartIndex, &is_dead, sizeof(is_dead)); + memcpy(message + CkSendMiscrosStartIndex, &send_micros, + sizeof(send_micros)); + return std::string(message, CkPrefixMessageTotalBytes); + } + } + + static DriverPrefixMessage ParseDriverPrefixMessage(void* addr, + const bool& meta_changed) { + if (meta_changed) { + char* message = static_cast(addr); + DriverPrefixMessage m; + memcpy(&m.tensor_shape_, message + kTensorShapeStartIndex, + sizeof(m.tensor_shape_)); + memcpy(&m.tensor_bytes_, message + kTensorBytesStartIndex, + sizeof(m.tensor_bytes_)); + memcpy(&m.is_dead_, message + KIsDeadIndexStartIndex, + sizeof(m.is_dead_)); + memcpy(&m.send_micros_, message + KSendMicrosStartIndex, + sizeof(m.send_micros_)); + return m; + } else { + char* message = static_cast(addr); + DriverPrefixMessage m; + memcpy(&m.is_dead_, message + CKIsDeadIndexStartIndex, + sizeof(m.is_dead_)); + memcpy(&m.send_micros_, message + CkSendMiscrosStartIndex, + sizeof(m.send_micros_)); + return m; + } + } +}; + +enum DriverStatus { + DRIVER_INIT, + RPC_0, + RPC_1, + DATA_NOT_READY, + DATA_READY, + DRIVER_ERROR +}; +struct DriverEntry { + public: + DriverEntry(const uint32& uid, + const std::string& parsedkey, + void* addr, + ibv_mr* mr, + int allocate_size); + + DriverEntry(); + + uint32 uinque_id_; + std::string parsed_key_; + std::atomic dri_status_; + // saved tensor data and string message + std::shared_ptr mem_mr_; + // uint32 prefix_msg_len_; + std::string prefix_msg_; + int allocate_size_ = 0; + // + uint32_t lkey_; + // + uint64_t addr_; + // record metag changed + bool meta_changed_ = false; + + + // allocate for send prefix string + std::shared_ptr send_mem_mr_; + + // for send tensor ref + TensorBuffer* src_buffer_ = nullptr; + // for send tensor smr_ + struct ibv_mr* smr_ = nullptr; // not owend + // can memcpy tensor + void* tensor_addr_ = nullptr; + + int local_allocate_size_ = 0; + + // allocate for send tensor + std::shared_ptr send_region_; + + uint64 send_micros_ = 0; +}; + +class RdmaSendDriverMgr { + friend class RdmaChannel; + friend class ChannelRecordTensorMetaData; + friend class RdmaAdapter; + + public: + RdmaSendDriverMgr(RdmaChannel* channel); + + size_t InitLocalDriverEntry(); + + void NotifyRemoteDriverEntry(); + + ~RdmaSendDriverMgr() { + } + + // send service update recv_entries_ + void RpcUpdateRemoteDriverEntry(const DriverMessageReq* request, + DriverMessageResp* response); + + // recv client update driver_entries_ + void RpcUpdateDriverEntries(const DriverMessageResp& resp); + + bool RpcReqResp(GrpcVerbsClient* client, const DriverMessageReq& req); + + void AllocateRecvEntriesStringMemoryAndRegion(); + + std::shared_ptr GetRecvEntry(const std::string& parsed_key, + bool* has_data); + + std::shared_ptr GetDriverEntry(const std::string& parsed_key, + bool* has_data); + + public: + std::atomic driver_mgr_is_ok_; + typedef std::unordered_map > EntryMapType; + // typedef absl::flat_hash_map > EntryMapType; + + protected: + RdmaChannel * channel_; + EntryMapType driver_entries_; + EntryMapType recv_entries_; +}; + +class ChannelRecordTensorMetaData { + public: + // typedef absl::flat_hash_map RecordMapType; + typedef std::unordered_map RecordMapType; + typedef std::unordered_map RecordMapUniIdType; + + ChannelRecordTensorMetaData(RdmaChannel* channel); + + static uint32 GetEnumSize(const DataType& date_type); + + static int GetTensorBytes(const TensorMetaData& m); + + void AllocateMemoryAndRegion(const string& key, + const TensorMetaData& m, + ibv_pd* pd, + void** addr, + ibv_mr** mr, + int* addr_size, + Allocator* alloc_attr = nullptr) const; + + void AllocateSendStringMemoryAndRegion(ibv_pd* pd, + void** addr, + ibv_mr** mr, + int* addr_size, + Allocator* alloc_attr = nullptr); + + void Record(const std::string& tensor_name, + const TensorMetaData& m); + + static StringPiece ConsumeNextPart(StringPiece* s, char delim); + + static string RegexEdgeName(const string & str); + + void InitMetaDataFromEnv(); + + const RecordMapType & GetChannelTensorsMetaData() { + return channel_tensors_meta_data_; + } + + const RecordMapUniIdType & GetChannelTensorsUidParsedkey() { + return channel_tensors_uid_parsed_key_; + } + + public: + RecordMapType channel_tensors_meta_data_; + + RecordMapUniIdType channel_tensors_uid_parsed_key_; + + private: + RdmaChannel* channel_; + mutex channel_tensor_meta_data_mu_; + // uid_ must less RDMA_IMM_MAX_REQUEST_ID + uint32 uid_ = RDMA_IMM_MIN_SENDMGR_BASE; +}; + +class RdmaMgr; class RdmaChannel { friend class RdmaAdapter; friend class RdmaMessageBuffer; @@ -411,10 +974,14 @@ class RdmaChannel { friend class RdmaTensorResponse; friend class RdmaMgr; friend class RdmaRemoteRendezvous; + friend class RdmaSendDriverMgr; + friend class ChannelRecordTensorMetaData; public: explicit RdmaChannel(const RdmaAdapter* adapter, const string local_name, - const string remote_name_); + const string remote_name_, GrpcChannelCache* rdma_mgr, + ibv_cq* cq); + ~RdmaChannel(); inline const RdmaAddress& self() { return self_; } RdmaAddress address() const; @@ -439,8 +1006,84 @@ class RdmaChannel { RdmaTensorResponse* UpdateTensorResponse(const RdmaMessage& rm); void RemoveTensorResponse(uint32_t request_index); - static const int kNumMessageBuffers = 2; + // static const int kNumMessageBuffers = 2; + static const int kNumMessageBuffers = Const_kNumMessageBuffers; static const int kPingRecvWrid = 0; + // CAT log + RdmaTensorRequest* GetTensorRequestForCat(uint32_t request_index); + + inline size_t Alloc(size_t size, void** p, ibv_mr** mr, + bool dynamic=false, size_t realloc_size=0) const; + bool FindLocalMr(const std::string& key, void** remote_bytes_addr, + ibv_mr** mr, int* length); + + inline void FindOrCreateRemoteBytesAddrMemoryRegion(const std::string& key, + void** remote_bytes_addr /*new*/, + ibv_mr** mr /*new*/, + std::shared_ptr * region, + size_t length, + const Allocator* alloc_attr = nullptr); + + size_t ChannelAllocateTensors(const string& key, const TensorMetaData& meta, + const Allocator* alloc_attr, ibv_mr** mr/*new*/, + std::shared_ptr * region, + void** rdma_addr /*new*/); + + GrpcChannelCache* GetChannelChache() { return channel_cache_; } + + std::shared_ptr GetRdmaSendDriverMgr() { + return rdma_send_driver_mgr_; + } + + // For tensor response + + // For Send Kernel op + void SendDriverData(const Tensor& in, + bool is_dead, + const std::string& name); + + // (1) enter + void InitAndSetDriverStatus(); + + void TestPleSendOrCheck() { + LOG(INFO) << "TestPleSendOrCheck begin..."; + PleSendOrCheck(); + } + + void FakeAllocateTest() { + Tensor fill_shape_tensor(DT_INT32, TensorShape({1})); + fill_shape_tensor.vec()(0) = 1; + // fill_shape_tensor.vec()(1) = 256; + // fill_shape_tensor.vec()(2) = 1024; + // fill_shape_tensor.vec()(3) = 1024; + auto flat = fill_shape_tensor.flat(); + auto ts = fill_shape_tensor.scalar(); + LOG(INFO) << "ts size:" << ts.size() + << " flat size:" << flat.size(); + for (int i = 0; i < flat.size(); ++i) { + // flat(i) = i; + LOG(INFO) << "ts " << i << " :" << ts(i); + } + } + + void PleSendOrCheck(); + + const TensorMetaData* GetTensorMetaData(const std::string& tensor_name); + + const TensorMetaData* SetTensorMetaData(const std::string& tensor_name, + DataType dtype, + const TensorShape& shape, + bool is_dead, size_t proto_size); + // Memory regions + ibv_mr* FindMemoryRegion(void* addr, size_t length); + + public: + bool could_send_driver_ = false; + string local_name_; + string remote_name_; + std::shared_ptr channel_record_; + std::shared_ptr rdma_send_driver_mgr_; + std::shared_ptr local_driver_buffer_mgr_; private: static const int kPingBuffSize = 1024; @@ -452,24 +1095,49 @@ class RdmaChannel { protected: const RdmaAdapter* adapter_; + RdmaMgr* rdma_mgr_; RdmaAddress self_; - string local_name_; - string remote_name_; ibv_qp* qp_; mutex mu_; - bool connected_ TF_GUARDED_BY(mu_) = false; - RdmaAddress remote_ TF_GUARDED_BY(mu_); - bool remote_set_ TF_GUARDED_BY(mu_) = false; + bool connected_ GUARDED_BY(mu_) = false; + RdmaAddress remote_ GUARDED_BY(mu_); + bool remote_set_ GUARDED_BY(mu_) = false; mutex ct_mu_; typedef std::unordered_map RequestTable; - RequestTable request_table_ TF_GUARDED_BY(ct_mu_); - uint32_t request_serial_ TF_GUARDED_BY(ct_mu_); + RequestTable request_table_ GUARDED_BY(ct_mu_); + typedef std::unordered_map ParsedKeyToIndex; + typedef std::unordered_map IndexToParsedKey; + + IndexToParsedKey req_table_idx_to_pkey_ GUARDED_BY(ct_mu_); + + uint32_t request_serial_ GUARDED_BY(ct_mu_); mutex responses_mu_; - typedef std::unordered_map ResponsesTable; - ResponsesTable responses_table_ TF_GUARDED_BY(responses_mu_); - RdmaMessageBuffer* tx_message_buffer_; - RdmaMessageBuffer* rx_message_buffer_; + typedef std::unordered_map > ResponsesTable; + ResponsesTable responses_table_ GUARDED_BY(responses_mu_); std::vector message_buffers_; + // for addr size + // Managed memory regions + mutex remote_bytes_addr_mu_; + typedef absl::flat_hash_map> MRegionType; + // typedef std::unordered_map> MRegionType; + MRegionType remote_bytes_addr_mrs_ GUARDED_BY(remote_bytes_addr_mu_); + + GrpcChannelCache* const channel_cache_; + + // meta record + mutex tensor_meta_data_mu_; + std::unordered_map tensors_meta_data_; + + // for mem allocator + Allocator* rdma_mem_allocator_; + RdmaMemoryMgr* rdma_memory_mgr_; + std::vector alloc_visitors_; + std::vector free_visitors_; + struct ibv_pd *pd_; // not owned + size_t pagesize_ = sysconf(_SC_PAGESIZE); }; // Class that represents a buffer for Rdma message sending. @@ -498,15 +1166,31 @@ class RdmaMessageBuffer { void EnqueueItem(string Item); void SendNextItem(); void CreateCPUBuffer(size_t size, bool lock = true); + void ChunkCreateCPUBuffer(size_t size, void* buffer, ibv_mr* mr, + bool lock = true); void SetRemoteMR(RemoteMR rmi, bool override); void Write(uint32_t imm_data, size_t buffer_size); + static void Write(const RdmaChannel* channel, uint32_t imm_data, size_t buffer_size, uint64_t src_addr, uint32_t lkey, uint64_t remote_addr, uint32_t rkey, RdmaWriteIDType write_type, void* write_context); - static void SendAck(const RdmaChannel* channel); + + static void WriteWithPrefix(const RdmaChannel* channel, uint32_t imm_data, + size_t buffer_size, uint64_t src_addr, + uint32_t lkey, uint64_t remote_addr, + uint32_t rkey, RdmaWriteIDType write_type, + void* write_context, uint64_t prefix_addr, + uint32_t prefix_lkey, size_t prefix_size); + + static void SendAck(const RdmaChannel* channel, int pair_index); + + public: + int pair_index_; + uint64_t rm_ack_micros_; protected: + int64 time_guard_; const RdmaChannel* channel_; void* buffer_ = nullptr; bool buffer_on_host_ = true; @@ -515,11 +1199,61 @@ class RdmaMessageBuffer { ibv_mr* self_ = nullptr; mutex mu_; RemoteMR remote_; - std::queue queue_ TF_GUARDED_BY(mu_); - BufferStatus local_status_ TF_GUARDED_BY(mu_) = none; - BufferStatus remote_status_ TF_GUARDED_BY(mu_) = none; + std::queue queue_ GUARDED_BY(mu_); + BufferStatus local_status_ GUARDED_BY(mu_) = none; + BufferStatus remote_status_ GUARDED_BY(mu_) = none; +}; + +class VerbsEnvRegistrar { + public: + static VerbsEnvRegistrar* Instance() { + static VerbsEnvRegistrar* instance_ = new VerbsEnvRegistrar(); + return instance_; + } + int RdmaCQpoolSize() { + return rdma_cqpool_size_; + } + + bool RdmaEnableSendDriven() { + return enable_send_driven_; + } + + int RdmaTensorBufferRatio() { + return rdma_tensor_buffer_ratio_; + } + + int RdmaCqNums() { + return rdma_cq_nums_; + } + int RdmaChunkSize() { + return rdma_chunk_size_; + } + + private: + VerbsEnvRegistrar() { + rdma_cqpool_size_ = RDMACQPOOLSIZE(); + CHECK(rdma_cqpool_size_ < 500 && rdma_cqpool_size_ >= 1) + << "rdma_cqpool_size_ must less 100 and greater 1"; + enable_send_driven_ = RDMAENABLESENDDRIERN() == 1 ? true : false; + + rdma_tensor_buffer_ratio_ = RDMATENSORBUFFERRATIO(); + CHECK(rdma_tensor_buffer_ratio_ < 100 && rdma_tensor_buffer_ratio_ >= 1) + << "rdma_tensor_buffer_ratio_ must less 100 and greater 1"; + + rdma_cq_nums_ = RDMACQNUMS(); + CHECK(rdma_cq_nums_ < 100 && rdma_cq_nums_ >= 1) + << "rdma_cq_nums_ must less 100 and greater 1"; + rdma_chunk_size_ = RDMACHUNKSIZE(); + } + + int rdma_cqpool_size_; + bool enable_send_driven_; + int rdma_tensor_buffer_ratio_; + int rdma_cq_nums_; + int rdma_chunk_size_; }; } // namespace tensorflow +#endif // TENSORFLOW_USE_VERBS #endif // TENSORFLOW_CONTRIB_VERBS_RDMA_H_ diff --git a/tensorflow_networking/verbs/rdma_mgr.cc b/tensorflow_networking/verbs/rdma_mgr.cc index 6d19758..300f16e 100644 --- a/tensorflow_networking/verbs/rdma_mgr.cc +++ b/tensorflow_networking/verbs/rdma_mgr.cc @@ -13,20 +13,18 @@ See the License for the specific language governing permissions and limitations under the License. ==============================================================================*/ +#ifdef TENSORFLOW_USE_VERBS + #include "tensorflow_networking/verbs/rdma_mgr.h" #include #include -#include "tensorflow/core/common_runtime/gpu/gpu_process_state.h" -#include "tensorflow/core/common_runtime/gpu/gpu_util.h" -#include "tensorflow/core/common_runtime/pool_allocator.h" -#include "tensorflow/core/common_runtime/process_state.h" +#include "tensorflow_networking/verbs/grpc_verbs_client.h" +#include "tensorflow_networking/verbs/verbs_service.pb.h" #include "tensorflow/core/distributed_runtime/rpc/grpc_worker_cache.h" #include "tensorflow/core/distributed_runtime/session_mgr.h" #include "tensorflow/core/framework/allocator_registry.h" #include "tensorflow/core/lib/core/status.h" #include "tensorflow/core/lib/strings/strcat.h" -#include "tensorflow_networking/verbs/grpc_verbs_client.h" -#include "tensorflow_networking/verbs/verbs_service.pb.h" namespace tensorflow { @@ -41,36 +39,70 @@ RdmaMgr::RdmaMgr(const WorkerEnv* const worker_env, std::vector workers; worker_env_->session_mgr->LegacySession()->worker_cache->ListWorkers( &workers); + num_remote_workers_ = workers.size() - 1; VLOG(2) << "rmda_mgr on local worker: " << local_worker_; + string other_worker_name = ""; + int worker_cq_num = 0; + int ps_cq_num = 0; for (size_t i = 0; i < workers.size(); i++) { if (local_worker_.compare(workers[i]) != 0) { + other_worker_name += ";" + workers[i]; + ibv_cq* cq = nullptr; + if (workers[i].find("worker") != string::npos) { + RDMA_LOG(2) << "Schedule CQ num For worker: " + << workers[i] + << " cq_num:" + << worker_cq_num % rdma_adapter_->cq_nums_; + cq = rdma_adapter_->cq_vec_[worker_cq_num % rdma_adapter_->cq_nums_]; + worker_cq_num++; + } else if (workers[i].find("ps") != string::npos) { + RDMA_LOG(2) << "Schedule CQ num For ps: " + << workers[i] + << " cq_num:" + << ps_cq_num % rdma_adapter_->cq_nums_; + cq = rdma_adapter_->cq_vec_[ps_cq_num % rdma_adapter_->cq_nums_]; + ps_cq_num++; + } else { + RDMA_LOG(2) << "Schedule CQ num For chief: " + << workers[i] + << " cq_num:" + << 0; + cq = rdma_adapter_->cq_vec_[0]; + } channel_table_.insert( {workers[i], - new RdmaChannel(rdma_adapter_, local_worker_, workers[i])}); + new RdmaChannel(rdma_adapter_, local_worker_, workers[i], channel_cache_, + cq)}); } } + LOG(INFO) << "local_worker: " << local_worker_ << " other_channel:" << other_worker_name; } // Setup Rdma channels between peers. // This is done at the beginning of the server setup. void RdmaMgr::SetupChannels() { + LOG(INFO) << "channel_table_size:" << channel_table_.size(); for (const auto& p : channel_table_) { string worker_name = p.first; RDMA_LOG(2) << "Connecting to remote node " << worker_name; + LOG(INFO) << "Connecting to remote node " << worker_name; RdmaChannel* rc = p.second; GetRemoteAddressRequest req; GetRemoteAddressResponse resp; // get the channel cache SharedGrpcChannelPtr client_channel = channel_cache_->FindWorkerChannel(worker_name); + + CHECK(client_channel != nullptr) << "target:" << worker_name << " client_channel is null!"; + GrpcVerbsClient* client = new GrpcVerbsClient(client_channel); CHECK(client != nullptr) << "No worker known as " << worker_name; // setting up request req.set_host_name(local_worker_); - Channel* channel_info = req.mutable_channel(); + ChannelInfo* channel_info = req.mutable_channel(); channel_info->set_lid(rc->self_.lid); channel_info->set_qpn(rc->self_.qpn); channel_info->set_psn(rc->self_.psn); @@ -101,12 +133,21 @@ void RdmaMgr::SetupChannels() { rc->SetRemoteAddress(ra, false); rc->Connect(); int i = 0; - int idx[] = {1, 0}; + // int idx[] = {1, 0}; + // {1, 0, 3, 2, 5, 4} + int idx[RdmaChannel::kNumMessageBuffers + 1]; + for (auto k = 0; k < RdmaChannel::kNumMessageBuffers; k = k + 2) { + // for (auto k=0; k<2; k=k+2) { + idx[k] = k+1; + idx[k+1] = k; + } + for (const auto& mr : resp.mr()) { // the connections are crossed, i.e. // local tx_message_buffer <---> remote rx_message_buffer_ // local rx_message_buffer <---> remote tx_message_buffer_ // hence idx[] = {1, 0}. + // LOG(ERROR) << "resp index:" << i << " local message_buffer idx:" << idx[i]; RdmaMessageBuffer* rb = rc->message_buffers_[idx[i]]; RemoteMR rmr; rmr.remote_addr = mr.remote_addr(); @@ -134,10 +175,11 @@ void RdmaMgr::SetupChannels() { bool RdmaMgr::ConnectivityCheck() { int i, rcnt = 0, scnt = 0; + int num_remote_workers = 0; for (const auto& p : channel_table_) { + num_remote_workers++; string worker_name = p.first; RdmaChannel* rc = p.second; - VLOG(2) << "Ping to " << worker_name; CHECK(rc->PingPostSend() == 0) << "Couldn't post send to " << worker_name << " with error: " << std::strerror(errno); @@ -145,38 +187,50 @@ bool RdmaMgr::ConnectivityCheck() { rc->Recv(); } } + LOG(INFO) << "PingPostSend num_remote_workers:" << num_remote_workers; - while (rcnt < num_remote_workers_ || scnt < num_remote_workers_) { - int ne; - do { - ne = ibv_poll_cq(rdma_adapter_->cq_, 2 * num_remote_workers_, - rdma_adapter_->wc_); - CHECK(ne >= 0) << "poll CQ failed " << ne << "with error" - << std::strerror(errno); - } while (ne < 1); - - for (i = 0; i < ne; ++i) { - ibv_wc_status s = rdma_adapter_->wc_[i].status; - // recv complete - if ((int)rdma_adapter_->wc_[i].wr_id == RdmaChannel::kPingRecvWrid) { - CHECK(s == IBV_WC_SUCCESS) - << ": " << ibv_wc_status_str(rdma_adapter_->wc_[i].status) << "(" - << rdma_adapter_->wc_[i].status << ") for PING_RECV_WRID"; - ++rcnt; - // send complete - } else { - RdmaChannel* rc = - reinterpret_cast(rdma_adapter_->wc_[i].wr_id); - CHECK(s == IBV_WC_SUCCESS) - << ": " << ibv_wc_status_str(rdma_adapter_->wc_[i].status) << "(" - << rdma_adapter_->wc_[i].status << ") to " << rc->remote_name_; - ++scnt; - } - } // for + while (rcnt < num_remote_workers || scnt < num_remote_workers) { + for (int j = 0; j < rdma_adapter_->cq_nums_; j++) { + int ne = 0; + int retry_times = 0; + do { + ne = ibv_poll_cq(rdma_adapter_->cq_vec_[j], 2 * num_remote_workers_, + rdma_adapter_->wc_vec_[j]); + CHECK(ne >= 0) << "poll CQ failed " << ne << "with error" + << std::strerror(errno); + retry_times ++; + if (retry_times > 10) { + break; + } + } while (ne < 1); + for (i = 0; i < ne; ++i) { + ibv_wc_status s = rdma_adapter_->wc_vec_[j][i].status; + // recv complete + if ((int)rdma_adapter_->wc_vec_[j][i].wr_id == RdmaChannel::kPingRecvWrid) { + CHECK(s == IBV_WC_SUCCESS) + << ": " << ibv_wc_status_str(rdma_adapter_->wc_vec_[j][i].status) << "(" + << rdma_adapter_->wc_vec_[j][i].status << ") for PING_RECV_WRID"; + ++rcnt; + // send complete + } else { + RdmaChannel* rc = + reinterpret_cast(rdma_adapter_->wc_vec_[j][i].wr_id); + CHECK(s == IBV_WC_SUCCESS) + << ": " << ibv_wc_status_str(rdma_adapter_->wc_vec_[j][i].status) << "(" + << rdma_adapter_->wc_vec_[j][i].status << ") to " << rc->remote_name_; + ++scnt; + } + } // for + } } // while + LOG(INFO) << "ConnectivityCheck:" + << num_remote_workers + << " rcnt:" << rcnt + << " scnt:" << scnt; + CHECK(rcnt == scnt) << "Connectivity check failed!"; rdma_adapter_->StartPolling(); - return (num_remote_workers_ == rcnt) && (num_remote_workers_ == scnt); + return rcnt == scnt; } RdmaMgr::~RdmaMgr() { @@ -192,10 +246,48 @@ RdmaMgr::~RdmaMgr() { // channel object that is connected to the named peer. RdmaChannel* RdmaMgr::FindChannel(const string& name) { ChannelTable::iterator iter = channel_table_.find(name); - CHECK(iter != channel_table_.end()); + CHECK(iter != channel_table_.end()) + << "name:" << name + << "table_name like:" + << channel_table_.begin()->first; return iter->second; } +bool RdmaMgr::NotifyAsyncAllocatorTest() { + for (const auto& p : channel_table_) { + string worker_name = p.first; + LOG(INFO) << "NotifyAsyncAllocator to remote node " << worker_name; + RdmaChannel* rc = p.second; + // 请 ps 端进行态空间分配,并同步静态空间给我 + rc->PleSendOrCheck(); + LOG(INFO) << "NotifyAsyncAllocator PleSendOrCheck to remote node" + << worker_name + << " Succeed!"; + } + return true; +} + +bool RdmaMgr::NotifyAsyncAllocator() { + for (const auto& p : channel_table_) { + string worker_name = p.first; + LOG(INFO) << "NotifyAsyncAllocator to remote node " << worker_name; + RdmaChannel* rc = p.second; + // 分配静态空间并且同步自身的静态空间给对方 + // TODO(wuyongyu02): change to large MR + rc->InitAndSetDriverStatus(); + LOG(INFO) << "NotifyAsyncAllocator InitAndSetDriverStatus to remote node " + << worker_name + << " Succeed!"; + // 请 ps 端进行态空间分配,并同步静态空间给我 + rc->PleSendOrCheck(); + LOG(INFO) << "NotifyAsyncAllocator PleSendOrCheck to remote node" + << worker_name + << " Succeed!"; + + } + return true; +} + bool IsGDRAvailable() { #if defined(__APPLE__) return false; @@ -236,8 +328,9 @@ int TryToReadNumaNode(ibv_device* device) { if (strings::safe_strto32(content, &value)) { if (value < 0) { LOG(INFO) << "Successful NUMA node read from SysFS had negative value (" - << value << "), but there must be at least one NUMA node" - ", so returning NUMA node zero"; + << value + << "), but there must be at least one NUMA node" + ", so returning NUMA node zero"; return 0; } LOG(INFO) << "NUMA node for device: " << device->name << " is " << value; @@ -254,26 +347,35 @@ void MRDeleter(ibv_mr* mr) { } void RdmaMgr::InitAllocators() { - static std::once_flag flag; - std::call_once( - flag, [this]() { RdmaMemoryMgr::Singleton().pd_ = rdma_adapter_->pd_; }); + // static std::once_flag flag; + // std::call_once( + // flag, [this]() { RdmaMemoryMgr::Singleton().pd_ = rdma_adapter_->pd_; }); } /*static*/ void RdmaMgr::RegMemVisitors() { + // SubAllocator::Visitor alloc_visitor = [](void* ptr, int numa_node, + // size_t num_bytes) { + // LOG(INFO) << "RdmaMgr alloc_visitor"; + // RdmaMemoryMgr::Singleton().InsertMemoryRegion( + // ptr, num_bytes, strings::StrCat("CPU:", numa_node)); + // }; + // SubAllocator::Visitor free_visitor = [](void* ptr, int numa_node, + // size_t num_bytes) { + // RdmaMemoryMgr::Singleton().EvictMemoryRegion(ptr, num_bytes); + // }; + + // LOG(INFO) << " ProcessState::singleton()->AddCPUAllocVisitor..."; + // ProcessState::singleton()->AddCPUAllocVisitor(alloc_visitor); + // ProcessState::singleton()->AddCPUFreeVisitor(free_visitor); + +#if GOOGLE_CUDA SubAllocator::Visitor alloc_visitor = [](void* ptr, int numa_node, size_t num_bytes) { - RdmaMemoryMgr::Singleton().InsertMemoryRegion( - ptr, num_bytes, strings::StrCat("CPU:", numa_node)); + LOG(ERROR) << "Rdma For GPU is not supported!"; }; SubAllocator::Visitor free_visitor = [](void* ptr, int numa_node, size_t num_bytes) { - RdmaMemoryMgr::Singleton().EvictMemoryRegion(ptr, num_bytes); }; - - ProcessState::singleton()->AddCPUAllocVisitor(alloc_visitor); - ProcessState::singleton()->AddCPUFreeVisitor(free_visitor); - -#if GOOGLE_CUDA GPUProcessState::singleton()->AddGpuHostAllocVisitor(0, alloc_visitor); GPUProcessState::singleton()->AddGpuHostFreeVisitor(0, free_visitor); @@ -289,8 +391,8 @@ void RdmaMgr::InitAllocators() { SubAllocator::Visitor cuda_alloc_visitor = [](void* ptr, int gpu_id, size_t num_bytes) { - RdmaMemoryMgr::Singleton().InsertMemoryRegion( - ptr, num_bytes, strings::StrCat("GPU:", gpu_id)); + // RdmaMemoryMgr::Singleton().InsertMemoryRegion( + // ptr, num_bytes, strings::StrCat("GPU:", gpu_id)); }; GPUProcessState::singleton()->AddGPUAllocVisitor(bus_id, cuda_alloc_visitor); @@ -300,3 +402,5 @@ void RdmaMgr::InitAllocators() { } } // end namespace tensorflow + +#endif diff --git a/tensorflow_networking/verbs/rdma_mgr.h b/tensorflow_networking/verbs/rdma_mgr.h index 06df124..9e3d9bd 100644 --- a/tensorflow_networking/verbs/rdma_mgr.h +++ b/tensorflow_networking/verbs/rdma_mgr.h @@ -16,18 +16,32 @@ limitations under the License. #ifndef TENSORFLOW_CONTRIB_VERBS_RDMA_MGR_H_ #define TENSORFLOW_CONTRIB_VERBS_RDMA_MGR_H_ +#ifdef TENSORFLOW_USE_VERBS + #include #include +#include "tensorflow_networking/verbs/rdma.h" #include "tensorflow/core/distributed_runtime/rpc/grpc_channel.h" #include "tensorflow/core/distributed_runtime/worker_env.h" -#include "tensorflow_networking/verbs/rdma.h" +// For timeline logger +#include "tensorflow/core/distributed_runtime/worker_cache_logger.h" +#include "tensorflow/core/common_runtime/bfc_allocator.h" +#include "tensorflow/core/common_runtime/pool_allocator.h" +#include "tensorflow/core/common_runtime/process_state.h" +#include "tensorflow/core/distributed_runtime/worker_cache_partial.h" +#include "tensorflow/core/common_runtime/gpu/gpu_process_state.h" +#include "tensorflow/core/common_runtime/gpu/gpu_util.h" namespace tensorflow { +class RdmaChannel; +class RdmaAdapter; +class RdmaTensorRequest; class RdmaMgr { friend class RdmaChannel; friend class RdmaAdapter; + friend class RdmaSendDriverMgr; public: explicit RdmaMgr(const WorkerEnv* const worker_env, @@ -40,17 +54,74 @@ class RdmaMgr { static void RegMemVisitors(); const string& local_worker() { return local_worker_; } - private: + bool NotifyAsyncAllocator(); + + bool NotifyAsyncAllocatorTest(); + + public: string local_worker_; - size_t num_remote_workers_; const WorkerEnv* const worker_env_; GrpcChannelCache* const channel_cache_; + + private: + size_t num_remote_workers_; RdmaAdapter* rdma_adapter_; typedef std::unordered_map ChannelTable; ChannelTable channel_table_; TF_DISALLOW_COPY_AND_ASSIGN(RdmaMgr); }; +class RdmaBasicCPUAllocator : public SubAllocator { + public: + RdmaBasicCPUAllocator(const std::vector& alloc_visitors, + const std::vector& free_visitors) : + SubAllocator(alloc_visitors, free_visitors) { + numa_node_ = port::kNUMANoAffinity; + } + + void* Alloc(size_t alignment, size_t num_bytes) override { + void* ptr = nullptr; + if (num_bytes > 0) { + if (numa_node_ == port::kNUMANoAffinity) { + ptr = port::AlignedMalloc(num_bytes, static_cast(alignment)); + } else { + ptr = + port::NUMAMalloc(numa_node_, num_bytes, static_cast(alignment)); + } + VisitAlloc(ptr, numa_node_, num_bytes); + } + return ptr; + } + + void Free(void* ptr, size_t num_bytes) override { + if (num_bytes > 0) { + VisitFree(ptr, numa_node_, num_bytes); + if (numa_node_ == port::kNUMANoAffinity) { + port::AlignedFree(ptr); + } else { + port::NUMAFree(ptr, num_bytes); + } + } + } + + private: + int numa_node_; + TF_DISALLOW_COPY_AND_ASSIGN(RdmaBasicCPUAllocator); +}; + +// TODO(wuyongyu02): remove this class and its registration when the default +// cpu_allocator() returns visitable allocator +class BFCRdmaAllocator : public BFCAllocator { + public: + BFCRdmaAllocator(const std::vector& alloc_visitors, + const std::vector& free_visitors) + : BFCAllocator(new RdmaBasicCPUAllocator(alloc_visitors, + free_visitors), 1LL << 36, true, "cpu_rdma_bfc") { + } +}; +// REGISTER_MEM_ALLOCATOR("BFCRdmaAllocator", 101, BFCRdmaAllocator); + } // namespace tensorflow +#endif // TENSORFLOW_USE_VERBS #endif // TENSORFLOW_CONTRIB_VERBS_RDMA_MGR_H_ diff --git a/tensorflow_networking/verbs/rdma_rendezvous_mgr.cc b/tensorflow_networking/verbs/rdma_rendezvous_mgr.cc index f9a1afa..d54d754 100644 --- a/tensorflow_networking/verbs/rdma_rendezvous_mgr.cc +++ b/tensorflow_networking/verbs/rdma_rendezvous_mgr.cc @@ -13,31 +13,39 @@ See the License for the specific language governing permissions and limitations under the License. ==============================================================================*/ +#ifdef TENSORFLOW_USE_VERBS + #include "tensorflow_networking/verbs/rdma_rendezvous_mgr.h" #include +#include "tensorflow_networking/verbs/verbs_util.h" #include "tensorflow/core/common_runtime/device.h" #include "tensorflow/core/common_runtime/device_mgr.h" #include "tensorflow/core/common_runtime/dma_helper.h" #include "tensorflow/core/lib/core/errors.h" #include "tensorflow/core/lib/strings/numbers.h" #include "tensorflow/core/lib/strings/str_util.h" -#include "tensorflow_networking/verbs/verbs_util.h" +#include "tensorflow/core/distributed_runtime/worker_cache_partial.h" namespace tensorflow { class RdmaRemoteRendezvous : public BaseRemoteRendezvous { public: RdmaRemoteRendezvous(const WorkerEnv* env, int64 step_id, RdmaMgr* rdma_mgr) - : BaseRemoteRendezvous(env, step_id), rdma_mgr_(rdma_mgr) {} + : BaseRemoteRendezvous(env, step_id) { + rdma_mgr_ = rdma_mgr; + } protected: void RecvFromRemoteAsync(const Rendezvous::ParsedKey& parsed, const Rendezvous::Args& args, DoneCallback done) override; + public: + RdmaMgr* rdma_mgr_; + private: ~RdmaRemoteRendezvous() override {} - RdmaMgr* rdma_mgr_; + TF_DISALLOW_COPY_AND_ASSIGN(RdmaRemoteRendezvous); }; @@ -59,8 +67,8 @@ void RdmaRemoteRendezvous::RecvFromRemoteAsync( done(s, Args(), recv_args, Tensor{}, false); return; } - CHECK(dst_name.compare(rdma_mgr_->local_worker()) == 0); - RdmaChannel* rc = rdma_mgr_->FindChannel(src_name); + CHECK(dst_name.compare(static_cast(rdma_mgr_)->local_worker()) == 0); + RdmaChannel* rc = static_cast(rdma_mgr_)->FindChannel(src_name); string key(parsed.FullKey()); string key_with_step_id = VerbsUtil::AppendStepidToKey(key, step_id_); @@ -72,6 +80,17 @@ void RdmaRemoteRendezvous::RecvFromRemoteAsync( return; } + uint64_t time_now = Env::Default()->NowMicros(); + + // Add to Channel LocalDriverBufferMgr + if (rc->could_send_driver_) { + RDMA_LOG(1) << "Recv From Local key:" << key << " will GetBufferMgr"; + rc->local_driver_buffer_mgr_->QueueLoadAsync( + key, recv_args, std::move(done), Env::Default()->NowMicros()); + return; + } + RDMA_LOG(1) << "Request start:" << key; + RdmaTensorRequest* request = rc->InsertTensorRequest(key, step_id_, dst_dev, recv_args, done); request->Start(); @@ -86,3 +105,5 @@ BaseRemoteRendezvous* RdmaRendezvousMgr::Create(int64 step_id, } } // end namespace tensorflow + +#endif diff --git a/tensorflow_networking/verbs/rdma_rendezvous_mgr.h b/tensorflow_networking/verbs/rdma_rendezvous_mgr.h index a750dbb..5455235 100644 --- a/tensorflow_networking/verbs/rdma_rendezvous_mgr.h +++ b/tensorflow_networking/verbs/rdma_rendezvous_mgr.h @@ -16,9 +16,12 @@ limitations under the License. #ifndef TENSORFLOW_CONTRIB_VERBS_RDMA_RENDEZVOUS_MGR_H_ #define TENSORFLOW_CONTRIB_VERBS_RDMA_RENDEZVOUS_MGR_H_ +#ifdef TENSORFLOW_USE_VERBS + #include "tensorflow/core/distributed_runtime/base_rendezvous_mgr.h" #include "tensorflow/core/distributed_runtime/worker_env.h" #include "tensorflow/core/platform/macros.h" +#include "tensorflow_networking/verbs/rdma.h" #include "tensorflow_networking/verbs/rdma_mgr.h" namespace tensorflow { @@ -46,6 +49,12 @@ class RdmaRendezvousMgr : public BaseRendezvousMgr { explicit RdmaRendezvousMgr(const WorkerEnv* env); void SetRdmaMgr(RdmaMgr* rdma_mgr) { rdma_mgr_ = rdma_mgr; } + bool NotifyAsyncAllocatorTest() { + rdma_mgr_->NotifyAsyncAllocator(); + } + + + protected: BaseRemoteRendezvous* Create(int64 step_id, const WorkerEnv* worker_env) override; @@ -57,4 +66,5 @@ class RdmaRendezvousMgr : public BaseRendezvousMgr { } // end namespace tensorflow +#endif // TENSORFLOW_USE_VERBS #endif // TENSORFLOW_CONTRIB_VERBS_RDMA_RENDEZVOUS_MGR_H_ diff --git a/tensorflow_networking/verbs/verbs_server_lib.cc b/tensorflow_networking/verbs/verbs_server_lib.cc index 103db21..74ab309 100644 --- a/tensorflow_networking/verbs/verbs_server_lib.cc +++ b/tensorflow_networking/verbs/verbs_server_lib.cc @@ -13,15 +13,17 @@ See the License for the specific language governing permissions and limitations under the License. ==============================================================================*/ +#ifdef TENSORFLOW_USE_VERBS + #include "tensorflow_networking/verbs/verbs_server_lib.h" #include "grpc/support/alloc.h" +#include "tensorflow_networking/verbs/rdma_mgr.h" +#include "tensorflow_networking/verbs/rdma_rendezvous_mgr.h" #include "tensorflow/core/distributed_runtime/server_lib.h" #include "tensorflow/core/lib/core/status.h" #include "tensorflow/core/platform/env.h" -#include "tensorflow_networking/verbs/rdma_mgr.h" -#include "tensorflow_networking/verbs/rdma_rendezvous_mgr.h" namespace tensorflow { @@ -41,7 +43,7 @@ VerbsServer::VerbsServer(const ServerDef& server_def, Env* env) VerbsServer::~VerbsServer() { TF_CHECK_OK(Stop()); TF_CHECK_OK(Join()); - delete rdma_mgr_; + //delete rdma_mgr_; delete verbs_service_; delete channel_cache_; } @@ -49,8 +51,8 @@ VerbsServer::~VerbsServer() { Status VerbsServer::ChannelCacheFactory(const ServerDef& server_def, GrpcChannelCache** channel_cache) { string name_prefix = - strings::StrCat("/job:", server_def.job_name(), "/replica:0", "/task:", - server_def.task_index()); + strings::StrCat("/job:", server_def.job_name(), "/replica:0", + "/task:", server_def.task_index()); GrpcChannelSpec channel_spec; TF_RETURN_IF_ERROR(ParseChannelSpec(server_def, &channel_spec)); @@ -59,6 +61,7 @@ Status VerbsServer::ChannelCacheFactory(const ServerDef& server_def, NewGrpcChannelCache(channel_spec, GetChannelCreationFunction()); const string host_port = (*channel_cache)->TranslateTask(name_prefix); + int requested_port; if (!strings::safe_strto32(str_util::Split(host_port, ':')[1], @@ -79,7 +82,6 @@ Status VerbsServer::ChannelCacheFactory(const ServerDef& server_def, Status VerbsServer::Init(ServiceInitFunction service_func, RendezvousMgrCreationFunction rendezvous_mgr_func) { std::call_once(reg_mem_visitors_call, []() { RdmaMgr::RegMemVisitors(); }); - GrpcServerOptions opts; opts.service_func = service_func; opts.rendezvous_mgr_func = rendezvous_mgr_func; @@ -88,12 +90,16 @@ Status VerbsServer::Init(ServiceInitFunction service_func, mutex_lock l(mu_); CHECK_EQ(verbs_state_, DISCONNECTED); CHECK(ChannelCacheFactory(server_def(), &channel_cache_).ok()); + LOG(INFO) << "ChannelCacheFactory init GrpcChannelCache End."; rdma_mgr_ = new RdmaMgr(worker_env(), channel_cache_); // set rdma_mgr for verbs_service and rdma_rendezvous_mgr verbs_service_->SetRdmaMgr(rdma_mgr_); + LOG(INFO) << "VerbsService SetRdmaMgr End."; dynamic_cast(worker_env()->rendezvous_mgr) ->SetRdmaMgr(rdma_mgr_); + LOG(INFO) << "RdmaRendezvousMgr SetRdmaMgr End."; } + LOG(INFO) << "VerbsServer::Init End."; return s; } @@ -107,10 +113,14 @@ Status VerbsServer::Start() { verbs_thread_.reset(worker_env()->env->StartThread( ThreadOptions(), "TF_verbs_service", [this] { verbs_service_->HandleRPCsLoop(); })); + LOG(INFO) << "Start SetupChannels begin:"; rdma_mgr_->SetupChannels(); + LOG(INFO) << "rdma_mgr_ SetupChannels succeed!"; CHECK(rdma_mgr_->ConnectivityCheck()) << "Connectivity check failed!"; + LOG(INFO) << "rdma_mgr_ Connectivity check succeed!"; rdma_mgr_->InitAllocators(); verbs_state_ = CONNECTED; + LOG(INFO) << "verbs state CONNECTED."; } } return s; @@ -171,3 +181,5 @@ static VerbsServerRegistrar registrar; } // namespace } // namespace tensorflow + +#endif diff --git a/tensorflow_networking/verbs/verbs_server_lib.h b/tensorflow_networking/verbs/verbs_server_lib.h index 2869be1..3662921 100644 --- a/tensorflow_networking/verbs/verbs_server_lib.h +++ b/tensorflow_networking/verbs/verbs_server_lib.h @@ -16,9 +16,11 @@ limitations under the License. #ifndef TENSORFLOW_CONTRIB_VERBS_VERBS_SERVER_LIB_H_ #define TENSORFLOW_CONTRIB_VERBS_VERBS_SERVER_LIB_H_ -#include "tensorflow/core/distributed_runtime/rpc/grpc_server_lib.h" +#ifdef TENSORFLOW_USE_VERBS + #include "tensorflow_networking/verbs/grpc_verbs_service.h" #include "tensorflow_networking/verbs/rdma_mgr.h" +#include "tensorflow/core/distributed_runtime/rpc/grpc_server_lib.h" namespace tensorflow { @@ -43,7 +45,7 @@ class VerbsServer : public GrpcServer { RendezvousMgrCreationFunction rendezvous_mgr_func); Status ChannelCacheFactory(const ServerDef& server_def, GrpcChannelCache** channel_cache); - + private: RdmaMgr* rdma_mgr_; @@ -51,13 +53,14 @@ class VerbsServer : public GrpcServer { mutex mu_; enum State { DISCONNECTED, CONNECTED }; - State verbs_state_ TF_GUARDED_BY(mu_); + State verbs_state_ GUARDED_BY(mu_); GrpcVerbsService* verbs_service_ = nullptr; - std::unique_ptr verbs_thread_ TF_GUARDED_BY(mu_); + std::unique_ptr verbs_thread_ GUARDED_BY(mu_); GrpcChannelCache* channel_cache_ = nullptr; }; } // namespace tensorflow +#endif // TENSORFLOW_USE_VERBS #endif // TENSORFLOW_CONTRIB_VERBS_VERBS_SERVER_LIB_H_ diff --git a/tensorflow_networking/verbs/verbs_service.proto b/tensorflow_networking/verbs/verbs_service.proto index abdae1d..3f68020 100644 --- a/tensorflow_networking/verbs/verbs_service.proto +++ b/tensorflow_networking/verbs/verbs_service.proto @@ -26,7 +26,7 @@ option java_package = "org.tensorflow.contrib.verbs"; // //////////////////////////////////////////////////////////////////////////////// -message Channel { +message ChannelInfo { int32 lid = 1; int32 qpn = 2; int32 psn = 3; @@ -40,16 +40,54 @@ message MemoryRegion { } message GetRemoteAddressRequest { string host_name = 1; - Channel channel = 2; + ChannelInfo channel = 2; repeated MemoryRegion mr = 3; } message GetRemoteAddressResponse { string host_name = 1; - Channel channel = 2; + ChannelInfo channel = 2; repeated MemoryRegion mr = 3; } +message DriverMessageItem { + enum DriverStatus { + DRIVER_INIT = 0; + RPC_0 = 1; + RPC_1 = 2; + DATA_NOT_READY = 4; + DATA_READY = 5; + DRIVER_ERROR = 6; + } + uint32 unique_id = 1; + string parsed_key = 2; + uint64 remote_addr = 3; + uint32 rkey = 4; + DriverStatus status=5; + int32 allocate_bytes=6; + bool meta_changed=7; +} + +message DriverMessageReq { + string host_name = 1; + repeated DriverMessageItem item = 2; +} + +message DriverMessageResp { + string host_name = 1; + repeated DriverMessageItem item = 2; +} + +message PleSendOrCheckReq { + string host_name = 1; +} + + +message PleSendOrCheckResp { + string host_name = 1; + bool is_ok = 2; +} + message ErrorStatusProto { int32 error_code = 1; string error_message = 2; @@ -65,4 +103,8 @@ message ErrorStatusProto { service VerbsService { rpc GetRemoteAddress(GetRemoteAddressRequest) returns (GetRemoteAddressResponse); + rpc ReqDriverMessage(DriverMessageReq) + returns (DriverMessageResp); + rpc ReqPleSendOrCheck(PleSendOrCheckReq) + returns (PleSendOrCheckResp); } diff --git a/tensorflow_networking/verbs/verbs_testlib.h b/tensorflow_networking/verbs/verbs_testlib.h new file mode 100644 index 0000000..a95cb9d --- /dev/null +++ b/tensorflow_networking/verbs/verbs_testlib.h @@ -0,0 +1,60 @@ +#pragma once +#include +#include +#include + +#include "tensorflow/core/framework/device_attributes.pb.h" +#include "tensorflow/core/lib/core/status.h" +#include "tensorflow/core/platform/macros.h" +#include "tensorflow/core/platform/subprocess.h" +#include "tensorflow/core/platform/test.h" +#include "tensorflow/core/platform/types.h" +#include "tensorflow/core/public/session_options.h" + +namespace tensorflow { + +class Device; + +namespace test { + +// Provides a handle to a set of TensorFlow servers (masters and +// workers) for testing purposes. +// +// This class currently runs the servers in separate processes; the +// lifetime of this object is coterminous with the lifetimes of those +// processes. +class TestCluster { + public: + // Creates a new test cluster based on the given `options` (which + // configure the number of devices of each type) and a count of + // processes `n`. On success, the test cluster is stored in + // *out_cluster, and this function returns OK. Otherwise an error is + // returned. + static Status MakeTestCluster(const SessionOptions& options, int n, + std::unique_ptr* out_cluster); + + // As above, but allows overridding the server binary path via `binary_path`. + static Status MakeTestCluster(const string& binary_path, + const SessionOptions& options, int n, + std::unique_ptr* out_cluster); + ~TestCluster(); + + // Returns a vector of string ":" pairs that may be + // used as targets to construct a GrpcSession. + const std::vector& targets() const { return targets_; } + + // Returns a vector of devices available in this test cluster. + const std::vector& devices() const { return devices_; } + + private: + TestCluster() = default; + + std::vector> subprocesses_; + std::vector targets_; + std::vector devices_; + + TF_DISALLOW_COPY_AND_ASSIGN(TestCluster); +}; + +} // end namespace test +} // end namespace tensorflow diff --git a/tensorflow_networking/verbs/verbs_testlib_server.cc b/tensorflow_networking/verbs/verbs_testlib_server.cc new file mode 100644 index 0000000..e11a823 --- /dev/null +++ b/tensorflow_networking/verbs/verbs_testlib_server.cc @@ -0,0 +1,102 @@ +#include + +#include "grpcpp/grpcpp.h" +#include "grpcpp/security/credentials.h" +#include "grpcpp/server_builder.h" + +#include "tensorflow/core/distributed_runtime/server_lib.h" + +#include "tensorflow/core/lib/core/errors.h" +#include "tensorflow/core/lib/core/status.h" +#include "tensorflow/core/lib/strings/str_util.h" +#include "tensorflow/core/lib/strings/strcat.h" +#include "tensorflow/core/platform/env.h" +#include "tensorflow/core/platform/init_main.h" +#include "tensorflow/core/protobuf/cluster.pb.h" +#include "tensorflow/core/public/session_options.h" +#include "tensorflow/core/util/command_line_flags.h" + +// This binary starts a TensorFlow verbs server (master and worker) for test purposes. +namespace tensorflow { +namespace { +Status FillServerDef(const string& job_spec, const string& job_name, + int num_cpus, int num_gpus, int task_index, + ServerDef* options) { + options->set_protocol("grpc+verbs"); + options->set_job_name(job_name); + options->set_task_index(task_index); + + uint32 my_tasks_per_replica = 0; + for (const string& job_str : str_util::Split(job_spec, ',')) { + JobDef* job_def = options->mutable_cluster()->add_job(); + // Split each entry in the flag into 2 pieces, separated by "|". + const std::vector job_pieces = str_util::Split(job_str, '|'); + CHECK_EQ(2, job_pieces.size()) << job_str; + job_def->set_name(job_pieces[0]); + // Does a bit more validation of the tasks_per_replica. + const StringPiece spec = job_pieces[1]; + // job_str is of form |. + const std::vector host_ports = str_util::Split(spec, ';'); + uint32 tasks_per_replica = host_ports.size(); + for (size_t i = 0; i < host_ports.size(); ++i) { + (*job_def->mutable_tasks())[i] = host_ports[i]; + } + if (job_def->name() == options->job_name()) { + my_tasks_per_replica = tasks_per_replica; + } + LOG(INFO) << "Peer " << job_def->name() << " " << tasks_per_replica << " {" + << absl::StrJoin(host_ports, ", ") << "}"; + } + if (my_tasks_per_replica == 0) { + return errors::InvalidArgument("Invalid job specification"); + } + ConfigProto* config = options->mutable_default_session_config(); + (*config->mutable_device_count())["CPU"] = num_cpus; + (*config->mutable_device_count())["GPU"] = num_gpus; + return Status::OK(); +} +} // namespace +} // namespace tensorflow + +int main(int argc, char** argv) { + tensorflow::port::InitMain(argv[0], &argc, &argv); + tensorflow::string job_spec; + tensorflow::string job_name; + int num_cpus = 1; + int num_gpus = 0; + int task_index = 0; + std::vector flag_list = { + tensorflow::Flag("tf_jobs", &job_spec, "job spec"), + tensorflow::Flag("tf_job", &job_name, "job name"), + tensorflow::Flag("tf_task", &task_index, "task index"), + tensorflow::Flag("num_cpus", &num_cpus, "number of CPUs"), + tensorflow::Flag("num_gpus", &num_gpus, "number of GPUs"), + }; + tensorflow::string usage = tensorflow::Flags::Usage(argv[0], flag_list); + const bool parse_result = tensorflow::Flags::Parse(&argc, argv, flag_list); + if (!parse_result || argc != 1) { + LOG(ERROR) << usage; + return -1; + } + tensorflow::ServerDef def; + tensorflow::Status s = tensorflow::FillServerDef(job_spec, job_name, num_cpus, + num_gpus, task_index, &def); + if (!s.ok()) { + LOG(ERROR) << "Could not parse job spec: " << s.error_message() << "\n" + << usage; + return -1; + } + std::unique_ptr svr; + s = tensorflow::NewServer(def, &svr); + + if (!s.ok()) { + LOG(ERROR) << "Could not create server: " << s.error_message(); + return -1; + } + TF_QCHECK_OK(svr->Start()); + TF_QCHECK_OK(svr->Join()); + return 0; +} + + + diff --git a/tensorflow_networking/verbs/verbs_util.cc b/tensorflow_networking/verbs/verbs_util.cc index 20d2c71..356f856 100644 --- a/tensorflow_networking/verbs/verbs_util.cc +++ b/tensorflow_networking/verbs/verbs_util.cc @@ -44,7 +44,7 @@ void VerbsUtil::GetKeyAndStepId(const string& key_with_step_id, string& key, CHECK(parts.size() == 6) << "Key with step_id must have 6 parts"; strings::safe_strto64(parts[5], &step_id); parts.pop_back(); // remove step_id - key.assign(str_util::Join(parts, ";")); // stitch them together + key.assign(absl::StrJoin(parts, ";")); // stitch them together } } // namespace tensorflow diff --git a/tensorflow_networking/verbs/verbs_util.h b/tensorflow_networking/verbs/verbs_util.h index 6277bc4..db76b2e 100644 --- a/tensorflow_networking/verbs/verbs_util.h +++ b/tensorflow_networking/verbs/verbs_util.h @@ -17,11 +17,106 @@ limitations under the License. #define TENSORFLOW_CONTRIB_VERBS_VERBS_UTIL_H_ #include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include #include "tensorflow/core/framework/types.h" namespace tensorflow { +namespace { +int RDMACQNUMS() { + const char* env_p = std::getenv("RDMA_CQ_NUMS"); + int nums = 1; + if (env_p != nullptr) { + std::stringstream ss(env_p); + ss >> nums; + } + LOG(INFO) << "RDMA_CQ_NUMS:" << nums; + return nums; +} +int RDMACQPOOLSIZE() { + const char* env_p = std::getenv("RDMA_CQPOOL_SIZE"); + int pool_size = 20; + if (env_p != nullptr) { + std::stringstream ss(env_p); + ss >> pool_size; + } + return pool_size; +} + +int RDMATENSORBUFFERRATIO() { + const char* env_p = std::getenv("RDMA_TENSOR_BUFFER_RATIO"); + int ratio = 5; + if (env_p != nullptr) { + std::stringstream ss(env_p); + ss >> ratio; + } + LOG(INFO) << "RDMA_TENSOR_BUFFER_RATIO:" << ratio; + return ratio; +} + +int RDMAENABLESENDDRIERN() { + const char* env_p = std::getenv("RDMASendDriver"); + int send_driver = 0; + if (env_p != nullptr) { + std::stringstream ss(env_p); + ss >> send_driver; + } + return send_driver; +} +int RDMACHUNKSIZE() { + const char* env_p = std::getenv("RDMAChunkSize"); + int chunk_size = 60*1024*1024; + if (env_p != nullptr) { + std::stringstream ss(env_p); + ss >> chunk_size; + } + return chunk_size; +} + +std::string GetMetaOutput() { + const char* env_p = std::getenv("RDMAMetaOutput"); + if (env_p != nullptr) { + return std::string(env_p); + } + return "viewfs://hadoop-meituan/user/hadoop-hdp/wuyongyu02/default_output"; +} + +std::string GetWorkerMetas() { + /* + edg_name#size|edg_name#size + */ + const char* env_p = std::getenv("RDMAWorkerMetas"); + if (env_p != nullptr) { + return std::string(env_p); + } + return "edge_6389_global_step;0:0#80"; +} + +} // end namespace class VerbsUtil { public: static string AppendStepidToKey(const string& key, int64 step_id); @@ -29,5 +124,104 @@ class VerbsUtil { int64& step_id); }; + +#define DIVUP(x, y) (((x)+(y)-1)/(y)) +#define ROUNDUP(x, y) (DIVUP((x), (y))*(y)) + +template +static inline T align_floor(T v, T align) { + return v - (v % align); +} + +template +static inline T align_ceil(T v, T align) { + return align_floor(v + align - 1, align); +} + +static inline size_t ib_allocate_size(size_t size) { + size_t page_size = 4096; + return ROUNDUP(size, page_size); +} + +static inline void ib_malloc(void** ptr, size_t* allocate_size, size_t size, + int minimum_alignment) { + void* p; + *allocate_size = size; + const int required_alignment = sizeof(void*); + if (minimum_alignment < required_alignment) { + p = malloc(size); + } else { + int err = posix_memalign(&p, minimum_alignment, size); + } + *ptr = p; +} + +class MemoryAllocator { + public: + explicit MemoryAllocator(struct ibv_pd *pd) { + std::lock_guard lk(mu_); + pd_ = pd; + } + + ~MemoryAllocator() { + std::lock_guard lk(mu_); + for(auto &it : mr_) { + ibv_dereg_mr(it.second); + free(it.first); + } + } + + char *Alloc(size_t size) { + if (size == 0) { + return nullptr; + } + + // align to page size (usually 4KB) + size = align_ceil(size, pagesize_); + + char *p; + size_t allocate_size = size; + ib_malloc((void**) &p, &allocate_size, size, 64); + CHECK(p); + + struct ibv_mr *mr; + CHECK(mr = ibv_reg_mr(pd_, p, size, IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_WRITE)); + + std::lock_guard lk(mu_); + mr_[p] = mr; + used_list.emplace(p, size); + + return p; + } + + uint32_t LocalKey(char *addr) { + return Addr2MR(addr)->lkey; + } + + uint32_t RemoteKey(char *addr) { + return Addr2MR(addr)->rkey; + } + + struct ibv_pd* GetPD() { + return pd_; + } + + private: + // convert the memory address to its associated RDMA memory region + inline struct ibv_mr* Addr2MR(char *addr) { + std::lock_guard lk(mu_); + auto it = mr_.find(addr); + CHECK(it != mr_.end()); + + return it->second; + } + + std::mutex mu_; + struct ibv_pd *pd_; + size_t pagesize_ = sysconf(_SC_PAGESIZE); + std::unordered_map used_list; + std::unordered_map mr_; +}; + } // namespace tensorflow #endif // TENSORFLOW_CONTRIB_VERBS_VERBS_UTIL_H_ diff --git a/tensorflow_networking/verbs/verbs_with_0_copies.xml b/tensorflow_networking/verbs/verbs_with_0_copies.xml index c3d79e7..16130a9 100644 --- a/tensorflow_networking/verbs/verbs_with_0_copies.xml +++ b/tensorflow_networking/verbs/verbs_with_0_copies.xml @@ -1 +1 @@ -7Vxtc9o4EP41zKQfmsGW3/hIgPQ60/RyIZ1rPzHClsFXY1FZEOivP8mW8ZsAB2yXtHQ6jb2SJXl3n0e7K6cdMFhsPhC4nD9gB/kdtetsOmDYUVVFUw32g0u2scRUu7FgRjxHdEoFY+8nEsKk28pzUJjrSDH2qbfMC20cBMimORkkBL/ku7nYz8+6hDNUEoxt6Jel/3oOnQup0u2mDX8hbzYXU1u6aJhC+/uM4FUg5uuowI3+xM0LmIwl+odz6OCXjAiMOmBAMKbx1WIzQD7XbaK2+Ln7Pa27dRMU0CoP6CB+Yg39FUqWHC2MbhNlRK+D+APdDrh7mXsUjZfQ5q0vzPxMNqcLn90p7NL1fH+AfUzYfYAD1ulOzIAIRZu9y1R2L8+cCuEFomTLumx2mo8fEf5kiduX1DhWIptn7GIkQigcYrYbOlUKuxB6kevIkqjI8NkMd463zqnK+LHihrtjL0rfQ9+bBR3QZz185NK0lV3NxM9olHAJg0Q2ppDQm3dJE1tatjUjjqbOS+tfTSKbEskK2lqYcsua+r6PbUgRJwIUhJiEt03OqfI5xyhwbp5Hn8d/P02eRv98GY2f37VhAam2c7PVBVDGTg5Elmtzu1OCv6NMi2FbaOru5iuBVQLp/fgFefwqRhnAalcC4B3jngPghGxrR/ATstfPkTs+IAqHkMKbC/GQ2oD3ZenEsGNah+/ZNeTbLrTnqHkAPiHYLuxlHAjKVCBjg8q0+XsBGahtAlkxjkcryGGRnLjFhM7xDAfQH6XSu7yWMxr9D1G6FcEoXFHMROkInzBein579RjiFbGTEFIsjW3oM5R002IZX+NBbRPkQ+qt89HoWZpTG6LAiCQGBMUgJShc4iBshRvs9SfGDX4/3AZ2s7QbUcAAL7dRGsKvH79wyCPisWd/8hf/6EZv/2PlEeTsffs3B3dT+aX7tv4r0M20RbZfszff+GC3Or/dePRr7u6bmKgaJ2hlTkhS5fo4QTz6iD22lJ0pe728KU2jYKF4UeKp1Eh9QuA2023JO4QH5jELO4RZyECP9E9SttRH4hWkHrPTSTXm00rMd++RkD57Cw7cjjngf1UDLjjggmm4LPJGjN4kwhvMYTBDDmMccF8V53O8mK7C4xh3GHvY1MOMjYbMbzjCLgP3LW/zvePbfDiHS37p+mjT5xWfCKyOuBzaPgxDzz5Ioa5lI9vOIr5bRnxJzVNL1/TKiLckQYBaEfAZXesSVSeyM3kBFCI6tcgL8euUeKE0kNbt3jK/MUxLUSxD10wjP65SjW9OgHjiHm35y5k+Id0FuhflFIbSu1WtHlAVy1KApqs5U2pFU1Z1kcPDgoo70ikeUi5zfkNhyUl4OJh3gbypRUFTUuMUMeTQZoZHTH7Hudbj4aloWHiOE8Unsi0gH7M0wd+gzN+axH3UOqK28ob7Gf++qraK8U6bqpblw00VQqJM75GFyfI0r61yMM/+5MFadgVPwwc+xAvxorz0Jq7SdaITI8qM/e61S3/zqZsh8cvmQjigH9+S28zlRMK2i+2q7tWqKdmrW8rYrKIFtWr74/6M7Zwd1CwZdFcaztDBm4eJ1mqFA1Q4fn0jmU6yn+WQYlZESjtRrVpIdTTzxDi2OJBe9IWaaimleZR6ayOgQj29EfeTlNbOdT+j7H7gstzvcPZjgkaSqtIXEPUlVaC8JdR9rDqIo7Xf6VRVUlqMI2uCN9soQOXnDPcOyh4veJWOF0oDyyLj6PTkY7BmYGMXDs+p+IGu7/NPl45Fxa9S0ZEz1aHkdJf7aeBE77rAayReGoX0tEzjzQfxxePWdoP4JN6UAHyuVIKAyNFlCEeMSEnGUHzEPXY6vVbAXMX2ghkT6Ondc5QevFf3GZ05HnH9aHebe46DgibKBoqp5yzbKxt299Fb1rDFHOAku6pN2ZV/J/EnW9U0jlq115RRZamEYO0iL7o4CiDsHWmldgSu2+1y8ihBdvjQnzyMxuP+h9Ek/1Vcxt7xyCUWnjbgBRdWBwRWnqoVyTeq0uiyjkKgVq65Nmf8h9FzfzLss3+eRuPHvz+PR1cHkDgA0Np0AFm9rXH0XwnggP21Vglg/0lALfYv1s+vFpdY3NDbtHhT2XfNSXUi+LhYxP2ruCF3Qv47M8VRBQO9yvsOJYiyVLLm9773kO+E+VfPLpBulyzPHaSp7sRjXrqJRQFciMaQouWE2V70XGCKJtBxiBB8R9v4in+mPYk/0z6SGZ9w6nUMuTwvNqaGbnTKNUDXVaMa4KVh2MhjWJV86QRkGbZeB4ab/tWihjAsg1m31PvPBbpUPweBfoXtebAFkmCrOdjKvk+8wvYPhO3r9ucrsk9Att7mhpyMcUV2ceqC818+viueVF1xtwd3hqR2XRfu2G36XxzEJ8/p/yMBRv8D +7Vxtc9o4EP41zKQfmsGW3/hIgPQ60/RyIZ1rPzHClsFXY1FZEOivP8mW8ZsAB2yXtHQ6jb2SJXl3n0e7K6cdMFhsPhC4nD9gB/kdtetsOmDYUVVFUw32g0u2scRUu7FgRjxHdEoFY+8nEsKk28pzUJjrSDH2qbfMC20cBMimORkkBL/ku7nYz8+6hDNUEoxt6Jel/3oOnQup0u2mDX8hbzYXU1u6aJhC+/uM4FUg5uuowI3+xM0LmIwl+odz6OCXjAiMOmBAMKbx1WIzQD7XbaK2+Ln7Pa27dRMU0CoP6CB+Yg39FUqWHC2MbhNlRK+D+APdDrh7mXsUjZfQ5q0vzPxMNqcLn90p7NL1fH+AfUzYfYAD1ulOzIAIRZu9y1R2L8+cCuEFomTLumx2mo8fEf5kiduX1DhWIptn7GIkQigcYrYbOlUKuxB6kevIkqjI8NkMd463zqnK+LHihrtjL0rfQ9+bBR3QZz185NK0lV3NxM9olHAJg0Q2ppDQm3dJE1tatjUjjqbOS+tfTSKbEskK2lqYcsua+r6PbUgRJwIUhJiEt03OqfI5xyhwbp5Hn8d/P02eRv98GY2f37VhAam2c7PVBVDGTg5Elmtzu1OCv6NMi2FbaOru5iuBVQLp/fgFefwqRhnAalcC4B3jngPghGxrR/ATstfPkTs+IAqHkMKbC/GQ2oD3ZenEsGNah+/ZNeTbLrTnqHkAPiHYLuxlHAjKVCBjg8q0+XsBGahtAlkxjkcryGGRnLjFhM7xDAfQH6XSu7yWMxr9D1G6FcEoXFHMROkInzBein579RjiFbGTEFIsjW3oM5R002IZX+NBbRPkQ+qt89HoWZpTG6LAiCQGBMUgJShc4iBshRvs9SfGDX4/3AZ2s7QbUcAAL7dRGsKvH79wyCPisWd/8hf/6EZv/2PlEeTsffs3B3dT+aX7tv4r0M20RbZfszff+GC3Or/dePRr7u6bmKgaJ2hlTkhS5fo4QTz6iD22lJ0pe728KU2jYKF4UeKp1Eh9QuA2023JO4QH5jELO4RZyECP9E9SttRH4hWkHrPTSTXm00rMd++RkD57Cw7cjjngf1UDLjjggmm4LPJGjN4kwhvMYTBDDmMccF8V53O8mK7C4xh3GHvY1MOMjYbMbzjCLgP3LW/zvePbfDiHS37p+mjT5xWfCKyOuBzaPgxDzz5Ioa5lI9vOIr5bRnxJzVNL1/TKiLckQYBaEfAZXesSVSeyM3kBFCI6tcgL8euUeKE0kNbt3jK/MUxLUSxD10wjP65SjW9OgHjiHm35y5k+Id0FuhflFIbSu1WtHlAVy1KApqs5U2pFU1Z1kcPDgoo70ikeUi5zfkNhyUl4OJh3gbypRUFTUuMUMeTQZoZHTH7Hudbj4aloWHiOE8Unsi0gH7M0wd+gzN+axH3UOqK28ob7Gf++qraK8U6bqpblw00VQqJM75GFyfI0r61yMM/+5MFadgVPwwc+xAvxorz0Jq7SdaITI8qM/e61S3/zqZsh8cvmQjigH9+S28zlRMK2i+2q7tWqKdmrW8rYrKIFtWr74/6M7Zwd1CwZdFcaztDBm4eJ1mqFA1Q4fn0jmU6yn+WQYlZESjtRrVpIdTTzxDi2OJBe9IWaaimleZR6ayOgQj29EfeTlNbOdT+j7H7gstzvcPZjgkaSqtIXEPUlVaC8JdR9rDqIo7Xf6VRVUlqMI2uCN9soQOXnDPcOyh4veJWOF0oDyyLj6PTkY7BmYGMXDs+p+IGu7/NPl45Fxa9S0ZEz1aHkdJf7aeBE77rAayReGoX0tEzjzQfxxePWdoP4JN6UAHyuVIKAyNFlCEeMSEnGUHzEPXY6vVbAXMX2ghkT6Ondc5QevFf3GZ05HnH9aHebe46DgibKBoqp5yzbKxt299Fb1rDFHOAku6pN2ZV/J/EnW9U0jlq115RRZamEYO0iL7o4CiDsHWmldgSu2+1y8ihBdvjQnzyMxuP+h9Ek/1Vcxt7xyCUWnjbgBRdWBwRWnqoVyTeq0uiyjkKgVq65Nmf8h9FzfzLss3+eRuPHvz+PR1cHkDgA0Np0AFm9rXH0XwnggP21Vglg/0lALfYv1s+vFpdY3NDbtHhT2XfNSXUi+LhYxP2ruCF3Qv47M8VRBQO9yvsOJYiyVLLm9773kO+E+VfPLpBulyzPHaSp7sRjXrqJRQFciMaQouWE2V70XGCKJtBxiBB8R9v4in+mPYk/0z6SGZ9w6nUMuTwvNqaGbnTKNUDXVaMa4KVh2MhjWJV86QRkGbZeB4ab/tWihjAsg1m31PvPBbpUPweBfoXtebAFkmCrOdjKvk+8wvYPhO3r9ucrsk9Att7mhpyMcUV2ceqC818+viueVF1xtwd3hqR2XRfu2G36XxzEJ8/p/yMBRv8D \ No newline at end of file diff --git a/tensorflow_networking/verbs/verbs_with_0_copies_phase1_protocol.xml b/tensorflow_networking/verbs/verbs_with_0_copies_phase1_protocol.xml index c6b49d7..484e7c7 100644 --- a/tensorflow_networking/verbs/verbs_with_0_copies_phase1_protocol.xml +++ b/tensorflow_networking/verbs/verbs_with_0_copies_phase1_protocol.xml @@ -1 +1 @@ -7Vxbc5s4FP41nuk+pMMd8ujYTrYzTZqNk9nmyaOAbNhgRIWc2P31K4G4yzZ1AHtaZzJjOBLS8TnnOzeRDNTRcn2DQejeIgf6A0Vy1gN1PFAUWVMM+sEom4RiGpcJYYE9h0/KCVPvJ+REiVNXngOj0kSCkE+8sEy0URBAm5RoAGP0Xp42R3551xAsYI0wtYFfp/7rOcTlVFmS8oG/obdw+daWzgdegP26wGgV8P0GijqPf5LhJUjX4vMjFzjovUBSJwN1hBEiydVyPYI+k20qtuS56y2jGd8YBqTJA1bywBvwVzDl2PDpo1eO98b4IxsuE+PHijF1ReCaXADfWwQDdUhn+HBO8lF6teCf8SpRCIKUNiUAk09/pUOUqeJogRxvXaa2z01Ke8ECDnpkDCxDehG8ROzjgs4c+j6yAYGPMIgQjkoC64WBKQycT4+Tu+m3h9nD5J+nyfSxax62a6K0m1LaRYlxBpklS3T43fUInIbAZqPv1C9RmkuWPr2Ts6ffIKZMbQWLnEGQujaIlpDgDZ3CH7A4aLlTkw1+/567CCX1EG7BO2RuA3C3tMiWzqFJLzg6xUhNPUbrUH2A9ltia7eQgDEgoHOTa6juNnZjBj2GoE9M1UHc/X4xZq+erq8nDLPT+29308nWTU8LRqrSJ4xkQwCjikCgQ5MBfoswcdECBcCf5NSrssgK4vkPErLh+QxYEURJ+QpfEQpLYmQb7RYi5QutsJ3O4rzSQLqA6TRNLGwMfUC8t/L6H5Kc0pEDivHiON/wU+hQyDzAKERBBLsHDfN8XylM/WG0Cezu9/syj/XyY+VhZjvDPgP7gKGsJRL7LiMUbm7unx7R6F6UXT2dUp7XqSCmEHuUsZ/wqN/45HMnQztq8qQfw8lT0eDN9+LNM1vss85u1x75Xrp75hsdGBq0emhIqvAPhAb+6D3y6M6ZKi+VsipNo6KhhAf+VK6kIcZgU5gWsglR831Us1LL7plvWLvnW9rO+fQi4Ti3sEyGzQKmVguY1x6OyKO3hMyZmCP2W/MyBbeQYDdNy0cuCBbQoX5GvW6KchctX1bRfoQ7NCTZxEPU2YypWTFEdoH6nnO9y/25XuSCkF3Ofbgess5RDFWHX45tH0SRZ5eFNfd8f4R8hOMl0gZPAe9SHe+HodoS5HuKWOIFieoCgaa0D2K/mrwrVewn3NewX1tI1aXPpiwZpiXLlqFrplFeV27mUw6AZWoEfVlFi/5cGhxR9bpx+VmxLlVFtixZ1XSlpDCtqrCmhrB7WbVhbDnEDtSaHTzDqGYKLA8rKzoiGL3CVNUBCmBF+5zEk7exTfUMKf2KuVKPlRt8YOk5TpxpiJxzOfvowherdV+sCcxHaSP/qofCO/T7itqSjihqUYOjq+KKFUD3KBSW7H2VQBfbUqgiAw/jW7bCO6bap5+fkr7cID5BIlSrv8r4iVVThsDAusurVH1/BO2zvOI1VJZwHRx0FVMQdLsposyqBrVmgW57EfWRUGjWFLqlIXdidq/12kVQ6xlDTSKnXU+kAaZk4KZY5P1klXKloNDMA/PI6kJ6VeMtdSVq+8jtdg3UBgcUnRiZoEl1oJEZdSNTj2pku2sMU+2kdEnbSR2ULmrdX7d9FjxK8qLf6ShY0Fo78FSmvyOWETtiubl/aqSHftgaw0h45LGbq/hJWqzte+TEEm3rmHl2muwIYO7KjYDA62ERziF1p7ggdbbiFqEfXpfTUsr2ggUl6PndY5zBXyjbNIgoY3M/jmQuLdth0E2JXlLsZUO9VrP0g9QqakC2olb2FsifrNRqedCrVkXFAQ9vVSc3R3EaYWfpWK5ImphJEuOxBtnx7XB2O5lOhzeTWfntvILCk5VrLvWlAzM4sZ5b1mNLD5gtgfJFOWYbTTet3t/sTvnZa15n5W9Tvqr1qXxRO6xz5Sfv+J21L9C+1iv0t/fbW9F+loLHK1T71tloVe1na8hydr1Pa+iqMm+54E4JX5bLZH4TE2UGyv6Spboq902/ZH27akDRAUzL3/vag74Tlb96kUGyCeFAGfHOAIzIzKNWuk5IAVjywYjAcEZ1z2cuEYEz4DiYE17hJrmidgRmDiBgb/F7wOHTPuRSzTnGi6EbA1EXULHtE8SwXMawInhvSBVl8nobGO76j6I6wrAIZlJt9p8LdKF8dgL9DNuPwVYVJGLdwVb0tt8Ztn8gbD8Un7e+kXvG/i9hX+8zZKdrnLFf3boCj9P3AA2PAs+424I7Q9D0bgt39Db/1wTJuXX+/x/Uyf8= +7Vxbc5s4FP41nuk+pMMd8ujYTrYzTZqNk9nmyaOAbNhgRIWc2P31K4G4yzZ1AHtaZzJjOBLS8TnnOzeRDNTRcn2DQejeIgf6A0Vy1gN1PFAUWVMM+sEom4RiGpcJYYE9h0/KCVPvJ+REiVNXngOj0kSCkE+8sEy0URBAm5RoAGP0Xp42R3551xAsYI0wtYFfp/7rOcTlVFmS8oG/obdw+daWzgdegP26wGgV8P0GijqPf5LhJUjX4vMjFzjovUBSJwN1hBEiydVyPYI+k20qtuS56y2jGd8YBqTJA1bywBvwVzDl2PDpo1eO98b4IxsuE+PHijF1ReCaXADfWwQDdUhn+HBO8lF6teCf8SpRCIKUNiUAk09/pUOUqeJogRxvXaa2z01Ke8ECDnpkDCxDehG8ROzjgs4c+j6yAYGPMIgQjkoC64WBKQycT4+Tu+m3h9nD5J+nyfSxax62a6K0m1LaRYlxBpklS3T43fUInIbAZqPv1C9RmkuWPr2Ts6ffIKZMbQWLnEGQujaIlpDgDZ3CH7A4aLlTkw1+/567CCX1EG7BO2RuA3C3tMiWzqFJLzg6xUhNPUbrUH2A9ltia7eQgDEgoHOTa6juNnZjBj2GoE9M1UHc/X4xZq+erq8nDLPT+29308nWTU8LRqrSJ4xkQwCjikCgQ5MBfoswcdECBcCf5NSrssgK4vkPErLh+QxYEURJ+QpfEQpLYmQb7RYi5QutsJ3O4rzSQLqA6TRNLGwMfUC8t/L6H5Kc0pEDivHiON/wU+hQyDzAKERBBLsHDfN8XylM/WG0Cezu9/syj/XyY+VhZjvDPgP7gKGsJRL7LiMUbm7unx7R6F6UXT2dUp7XqSCmEHuUsZ/wqN/45HMnQztq8qQfw8lT0eDN9+LNM1vss85u1x75Xrp75hsdGBq0emhIqvAPhAb+6D3y6M6ZKi+VsipNo6KhhAf+VK6kIcZgU5gWsglR831Us1LL7plvWLvnW9rO+fQi4Ti3sEyGzQKmVguY1x6OyKO3hMyZmCP2W/MyBbeQYDdNy0cuCBbQoX5GvW6KchctX1bRfoQ7NCTZxEPU2YypWTFEdoH6nnO9y/25XuSCkF3Ofbgess5RDFWHX45tH0SRZ5eFNfd8f4R8hOMl0gZPAe9SHe+HodoS5HuKWOIFieoCgaa0D2K/mrwrVewn3NewX1tI1aXPpiwZpiXLlqFrplFeV27mUw6AZWoEfVlFi/5cGhxR9bpx+VmxLlVFtixZ1XSlpDCtqrCmhrB7WbVhbDnEDtSaHTzDqGYKLA8rKzoiGL3CVNUBCmBF+5zEk7exTfUMKf2KuVKPlRt8YOk5TpxpiJxzOfvowherdV+sCcxHaSP/qofCO/T7itqSjihqUYOjq+KKFUD3KBSW7H2VQBfbUqgiAw/jW7bCO6bap5+fkr7cID5BIlSrv8r4iVVThsDAusurVH1/BO2zvOI1VJZwHRx0FVMQdLsposyqBrVmgW57EfWRUGjWFLqlIXdidq/12kVQ6xlDTSKnXU+kAaZk4KZY5P1klXKloNDMA/PI6kJ6VeMtdSVq+8jtdg3UBgcUnRiZoEl1oJEZdSNTj2pku2sMU+2kdEnbSR2ULmrdX7d9FjxK8qLf6ShY0Fo78FSmvyOWETtiubl/aqSHftgaw0h45LGbq/hJWqzte+TEEm3rmHl2muwIYO7KjYDA62ERziF1p7ggdbbiFqEfXpfTUsr2ggUl6PndY5zBXyjbNIgoY3M/jmQuLdth0E2JXlLsZUO9VrP0g9QqakC2olb2FsifrNRqedCrVkXFAQ9vVSc3R3EaYWfpWK5ImphJEuOxBtnx7XB2O5lOhzeTWfntvILCk5VrLvWlAzM4sZ5b1mNLD5gtgfJFOWYbTTet3t/sTvnZa15n5W9Tvqr1qXxRO6xz5Sfv+J21L9C+1iv0t/fbW9F+loLHK1T71tloVe1na8hydr1Pa+iqMm+54E4JX5bLZH4TE2UGyv6Spboq902/ZH27akDRAUzL3/vag74Tlb96kUGyCeFAGfHOAIzIzKNWuk5IAVjywYjAcEZ1z2cuEYEz4DiYE17hJrmidgRmDiBgb/F7wOHTPuRSzTnGi6EbA1EXULHtE8SwXMawInhvSBVl8nobGO76j6I6wrAIZlJt9p8LdKF8dgL9DNuPwVYVJGLdwVb0tt8Ztn8gbD8Un7e+kXvG/i9hX+8zZKdrnLFf3boCj9P3AA2PAs+424I7Q9D0bgt39Db/1wTJuXX+/x/Uyf8= \ No newline at end of file