diff --git a/tensorflow_networking/verbs/BUILD b/tensorflow_networking/verbs/BUILD
index 88a93eb..3e50adc 100644
--- a/tensorflow_networking/verbs/BUILD
+++ b/tensorflow_networking/verbs/BUILD
@@ -1,13 +1,41 @@
 # Description:
 #   Verbs RDMA communication interfaces and implementations for TensorFlow.
 
-package(default_visibility = [
-    "//tensorflow_networking:__subpackages__",
-])
+load("@org_tensorflow//tensorflow:tensorflow.bzl", "tf_cuda_library")
 
-licenses(["notice"])  # Apache 2.0
+# For platform specific build config
+load(
+    "@org_tensorflow//tensorflow/core/platform:default/build_config.bzl",
+    "tf_proto_library_cc",
+)
 
-load("@org_tensorflow//tensorflow:tensorflow.bzl", "tf_cuda_library")
+load(
+    "@org_tensorflow//tensorflow:tensorflow.bzl",
+    "tf_cc_binary",
+    "tf_cc_test",
+    "tf_cuda_library",
+)
+
+load("@org_tensorflow//tensorflow:tensorflow.bzl", "tf_cuda_cc_test")
+load("@org_tensorflow//tensorflow:tensorflow.bzl", "tf_cuda_cc_tests")
+
+# For platform specific build config
+load(
+    "@org_tensorflow//tensorflow/core/platform:default/build_config.bzl",
+    "tf_kernel_tests_linkstatic",
+)
+
+load(
+    "@org_tensorflow//tensorflow/core/platform:default/build_config_root.bzl",
+    "tf_cuda_tests_tags",
+)
+
+package(
+    default_visibility = [
+        "//tensorflow_networking:__subpackages__",
+    ],
+    licenses = ["notice"],  # Apache 2.0
+)
 
 exports_files(["LICENSE"])
 
@@ -19,12 +47,6 @@ filegroup(
     ]),
 )
 
-# For platform specific build config
-load(
-    "@org_tensorflow//tensorflow/core:platform/default/build_config.bzl",
-    "tf_proto_library_cc",
-)
-
 tf_proto_library_cc(
     name = "verbs_service_proto",
     srcs = ["verbs_service.proto"],
@@ -43,6 +65,10 @@ cc_library(
         "@org_tensorflow//tensorflow/core:framework",
         "@org_tensorflow//tensorflow/core:lib",
     ],
+    linkopts = select({
+        "@org_tensorflow//tensorflow:with_verbs_support": ["-libverbs"],
+        "//conditions:default": [],
+    }),
 )
 
 cc_library(
@@ -52,9 +78,10 @@ cc_library(
     deps = [
         ":grpc_verbs_service_impl",
         ":rdma_mgr",
+        ":rdma",
         ":verbs_service_proto_cc",
         "@org_tensorflow//tensorflow:grpc++",
-        "@org_tensorflow//tensorflow/core:lib",
+        #"@org_tensorflow//tensorflow/core:lib_internal",
         "@org_tensorflow//tensorflow/core/distributed_runtime:session_mgr",
         "@org_tensorflow//tensorflow/core/distributed_runtime/rpc:async_service_interface",
         "@org_tensorflow//tensorflow/core/distributed_runtime/rpc:grpc_call",
@@ -77,6 +104,7 @@ cc_library(
     name = "grpc_verbs_client",
     srcs = ["grpc_verbs_client.cc"],
     hdrs = ["grpc_verbs_client.h"],
+    copts = ["-Og", "-g3"],
     deps = [
         ":grpc_verbs_service_impl",
         ":verbs_service_proto_cc",
@@ -90,27 +118,34 @@ cc_library(
 cc_library(
     name = "rdma_rendezvous_mgr",
     srcs = ["rdma_rendezvous_mgr.cc"],
-    hdrs = ["rdma_rendezvous_mgr.h"],
+    hdrs = ["rdma_rendezvous_mgr.h", "rdma.h"],
+    copts = ["-Og", "-g3"],
     deps = [
         ":rdma_mgr",
         ":verbs_util",
-        "@org_tensorflow//tensorflow/core",
+        #"@org_tensorflow//tensorflow/core:core_cpu_internal",
+        #"@org_tensorflow//tensorflow/core:gpu_runtime",
         "@org_tensorflow//tensorflow/core:lib",
         "@org_tensorflow//tensorflow/core/distributed_runtime:base_rendezvous_mgr",
         "@org_tensorflow//tensorflow/core/distributed_runtime:worker_env",
+        #"@org_tensorflow//tensorflow/core/distributed_runtime:worker_cache_partial",
     ],
 )
 
 tf_cuda_library(
     name = "rdma_mgr",
     srcs = ["rdma_mgr.cc"],
-    hdrs = ["rdma_mgr.h"],
+    hdrs = ["rdma_mgr.h", "rdma.h"],
+    copts = ["-Og", "-g3"],
     deps = [
         ":grpc_verbs_client",
-        ":rdma",
+        #":rdma",
+        ":verbs_util",
         ":verbs_service_proto_cc",
         "@org_tensorflow//tensorflow/core",
+        #"@org_tensorflow//tensorflow/core:core_cpu_internal",
         "@org_tensorflow//tensorflow/core:lib",
+        #"@org_tensorflow//tensorflow/core:lib_internal",
         "@org_tensorflow//tensorflow/core/distributed_runtime:session_mgr",
         "@org_tensorflow//tensorflow/core/distributed_runtime:worker_env",
         "@org_tensorflow//tensorflow/core/distributed_runtime/rpc:grpc_channel",
@@ -118,21 +153,31 @@ tf_cuda_library(
     ],
 )
 
+
 tf_cuda_library(
     name = "rdma",
     srcs = ["rdma.cc"],
     hdrs = ["rdma.h"],
-    linkopts = ["-libverbs"],
+    linkopts = select({
+        "@org_tensorflow//tensorflow:with_verbs_support": ["-libverbs"],
+        "//conditions:default": [],
+    }),
+    copts = ["-Og", "-g3"],
     deps = [
+        ":rdma_mgr",
         ":grpc_verbs_client",
         ":verbs_service_proto_cc",
         ":verbs_util",
-        "@org_tensorflow//tensorflow/core",
+        #"@org_tensorflow//tensorflow/core:core_cpu_internal",
         "@org_tensorflow//tensorflow/core:framework",
+        #"@org_tensorflow//tensorflow/core:framework_internal",
+        #"@org_tensorflow//tensorflow/core:gpu_runtime",
         "@org_tensorflow//tensorflow/core:lib",
+        #"@org_tensorflow//tensorflow/core:lib_internal",
         "@org_tensorflow//tensorflow/core/distributed_runtime:rendezvous_mgr_interface",
         "@org_tensorflow//tensorflow/core/distributed_runtime:session_mgr",
         "@org_tensorflow//tensorflow/core/distributed_runtime:worker_env",
+        "@org_tensorflow//tensorflow/core/distributed_runtime/rpc:grpc_channel",
     ],
 )
 
@@ -151,3 +196,4 @@ cc_library(
     ],
     alwayslink = 1,
 )
+
diff --git a/tensorflow_networking/verbs/Dockerfile b/tensorflow_networking/verbs/Dockerfile
deleted file mode 100644
index cecb40d..0000000
--- a/tensorflow_networking/verbs/Dockerfile
+++ /dev/null
@@ -1,82 +0,0 @@
-ARG UBUNTU_VERSION=16.04
-
-FROM ubuntu:${UBUNTU_VERSION} AS base
-
-RUN apt-get update && apt-get install -y --no-install-recommends \
-        build-essential \
-        curl \
-        git \
-        libcurl3-dev \
-        libfreetype6-dev \
-        libhdf5-serial-dev \
-        libpng12-dev \
-        libzmq3-dev \
-        pkg-config \
-        rsync \
-        software-properties-common \
-        unzip \
-        zip \
-        zlib1g-dev \
-        openjdk-8-jdk \
-        openjdk-8-jre-headless \
-        libibverbs-dev \
-        && \
-    apt-get clean && \
-    rm -rf /var/lib/apt/lists/*
-
-ENV CI_BUILD_PYTHON python
-
-ARG USE_PYTHON_3_NOT_2
-ARG _PY_SUFFIX=${USE_PYTHON_3_NOT_2:+3}
-ARG PYTHON=python${_PY_SUFFIX}
-ARG PIP=pip${_PY_SUFFIX}
-
-# See http://bugs.python.org/issue19846
-ENV LANG C.UTF-8
-
-RUN apt-get update && apt-get install -y \
-    ${PYTHON} \
-    ${PYTHON}-pip
-
-RUN ${PIP} --no-cache-dir install --upgrade \
-    pip \
-    setuptools
-
-# Some TF tools expect a "python" binary
-RUN ln -s $(which ${PYTHON}) /usr/local/bin/python 
-
-RUN apt-get update && apt-get install -y \
-    build-essential \
-    curl \
-    git \
-    wget \
-    openjdk-8-jdk \
-    ${PYTHON}-dev \
-    swig
-
-RUN ${PIP} --no-cache-dir install \
-    Pillow \
-    h5py \
-    keras_applications \
-    keras_preprocessing \
-    matplotlib \
-    mock \
-    numpy \
-    scipy \
-    sklearn \
-    pandas \
-    && test "${USE_PYTHON_3_NOT_2}" -eq 1 && true || ${PIP} --no-cache-dir install \
-    enum34
-
-# Install bazel
-ARG BAZEL_VERSION=0.24.1
-RUN mkdir /bazel && \
-    wget -O /bazel/installer.sh "https://github.com/bazelbuild/bazel/releases/download/${BAZEL_VERSION}/bazel-${BAZEL_VERSION}-installer-linux-x86_64.sh" && \
-    wget -O /bazel/LICENSE.txt "https://raw.githubusercontent.com/bazelbuild/bazel/master/LICENSE" && \
-    chmod +x /bazel/installer.sh && \
-    /bazel/installer.sh && \
-    rm -f /bazel/installer.sh
-
-ADD . /tf_networking
-WORKDIR /tf_networking
-RUN bazel build -c opt //tensorflow_networking/verbs:verbs_server_lib
diff --git a/tensorflow_networking/verbs/docker_howto.txt b/tensorflow_networking/verbs/docker_howto.txt
deleted file mode 100644
index 16b8392..0000000
--- a/tensorflow_networking/verbs/docker_howto.txt
+++ /dev/null
@@ -1,13 +0,0 @@
-Building a networking component may require libraries not available or installable in your normal environment.
-As of late 2018 the networking contrib extensions to TensorFlow 1.x can be built in a docker container, as follows.
-
-1. Ensure docker is installed.
-2. Invoke a docker container with the latest nightly development build:
-   $ docker run -it -w /tensorflow -v $PWD:/mnt -e HOST_PERMS"$(id -u):$(id -g)" tensorflow/tensorflow:nightly-devel bash
-3. Configure for bazel build
-   $ ./configure
-4. Install any necessary additional packages, e.g.
-   $ apt-get update  
-   $ apt-get install libibverbs-dev
-5. Build with the desired extension
-   $ bazel build --config=verbs //tensorflow/tools/pip_package:build_pip_package
diff --git a/tensorflow_networking/verbs/grpc_verbs_client.cc b/tensorflow_networking/verbs/grpc_verbs_client.cc
index 28411fd..d0ad3ef 100644
--- a/tensorflow_networking/verbs/grpc_verbs_client.cc
+++ b/tensorflow_networking/verbs/grpc_verbs_client.cc
@@ -37,6 +37,39 @@ Status GrpcVerbsClient::GetRemoteAddress(const GetRemoteAddressRequest* request,
   return GetRemoteAddress(&call_options, request, response);
 }
 
+
+Status GrpcVerbsClient::ReqDriverMessage(CallOptions* call_options,
+                        const DriverMessageReq* request,
+                        DriverMessageResp* response) {
+  ::grpc::ClientContext ctx;
+  ctx.set_fail_fast(false);
+  SetDeadline(&ctx, call_options->GetTimeout());
+  return FromGrpcStatus(stub_->ReqDriverMessage(&ctx, *request, response));
+}
+
+Status GrpcVerbsClient::ReqDriverMessage(const DriverMessageReq* request,
+                        DriverMessageResp* response) {
+  CallOptions call_options;
+  call_options.SetTimeout(-1);  // no time out
+  return ReqDriverMessage(&call_options, request, response);
+}
+
+Status GrpcVerbsClient::ReqPleSendOrCheck(CallOptions* call_options,
+                          const PleSendOrCheckReq* request,
+                          PleSendOrCheckResp* response) {
+  ::grpc::ClientContext ctx;
+  ctx.set_fail_fast(false);
+  SetDeadline(&ctx, call_options->GetTimeout());
+  return FromGrpcStatus(stub_->ReqPleSendOrCheck(&ctx, *request, response));
+}
+
+Status GrpcVerbsClient::ReqPleSendOrCheck(const PleSendOrCheckReq* request,
+                        PleSendOrCheckResp* response) {
+  CallOptions call_options;
+  call_options.SetTimeout(-1);  // no time out
+  return ReqPleSendOrCheck(&call_options, request, response);
+}
+
 void GrpcVerbsClient::SetDeadline(::grpc::ClientContext* ctx,
                                   int64 time_in_ms) {
   if (time_in_ms > 0) {
diff --git a/tensorflow_networking/verbs/grpc_verbs_client.h b/tensorflow_networking/verbs/grpc_verbs_client.h
index db01798..c703f64 100644
--- a/tensorflow_networking/verbs/grpc_verbs_client.h
+++ b/tensorflow_networking/verbs/grpc_verbs_client.h
@@ -16,11 +16,11 @@ limitations under the License.
 #ifndef TENSORFLOW_CONTRIB_VERBS_GRPC_VERBS_CLIENT_H_
 #define TENSORFLOW_CONTRIB_VERBS_GRPC_VERBS_CLIENT_H_
 
+#include "tensorflow_networking/verbs/grpc_verbs_service_impl.h"
+#include "tensorflow_networking/verbs/verbs_service.pb.h"
 #include "tensorflow/core/distributed_runtime/call_options.h"
 #include "tensorflow/core/distributed_runtime/rpc/grpc_util.h"
 #include "tensorflow/core/lib/core/status.h"
-#include "tensorflow_networking/verbs/grpc_verbs_service_impl.h"
-#include "tensorflow_networking/verbs/verbs_service.pb.h"
 
 namespace tensorflow {
 
@@ -37,6 +37,19 @@ class GrpcVerbsClient {
   Status GetRemoteAddress(const GetRemoteAddressRequest* request,
                           GetRemoteAddressResponse* response);
 
+  Status ReqDriverMessage(CallOptions* call_options,
+                          const DriverMessageReq* request,
+                          DriverMessageResp* response);
+  Status ReqDriverMessage(const DriverMessageReq* request,
+                          DriverMessageResp* response);
+
+  Status ReqPleSendOrCheck(CallOptions* call_options,
+                          const PleSendOrCheckReq* request,
+                          PleSendOrCheckResp* response);
+
+  Status ReqPleSendOrCheck(const PleSendOrCheckReq* request,
+                          PleSendOrCheckResp* response);
+
  private:
   std::unique_ptr<grpc::VerbsService::Stub> stub_;
 
diff --git a/tensorflow_networking/verbs/grpc_verbs_service.cc b/tensorflow_networking/verbs/grpc_verbs_service.cc
index 11d8704..b38358a 100644
--- a/tensorflow_networking/verbs/grpc_verbs_service.cc
+++ b/tensorflow_networking/verbs/grpc_verbs_service.cc
@@ -13,7 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#ifdef TENSORFLOW_USE_VERBS
+
 #include "tensorflow_networking/verbs/grpc_verbs_service.h"
+
 #include "tensorflow/core/distributed_runtime/rpc/grpc_util.h"
 #include "tensorflow/core/distributed_runtime/session_mgr.h"
 
@@ -71,12 +74,34 @@ void GrpcVerbsService::Shutdown() {
     }                                                                        \
   } while (0)
 
+#define ENQUEUE_Driver_REQUEST(method, method_func, supports_cancel)        \
+  do {                                                                       \
+    mutex_lock l(shutdown_mu_);                                              \
+    if (!is_shutdown_) {                                                     \
+      Call<GrpcVerbsService, grpc::VerbsService::AsyncService,               \
+           method##Req, method##Resp>::                                     \
+          EnqueueRequest(&verbs_service_, cq_,                               \
+                         &grpc::VerbsService::AsyncService::Request##method_func, \
+                         &GrpcVerbsService::method_func##Handler,                 \
+                         (supports_cancel));                                 \
+    }                                                                        \
+  } while (0)
+
+
 // This method blocks forever handling requests from the completion queue.
 void GrpcVerbsService::HandleRPCsLoop() {
   for (int i = 0; i < 10; ++i) {
     ENQUEUE_REQUEST(GetRemoteAddress, false);
   }
 
+  for (int i =0; i < 10; ++i) {
+    ENQUEUE_Driver_REQUEST(DriverMessage, ReqDriverMessage, false);
+  }
+
+  for (int i =0; i < 10; ++i) {
+    ENQUEUE_Driver_REQUEST(PleSendOrCheck, ReqPleSendOrCheck, false);
+  }
+
   void* tag;
   bool ok;
 
@@ -98,6 +123,77 @@ void GrpcVerbsService::GetRemoteAddressHandler(
   ENQUEUE_REQUEST(GetRemoteAddress, false);
 }
 
+void GrpcVerbsService::ReqDriverMessageHandler(
+    WorkerCall<DriverMessageReq, DriverMessageResp>* call) {
+  Status s = ReqDriverMessageSync(&call->request, &call->response);
+  call->SendResponse(ToGrpcStatus(s));
+  ENQUEUE_Driver_REQUEST(DriverMessage, ReqDriverMessage, false);
+}
+
+void GrpcVerbsService::ReqPleSendOrCheckHandler(
+    WorkerCall<PleSendOrCheckReq, PleSendOrCheckResp>* call) {
+  Status s = ReqPleSendOrCheckSync(&call->request, &call->response);
+  call->SendResponse(ToGrpcStatus(s));
+  ENQUEUE_Driver_REQUEST(PleSendOrCheck, ReqPleSendOrCheck, false);
+}
+
+// synchronous method
+Status GrpcVerbsService::ReqDriverMessageSync(const DriverMessageReq* request,
+                              DriverMessageResp* response) {
+  // analysis send-driven Request
+  const string& remote_host_name = request->host_name();
+  RdmaChannel* channel = rdma_mgr_->FindChannel(remote_host_name);
+  CHECK(channel != nullptr) << "GrpcVerbsService RdmaChannel for:"
+      << remote_host_name <<  " is nullptr";
+  RDMA_LOG(1) << "GrpcVerbsService Channel local_name_:"
+            << channel->local_name_;
+  string worker_name = worker_env_->session_mgr->LegacySession()->worker_name;
+
+  CHECK(worker_name == channel->local_name_)
+      << "worker_name != channel->local_name_"
+      << " worker_name:" << worker_name
+      << " channel->local_name_:" << channel->local_name_;
+
+  // LOG(INFO) << "GrpcVerbsService recv: " << remote_host_name;
+  std::shared_ptr<RdmaSendDriverMgr> driver_mgr_ptr = channel->GetRdmaSendDriverMgr();
+  driver_mgr_ptr->RpcUpdateRemoteDriverEntry(request, response);
+  return Status::OK();
+}
+
+Status GrpcVerbsService::ReqPleSendOrCheckSync(const PleSendOrCheckReq* request,
+                              PleSendOrCheckResp* response) {
+  // analysis request
+  const string& remote_host_name = request->host_name();
+  RdmaChannel* channel = rdma_mgr_->FindChannel(remote_host_name);
+  CHECK(channel != nullptr) << "ReqPleSendOrCheckSync RdmaChannel for:"
+      << remote_host_name <<  " is nullptr";
+  LOG(INFO) << "ReqPleSendOrCheckSync Channel local_name_:"
+            << channel->local_name_;
+  string worker_name = worker_env_->session_mgr->LegacySession()->worker_name;
+
+  CHECK(worker_name == channel->local_name_)
+      << "worker_name != channel->local_name_"
+      << " worker_name:" << worker_name
+      << " channel->local_name_:" << channel->local_name_;
+
+  if (channel->could_send_driver_) {
+    LOG(INFO) << "ReqPleSendOrCheckSync for remote:"
+              << remote_host_name
+              << " is ok";
+    response->set_host_name(channel->local_name_);
+    response->set_is_ok(true);
+    return Status::OK();
+  }
+
+  // service allocate static mem and notify to endpoint
+  // TODO(wuyongyu02): change to large MR
+  channel->InitAndSetDriverStatus();
+  response->set_host_name(channel->local_name_);
+  response->set_is_ok(true);
+  // LOG(INFO) << "ReqPleSendOrCheckSync recv: " << remote_host_name;
+  return Status::OK();
+}
+
 // synchronous method
 Status GrpcVerbsService::GetRemoteAddressSync(
     const GetRemoteAddressRequest* request,
@@ -116,7 +212,13 @@ Status GrpcVerbsService::GetRemoteAddressSync(
   rc->SetRemoteAddress(ra, false);
   rc->Connect();
   int i = 0;
-  int idx[] = {1, 0};
+  // int idx[] = {1, 0};
+  int idx[RdmaChannel::kNumMessageBuffers + 1];
+  for (auto k=0; k < RdmaChannel::kNumMessageBuffers; k = k + 2) {
+    idx[k] = k+1;
+    idx[k+1] = k;
+    // LOG(ERROR) << "idx[" << k << "]:" << idx[k] << " idx[" << k+1 << "]:" << idx[k+1];
+  }
   std::vector<RdmaMessageBuffer*> mb(rc->message_buffers());
   CHECK_EQ(request->mr_size(), RdmaChannel::kNumMessageBuffers);
   for (const auto& mr : request->mr()) {
@@ -136,7 +238,7 @@ Status GrpcVerbsService::GetRemoteAddressSync(
   // setting up response
   response->set_host_name(
       worker_env_->session_mgr->LegacySession()->worker_name);
-  Channel* channel_info = response->mutable_channel();
+  ChannelInfo* channel_info = response->mutable_channel();
   channel_info->set_lid(rc->self().lid);
   channel_info->set_qpn(rc->self().qpn);
   channel_info->set_psn(rc->self().psn);
@@ -157,3 +259,5 @@ void SetNewVerbsService(GrpcVerbsService** handle, const WorkerEnv* worker_env,
 }
 
 }  // namespace tensorflow
+
+#endif  // TENSORFLOW_USE_VERBS
diff --git a/tensorflow_networking/verbs/grpc_verbs_service.h b/tensorflow_networking/verbs/grpc_verbs_service.h
index 0d36859..494798b 100644
--- a/tensorflow_networking/verbs/grpc_verbs_service.h
+++ b/tensorflow_networking/verbs/grpc_verbs_service.h
@@ -16,15 +16,18 @@ limitations under the License.
 #ifndef TENSORFLOW_CONTRIB_VERBS_GRPC_VERBS_SERVICE_H_
 #define TENSORFLOW_CONTRIB_VERBS_GRPC_VERBS_SERVICE_H_
 
+#ifdef TENSORFLOW_USE_VERBS
+
 #include "grpcpp/alarm.h"
 #include "grpcpp/grpcpp.h"
 #include "grpcpp/server_builder.h"
+#include "tensorflow_networking/verbs/verbs_service.pb.h"
+#include "tensorflow_networking/verbs/rdma.h"
+#include "tensorflow_networking/verbs/rdma_mgr.h"
+#include "tensorflow_networking/verbs/verbs_service.pb.h"
 #include "tensorflow/core/distributed_runtime/rpc/async_service_interface.h"
 #include "tensorflow/core/distributed_runtime/rpc/grpc_call.h"
 #include "tensorflow/core/lib/core/refcount.h"
-#include "tensorflow_networking/verbs/grpc_verbs_service_impl.h"
-#include "tensorflow_networking/verbs/rdma_mgr.h"
-#include "tensorflow_networking/verbs/verbs_service.pb.h"
 
 namespace tensorflow {
 
@@ -44,11 +47,23 @@ class GrpcVerbsService : public AsyncServiceInterface {
       WorkerCall<GetRemoteAddressRequest, GetRemoteAddressResponse>* call);
   Status GetRemoteAddressSync(const GetRemoteAddressRequest* request,
                               GetRemoteAddressResponse* response);
+  
+  void ReqDriverMessageHandler(
+    WorkerCall<DriverMessageReq, DriverMessageResp>* call);
+
+  void ReqPleSendOrCheckHandler(
+    WorkerCall<PleSendOrCheckReq, PleSendOrCheckResp>* call);
+
+  Status ReqDriverMessageSync(const DriverMessageReq* request,
+                              DriverMessageResp* response);
+  
+  Status ReqPleSendOrCheckSync(const PleSendOrCheckReq* request,
+                              PleSendOrCheckResp* response);
 
   ::grpc::ServerCompletionQueue* cq_;
   grpc::VerbsService::AsyncService verbs_service_;
   mutex shutdown_mu_;
-  bool is_shutdown_ TF_GUARDED_BY(shutdown_mu_);
+  bool is_shutdown_ GUARDED_BY(shutdown_mu_);
   ::grpc::Alarm* shutdown_alarm_;
   // not owned
   RdmaMgr* rdma_mgr_;
@@ -63,4 +78,5 @@ void SetNewVerbsService(GrpcVerbsService** handle, const WorkerEnv* worker_env,
 
 }  // namespace tensorflow
 
+#endif  // TENSORFLOW_USE_VERBS
 #endif  // TENSORFLOW_CONTRIB_VERBS_GRPC_VERBS_SERVICE_H_
diff --git a/tensorflow_networking/verbs/grpc_verbs_service_impl.cc b/tensorflow_networking/verbs/grpc_verbs_service_impl.cc
index 506bcb4..3c8d8bd 100644
--- a/tensorflow_networking/verbs/grpc_verbs_service_impl.cc
+++ b/tensorflow_networking/verbs/grpc_verbs_service_impl.cc
@@ -30,6 +30,8 @@ namespace grpc {
 
 static const char* grpcVerbsService_method_names[] = {
     "/tensorflow.VerbsService/GetRemoteAddress",
+    "/tensorflow.VerbsService/ReqDriverMessage",
+    "/tensorflow.VerbsService/ReqPleSendOrCheck"
 };
 
 std::unique_ptr<VerbsService::Stub> VerbsService::NewStub(
@@ -44,7 +46,14 @@ VerbsService::Stub::Stub(
     : channel_(channel),
       rpcmethod_GetRemoteAddress_(grpcVerbsService_method_names[0],
                                   ::grpc::internal::RpcMethod::NORMAL_RPC,
-                                  channel) {}
+                                  channel),
+      rpcmethod_ReqDriverMessage_(grpcVerbsService_method_names[1],
+                                  ::grpc::internal::RpcMethod::NORMAL_RPC,
+                                  channel),
+      rpcmethod_ReqPleSendOrCheck_(grpcVerbsService_method_names[2],
+                                  ::grpc::internal::RpcMethod::NORMAL_RPC,
+                                  channel)
+                                   {}
 
 ::grpc::Status VerbsService::Stub::GetRemoteAddress(
     ::grpc::ClientContext* context, const GetRemoteAddressRequest& request,
@@ -53,8 +62,24 @@ ::grpc::Status VerbsService::Stub::GetRemoteAddress(
       channel_.get(), rpcmethod_GetRemoteAddress_, context, request, response);
 }
 
+::grpc::Status VerbsService::Stub::ReqDriverMessage(
+    ::grpc::ClientContext* context, const DriverMessageReq& request,
+    DriverMessageResp* response) {
+  // LOG(INFO) << "Stub ReqDriverMessage..."
+  //           << " rpcmethod_ReqDriverMessage_:" << rpcmethod_ReqDriverMessage_;
+  return ::grpc::internal::BlockingUnaryCall(
+      channel_.get(), rpcmethod_ReqDriverMessage_, context, request, response);
+}
+
+::grpc::Status VerbsService::Stub::ReqPleSendOrCheck(
+    ::grpc::ClientContext* context, const PleSendOrCheckReq& request,
+    PleSendOrCheckResp* response) {
+  return ::grpc::internal::BlockingUnaryCall(
+      channel_.get(), rpcmethod_ReqPleSendOrCheck_, context, request, response);
+}
+
 VerbsService::AsyncService::AsyncService() {
-  for (int i = 0; i < 1; ++i) {
+  for (int i = 0; i < 3; ++i) {
     AddMethod(new ::grpc::internal::RpcServiceMethod(
         grpcVerbsService_method_names[i],
         ::grpc::internal::RpcMethod::NORMAL_RPC, nullptr));
diff --git a/tensorflow_networking/verbs/grpc_verbs_service_impl.h b/tensorflow_networking/verbs/grpc_verbs_service_impl.h
index cdd8904..caabc85 100644
--- a/tensorflow_networking/verbs/grpc_verbs_service_impl.h
+++ b/tensorflow_networking/verbs/grpc_verbs_service_impl.h
@@ -32,7 +32,7 @@ namespace tensorflow {
 namespace grpc {
 
 // Implementation of `tensorflow.VerbsService`, based on the
-// definition in "//tensorflow_networking/verbs/verbs_service.proto",
+// definition in "//tensorflow/contrib/verbs/verbs_service.proto",
 // and the gRPC generated stub and service classes.
 // See the proto file for the definition of methods and messages.
 class VerbsService GRPC_FINAL {
@@ -43,6 +43,13 @@ class VerbsService GRPC_FINAL {
     virtual ::grpc::Status GetRemoteAddress(
         ::grpc::ClientContext* context, const GetRemoteAddressRequest& request,
         GetRemoteAddressResponse* response) = 0;
+    virtual ::grpc::Status ReqDriverMessage(
+        ::grpc::ClientContext* context, const DriverMessageReq& request,
+        DriverMessageResp* response) = 0;
+
+    virtual ::grpc::Status ReqPleSendOrCheck(
+        ::grpc::ClientContext* context, const PleSendOrCheckReq& request,
+        PleSendOrCheckResp* response) = 0;
   };
   class Stub GRPC_FINAL : public StubInterface {
    public:
@@ -51,9 +58,21 @@ class VerbsService GRPC_FINAL {
         ::grpc::ClientContext* context, const GetRemoteAddressRequest& request,
         GetRemoteAddressResponse* response) GRPC_OVERRIDE;
 
+    ::grpc::Status ReqDriverMessage(
+        ::grpc::ClientContext* context, 
+        const DriverMessageReq& request,
+        DriverMessageResp* response) GRPC_OVERRIDE;
+    
+    ::grpc::Status ReqPleSendOrCheck(
+        ::grpc::ClientContext* context, 
+        const PleSendOrCheckReq& request,
+        PleSendOrCheckResp* response) GRPC_OVERRIDE;
+   
    private:
     std::shared_ptr< ::grpc::ChannelInterface> channel_;
     const ::grpc::internal::RpcMethod rpcmethod_GetRemoteAddress_;
+    const ::grpc::internal::RpcMethod rpcmethod_ReqDriverMessage_;
+    const ::grpc::internal::RpcMethod rpcmethod_ReqPleSendOrCheck_;
   };
   static std::unique_ptr<Stub> NewStub(
       const std::shared_ptr< ::grpc::ChannelInterface>& channel,
@@ -71,6 +90,25 @@ class VerbsService GRPC_FINAL {
       ::grpc::Service::RequestAsyncUnary(0, context, request, response,
                                          new_call_cq, notification_cq, tag);
     }
+
+    void RequestReqDriverMessage(
+        ::grpc::ServerContext* context, DriverMessageReq* request,
+        ::grpc::ServerAsyncResponseWriter<DriverMessageResp>* response,
+        ::grpc::CompletionQueue* new_call_cq,
+        ::grpc::ServerCompletionQueue* notification_cq, void* tag) {
+      ::grpc::Service::RequestAsyncUnary(1, context, request, response,
+                                         new_call_cq, notification_cq, tag);
+    }
+
+    void RequestReqPleSendOrCheck(
+        ::grpc::ServerContext* context, PleSendOrCheckReq* request,
+        ::grpc::ServerAsyncResponseWriter<PleSendOrCheckResp>* response,
+        ::grpc::CompletionQueue* new_call_cq,
+        ::grpc::ServerCompletionQueue* notification_cq, void* tag) {
+      ::grpc::Service::RequestAsyncUnary(2, context, request, response,
+                                         new_call_cq, notification_cq, tag);
+    }
+
   };
 };
 
diff --git a/tensorflow_networking/verbs/rdma.cc b/tensorflow_networking/verbs/rdma.cc
index b4b3dad..971dd32 100644
--- a/tensorflow_networking/verbs/rdma.cc
+++ b/tensorflow_networking/verbs/rdma.cc
@@ -13,14 +13,22 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#ifdef TENSORFLOW_USE_VERBS
+
 #include <fcntl.h>
 #include <cstdlib>
+#include <regex>
+#include <bitset>
+#include <inttypes.h>
+#include <sstream>
+#include <utility>
+#include <set>
 
+#include "tensorflow_networking/verbs/rdma.h"
+#include "tensorflow_networking/verbs/verbs_service.pb.h"
 #include "tensorflow/core/common_runtime/device_mgr.h"
 #include "tensorflow/core/common_runtime/dma_helper.h"
 #include "tensorflow/core/common_runtime/process_util.h"
-#include "tensorflow_networking/verbs/rdma.h"
-#include "tensorflow_networking/verbs/verbs_service.pb.h"
 #if GOOGLE_CUDA
 #include "tensorflow/core/common_runtime/gpu/gpu_process_state.h"
 #include "tensorflow/core/common_runtime/gpu/gpu_util.h"
@@ -54,6 +62,12 @@ string MessageTypeToString(RdmaMessageType rmt) {
     case RDMA_MESSAGE_TENSOR_REQUEST:
       return "RDMA_MESSAGE_TENSOR_REQUEST";
       break;
+    case RDMA_MESSAGE_DRIVER_BEGIN:
+      return "RDMA_MESSAGE_DRIVER_BEGIN";
+      break;
+    case RDMA_MESSAGE_ERROR_STATUS:
+      return "RDMA_MESSAGE_ERROR_STATUS";
+      break;
     default:
       return "UNKNOWN MESSAGE";
   }
@@ -79,6 +93,8 @@ string get_env_var(char const* var_name) {
 ibv_context* open_device(ibv_device* ibv_dev) {
   ibv_context* context = ibv_open_device(ibv_dev);
 
+  LOG(INFO) << "RDMA context->num_comp_vectors:" << context->num_comp_vectors;
+
   CHECK(context) << "Open context failed for " << ibv_get_device_name(ibv_dev);
   return context;
 }
@@ -98,6 +114,13 @@ int get_dev_active_port_count(ibv_device* device) {
   CHECK(context) << "Open context failed for " << ibv_get_device_name(device);
   rc = ibv_query_device(context, &device_att);
   CHECK(!rc) << "Failed to query the device";
+  LOG(INFO) << "[RDMA Device Info] "
+            << " max_qp:" << device_att.max_qp
+            << " max_cq:" << device_att.max_cq
+            << " max_pd:" << device_att.max_pd
+            << " max_mr:" << device_att.max_mr
+            << " max_mr_size:" << device_att.max_mr_size;
+
 
   for (port_index = 1; port_index <= device_att.phys_port_cnt; port_index++) {
     rc = ibv_query_port(context, port_index, &port_attr);
@@ -398,128 +421,348 @@ ibv_pd* alloc_protection_domain(ibv_context* context) {
   return pd;
 }
 
+Chunk::Chunk(struct ibv_pd* pd) :
+    pd_(pd), allocate_size_(0), curr_size_(0), empty_size_(0),
+    offset_(0), total_waste_size_(0), total_realloc_size_(0) {
+  chunk_addr_size = VerbsEnvRegistrar::Instance()->RdmaChunkSize();
+  if (EIGEN_MAX_ALIGN_BYTES > 0) {
+    int ratio = (chunk_addr_size + EIGEN_MAX_ALIGN_BYTES) / EIGEN_MAX_ALIGN_BYTES;
+    chunk_addr_size = ratio * EIGEN_MAX_ALIGN_BYTES;
+  }
+  LOG(INFO) << "chunk size:" 
+            << chunk_addr_size 
+            << " EIGEN_MAX_ALIGN_BYTES:"
+            << EIGEN_MAX_ALIGN_BYTES;
+}
+
+void Chunk::FreeChunk() {
+  LOG(INFO) << "delete Chunk";
+  for (auto& it : mrs_) {
+    ibv_dereg_mr(it);
+  }
+  for (auto& it : chunk_addrs_) {
+    free(it);
+  }
+}
+
+Chunk::~Chunk() { }
+
+void Chunk::Alloc(size_t size, void** p, ibv_mr** mr, size_t realloc_size) {
+  mutex_lock l(alloc_mu_);
+  size_t align_size = size;
+  if (EIGEN_MAX_ALIGN_BYTES > 0) {
+    int ratio = (size + EIGEN_MAX_ALIGN_BYTES - 1) / EIGEN_MAX_ALIGN_BYTES;
+    align_size = ratio * EIGEN_MAX_ALIGN_BYTES;
+  }
+  // empty addr need alloc new data
+  if (empty_size_ < align_size) {
+    size_t malloc_size = (align_size + chunk_addr_size - 1) / chunk_addr_size * chunk_addr_size;
+    curr_size_ += malloc_size;
+    total_waste_size_ += empty_size_;
+    total_realloc_size_ += realloc_size;
+    LOG(INFO) << "RDMA Allocate Memory: " << curr_size_ << " Bytes " << total_waste_size_ << " " << total_realloc_size_;
+    offset_ = 0;
+    empty_size_ = malloc_size;
+    size_t allocate_size= 0;
+    ib_malloc((void**)&new_p_, &allocate_size, malloc_size, 
+              EIGEN_MAX_ALIGN_BYTES);
+    new_mr_ = ibv_reg_mr(pd_, new_p_, malloc_size,
+                        IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_WRITE);
+    mrs_.emplace_back(new_mr_);
+    chunk_addrs_.emplace_back(new_p_);
+  }
+  *p = (void*)(((char *)new_p_) + offset_);
+  empty_size_ -= align_size;
+  *mr = new_mr_;
+  offset_ += align_size;
+}
+
 RdmaAdapter::RdmaAdapter(const WorkerEnv* worker_env)
     : context_(open_device(set_device())),
       params_(params_init(context_)),
       pd_(alloc_protection_domain(context_)),
       worker_env_(worker_env) {
-  event_channel_ = ibv_create_comp_channel(context_);
-  CHECK(event_channel_) << "Failed to create completion channel";
-  cq_ = ibv_create_cq(context_, MAX_CONCURRENT_WRITES * 2, NULL, event_channel_,
-                      0);
-  CHECK(cq_) << "Failed to create completion queue";
-  CHECK(!ibv_req_notify_cq(cq_, 0)) << "Failed to request CQ notification";
+  recv_chunk_ =  new Chunk(pd_);
+  cq_nums_ = VerbsEnvRegistrar::Instance()->RdmaCqNums();
+  wc_vec_ = new ibv_wc*[cq_nums_];
+  cq_vec_ = new ibv_cq*[cq_nums_];
+  event_channel_vec_ = new ibv_comp_channel*[cq_nums_];
+  for (int i = 0; i < cq_nums_; i++) {
+    wc_vec_[i] = new ibv_wc[MAX_CONCURRENT_WRITES * 2];
+    event_channel_vec_[i] = ibv_create_comp_channel(context_);
+    CHECK(event_channel_vec_[i]) << "Failed to create of "  << i
+                                 << " completion channel";
+    cq_vec_[i] = ibv_create_cq(context_, MAX_CONCURRENT_WRITES * 2, NULL,
+                               event_channel_vec_[i], 0);
+    CHECK(cq_vec_[i]) << "Failed to create of " << i << " completion queue";
+    CHECK(!ibv_req_notify_cq(cq_vec_[i], 0))
+        << "Failed to request CQ notification";
+  }
+  LOG(INFO) << "RdmaCQpoolSize:"
+            << VerbsEnvRegistrar::Instance()->RdmaCQpoolSize();
+  pool_ = new thread::ThreadPool(Env::Default(), ThreadOptions(),
+      "process_wr_impl", VerbsEnvRegistrar::Instance()->RdmaCQpoolSize(),
+      false, nullptr);
 }
 
 RdmaAdapter::~RdmaAdapter() {
-  polling_thread_.reset();
-  CHECK(!ibv_destroy_cq(cq_)) << "Failed to destroy CQ";
-  CHECK(!ibv_destroy_comp_channel(event_channel_))
-      << "Failed to destroy channel";
+  for (int i = 0; i < cq_nums_; i++) {
+    polling_thread_vec_[i].reset();
+  }
+  for (int i = 0; i < cq_nums_; i++) {
+    CHECK(!ibv_destroy_cq(cq_vec_[i])) << "Failed to destroy CQ";
+    CHECK(!ibv_destroy_comp_channel(event_channel_vec_[i]))
+        << "Failed to destroy channel";
+  }
   CHECK(!ibv_dealloc_pd(pd_)) << "Failed to deallocate PD";
   CHECK(!ibv_close_device(context_)) << "Failed to release context";
+  recv_chunk_->FreeChunk();
+  delete recv_chunk_;
+  recv_chunk_ = nullptr;
 }
 
 void RdmaAdapter::StartPolling() {
-  polling_thread_.reset(Env::Default()->StartThread(
-      ThreadOptions(), "RdmaAdapterCQThread", [this] { Process_CQ(); }));
+  for (int i = 0; i < cq_nums_; i++) {
+    polling_thread_vec_.emplace_back(Env::Default()->StartThread(
+      ThreadOptions(), "RdmaAdapterCQThread",
+      [this, i] { Pool_Process_CQ(i); }));
+  }
   VLOG(2) << "Start RdmaAdapter: " << name();
 }
 
 string RdmaAdapter::name() const { return string(context_->device->name); }
 
-// Function to process incoming messages
-// There are two types of messages:
-// 1. IBV_WC_RECV_RDMA_WITH_IMM (receive)
-// 2. IBV_WC_RDMA_WRITE (send))
-void RdmaAdapter::Process_CQ() {
-  while (true) {
-    ibv_cq* cq;
-    void* cq_context;
-    CHECK(!ibv_get_cq_event(event_channel_, &cq, &cq_context));
-    CHECK(cq == cq_);
-    ibv_ack_cq_events(cq, 1);
-    CHECK(!ibv_req_notify_cq(cq_, 0));
+void RdmaAdapter::Process_WR(ibv_wc wc_, int cq_num) {
+  if (wc_.status != IBV_WC_SUCCESS) {
+      return;
+    }
+  CHECK(wc_.status == IBV_WC_SUCCESS)
+      << "Failed status \n"
+      << ibv_wc_status_str(wc_.status) << " " << wc_.status << " "
+      << static_cast<int>(wc_.wr_id) << " " << wc_.vendor_err;
+  if (wc_.opcode == IBV_WC_RECV_RDMA_WITH_IMM) {
+    RdmaChannel* rc = reinterpret_cast<RdmaChannel*>(wc_.wr_id);
+    if (rc == nullptr) {
+      LOG(FATAL) << "Process_WR Faild wc_.wr_id:" << wc_.wr_id
+                 << " cq_num:" << cq_num;
+      return;
+    }
+    // put back a recv wr.
+    rc->Recv();
+    // imm_data is the index of RX buffer in the buffer table.
+    uint32_t imm_data = wc_.imm_data;
+    RdmaMessageBuffer* rb;
+    RdmaMessage rm;
+
+    if (imm_data > RDMA_IMM_MAX_REQUEST_ID && imm_data <= RDMA_IMM_DATA_ACK) {
+      // receive an ack to a message
+      int pair_index = imm_data - RDMA_IMM_MAX_REQUEST_ID -1;
+      int buffer_index = 2 * pair_index;
+      rb = rc->message_buffers()[buffer_index];
+      rb->SetBufferStatus(remote, idle);
+      rb->SendNextItem();
+      return;
+    }
 
-    int ne =
-        ibv_poll_cq(cq_, MAX_CONCURRENT_WRITES * 2, static_cast<ibv_wc*>(wc_));
-    CHECK_GE(ne, 0);
-    for (int i = 0; i < ne; ++i) {
-      CHECK(wc_[i].status == IBV_WC_SUCCESS)
-          << "Failed status \n"
-          << ibv_wc_status_str(wc_[i].status) << " " << wc_[i].status << " "
-          << static_cast<int>(wc_[i].wr_id) << " " << wc_[i].vendor_err;
-      if (wc_[i].opcode == IBV_WC_RECV_RDMA_WITH_IMM) {
-        RdmaChannel* rc = reinterpret_cast<RdmaChannel*>(wc_[i].wr_id);
-        // put back a recv wr.
-        rc->Recv();
-        // imm_data is the index of RX buffer in the buffer table.
-        uint32_t imm_data = wc_[i].imm_data;
-        RdmaMessageBuffer* rb;
-        RdmaMessage rm;
-
-        if (imm_data == RDMA_IMM_DATA_ACK) {
-          // receive an ack to a message
-          rb = rc->tx_message_buffer_;
-          rb->SetBufferStatus(remote, idle);
-          rb->SendNextItem();
-          continue;
+    if (imm_data <= RDMA_IMM_MAX_REQUEST_ID) {
+      // receive a tensor RDMA write
+      uint32_t request_index = imm_data;
+      if (imm_data < RDMA_IMM_MIN_SENDMGR_BASE) {
+        RdmaTensorRequest* request = rc->GetTensorRequest(request_index);
+        if (request == nullptr) {
+          LOG(INFO) << "Normal request_index:"
+                    << request_index
+                    << " , Normal request is done by SendDriverMgr";
+          return;
+        }
+        RDMA_LOG(1) << "DoNormal request_index:" << request_index;
+        request->RecvTensorContent();
+      } else {
+        // RecvSendDriver
+        const auto& tensors_uid_parsed_key =
+            rc->channel_record_->GetChannelTensorsUidParsedkey();
+        const auto& it = tensors_uid_parsed_key.find(imm_data);
+        if (it == tensors_uid_parsed_key.end()) {
+          LOG(FATAL) << "RdmaTensorRequest Not find parsed_key:"
+                    << it->second;
+        }
+        const auto& parsed_key = it->second;
+        bool has_data = false;
+        std::shared_ptr<DriverEntry> entry_ptr =
+            rc->rdma_send_driver_mgr_->GetDriverEntry(parsed_key, &has_data);
+
+        const auto& tensors_meta_data =
+            rc->channel_record_->GetChannelTensorsMetaData();
+        const auto& meta = tensors_meta_data.find(parsed_key);
+        if (meta == tensors_meta_data.end()) {
+          LOG(FATAL)
+              << "meta is not find in rc->channel_record_->tensors_meta_data_";
         }
 
-        if (imm_data <= RDMA_IMM_MAX_REQUEST_ID) {
-          // receive a tensor RDMA write
-          uint32_t request_index = imm_data;
-          RdmaTensorRequest* request = rc->GetTensorRequest(request_index);
-          request->RecvTensorContent();
-          continue;
+        bool can_memcpy = DataTypeCanUseMemcpy(meta->second.data_type_);
+        if (!has_data) {
+          // parsed DriverPrefixMessage
+          DriverPrefixMessage driver_prefix =
+              DriverPrefixMessage::ParseDriverPrefixMessage(
+                  (void*)entry_ptr->addr_, meta->second.meta_changed_);
+          Tensor* val;
+          void* entry_tensor_addr = nullptr;
+          // get rama's offset addr of Tensor 
+          if (meta->second.meta_changed_) {
+            entry_tensor_addr = (void*)(entry_ptr->addr_ +
+                DriverPrefixMessage::kPrefixMessageTotalBytes);
+          } else {
+            entry_tensor_addr = (void*)(entry_ptr->addr_ +
+                DriverPrefixMessage::CkPrefixMessageTotalBytes);
+          }
+          if (can_memcpy) {
+            // tensor can use zero-copy
+            auto fake_allocator = new FakeAllocator(entry_tensor_addr);
+            if (meta->second.meta_changed_) {
+              val = new Tensor(fake_allocator, 
+                               meta->second.data_type_,
+                               driver_prefix.tensor_shape_);
+            } else {
+              val = new Tensor(fake_allocator, 
+                               meta->second.data_type_,
+                               meta->second.tensor_shape_);
+            }
+            // memcpy(DMAHelper::base(val), entry_tensor_addr, val->TotalBytes());
+          } else {
+            // proto should not used zero-copy
+            if (meta->second.meta_changed_) {
+              val = new Tensor(meta->second.data_type_,
+                               driver_prefix.tensor_shape_);
+            } else {
+              val = new Tensor(meta->second.data_type_,
+                               meta->second.tensor_shape_);
+            }
+            TensorProto proto;
+            CHECK(ParseProtoUnlimited(&proto,entry_tensor_addr,
+                                      driver_prefix.tensor_bytes_))
+                << " fail to parse proto from array";
+            if (proto.dtype() > 0 && proto.dtype() <= DataType_MAX) {
+              Tensor parsed(proto.dtype());
+              if (parsed.FromProto(cpu_allocator(), proto)) {
+                *val = std::move(parsed);
+              }
+            }
+          }
+          Status s = Status::OK();
+          bool is_dead = driver_prefix.is_dead_;
+          int64 recv_micros = 0;
+          Rendezvous::Args send_args = Rendezvous::Args();
+          rc->local_driver_buffer_mgr_->QueueRdmaSave(parsed_key,
+              send_args, val, is_dead, recv_micros);
+          // if (val != nullptr) {
+          //   delete val;
+          //   val = nullptr;
+          // }
+        } else {
+          // When recv a SendDriverData which means that :
+          // Localrecv SendDriver is Ready.
+          LOG(FATAL) << "Local recv SendDriver Data is not ready"
+                    << " has_data:" << has_data;
         }
+      }
+      return;
+    }
 
-        // receive a control message
-        rb = rc->rx_message_buffer_;
-        RdmaMessage::ParseMessage(rm, rb->buffer_);
-        RdmaMessageBuffer::SendAck(rc);
-        RDMA_LOG(1) << "Step 0x" << std::hex << rm.step_id_ << std::dec
-                    << ": Received " << MessageTypeToString(rm.type_) << " "
-                    << "#" << rm.request_index_ << ": " << rm.name_;
-
-        if (rm.type_ == RDMA_MESSAGE_TENSOR_REQUEST) {
-          RdmaTensorResponse* response = rc->AddTensorResponse(rm);
-          response->Start();
-        } else if (rm.type_ == RDMA_MESSAGE_META_DATA_UPDATE) {
-          RdmaTensorRequest* request = rc->GetTensorRequest(rm.request_index_);
-          request->RecvTensorMetaData(rm.data_type_, rm.tensor_shape_,
+    // receive a control message
+    int pair_index = imm_data - RDMA_IMM_DATA_ACK -1;
+    int buffer_index = 2 * pair_index + 1;
+    rb = rc->message_buffers()[buffer_index];
+    RdmaMessage::ParseMessage(rm, rb->buffer_);
+    RdmaMessageBuffer::SendAck(rc, pair_index+1);
+    RDMA_LOG(1) << "Step 0x" << std::hex << rm.step_id_ << std::dec
+                << ": Received " << MessageTypeToString(rm.type_) << " "
+                << "#" << rm.request_index_ << ": " << rm.name_;
+    RDMA_LOG(1) << "pair_index imm_data:" << imm_data
+              << " Process_WR rm type:" << MessageTypeToString(rm.type_);
+
+    if (rm.type_ == RDMA_MESSAGE_TENSOR_REQUEST) {
+      RdmaTensorResponse* response = rc->AddTensorResponse(rm);
+      RDMA_LOG(1) << "GetResponse....";
+      response->Start();
+    } else if (rm.type_ == RDMA_MESSAGE_META_DATA_UPDATE) {
+      RDMA_LOG(1) << "Recevive RDMA_MESSAGE_META_DATA_UPDATE";
+      RdmaTensorRequest* request = rc->GetTensorRequest(rm.request_index_);
+      if (request == nullptr) {
+        LOG(FATAL) << "RDMA_MESSAGE_META_DATA_UPDATE request : "
+                  << rm.request_index_ << " is already done by LocalBufferMgr.";
+      }
+      request->RecvTensorMetaData(rm.data_type_, rm.tensor_shape_,
                                       rm.is_dead_, rm.tensor_bytes_);
 #ifdef RDMA_DATA_VALIDATION
-          request->RecvTensorChecksum(rm.checksum_);
+      request->RecvTensorChecksum(rm.checksum_);
 #endif
-        } else if (rm.type_ == RDMA_MESSAGE_TENSOR_RE_REQUEST) {
-          RdmaTensorResponse* response = rc->UpdateTensorResponse(rm);
-          response->Resume();
-        } else if (rm.type_ == RDMA_MESSAGE_ERROR_STATUS) {
-          RdmaTensorRequest* request = rc->GetTensorRequest(rm.request_index_);
-          request->RecvErrorStatus(rm.status_);
-        }
-      } else if (wc_[i].opcode == IBV_WC_RDMA_WRITE) {
-        RdmaWriteID* wr_id = reinterpret_cast<RdmaWriteID*>(wc_[i].wr_id);
-        RDMA_LOG(2) << "Write complete of type " << wr_id->write_type;
-        switch (wr_id->write_type) {
-          case RDMA_WRITE_ID_ACK:
-            break;
-          case RDMA_WRITE_ID_MESSAGE: {
-            RdmaMessageBuffer* rb =
-                reinterpret_cast<RdmaMessageBuffer*>(wr_id->write_context);
-            rb->SetBufferStatus(local, idle);
-            rb->SendNextItem();
-            break;
-          }
-          case RDMA_WRITE_ID_TENSOR_WRITE: {
-            RdmaTensorResponse* response =
-                reinterpret_cast<RdmaTensorResponse*>(wr_id->write_context);
-            response->Destroy();
-          }
-        }
-        delete wr_id;
+    } else if (rm.type_ == RDMA_MESSAGE_DRIVER_BEGIN) {
+      LOG(INFO) << "Recevive RDMA_MESSAGE_DRIVER_BEGIN";
+      RdmaTensorRequest* request = rc->GetTensorRequest(rm.request_index_);
+      if (request == nullptr) {
+        LOG(INFO) << "RDMA_MESSAGE_DRIVER_BEGIN request : "
+                  << rm.request_index_ << " is already done by LocalBufferMgr.";
+      }
+    } else if (rm.type_ == RDMA_MESSAGE_TENSOR_RE_REQUEST) {
+      RdmaTensorResponse* response = rc->UpdateTensorResponse(rm);
+      response->Resume();
+    } else if (rm.type_ == RDMA_MESSAGE_ERROR_STATUS) {
+      RdmaTensorRequest* request = rc->GetTensorRequest(rm.request_index_);
+      request->RecvErrorStatus(rm.status_);
+    }
+  } else if (wc_.opcode == IBV_WC_RDMA_WRITE) {
+    RdmaWriteID* wr_id = reinterpret_cast<RdmaWriteID*>(wc_.wr_id);
+    RDMA_LOG(2) << "Write complete of type " << wr_id->write_type;
+    switch (wr_id->write_type) {
+      case RDMA_WRITE_ID_ACK:
+        break;
+      case RDMA_WRITE_ID_MESSAGE: {
+        RdmaMessageBuffer* rb =
+            reinterpret_cast<RdmaMessageBuffer*>(wr_id->write_context);
+        // TODO(wuyongyu02): (local buffer idle)
+        rb->SetBufferStatus(local, idle);
+        rb->SendNextItem();
+        break;
+      }
+      case RDMA_WRITE_ID_SEND_DEIVER_WRITE: {
+        DriverEntry* entry =
+            reinterpret_cast<DriverEntry*>(wr_id->write_context);
+        RDMA_LOG(1)<< "succeed send FreeEntry uid:" << entry->uinque_id_;
+        break;
       }
+      case RDMA_WRITE_ID_TENSOR_WRITE: {
+        RdmaTensorResponse* response =
+            reinterpret_cast<RdmaTensorResponse*>(wr_id->write_context);
+        response->Destroy();
+      }
+    }
+    if (wr_id->write_type != RDMA_WRITE_ID_SEND_DEIVER_WRITE) {
+      delete wr_id;
+    }
+  }
+}
+
+void RdmaAdapter::Pool_Process_CQ(int cq_num) {
+  LOG(INFO) << "Pool_Process_CQ:" << cq_num;
+  auto cq = cq_vec_[cq_num];
+  auto event_channel =  event_channel_vec_[cq_num];
+  auto wc = wc_vec_[cq_num];
+  while (true) {
+    ibv_cq* cq_tmp;
+    void* cq_context;
+    CHECK(!ibv_get_cq_event(event_channel, &cq_tmp, &cq_context));
+    CHECK(cq_tmp == cq);
+    ibv_ack_cq_events(cq_tmp, 1);
+    CHECK(!ibv_req_notify_cq(cq, 0));
+
+    int ne =
+        ibv_poll_cq(cq, MAX_CONCURRENT_WRITES * 2, static_cast<ibv_wc*>(wc));
+    CHECK_GE(ne, 0);
+
+    for (int i = 0; i < ne; ++i) {
+      auto c = std::bind(&RdmaAdapter::Process_WR, this, wc[i], cq_num);
+      pool_->Schedule(std::move(c));
+      // worker_env_->compute_pool->Schedule(std::move(c));
     }
   }
 }
@@ -537,7 +780,7 @@ int RdmaChannel::PingPostRecv() {
 int RdmaChannel::PingPostSend() {
   struct ibv_send_wr wr, *bad_wr;
   memset(&wr, 0, sizeof(wr));
-  wr.wr_id = (uint64_t) this;
+  wr.wr_id = (uint64_t)this;
   wr.sg_list = &ping_sge_list_;
   wr.num_sge = 1;
   wr.opcode = IBV_WR_SEND;
@@ -547,11 +790,29 @@ int RdmaChannel::PingPostSend() {
 }
 
 RdmaChannel::RdmaChannel(const RdmaAdapter* adapter, const string local_name,
-                         const string remote_name)
+    const string remote_name, GrpcChannelCache* channel_cache, ibv_cq* cq)
     : adapter_(adapter),
       local_name_(local_name),
       remote_name_(remote_name),
-      request_serial_(0) {
+      request_serial_(0),
+      could_send_driver_(false),
+      channel_cache_(channel_cache),
+      pd_(adapter->pd_) {
+
+  rdma_memory_mgr_ = new RdmaMemoryMgr(adapter->pd_);
+  alloc_visitors_.emplace_back([&](void* ptr, int numa_node,
+                                           size_t num_bytes) {
+    LOG(INFO) << "RdmaChannel RdmaMgr alloc_visitor";
+    rdma_memory_mgr_->InsertMemoryRegion(
+        ptr, num_bytes, strings::StrCat("CPU:", numa_node));
+  });
+  free_visitors_.emplace_back([&](void* ptr, int numa_node,
+                                          size_t num_bytes) {
+    rdma_memory_mgr_->EvictMemoryRegion(ptr, num_bytes);
+  });
+
+  rdma_mem_allocator_ = new BFCRdmaAllocator(alloc_visitors_, free_visitors_);
+
   struct ibv_sge list;
 
   mr_ = ibv_reg_mr(adapter_->pd_, ping_buff_, kPingBuffSize,
@@ -568,13 +829,14 @@ RdmaChannel::RdmaChannel(const RdmaAdapter* adapter, const string local_name,
   {
     struct ibv_qp_init_attr attr;
     memset(&attr, 0, sizeof(ibv_qp_init_attr));
-    attr.send_cq = adapter_->cq_;
-    attr.recv_cq = adapter_->cq_;
+    attr.send_cq = cq;
+    attr.recv_cq = cq;
     attr.cap.max_send_wr = adapter_->params_.queue_depth;
     attr.cap.max_recv_wr = adapter_->params_.queue_depth;
     attr.cap.max_send_sge = 1;
     attr.cap.max_recv_sge = 1;
     attr.qp_type = IBV_QPT_RC;
+    // attr.qp_type = IBV_QPT_UC;
 
     qp_ = ibv_create_qp(adapter_->pd_, &attr);
     CHECK(qp_) << "Failed to create queue pair";
@@ -589,6 +851,7 @@ RdmaChannel::RdmaChannel(const RdmaAdapter* adapter, const string local_name,
     attr.port_num = adapter_->params_.port_num;
     attr.qp_access_flags = IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_WRITE;
 
+    // https://man7.org/linux/man-pages/man3/ibv_modify_qp.3.html
     int mask =
         IBV_QP_STATE | IBV_QP_PKEY_INDEX | IBV_QP_PORT | IBV_QP_ACCESS_FLAGS;
     CHECK(!ibv_modify_qp(qp_, &attr, mask)) << "Failed to set QP to INIT";
@@ -614,24 +877,50 @@ RdmaChannel::RdmaChannel(const RdmaAdapter* adapter, const string local_name,
   // create message and ack buffers, then initialize the tables.
   {
     const string buffer_names[] = {"tx_message_buffer", "rx_message_buffer"};
-    tx_message_buffer_ = new RdmaMessageBuffer(this, buffer_names[0]);
-    rx_message_buffer_ = new RdmaMessageBuffer(this, buffer_names[1]);
     message_buffers_.reserve(kNumMessageBuffers);
-    message_buffers_.push_back(tx_message_buffer_);
-    message_buffers_.push_back(rx_message_buffer_);
-    // create buffer on host
-    tx_message_buffer_->CreateCPUBuffer(RdmaMessage::kRdmaMessageBufferSize);
-    rx_message_buffer_->CreateCPUBuffer(RdmaMessage::kRdmaMessageBufferSize);
+
+    // add other buffers
+    for (int i = 0; i < kNumMessageBuffers; i = i + 2) {
+      int pair_index = i/2+1;
+      std::stringstream ss;
+      ss << pair_index;
+      auto* tx_buffer1 = new RdmaMessageBuffer(this,
+          "tx_message_buffer_" + ss.str());
+      tx_buffer1->pair_index_ = pair_index;
+      auto* rx_buffer2 = new RdmaMessageBuffer(this,
+          "rx_message_buffer_" + ss.str());
+      rx_buffer2->pair_index_ = pair_index;
+      message_buffers_.push_back(tx_buffer1);
+      message_buffers_.push_back(rx_buffer2);
+      // create buffer and bind to MR
+      // tx_buffer1->CreateCPUBuffer(RdmaMessage::kRdmaMessageBufferSize);
+      // rx_buffer2->CreateCPUBuffer(RdmaMessage::kRdmaMessageBufferSize);
+      // NOTE(wuyongyu02): use chunk to alloc MR
+      void* p1;
+      void* p2;
+      ibv_mr* mr1;
+      ibv_mr* mr2;
+      adapter_->recv_chunk_->Alloc(ib_allocate_size(RdmaMessage::kRdmaMessageBufferSize * 2), &p1, &mr1);
+      CHECK(p1 != nullptr) << " p1 is nullptr";
+      tx_buffer1->ChunkCreateCPUBuffer(RdmaMessage::kRdmaMessageBufferSize, p1, mr1);
+      adapter_->recv_chunk_->Alloc(ib_allocate_size(RdmaMessage::kRdmaMessageBufferSize * 2), &p2, &mr2);
+      CHECK(p1 != nullptr) << " p2 is nullptr";
+      rx_buffer2->ChunkCreateCPUBuffer(RdmaMessage::kRdmaMessageBufferSize, p2, mr2);
+    }
   }
   CHECK(PingPostRecv() == 0) << "Couldn't post receive from " << remote_name_
                              << " with error " << std::strerror(errno);
+
+  channel_record_ = std::make_shared<ChannelRecordTensorMetaData>(this);
+  rdma_send_driver_mgr_ = std::make_shared<RdmaSendDriverMgr>(this);
+  local_driver_buffer_mgr_ = std::make_shared<LocalDriverBufferMgr>(this);
 }
 
 RdmaChannel::~RdmaChannel() {
   ibv_dereg_mr(mr_);
   CHECK(!ibv_destroy_qp(qp_)) << "Failed to destroy QP";
-  delete tx_message_buffer_;
-  delete rx_message_buffer_;
+  // delete tx_message_buffer_;
+  // delete rx_message_buffer_;
 }
 
 void RdmaChannel::SetRemoteAddress(const RdmaAddress& ra, bool override) {
@@ -657,7 +946,7 @@ void RdmaChannel::SetRemoteAddress(const RdmaAddress& ra, bool override) {
 void RdmaChannel::Recv() {
   struct ibv_recv_wr wr;
   memset(&wr, 0, sizeof(wr));
-  wr.wr_id = (uint64_t) this;
+  wr.wr_id = (uint64_t)this;
   struct ibv_recv_wr* bad_wr;
   CHECK(!ibv_post_recv(qp_, &wr, &bad_wr)) << "Failed to post recv";
 }
@@ -668,9 +957,12 @@ RdmaTensorRequest* RdmaChannel::InsertTensorRequest(
     const RdmaTensorRequest::RecvDoneCallback& done) {
   mutex_lock lock{ct_mu_};
   uint32_t request_index = request_serial_++;
-  if (request_serial_ > RDMA_IMM_MAX_REQUEST_ID) {
+
+  // > RDMA_IMM_MIN_SENDMGR_BASE  for SendMgr
+  if (request_serial_ >= RDMA_IMM_MIN_SENDMGR_BASE) {
     request_serial_ = 0;
   }
+
   RdmaTensorRequest request(request_index, key, step_id, this, dst_dev,
                             recv_args, done);
   auto it = request_table_.emplace(request_index, request);
@@ -679,16 +971,34 @@ RdmaTensorRequest* RdmaChannel::InsertTensorRequest(
 
 void RdmaChannel::RemoveTensorRequest(uint32_t request_index) {
   mutex_lock lock{ct_mu_};
-  request_table_.erase(request_index);
+  RDMA_LOG(1) << "RemoveTensorRequest:" << request_index;
+            //<< " parsed_key:" << key_;
+  const auto& it = request_table_.find(request_index);
+  if (it != request_table_.end()) {
+    request_table_.erase(request_index);
+  }
 }
 
 RdmaTensorRequest* RdmaChannel::GetTensorRequest(uint32_t request_index) {
   mutex_lock lock{ct_mu_};
   RequestTable::iterator iter = request_table_.find(request_index);
-  CHECK(iter != request_table_.end());
+  // CHECK(iter != request_table_.end())
+  //    << " RdmaChannel is already been delete.";
+  if (iter == request_table_.end()) {
+    return nullptr;
+  }
   return &iter->second;
 }
 
+RdmaTensorRequest* RdmaChannel::GetTensorRequestForCat(uint32_t request_index) {
+  mutex_lock lock{ct_mu_};
+  RequestTable::iterator iter = request_table_.find(request_index);
+  if (iter != request_table_.end()) {
+     return &iter->second;
+  }
+  return nullptr;
+}
+
 void RdmaChannel::Connect() {
   {
     mutex_lock lock{mu_};
@@ -728,11 +1038,11 @@ void RdmaChannel::Connect(const RdmaAddress& remoteAddr) {
     attr.ah_attr.grh.traffic_class = adapter_->params_.traffic_class;
 
     int r;
-    CHECK(!(r = ibv_modify_qp(qp_, &attr, IBV_QP_STATE | IBV_QP_AV |
-                                              IBV_QP_PATH_MTU |
-                                              IBV_QP_DEST_QPN | IBV_QP_RQ_PSN |
-                                              IBV_QP_MAX_DEST_RD_ATOMIC |
-                                              IBV_QP_MIN_RNR_TIMER)))
+    CHECK(!(r = ibv_modify_qp(qp_, &attr,
+                              IBV_QP_STATE | IBV_QP_AV | IBV_QP_PATH_MTU |
+                                  IBV_QP_DEST_QPN | IBV_QP_RQ_PSN |
+                                  IBV_QP_MAX_DEST_RD_ATOMIC |
+                                  IBV_QP_MIN_RNR_TIMER)))
         << "QP to Ready to Receive " << r;
 
     memset(&attr, 0, sizeof(ibv_qp_attr));
@@ -743,10 +1053,10 @@ void RdmaChannel::Connect(const RdmaAddress& remoteAddr) {
     attr.rnr_retry = 7; /* infinite */
     attr.max_rd_atomic = 1;
 
-    CHECK(!(r = ibv_modify_qp(qp_, &attr, IBV_QP_STATE | IBV_QP_TIMEOUT |
-                                              IBV_QP_RETRY_CNT |
-                                              IBV_QP_RNR_RETRY | IBV_QP_SQ_PSN |
-                                              IBV_QP_MAX_QP_RD_ATOMIC)))
+    CHECK(!(r = ibv_modify_qp(qp_, &attr,
+                              IBV_QP_STATE | IBV_QP_TIMEOUT | IBV_QP_RETRY_CNT |
+                                  IBV_QP_RNR_RETRY | IBV_QP_SQ_PSN |
+                                  IBV_QP_MAX_QP_RD_ATOMIC)))
         << "QP to Ready to Send " << r;
 
     connected_ = true;
@@ -755,6 +1065,711 @@ void RdmaChannel::Connect(const RdmaAddress& remoteAddr) {
   }
 }
 
+RdmaSendDriverMgr::RdmaSendDriverMgr(RdmaChannel* channel) {
+  channel_ = channel;
+  driver_mgr_is_ok_ = false;
+}
+
+size_t RdmaSendDriverMgr::InitLocalDriverEntry() {
+  // LOG(INFO) << "InitLocalDriverEntry begin...";
+  const auto& tensors_meta_data =
+      channel_->channel_record_->GetChannelTensorsMetaData();
+  const auto& global_tensors_meta_data =
+      RecordTensorMetaData::Singleton().GetGlobalTensorsMetaData();
+  // LOG(INFO) << "To Remote name:" << channel_->remote_name_
+  //           << "Channel_Record_Size:" << tensors_meta_data.size();
+  const auto& tensors_uidkeys =
+      channel_->channel_record_->GetChannelTensorsUidParsedkey();
+
+  CHECK(tensors_meta_data.size() == tensors_uidkeys.size())
+      << "tensors_meta_data size:" << tensors_meta_data.size()
+      << " tensors_uidkeys size:" << tensors_uidkeys.size();
+
+  LOG(INFO) << "InitLocalDriverEntry channel Metadata key begin "
+            << "create dirven-entry:"
+            << tensors_meta_data.size();
+  std::set<string> regrex_edge_keys;
+  for (auto& it : tensors_meta_data) {
+    const auto& meta_data = it.second;
+    const uint32& uid = meta_data.uid_;
+    void* addr;
+    ibv_mr *mr;
+    // allocate memory and region
+    int find_allocate_bytes = 0;
+    //NOTE(wuyongyu02) alloc recv-tensor memory
+    if (!channel_->FindLocalMr(it.first, &addr, &mr, &find_allocate_bytes)) {
+      LOG(INFO) << it.first << "not not find..";
+      find_allocate_bytes = 0;
+    } else {
+      LOG(INFO) << it.first << "find.. bytes:" << find_allocate_bytes;
+    }
+    int need_bytes =  VerbsEnvRegistrar::Instance()->RdmaTensorBufferRatio() *
+                      ChannelRecordTensorMetaData::GetTensorBytes(meta_data) +
+                      DriverPrefixMessage::kPrefixMessageTotalBytes;
+
+    if (find_allocate_bytes < need_bytes) {
+      LOG(INFO) << it.first << "reallocate find.. need:"
+                << need_bytes << " " << find_allocate_bytes;
+      channel_->channel_record_->AllocateMemoryAndRegion(it.first, meta_data,
+          channel_->adapter_->pd_, &addr, &mr, &find_allocate_bytes);
+    }
+    driver_entries_[it.first] = std::make_shared<DriverEntry>(
+        uid, it.first, addr, mr, find_allocate_bytes);
+    driver_entries_[it.first]->meta_changed_ = meta_data.meta_changed_;
+  }
+
+  LOG(INFO) << "InitLocalDriverEntry channel Metadata key:"
+            << tensors_meta_data.size()
+            << " driver_entries size:"
+            << driver_entries_.size()
+            << " global_tensors_meta_data size:"
+            << global_tensors_meta_data.size();
+  // Notify local driven-entires entry through Rpc
+  // Notify by Rpc
+  NotifyRemoteDriverEntry();
+  return driver_entries_.size();
+}
+
+// server service Update
+void RdmaSendDriverMgr::RpcUpdateDriverEntries(const DriverMessageResp& resp) {
+  CHECK(channel_->remote_name_ == resp.host_name())
+      << "channel_->remote_name_:" << channel_->remote_name_
+      << " resp.host_name:" << resp.host_name();
+  size_t driver_mgr_is_ok = 0;
+  for (const auto& it : resp.item()) {
+    const auto& parsed_key = it.parsed_key();
+    const auto& entry = driver_entries_.find(parsed_key);
+    if (entry == driver_entries_.end()) {
+      LOG(FATAL) << "RDMA parsed key "
+                 << parsed_key
+                << " is not find in driver_entries_";
+      for (auto& k : driver_entries_) {
+        LOG(INFO) << "kkkk:" << k.first;
+      }
+    }
+    auto& entry_ptr = driver_entries_[parsed_key];
+    if (it.status() == DriverMessageItem::RPC_0 &&
+        entry_ptr->dri_status_ == RPC_0) {
+      entry_ptr->dri_status_ == RPC_1;
+    } else if (it.status() == DriverMessageItem::RPC_1 &&
+        entry_ptr->dri_status_ == RPC_1) {
+      entry_ptr->dri_status_ == DATA_NOT_READY;
+      driver_mgr_is_ok++;
+    } else {
+      LOG(ERROR) << "RDMA RdmaSendDriverMgr::DriverEntries"
+                 << " local_name:" << channel_->local_name_
+                 << " remote_name:" << channel_->remote_name_
+                 << " key:" << parsed_key
+                 << " entry.dri_status_:" << entry_ptr->dri_status_
+                 << " it.status:" << it.status();
+    }
+  }
+  // When all entries is ok, so set driver_mgr status to 'ok'
+  if (driver_mgr_is_ok == driver_entries_.size()) {
+    driver_mgr_is_ok_.store(true);
+    // LOG(INFO) << "[Succeed] "
+    //           << channel_->remote_name_
+    //           << " driver_mgr_ptr RpcSend Entries is ok!";
+  }
+}
+
+bool RdmaSendDriverMgr::RpcReqResp(GrpcVerbsClient* client,
+    const DriverMessageReq& req) {
+  // synchronous call
+  const auto& remote_name = channel_->remote_name_;
+  DriverMessageResp resp;
+  Status s;
+  int attempts = 0;
+  static const int max_num_attempts = 5;
+  do {
+    s = client->ReqDriverMessage(&req, &resp);
+    // save obtained remote addresses
+    // connect to the remote channel
+    if (s.ok()) {
+      RpcUpdateDriverEntries(resp);
+    } else {
+      LOG(ERROR) << "ReqDriverMessage Connecting to " << remote_name << ": Got "
+                  << s.error_message() << ". Retrying (" << (attempts + 1)
+                  << "/" << max_num_attempts << ")...";
+      if (++attempts == max_num_attempts) {
+        return false;
+      }
+      channel_->adapter_->worker_env_->env->SleepForMicroseconds(2000000);
+    }
+  } while (!s.ok());
+  return true;
+}
+
+// Notify by Rpc
+void RdmaSendDriverMgr::NotifyRemoteDriverEntry() {
+  const auto& remote_name = channel_->remote_name_;
+  const auto& local_name = channel_->local_name_;
+  RDMA_LOG(1) << "NotifyRemoteDriverEntry local_worker_name:" << local_name
+            << " remote_name:" << remote_name
+            << " driver_entries_ size:" << driver_entries_.size();
+
+  auto* cache = channel_->channel_cache_;
+  // get the channel cache
+  SharedGrpcChannelPtr client_channel =
+      channel_->channel_cache_->FindWorkerChannel(remote_name);
+  CHECK(client_channel != nullptr) << "target:"
+                                   << remote_name
+                                   << " client_channel is null!";
+  GrpcVerbsClient* client = new GrpcVerbsClient(client_channel);
+  CHECK(client != nullptr) << "No worker known as " << remote_name;
+
+  DriverMessageReq req;
+  req.set_host_name(local_name);
+  for (auto& it : driver_entries_) {
+    auto* item = req.add_item();
+    auto driver_entry_ptr = it.second;
+    item->set_unique_id(driver_entry_ptr->uinque_id_);
+    item->set_parsed_key(it.first);
+    item->set_remote_addr(driver_entry_ptr->addr_);
+    item->set_rkey(driver_entry_ptr->lkey_);
+    item->set_allocate_bytes(driver_entry_ptr->allocate_size_);
+    item->set_meta_changed(driver_entry_ptr->meta_changed_);
+    item->set_status(DriverMessageItem::RPC_0);
+    // Remember to update driver_entries_ Status
+    it.second->dri_status_ = RPC_0;
+  }
+  if (RpcReqResp(client, req)) {
+    DriverMessageReq req_rpc2;
+    req_rpc2.set_host_name(local_name);
+    for (auto& it : driver_entries_) {
+      auto* item = req_rpc2.add_item();
+      auto driver_entry_ptr = it.second;
+      item->set_unique_id(driver_entry_ptr->uinque_id_);
+      item->set_parsed_key(it.first);
+      item->set_remote_addr(driver_entry_ptr->addr_);
+      item->set_rkey(driver_entry_ptr->lkey_);
+      item->set_allocate_bytes(driver_entry_ptr->allocate_size_);
+      item->set_meta_changed(driver_entry_ptr->meta_changed_);
+      item->set_status(DriverMessageItem::RPC_1);
+      // Remember to update driver_entries_ Status
+      it.second->dri_status_ = RPC_1;
+    }
+    if (!RpcReqResp(client, req_rpc2)) {
+      LOG(ERROR) << "ReqDriverMessage RpcReqResp2 remote node "
+               << remote_name << " FAILED";
+    }
+  } else {
+    LOG(ERROR) << "ReqDriverMessage RpcReqResp remote node "
+               << remote_name << " FAILED";
+  }
+  RDMA_LOG(0) << "ReqDriverMessage Connected to remote node " << remote_name;
+  delete client;
+}
+
+void RdmaSendDriverMgr::RpcUpdateRemoteDriverEntry(
+    const DriverMessageReq* request, DriverMessageResp* response) {
+  // setting up response
+  response->set_host_name(channel_->local_name_);
+  int recv_driver_mgr_entry_ok_nums = 0;
+  for (const auto& req_item : request->item()) {
+    DriverMessageItem* resp_item = response->add_item();
+    string parsed_key = req_item.parsed_key();
+    resp_item->set_parsed_key(parsed_key);
+    const auto& it = recv_entries_.find(parsed_key);
+    DriverMessageItem::DriverStatus status = req_item.status();
+    if (it == recv_entries_.end() && status == DriverMessageItem::RPC_0) {
+      recv_entries_[parsed_key] = std::make_shared<DriverEntry>();
+      recv_entries_[parsed_key]->uinque_id_ = req_item.unique_id();
+      recv_entries_[parsed_key]->addr_ =  req_item.remote_addr();
+      recv_entries_[parsed_key]->lkey_ = req_item.rkey();
+      recv_entries_[parsed_key]->allocate_size_ = req_item.allocate_bytes();
+      recv_entries_[parsed_key]->meta_changed_ = req_item.meta_changed();
+      recv_entries_[parsed_key]->parsed_key_ = parsed_key;
+      // update recv entries
+      recv_entries_[parsed_key]->dri_status_ = RPC_0;
+      // response status
+      resp_item->set_status(DriverMessageItem::RPC_0);
+      RDMA_LOG(1) << "RpcUpdateRemoteDriverEntry parsed_key :"
+                << parsed_key
+                << " recv dir_status: RPC_0 "
+                << " update dir_status RPC_1 : "
+                << recv_entries_[parsed_key]->dri_status_;
+    } else if (it->second->dri_status_ == RPC_0 &&
+               status == DriverMessageItem::RPC_1) {
+      // response status
+      resp_item->set_status(DriverMessageItem::RPC_1);
+      // update recv entries
+      recv_entries_[parsed_key]->dri_status_ = DATA_NOT_READY;
+      recv_driver_mgr_entry_ok_nums += 1;
+      RDMA_LOG(1) << "RpcUpdateRemoteDriverEntry parsed_key :"
+                << parsed_key
+                << " recv dir_status: RPC_1 "
+                << " update dir_status DATA_NOT_READY : "
+                << recv_entries_[parsed_key]->dri_status_;
+    } else {
+      LOG(ERROR) << "UpdateRemoteDriverEntry:"
+                  << "local_name:"
+                  << channel_->local_name_
+                  << " revc from remote:"
+                  << request->host_name()
+                  << " parsed_key:"
+                  << parsed_key
+                  << " recv_entries dri_status is not `RPC_1` "
+                  << " status is :"
+                  << status
+                  << " dri_status is "
+                  << recv_entries_[parsed_key]->dri_status_;
+    }
+  }
+  RDMA_LOG(1) << "RdmaSendDriverMgr::RpcUpdateRemoteDriverEntry end...."
+            << " localname:" << channel_->local_name_
+            << " remotename:" << channel_->remote_name_
+            << " recv_entries_ size:" << recv_entries_.size();
+  // driver_mgr_ptr is ok and can send tensor to other client.
+  if (recv_driver_mgr_entry_ok_nums == recv_entries_.size()) {
+    // allocate string RDMA
+    // NOTE(wuyongyu02)
+    // Allocate StringMessage change to FindOrCreateMemeoryRegion
+    // AllocateRecvEntriesStringMemoryAndRegion();
+    LOG(INFO) << "[Succeed] "
+              << request->host_name()
+              << " driver_mgr_ptr RecvEntries is ok!";
+  }
+}
+
+void RdmaSendDriverMgr::AllocateRecvEntriesStringMemoryAndRegion() {
+  for (auto& k : recv_entries_) {
+    void* addr;
+    ibv_mr *mr;
+    // allocate memory and region
+    int allocate_bytes = 0;
+    channel_->channel_record_->AllocateSendStringMemoryAndRegion(
+        channel_->adapter_->pd_, &addr, &mr, &allocate_bytes);
+    k.second->send_mem_mr_ = std::make_shared<RemoteBytesAddrMemoryRegion>(
+        addr, mr, allocate_bytes);
+    RDMA_LOG(1) << "AllocateRecvEntriesStringMemoryAndRegion:"
+              << k.first
+              << " allocate_bytes:"
+              << allocate_bytes;
+  }
+}
+
+std::shared_ptr<DriverEntry> RdmaSendDriverMgr::GetRecvEntry(
+    const std::string& parsed_key, bool* has_data) {
+  const auto& it = recv_entries_.find(parsed_key);
+  if (it == recv_entries_.end()) {
+    for (auto& find : recv_entries_) {
+      if (absl::StrContains(parsed_key, find.first)) {
+        return find.second;
+      }
+    }
+    // LOG(FATAL) << parsed_key << " is not find in recv_entries_.";
+    return nullptr;
+  }
+  *has_data = recv_entries_[parsed_key]->dri_status_ == DATA_READY;
+  // LOG(INFO) << "parsed_key:" << parsed_key
+  //           << " status:" << recv_entries_[parsed_key]->dri_status_
+  //           << " has_data:" << *has_data;
+  return recv_entries_[parsed_key];
+}
+
+std::shared_ptr<DriverEntry> RdmaSendDriverMgr::GetDriverEntry(
+    const std::string& parsed_key, bool* has_data) {
+  const auto& it = driver_entries_.find(parsed_key);
+  if (it == driver_entries_.end()) {
+    for (auto& find : driver_entries_) {
+      if (absl::StrContains(parsed_key, find.first)) {
+        return find.second;
+      }
+    }
+    LOG(FATAL) << parsed_key << " is not find in driver_entries_.";
+  }
+  *has_data = driver_entries_[parsed_key]->dri_status_ == DATA_READY;
+  return driver_entries_[parsed_key];
+}
+
+DriverEntry::DriverEntry() {
+  dri_status_.store(DRIVER_INIT);
+}
+
+DriverEntry::DriverEntry(const uint32& uid,
+              const std::string& parsedkey,
+              void* addr,
+              ibv_mr* mr,
+              int allocate_size) {
+  addr_ = (uint64_t) addr;
+  mem_mr_ =
+      std::make_shared<RemoteBytesAddrMemoryRegion>(addr, mr, allocate_size);
+  lkey_ = mr->lkey;
+  uinque_id_ = uid;
+  parsed_key_ = parsedkey;
+  dri_status_.store(DRIVER_INIT);
+  allocate_size_ = allocate_size;
+}
+
+string ChannelRecordTensorMetaData::RegexEdgeName(const string & str) {
+  std::string regex_str(".*edge_\\d*(_.*)(_\\d*)?;0:0");
+  std::regex pattern(regex_str, std::regex::icase);
+  std::smatch result;
+  if (std::regex_match(str, result, pattern)) {
+    return std::string(result[1]);
+  } else {
+    LOG(ERROR) << "RegexEdgeName key:" << str << " is not matchaed. pattern:"
+                << regex_str;
+  }
+  return str;
+}
+
+void ChannelRecordTensorMetaData::InitMetaDataFromEnv() {
+  // Init Channel
+  mutex_lock l(channel_tensor_meta_data_mu_);
+  const string& name = channel_->local_name_;
+  if (absl::StrContains(name, "worker") ||
+      absl::StrContains(name, "localhost")) {
+    const string meta_str = GetWorkerMetas();
+    StringPiece s(meta_str);
+    while (!s.empty()) {
+      StringPiece result = ConsumeNextPart(&s, '|');
+      if (!result.empty()) {
+        StringPiece meta_name_view = ConsumeNextPart(&result, '#');
+        if (!meta_name_view.empty()) {
+          auto meta_name = string(meta_name_view);
+          std::stringstream ss(string(result).c_str());
+          int meta_size = 0;
+          ss >> meta_size;
+          CHECK(meta_size > 0)
+              << " meta_name" << meta_name << " size:" << meta_size;
+          auto find = channel_tensors_meta_data_.find(meta_name);
+          if (find == channel_tensors_meta_data_.end()) {
+            auto it = channel_tensors_meta_data_.emplace(meta_name,
+                                                         TensorMetaData());
+            channel_tensors_uid_parsed_key_.emplace(uid_, meta_name);
+            auto& meta = channel_tensors_meta_data_[meta_name];
+            meta.uid_ = uid_;
+            if (it.second) {
+              uid_++;
+            }
+            meta.data_type_ = DT_INT64;
+            meta.tensor_shape_ = {};
+            meta.proto_size_ = 0;
+            meta.is_dead_ = false;
+          }
+        }
+      }
+    }
+  }
+}
+
+ChannelRecordTensorMetaData::ChannelRecordTensorMetaData(RdmaChannel* channel) {
+  channel_ = channel;
+  InitMetaDataFromEnv();
+}
+
+uint32 ChannelRecordTensorMetaData::GetEnumSize(const DataType& date_type) {
+  switch (date_type) {
+    case DT_FLOAT:
+      return 4;
+      break;
+    case DT_DOUBLE:
+      return 8;
+      break;
+    case DT_INT32:
+      return 4;
+      break;
+    case DT_UINT32:
+      return 4;
+      break;
+    case DT_UINT16:
+      return 2;
+      break;
+    case DT_INT8:
+      return 1;
+      break;
+    case DT_UINT8:
+      return 1;
+      break;
+    case DT_INT16:
+      return 2;
+      break;
+    case DT_INT64:
+      return 8;
+      break;
+    case DT_UINT64:
+      return 8;
+      break;
+    case DT_BOOL:
+      return 1;
+      break;
+    default:
+      return 4;
+      break;
+  }
+}
+
+void ChannelRecordTensorMetaData::AllocateSendStringMemoryAndRegion(ibv_pd* pd,
+                                        void** addr,
+                                        ibv_mr** mr,
+                                        int* addr_size,
+                                        Allocator* alloc_attr) {
+  // allocate prefix DriverPrefixMessage
+  auto total_bytes = DriverPrefixMessage::kPrefixMessageTotalBytes;
+  RDMA_LOG(1) << "AllocateSendStringMemoryAndRegion total bytes:"
+              << total_bytes;
+  *addr = malloc(total_bytes);
+  CHECK(addr != nullptr)
+      << "AllocateSendStringMemoryAndRegion addr malloc faild!";
+  *mr = ibv_reg_mr(pd, *addr, total_bytes,
+                  IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_WRITE);
+  *addr_size = total_bytes;
+}
+
+int ChannelRecordTensorMetaData::GetTensorBytes(const TensorMetaData& m) {
+  int total_bytes = 0;
+  if (DataTypeCanUseMemcpy(m.data_type_)) {
+    int m1 = m.tensor_shape_.num_elements();
+    total_bytes = m1 * GetEnumSize(m.data_type_);
+  } else {
+    total_bytes = m.proto_size_;
+  }
+  return total_bytes;
+}
+
+void ChannelRecordTensorMetaData::AllocateMemoryAndRegion(
+                                  const string& key,
+                                  const TensorMetaData& m,
+                                  ibv_pd* pd,
+                                  void** addr,
+                                  ibv_mr** mr,
+                                  int* addr_size,
+                                  Allocator* alloc_attr) const {
+  int total_bytes = GetTensorBytes(m);
+  total_bytes =
+      VerbsEnvRegistrar::Instance()->RdmaTensorBufferRatio() * total_bytes;
+  // allocate prefix DriverPrefixMessage
+  total_bytes += DriverPrefixMessage::kPrefixMessageTotalBytes;
+  RDMA_LOG(1) << "AllocateMemoryAndRegion key:"
+              << key
+              << " total bytes:" << total_bytes;
+  *addr = malloc(total_bytes);
+  CHECK(addr != nullptr) << "AllocateMemoryAndRegion addr malloc faild!";
+  *mr = ibv_reg_mr(pd, *addr, total_bytes,
+                  IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_WRITE);
+  *addr_size = total_bytes;
+}
+
+
+void ChannelRecordTensorMetaData::Record(const std::string& tensor_name,
+                                         const TensorMetaData& m) {
+  // send-driver stop record
+  if (channel_->could_send_driver_) {
+    return;
+  }
+  // LOG(INFO) << "ChannelRecordTensorMetaData::Record " << is_stable_;
+  mutex_lock l(channel_tensor_meta_data_mu_);
+  auto find = channel_tensors_meta_data_.find(tensor_name);
+  if (find == channel_tensors_meta_data_.end()) {
+    // LOG(INFO) << "Channel Record Tensorname:" << tensor_name;
+    auto it = channel_tensors_meta_data_.emplace(tensor_name, TensorMetaData());
+    channel_tensors_uid_parsed_key_.emplace(uid_, tensor_name);
+    auto& meta = channel_tensors_meta_data_[tensor_name];
+    meta.uid_ = uid_;
+    if (it.second) {
+      uid_++;
+    }
+    meta.data_type_ = m.data_type_;
+    meta.tensor_shape_ = m.tensor_shape_;
+    meta.proto_size_ = m.proto_size_;
+    meta.is_dead_ = m.is_dead_;
+  } else {
+    auto& meta = find->second;
+    bool can_memcpy = DataTypeCanUseMemcpy(m.data_type_);
+    if (can_memcpy) {
+      int m1 = 1;
+      int m2 = 1;
+      for (int d = 0; d < m.tensor_shape_.dims(); d++) {
+        m1 *= m.tensor_shape_.dim_size(d);
+        m2 *= meta.tensor_shape_.dim_size(d);
+      }
+      if (m1 > m2) {
+        meta.data_type_ = m.data_type_;
+        meta.tensor_shape_ = m.tensor_shape_;
+        meta.proto_size_ = m.proto_size_;
+        meta.is_dead_ = m.is_dead_;
+      }
+      if (m1 != m2) {
+        // LOG(INFO) << "Tensorname:" << tensor_name << " meta_changed.";
+        meta.meta_changed_ = true;
+      }
+    }
+    if ((!can_memcpy && meta.proto_size_ > m.proto_size_)) {
+      meta.data_type_ = m.data_type_;
+      meta.tensor_shape_ = m.tensor_shape_;
+      meta.proto_size_ = 10 * m.proto_size_;
+      meta.is_dead_ = m.is_dead_;
+    }
+    if (!can_memcpy && meta.proto_size_ != m.proto_size_) {
+      // LOG(INFO) << "Tensorname:" << tensor_name << " _meta_changed.";
+      meta.meta_changed_ = true;
+    }
+  }
+}
+
+StringPiece ChannelRecordTensorMetaData::ConsumeNextPart(StringPiece* s,
+    char delim) {
+  for (size_t offset = 0; offset < s->size(); offset++) {
+    if ((*s)[offset] == delim) {
+      StringPiece result(s->data(), offset);
+      s->remove_prefix(offset + 1);  // +1: remove delim, as well
+      return result;
+    }
+  }
+  // No delimiter found: return rest of string
+  StringPiece result(s->data(), s->size());
+  s->remove_prefix(s->size());
+  return result;
+}
+
+string RecordTensorMetaData::DebugString() const {
+  std::vector<string> lc;
+  for (auto& it : global_tensors_meta_data_) {
+    std::vector<string> ds;
+    ds.emplace_back(string(it.first));
+    // dtype
+    ds.emplace_back(std::to_string(it.second.data_type_));
+    // num elements
+    auto num_elements = it.second.tensor_shape_.num_elements();
+    ds.emplace_back(std::to_string(num_elements));
+    auto total_bytes = num_elements * GetEnumSize(it.second.data_type_);
+    ds.emplace_back(std::to_string(total_bytes));
+    lc.push_back(absl::StrJoin(lc, ","));
+  }
+    return absl::StrJoin(lc, "\n");
+}
+
+void RecordTensorMetaData::WriteOutput(const std::string& content) const {
+  Env* env = Env::Default();
+  std::string path_dir = GetMetaOutput();
+  if (!env->FileExists(path_dir).ok()) {
+    LOG(INFO) << "File " << path_dir << " is not exists!";
+    env->CreateDir(path_dir);
+    LOG(INFO) << "CreateFileDir " << path_dir << " sucess!";
+  }
+
+  std::string cfn = path_dir + "/" + local_worker_name_;
+  // Write something to the temporary file.
+  std::unique_ptr<WritableFile> file_to_write;
+  TF_CHECK_OK(env->NewWritableFile(cfn, &file_to_write));
+  TF_CHECK_OK(file_to_write->Append(content));
+  TF_CHECK_OK(file_to_write->Close());
+  TF_CHECK_OK(env->FileExists(cfn));
+}
+
+void RecordTensorMetaData::ReadFile(const std::string& filename,
+    StringPiece* content) {
+  Env* env = Env::Default();
+  // Read from the temporary file and check content.
+  std::unique_ptr<RandomAccessFile> file_to_read;
+  TF_CHECK_OK(env->NewRandomAccessFile(filename, &file_to_read));
+  // StringPiece content;
+  char scratch[1024];
+  CHECK_EQ(error::OUT_OF_RANGE,
+          file_to_read->Read(0 /* offset */, 1024 /* n */, content, scratch)
+              .code());
+}
+
+uint32 RecordTensorMetaData::GetEnumSize(const DataType& date_type) {
+  switch (date_type) {
+    case DT_FLOAT:
+      return 4;
+      break;
+    case DT_DOUBLE:
+      return 8;
+      break;
+    case DT_INT32:
+      return 4;
+      break;
+    case DT_UINT32:
+      return 4;
+      break;
+    case DT_UINT16:
+      return 2;
+      break;
+    case DT_INT8:
+      return 1;
+      break;
+    case DT_UINT8:
+      return 1;
+      break;
+    case DT_INT16:
+      return 2;
+      break;
+    case DT_INT64:
+      return 8;
+      break;
+    case DT_UINT64:
+      return 8;
+      break;
+    case DT_BOOL:
+      return 1;
+      break;
+    default:
+      return 4;
+      break;
+  }
+}
+
+void RecordTensorMetaData::GlobalRecord(const std::string& origin_tensor_name,
+    const TensorMetaData& m, bool stop_record) {
+  // send-driver status stop record
+  if (stop_record) {
+    return;
+  }
+  mutex_lock l(global_tensor_meta_data_mu_);
+  auto tensor_name = ChannelRecordTensorMetaData::RegexEdgeName(
+      origin_tensor_name);
+  auto find = global_tensors_meta_data_.find(tensor_name);
+  // LOG(INFO) << "Record Tensorname:" << tensor_name;
+  if (find == global_tensors_meta_data_.end()) {
+    auto it = global_tensors_meta_data_.emplace(tensor_name, TensorMetaData());
+    global_tensors_uid_parsed_key_.emplace(uid_, tensor_name);
+    auto& meta = global_tensors_meta_data_[tensor_name];
+    meta.uid_ = uid_;
+    if (it.second) {
+      uid_++;
+    }
+    meta.data_type_ = m.data_type_;
+    meta.tensor_shape_ = m.tensor_shape_;
+    meta.proto_size_ = m.proto_size_;
+    meta.is_dead_ = m.is_dead_;
+  } else {
+    auto& meta = find->second;
+    bool can_memcpy = DataTypeCanUseMemcpy(m.data_type_);
+    if (can_memcpy) {
+      int m1 = 1;
+      int m2 = 1;
+      for (int d = 0; d < m.tensor_shape_.dims(); d++) {
+        m1 *= m.tensor_shape_.dim_size(d);
+        m2 *= meta.tensor_shape_.dim_size(d);
+      }
+      if (m1 > m2) {
+        meta.data_type_ = m.data_type_;
+        meta.tensor_shape_ = m.tensor_shape_;
+        meta.proto_size_ = m.proto_size_;
+        meta.is_dead_ = m.is_dead_;
+      }
+    }
+    if ((!can_memcpy && meta.proto_size_ > m.proto_size_)) {
+      meta.data_type_ = m.data_type_;
+      meta.tensor_shape_ = m.tensor_shape_;
+      meta.proto_size_ = 10 * m.proto_size_;
+      meta.is_dead_ = m.is_dead_;
+    }
+  }
+  int tmp_sizes = 0;
+  for(const auto& k : global_tensors_meta_data_) {
+    tmp_sizes += ChannelRecordTensorMetaData::GetTensorBytes(k.second);
+  }
+  if (tmp_sizes > total_bytes_) {
+    total_bytes_ = tmp_sizes;
+    LOG(INFO) << "GlobalRecord bytes:" << total_bytes_;
+  }
+}
+
 RdmaMessageBuffer::RdmaMessageBuffer(RdmaChannel* channel, string name)
     : channel_(channel), name_(name) {}
 
@@ -769,6 +1784,26 @@ void RdmaMessageBuffer::FreeBuffer() {
   }
 }
 
+void RdmaMessageBuffer::ChunkCreateCPUBuffer(size_t size, void* buffer, ibv_mr* mr,
+    bool lock) {
+  CHECK(size > 0);
+  if (lock) {
+    mu_.lock();
+  }
+  if (local_status_ != none) {
+    // delete existing buffer
+  }
+  size_ = size;
+  buffer_ = buffer;
+  self_ = mr;
+  CHECK(self_) << "Failed to register memory region";
+  buffer_on_host_ = true;
+  local_status_ = idle;
+  if (lock) {
+    mu_.unlock();
+  }
+}
+
 // Allocate CPU memory for the Rdma buffer
 // Args:
 //   size: to-be-allocated memory size
@@ -827,6 +1862,44 @@ void RdmaMessageBuffer::Write(uint32_t imm_data, size_t buffer_size) {
         remote_.remote_addr, remote_.rkey, RDMA_WRITE_ID_MESSAGE, this);
 }
 
+// Generalized Write method
+void RdmaMessageBuffer::WriteWithPrefix(const RdmaChannel* channel,
+                              uint32_t imm_data,
+                              size_t buffer_size,
+                              uint64_t src_addr,
+                              uint32_t lkey,
+                              uint64_t remote_addr,
+                              uint32_t rkey,
+                              RdmaWriteIDType write_type,
+                              void* write_context,
+                              uint64_t prefix_addr,
+                              uint32_t prefix_lkey,
+                              size_t prefix_size) {
+  struct ibv_sge* list = new ibv_sge[2];
+  list[0].addr = prefix_addr;
+  list[0].length = prefix_size;
+  list[0].lkey = prefix_lkey;
+
+  list[1].addr = src_addr;
+  list[1].length = buffer_size;
+  list[1].lkey = lkey;
+
+  struct ibv_send_wr wr;
+  memset(&wr, 0, sizeof(wr));
+
+  wr.wr_id = (uint64_t) new RdmaWriteID(write_type, write_context);
+  wr.sg_list = list;
+  wr.num_sge = 2;
+  wr.opcode = IBV_WR_RDMA_WRITE_WITH_IMM;
+  wr.send_flags = IBV_SEND_SIGNALED;
+  wr.imm_data = imm_data;
+  wr.wr.rdma.remote_addr = remote_addr;
+  wr.wr.rdma.rkey = rkey;
+
+  struct ibv_send_wr* bad_wr;
+  CHECK(!ibv_post_send(channel->qp_, &wr, &bad_wr)) << "Failed to post send";
+}
+
 // Generalized Write method
 void RdmaMessageBuffer::Write(const RdmaChannel* channel, uint32_t imm_data,
                               size_t buffer_size, uint64_t src_addr,
@@ -840,6 +1913,7 @@ void RdmaMessageBuffer::Write(const RdmaChannel* channel, uint32_t imm_data,
 
   struct ibv_send_wr wr;
   memset(&wr, 0, sizeof(wr));
+
   wr.wr_id = (uint64_t) new RdmaWriteID(write_type, write_context);
   wr.sg_list = &list;
   wr.num_sge = 1;
@@ -854,17 +1928,21 @@ void RdmaMessageBuffer::Write(const RdmaChannel* channel, uint32_t imm_data,
 }
 
 // Send the next ack from the buffer's job queue.
-void RdmaMessageBuffer::SendAck(const RdmaChannel* channel) {
-  Write(channel, RDMA_IMM_DATA_ACK, 0, 0, 0, 0, 0, RDMA_WRITE_ID_ACK, nullptr);
+void RdmaMessageBuffer::SendAck(const RdmaChannel* channel, int pair_index) {
+  Write(channel, RDMA_IMM_MAX_REQUEST_ID + pair_index, 0, 0, 0, 0, 0,
+        RDMA_WRITE_ID_ACK, nullptr);
 }
 
 // Send the next message from the buffer's job queue.
 void RdmaMessageBuffer::SendNextItem() {
-  uint32_t imm_data = RDMA_IMM_DATA_MESSAGE;
+  uint32_t imm_data = RDMA_IMM_DATA_ACK + pair_index_;
   mu_.lock();
   if (!queue_.empty() && (local_status_ == idle) && (remote_status_ == idle)) {
     local_status_ = busy;
     remote_status_ = busy;
+    time_guard_ = 0;
+    rm_ack_micros_ = 0;
+    // LOG(ERROR) << "SendNextItem queue size:" << queue_.size();
     string message = queue_.front();
     queue_.pop();
     // local/remote_status_ won't be set back to idle
@@ -969,47 +2047,63 @@ static void StreamGPUOp(Device* gpu_device, const DeviceContext* device_context,
 
 RdmaTensorResponse* RdmaChannel::AddTensorResponse(const RdmaMessage& rm) {
   mutex_lock lock{mu_};
-  auto it =
-      responses_table_.emplace(rm.request_index_, RdmaTensorResponse(this, rm));
+  auto it = responses_table_.emplace(rm.request_index_,
+      std::make_shared<RdmaTensorResponse>(this, rm));
   CHECK(it.second) << "Response with the ID " << rm.request_index_
                    << " already exists.";
-  return &it.first->second;
+  // replica request_index
+  it.first->second->request_index_ = rm.request_index_;
+  return it.first->second.get();
 }
 
 RdmaTensorResponse* RdmaChannel::UpdateTensorResponse(const RdmaMessage& rm) {
   mutex_lock lock{mu_};
   auto it = responses_table_.find(rm.request_index_);
   CHECK(it != responses_table_.end()) << "No response found.";
-  RdmaTensorResponse* response = &it->second;
+  RdmaTensorResponse* response = it->second.get();
   response->Update(rm);
   return response;
 }
 
 void RdmaChannel::RemoveTensorResponse(uint32_t request_index) {
   mutex_lock lock{mu_};
-  responses_table_.erase(request_index);
+  if (responses_table_.find(request_index) != responses_table_.end())
+    responses_table_.erase(request_index);
 }
 
 void RdmaTensorResponse::Start() {
+  // LOG(INFO) << "RdmaTensorResponse Start...";
   Rendezvous::ParsedKey parsed;
   Status s = Rendezvous::ParseKey(rm_.name_, &parsed);
+  if (s.ok()) {
+    s = PrepareRecvTensor(parsed, &src_dev_);
+  }
   if (!s.ok()) {
-    SendErrorStatus(s);
+    SendErrorStatus(s, "RdmaTensorResponse::Start::PrepareRecvTensor");
     return;
   }
-
+  recv_local_send_rdma_ = 0;
   channel_->adapter_->worker_env_->rendezvous_mgr->RecvLocalAsync(
       rm_.step_id_, parsed,
-      [this, parsed](const Status& status, const Rendezvous::Args& send_args,
+      [this](const Status& status, const Rendezvous::Args& send_args,
                      const Rendezvous::Args& recv_args, const Tensor& in,
-                     bool is_dead) {
-        CHECK(status.ok()) << "RecvLocalAsync was not ok."
-                           << " error message: " << status.error_message();
-        RecvHandler(parsed, send_args, recv_args, in, is_dead);
+                     bool is_dead) mutable {
+          // (wuyongyu02) if the sender don't receive tensor from local
+          // should't CHECK(FALSE), can send SendErrorStatus  like
+          // RdmaTensorResponse::RecvHandler(...) {... SendErrorStatus(status);}
+          // CHECK(status.ok()) << "RecvLocalAsync was not ok."
+          //                    << "src_device : " << parsed.src_device
+          //                    << "dst_device : " << parsed.dst_device
+          //                    << " error message: " << status.error_message();
+          if (!status.ok()) {
+            // SendErrorStatus(status, "rendezvous_mgr->RecvLocalAsync::");
+            return;
+          }
+          RecvHandler(send_args, recv_args, in, is_dead);
       });
 }
 
-void RdmaTensorResponse::Resume() { SendContent(*tensor_, *proto_, is_dead_); }
+void RdmaTensorResponse::Resume() { SendContent(*tensor_, *proto_, is_dead_, true); }
 
 // Helper for RecvTensor. Validates "key" and returns the source
 // device in "*src_dev".
@@ -1035,16 +2129,9 @@ Status RdmaTensorResponse::PrepareRecvTensor(
   return Status::OK();
 }
 
-void RdmaTensorResponse::RecvHandler(Rendezvous::ParsedKey parsed,
-                                     const Rendezvous::Args& send_args,
+void RdmaTensorResponse::RecvHandler(const Rendezvous::Args& send_args,
                                      const Rendezvous::Args& recv_args,
                                      const Tensor& in, bool is_dead) {
-  Status s = PrepareRecvTensor(parsed, &src_dev_);
-  if (!s.ok()) {
-    SendErrorStatus(s);
-    return;
-  }
-
   meta_data_changed_ = TensorMetaDataChanged(in, is_dead);
 #ifdef RDMA_DATA_VALIDATION
   // Always send a meta data message with the source checksum
@@ -1071,9 +2158,7 @@ void RdmaTensorResponse::RecvHandler(Rendezvous::ParsedKey parsed,
       // so anyway we'll have to copy it from GPU to CPU first. If at some
       // point in time Clone() is changed to only save a shallow copy, we can
       // skip the copy here as well.
-      if ((in.TotalBytes() > 0) && !meta_data_changed_ &&
-          (RdmaMemoryMgr::Singleton().FindMemoryRegion(
-               (void*)DMAHelper::base(&in), in.TotalBytes()) != nullptr)) {
+      if ((in.TotalBytes() > 0) && !meta_data_changed_) {
         StreamGPUOp(src_dev_, send_dev_context,
                     [this, in, proto, is_dead](const Status& s) {
                       Send(in, proto, is_dead, s);
@@ -1101,7 +2186,8 @@ void RdmaTensorResponse::RecvHandler(Rendezvous::ParsedKey parsed,
           });
     }
 #else
-    SendErrorStatus(errors::Internal("No GPU device in process"));
+    SendErrorStatus(errors::Internal("No GPU device in process"),
+                    "No GPU device in process");
 #endif  // GOOGLE_CUDA
   } else {
     // tensor is in CPU memory.
@@ -1115,17 +2201,180 @@ void RdmaTensorResponse::RecvHandler(Rendezvous::ParsedKey parsed,
 void RdmaTensorResponse::Send(const Tensor& in, const TensorProto& proto,
                               bool is_dead, const Status& status) {
   if (!status.ok()) {
-    SendErrorStatus(status);
+    SendErrorStatus(status, "RdmaTensorResponse::Send::!status.ok");
+    return;
+  }
+  SendBck(in, proto, is_dead, status);
+}
+
+void RdmaChannel::SendDriverData(const Tensor& in,
+                                 bool is_dead,
+                                 const std::string& name) {
+  bool has_data = false;
+  std::shared_ptr<DriverEntry> entry =
+      rdma_send_driver_mgr_->GetRecvEntry(name, &has_data);
+
+  CHECK(entry.get() != nullptr) << "Channel SendDriverData to "
+                                << name
+                                << " is_dead:"
+                                << is_dead
+                                << " dtype:"
+                                << DataTypeString(in.dtype())
+                                << " shape:"
+                                << in.shape();
+
+  bool can_memcpy = DataTypeCanUseMemcpy(in.dtype());
+  TensorProto proto;
+  if (!can_memcpy) {
+    in.AsProtoTensorContent(&proto);
+  }
+  size_t tensor_bytes = can_memcpy ? in.TotalBytes() : proto.ByteSize();
+  if (is_dead) {
+    tensor_bytes = 0;
+  }
+  entry->send_micros_ = 0;
+  // prefix
+  string prefix = DriverPrefixMessage::CreateDriverPrefixMessage(in.shape(),
+      tensor_bytes, is_dead, entry->send_micros_, entry->meta_changed_);
+  uint32_t imm_data = entry->uinque_id_;
+
+  // tensor
+  uint32_t send_tensor_lkey = 0;
+  size_t prefix_s = prefix.size();
+  int need_length = prefix_s + tensor_bytes;
+  if (entry->tensor_addr_ == nullptr) {
+    if (!FindLocalMr(name, &entry->tensor_addr_,
+          &entry->smr_, &entry->local_allocate_size_)) {
+      entry->local_allocate_size_ = 0;
+    }
+  }
+  if (need_length > entry->local_allocate_size_) {
+    LOG(INFO) << "key :" << name << " relloc need_length:"
+              << need_length
+              << " already size:"
+              << entry->local_allocate_size_;
+    entry->local_allocate_size_ = Alloc(prefix_s + 
+        VerbsEnvRegistrar::Instance()->RdmaTensorBufferRatio() * tensor_bytes,
+        &entry->tensor_addr_, &entry->smr_, false);
+  }
+
+  if (!is_dead) {
+    if (can_memcpy) {
+      // allocate region and copy data
+      entry->src_buffer_ = const_cast<TensorBuffer*>(DMAHelper::buffer(&in));
+      if (entry->src_buffer_ != nullptr) {
+        if (tensor_bytes > 0) {
+          void* addr_offset = (void*)((uint64_t)entry->tensor_addr_ + prefix_s);
+          memcpy(addr_offset, DMAHelper::base(&in), tensor_bytes);
+        }
+      }
+    } else {
+      // for send dirven
+      void* addr_offset = (void*)((uint64_t)entry->tensor_addr_ + prefix_s);
+      proto.SerializeToArray(addr_offset, tensor_bytes);
+    }
+  } else {
+    tensor_bytes = 0;
+  }
+  memcpy(entry->tensor_addr_, prefix.data(), prefix_s);
+  send_tensor_lkey = (entry->smr_ == nullptr) ?
+                        0 : entry->smr_->lkey;
+  // remote mr addr
+  uint64_t remote_addr = entry->addr_;
+  uint32_t rkey = entry->lkey_;
+  CHECK(tensor_bytes + prefix_s <= entry->allocate_size_)
+      << " 1name:" << name
+      << " May should large allocate static memory ratio"
+      << " tensor_bytes:" << tensor_bytes
+      << " prefix_s:" << prefix_s
+      << " entry->allocate_size_:" << entry->allocate_size_;
+  auto tensor_addr = (uint64_t)entry->tensor_addr_;
+  RdmaMessageBuffer::Write(this, imm_data, tensor_bytes + prefix_s,
+      tensor_addr, send_tensor_lkey, remote_addr, rkey,
+      RDMA_WRITE_ID_SEND_DEIVER_WRITE, entry.get());
+}
+
+void RdmaChannel::InitAndSetDriverStatus() {
+  size_t entries_size = rdma_send_driver_mgr_->InitLocalDriverEntry();
+  // init LocalDriverBufferMgr
+  size_t ready_size = local_driver_buffer_mgr_->InitLocalDriverBufferMgr();
+  CHECK_EQ(entries_size, ready_size)
+      << "NotifyAsyncAllocator entries_size:"
+      << entries_size
+      << " ready_size:"
+      << ready_size;
+  // TODO(wuyongyu) could_send_driver must set before Async InitLocalDriverEntry
+  could_send_driver_ = true;
+}
+
+void RdmaChannel::PleSendOrCheck() {
+  const auto& remote_name = remote_name_;
+  const auto& local_name = local_name_;
+  RDMA_LOG(1) << "NotifyRemoteDriverEntry local_worker_name:" << local_name
+            << " remote_name:" << remote_name;
+
+  // get the channel cache
+  SharedGrpcChannelPtr client_channel =
+      channel_cache_->FindWorkerChannel(remote_name);
+  CHECK(client_channel != nullptr) << "PleSendOrCheck target:"
+                                   << remote_name
+                                   << " client_channel is null!";
+  GrpcVerbsClient* client = new GrpcVerbsClient(client_channel);
+  CHECK(client != nullptr) << "PleSendOrCheck No worker known as "
+                           << remote_name;
+
+  PleSendOrCheckReq req;
+  req.set_host_name(local_name);
+  // synchronous call
+  PleSendOrCheckResp resp;
+  Status s;
+  int attempts = 0;
+  static const int max_num_attempts = 5;
+  do {
+    s = client->ReqPleSendOrCheck(&req, &resp);
+    // save obtained remote addresses
+    // connect to the remote channel
+    if (s.ok() && resp.is_ok()) {
+      LOG(INFO) << "verbs to "<< remote_name << " ReqPleSendOrCheck succeed!";
+    } else {
+      LOG(ERROR) << "ReqPleSendOrCheck Connecting to "
+                 << remote_name << ": Got "
+                 << s.error_message() << ". Retrying (" << (attempts + 1)
+                 << " Remote worker Async status:" << resp.is_ok()
+                 << "/" << max_num_attempts << ")..."
+                 << " resp.is_ok:" << resp.is_ok();
+
+      if (++attempts == max_num_attempts) {
+        CHECK(FATAL) << "RdmaChannel::PleSendOrCheck failed";
+      }
+      adapter_->worker_env_->env->SleepForMicroseconds(2000000);
+    }
+  } while (!s.ok());
+  delete client;
+}
+
+void RdmaTensorResponse::SendBck(const Tensor& in, const TensorProto& proto,
+                              bool is_dead, const Status& status) {
+  if (!status.ok()) {
+    SendErrorStatus(status, "RdmaTensorResponse::SendBck::!status.ok");
     return;
   }
   bool can_memcpy = DataTypeCanUseMemcpy(in.dtype());
   bool proto_size_changed =
       (!can_memcpy) && (proto.ByteSize() != rm_.tensor_bytes_);
+
+  int pair_index = (request_index_ % RdmaChannel::kNumMessageBuffers) / 2;
+  int buffer_index = 2 * pair_index;
+  auto* tx_buffer = channel_->message_buffers()[buffer_index];
+  // move cpu allocator tensor to RdmaMR tensor
+  // RdmaClone(in, proto, is_dead);
   if (meta_data_changed_ || proto_size_changed) {
     Clone(in, proto, is_dead);
+    // Here is a bug
     SendMetaData(in, proto, is_dead);
+    tx_buffer->SendNextItem();
   } else {
-    SendContent(in, proto, is_dead);
+    SendContent(in, proto, is_dead, false);
   }
 }
 
@@ -1134,6 +2383,21 @@ bool RdmaTensorResponse::TensorMetaDataChanged(const Tensor& in, bool is_dead) {
          (rm_.is_dead_ != is_dead);
 }
 
+void RdmaTensorResponse::RdmaClone(const Tensor& in, const TensorProto& proto,
+                               bool is_dead) {
+  bool can_memcpy = DataTypeCanUseMemcpy(in.dtype());
+  if (can_memcpy && (in.TotalBytes() > 0)) {
+    tensor_ = new Tensor(channel_->rdma_mem_allocator_, in.dtype(), in.shape());
+    memcpy(DMAHelper::base(tensor_), DMAHelper::base(&in), in.TotalBytes());
+  } else {
+    tensor_ = new Tensor(in.dtype(), in.shape());
+  }
+  if (!can_memcpy) {
+    proto_ = new TensorProto(proto);
+  }
+  is_dead_ = is_dead;
+}
+
 void RdmaTensorResponse::Clone(const Tensor& in, const TensorProto& proto,
                                bool is_dead) {
   // Clone the data to be sent later. For simplicity, we clone the tensor's
@@ -1142,13 +2406,23 @@ void RdmaTensorResponse::Clone(const Tensor& in, const TensorProto& proto,
   // that some tensors share their buffer between different step-ids, so the
   // tensor content may change before re-request was completed.
   bool can_memcpy = DataTypeCanUseMemcpy(in.dtype());
+  // if (can_memcpy && (in.TotalBytes() > 0)) {
+  //   AllocatorAttributes host_alloc_attrs;
+  //   host_alloc_attrs.set_nic_compatible(true);
+  //   host_alloc_attrs.set_on_host(true);
+  //   Allocator* allocator = src_dev_->GetAllocator(host_alloc_attrs);
+  //   tensor_ = new Tensor(allocator, in.dtype(), in.shape());
+  //   memcpy(DMAHelper::base(tensor_), DMAHelper::base(&in), in.TotalBytes());
+  // } else {
+  //   tensor_ = new Tensor(in.dtype(), in.shape());
+  // }
   if (can_memcpy && (in.TotalBytes() > 0)) {
-    AllocatorAttributes host_alloc_attrs;
-    host_alloc_attrs.set_nic_compatible(true);
-    host_alloc_attrs.set_on_host(true);
-    Allocator* allocator = src_dev_->GetAllocator(host_alloc_attrs);
-    tensor_ = new Tensor(allocator, in.dtype(), in.shape());
-    memcpy(DMAHelper::base(tensor_), DMAHelper::base(&in), in.TotalBytes());
+    channel_->FindOrCreateRemoteBytesAddrMemoryRegion(rm_.name_,
+          &src_addr_, &mr_, &res_region_, in.TotalBytes());
+    auto src_buffer = const_cast<TensorBuffer*>(DMAHelper::buffer(&in));
+    memcpy(src_addr_, DMAHelper::base(&in), in.TotalBytes());
+    res_fake_allocator_ = new FakeAllocator(src_addr_);
+    tensor_ = new Tensor(res_fake_allocator_, in.dtype(), in.shape());
   } else {
     tensor_ = new Tensor(in.dtype(), in.shape());
   }
@@ -1160,12 +2434,13 @@ void RdmaTensorResponse::Clone(const Tensor& in, const TensorProto& proto,
 
 void RdmaTensorResponse::SendMetaData(const Tensor& in,
                                       const TensorProto& proto, bool is_dead) {
+  // LOG(INFO) << "SendMetaData...";
+  send_meta_begin_ = 0;
   RDMA_LOG(2) << "Request #" << rm_.request_index_
               << ": Meta data changed: " << rm_.name_;
   bool can_memcpy = DataTypeCanUseMemcpy(in.dtype());
   size_t tensor_bytes = (can_memcpy) ? in.TotalBytes() : proto.ByteSize();
 
-  // Send meta-data update:
   RdmaMessage rm;
   rm.type_ = RDMA_MESSAGE_META_DATA_UPDATE;
   rm.name_size_ = rm_.name_.size();
@@ -1186,31 +2461,46 @@ void RdmaTensorResponse::SendMetaData(const Tensor& in,
               << " data-type = " << DataTypeString(rm.data_type_) << "."
               << " is-dead = " << rm.is_dead_ << ")";
 
+  // rm.create_micros_ = send_meta_begin_;
   string message = RdmaMessage::CreateMessage(rm);
-  channel_->tx_message_buffer_->EnqueueItem(message);
-  channel_->tx_message_buffer_->SendNextItem();
+  int pair_index = (request_index_ % RdmaChannel::kNumMessageBuffers) / 2;
+  int buffer_index = 2 * pair_index;
+  auto* tx_message_buffer  = channel_->message_buffers()[buffer_index];
+  tx_message_buffer->EnqueueItem(message);
 }
 
 void RdmaTensorResponse::SendContent(const Tensor& in, const TensorProto& proto,
-                                     bool is_dead) {
+                                     bool is_dead,
+                                     bool is_resume) {
+  // update recv_local_send_rmda
+  // overcome the sendmeta effects
   bool can_memcpy = DataTypeCanUseMemcpy(in.dtype());
   size_t tensor_bytes = (can_memcpy) ? in.TotalBytes() : proto.ByteSize();
   uint32_t imm_data = rm_.request_index_;
+
+  AllocatorAttributes host_alloc_attrs;
+    host_alloc_attrs.set_nic_compatible(true);
+    host_alloc_attrs.set_on_host(true);
+  Allocator* allocator = src_dev_->GetAllocator(host_alloc_attrs);
   if (!is_dead) {
-    if (can_memcpy) {
+    if (can_memcpy && !is_resume || in.TotalBytes() == 0) {
+      // when send_content directly so we need copy data
       src_buffer_ = const_cast<TensorBuffer*>(DMAHelper::buffer(&in));
       if (src_buffer_ != nullptr) {
-        src_buffer_->Ref();  // Keep buffer alive until write is complete
-        src_addr_ = src_buffer_->data();
-        mr_ = RdmaMemoryMgr::Singleton().FindMemoryRegion(src_addr_,
-                                                          tensor_bytes);
+        // src_buffer_->Ref();  // Keep buffer alive until write is complete
+        // TODO(wuyongyu02): Move to Meta Change
+        channel_->FindOrCreateRemoteBytesAddrMemoryRegion(rm_.name_,
+            &src_addr_, &mr_, &res_region_, tensor_bytes);
+        if (tensor_bytes > 0) {
+          memcpy(src_addr_, src_buffer_->data(), tensor_bytes);
+        }
       }
-    } else {
+    } 
+    if(!can_memcpy){
       RDMA_LOG(2) << "Encoding proto: " << rm_.name_
                   << " (Size: " << tensor_bytes << ") " << in.DebugString();
-      src_addr_ = malloc(tensor_bytes);
-      mr_ = ibv_reg_mr(channel_->adapter_->pd_, src_addr_, tensor_bytes,
-                       IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_WRITE);
+      channel_->FindOrCreateRemoteBytesAddrMemoryRegion(rm_.name_,
+          &src_addr_, &mr_, &res_region_, tensor_bytes);
       proto.SerializeToArray(src_addr_, tensor_bytes);
     }
   } else {
@@ -1230,7 +2520,8 @@ void RdmaTensorResponse::SendContent(const Tensor& in, const TensorProto& proto,
                            rm_.rkey_, RDMA_WRITE_ID_TENSOR_WRITE, this);
 }
 
-void RdmaTensorResponse::SendErrorStatus(const Status& status) {
+void RdmaTensorResponse::SendErrorStatus(const Status& status,
+    const std::string& src_func_name) {
   RdmaMessage rm;
   rm.type_ = RDMA_MESSAGE_ERROR_STATUS;
   rm.name_size_ = rm_.name_.size();
@@ -1238,28 +2529,40 @@ void RdmaTensorResponse::SendErrorStatus(const Status& status) {
   rm.step_id_ = rm_.step_id_;
   rm.request_index_ = rm_.request_index_;
   rm.status_ = status;
-  LOG(ERROR) << "Step 0x" << std::hex << rm.step_id_ << std::dec
+
+  LOG(INFO) << "Step 0x" << (int64)rm.step_id_ << std::dec
              << ": Sending RDMA_MESSAGE_ERROR_STATUS #" << rm.request_index_
-             << ": " << rm.name_ << ". Status: " << status.ToString();
+             << ": " << rm.name_ << ". Status: " << status.ToString()
+             << " src_func_name:" << src_func_name;
 
   string message = RdmaMessage::CreateMessage(rm);
-  channel_->tx_message_buffer_->EnqueueItem(message);
-  channel_->tx_message_buffer_->SendNextItem();
 
+  int pair_index = (request_index_ % RdmaChannel::kNumMessageBuffers) / 2;
+  int buffer_index = 2 * pair_index;
+  // buffer_index = 0;
+  auto* tx_message_buffer  = channel_->message_buffers()[buffer_index];
+  // channel_->tx_message_buffer_->EnqueueItem(message);
+  // channel_->tx_message_buffer_->SendNextItem();
+  tx_message_buffer->EnqueueItem(message);
+  tx_message_buffer->SendNextItem();
   // Destroy the response.
   Destroy();
 }
 
 void RdmaTensorResponse::Destroy() {
+  if (res_region_.get() != nullptr) {
+    // res_region_->Unref();
+  }
+  // response end
   if (src_buffer_ != nullptr) {
-    src_buffer_->Unref();
+    // src_buffer_->Unref();
   }
   if (tensor_ != nullptr) {
     delete tensor_;
   }
   if (proto_ != nullptr) {
-    ibv_dereg_mr(mr_);
-    free(src_addr_);
+    // ibv_dereg_mr(mr_);
+    // free(src_addr_);
     delete proto_;
   }
   // Remove response from the pending list:
@@ -1275,8 +2578,10 @@ string RdmaMessage::CreateMessage(const RdmaMessage& rm) {
   // Rdma Message format
   // type|name_size|name|step_id|request_index|remote_addr|rkey|is_dead|...
   //   1B|    2B   | 512|  8B   |     8B      |       8B  | 4B |    1B |...
-  // ...|data_type|tensor_shape|tensor_bytes|error_status          |
-  // ...|   XB    |    XB      |    8B      |size - 4B, proto - XB |
+  // ...|data_type|tensor_shape|tensor_bytes|create_micros|error_status       |
+  // ...|   XB    |    XB      |    8B      |8B        |size - 4B, proto - XB |
+  // ...| remote_bytes_addr    | remote_bytes_value
+  //         8B                |    4B
   //
   // ACK:             Imm-type: ACK
   // TENSOR_REQUEST:  Imm-type: MESSAGE
@@ -1292,12 +2597,13 @@ string RdmaMessage::CreateMessage(const RdmaMessage& rm) {
   //                  Fields: type, request_index, name, step_id, error_status
   // Tensor content:  Imm-type: request_index
   size_t message_size = kMessageTotalBytes;
-  char message[kMessageTotalBytes + kErrorStatusMaxSize];
+  char message[kMessageTotalBytes + kErrorStatusMaxSize + 100];
   // type
   message[kTypeStartIndex] = static_cast<char>(rm.type_) & 0xff;
   // request index
   memcpy(&message[kRequestIndexStartIndex], &rm.request_index_,
          sizeof(rm.request_index_));
+
   // name, step_id, remote_addr, rkey
   if ((rm.type_ == RDMA_MESSAGE_TENSOR_REQUEST) ||
       (rm.type_ == RDMA_MESSAGE_TENSOR_RE_REQUEST)) {
@@ -1308,10 +2614,16 @@ string RdmaMessage::CreateMessage(const RdmaMessage& rm) {
            sizeof(rm.remote_addr_));
     memcpy(&message[kRkeyStartIndex], &rm.rkey_, sizeof(rm.rkey_));
     memcpy(&message[kStepIdStartIndex], &rm.step_id_, sizeof(rm.step_id_));
+
+    // memcpy(&message[KRemoteBytesAddrKeyStartIndex],
+    //          &rm.remote_bytes_addr_key_, sizeof(rm.remote_bytes_addr_key_));
+    // memcpy(&message[KRemoteBytesAddrStartIndex],
+    //          &rm.remote_bytes_addr_, sizeof(rm.remote_bytes_addr_));
   }
-  // is_dead, data_type, tensor_shape, tensor_bytes
+  // is_dead, data_type, tensor_shape, tensor_bytes, create_micros
   if ((rm.type_ == RDMA_MESSAGE_TENSOR_REQUEST) ||
       (rm.type_ == RDMA_MESSAGE_META_DATA_UPDATE) ||
+      (rm.type_ == RDMA_MESSAGE_DRIVER_BEGIN) ||
       (rm.type_ == RDMA_MESSAGE_TENSOR_RE_REQUEST)) {
     memcpy(&message[kIsDeadStartIndex], &rm.is_dead_, sizeof(rm.is_dead_));
 
@@ -1321,14 +2633,18 @@ string RdmaMessage::CreateMessage(const RdmaMessage& rm) {
            sizeof(rm.tensor_shape_));
     memcpy(&message[kTensorBytesStartIndex], &rm.tensor_bytes_,
            sizeof(rm.tensor_bytes_));
+    // memcpy(&message[kCreateMicrosStartIndex], &rm.create_micros_,
+    //       sizeof(rm.create_micros_));
   }
-// checksum
+  // checksum
 #ifdef RDMA_DATA_VALIDATION
   memcpy(&message[kChecksumStartIndex], &rm.checksum_, sizeof(rm.checksum_));
 #endif
   // error status
   if (rm.type_ == RDMA_MESSAGE_ERROR_STATUS) {
     ::grpc::Status gs = ToGrpcStatus(rm.status_);
+    // (wuyongyu) decrease the error message size https://km.sankuai.com/page/403000580
+    // ::grpc::Status gs = ::grpc::Status::OK;
     ErrorStatusProto gsProto;
     gsProto.set_error_code(gs.error_code());
     gsProto.set_error_message(gs.error_message());
@@ -1348,12 +2664,47 @@ string RdmaMessage::CreateMessage(const RdmaMessage& rm) {
   return string(message, message_size);
 }
 
-// Parse a RdmaMessage according to the pre-defined format
-// Args:
-//   rm: the message structure where the parsed message will be saved
-//   buffer: the place where the raw message is stored
-// Returns:
-//   None
+string FussionMessages::CreateFusionMessages(
+    const std::vector<RdmaMessage>& rmv) {
+  CHECK(rmv.size() < kRdmaMaxMessagesNumber)
+      << "FussionMessages CreateFusionMessages must less "
+      << kRdmaMaxMessagesNumber;
+  size_t message_size = kTotalFussionMessageSize;
+  char message[kTotalFussionMessageSize + RdmaMessage::kRdmaMessageBufferSize];
+  uint32_t* mn = (uint32_t*)&message[kMessageNumbersStartIndex];
+  *mn = rmv.size();
+  for (int i = 0; i < rmv.size(); i++) {
+    string m = RdmaMessage::CreateMessage(rmv[i]);
+    uint32_t* ms = (uint32_t*)&message[kMessageSizeStartIndex + i * 4];
+    *ms = m.size();
+    uint32_t s;
+    memcpy(&s, &message[kMessageSizeStartIndex + i * 4], sizeof(s));
+    memcpy(&message[KStringMessagesStartIndex +
+           i * RdmaMessage::kRdmaMessageBufferSize], m.data(), m.size());
+  }
+}
+
+void FussionMessages::ParseFussionMessages(std::vector<RdmaMessage>& rmv,
+    void* buffer) {
+  char* message = static_cast<char*>(buffer);
+  uint32_t mn = 0;
+  memcpy(&mn, &message[kMessageNumbersStartIndex], sizeof(mn));
+  if (mn == 0) {
+    return;
+  }
+  rmv.reserve(mn);
+  for (int i=0; i < mn; i++) {
+    uint32_t message_size;
+    memcpy(&message_size, &message[kMessageSizeStartIndex + i * 4],
+           sizeof(message_size));
+    char m[RdmaMessage::kMessageTotalBytes +
+            RdmaMessage::kErrorStatusMaxSize + 100];
+    memcpy(m, &message[KStringMessagesStartIndex +
+            i * RdmaMessage::kRdmaMessageBufferSize], message_size);
+    RdmaMessage::ParseMessage(rmv[i], &m);
+  }
+}
+
 void RdmaMessage::ParseMessage(RdmaMessage& rm, void* buffer) {
   char* message = static_cast<char*>(buffer);
   // type
@@ -1361,6 +2712,7 @@ void RdmaMessage::ParseMessage(RdmaMessage& rm, void* buffer) {
   // request index
   memcpy(&rm.request_index_, &message[kRequestIndexStartIndex],
          sizeof(rm.request_index_));
+
   // name, step_id, remote_addr, rkey
   if ((rm.type_ == RDMA_MESSAGE_TENSOR_REQUEST) ||
       (rm.type_ == RDMA_MESSAGE_TENSOR_RE_REQUEST)) {
@@ -1371,10 +2723,17 @@ void RdmaMessage::ParseMessage(RdmaMessage& rm, void* buffer) {
            sizeof(rm.remote_addr_));
     memcpy(&rm.rkey_, &message[kRkeyStartIndex], sizeof(rm.rkey_));
     memcpy(&rm.step_id_, &message[kStepIdStartIndex], sizeof(rm.step_id_));
+    // memcpy(&rm.remote_bytes_addr_key_,
+    //          &message[KRemoteBytesAddrKeyStartIndex],
+    //          sizeof(rm.remote_bytes_addr_key_));
+    // memcpy(&rm.remote_bytes_addr_,
+    //          &message[KRemoteBytesAddrStartIndex],
+    //          sizeof(rm.remote_bytes_addr_));
   }
   // data_type, tensor_bytes, tensor_shape, is_dead
   if ((rm.type_ == RDMA_MESSAGE_TENSOR_REQUEST) ||
       (rm.type_ == RDMA_MESSAGE_META_DATA_UPDATE) ||
+      (rm.type_ == RDMA_MESSAGE_DRIVER_BEGIN) ||
       (rm.type_ == RDMA_MESSAGE_TENSOR_RE_REQUEST)) {
     memcpy(&rm.is_dead_, &message[kIsDeadStartIndex], sizeof(rm.is_dead_));
     memcpy(&rm.data_type_, &message[kDataTypeStartIndex],
@@ -1383,8 +2742,10 @@ void RdmaMessage::ParseMessage(RdmaMessage& rm, void* buffer) {
            sizeof(rm.tensor_shape_));
     memcpy(&rm.tensor_bytes_, &message[kTensorBytesStartIndex],
            sizeof(rm.tensor_bytes_));
+    // memcpy(&rm.create_micros_, &message[kCreateMicrosStartIndex],
+    //       sizeof(rm.create_micros_));
   }
-// checksum
+  // checksum
 #ifdef RDMA_DATA_VALIDATION
   memcpy(&rm.checksum_, &message[kChecksumStartIndex], sizeof(rm.checksum_));
 #endif
@@ -1401,6 +2762,10 @@ void RdmaMessage::ParseMessage(RdmaMessage& rm, void* buffer) {
   }
 }
 
+ibv_mr* RdmaChannel::FindMemoryRegion(void* addr, size_t length) {
+  return rdma_memory_mgr_->FindMemoryRegion(addr, length);
+}
+
 //*****************************************************************************
 // RdmaMemoryMgr
 //*****************************************************************************
@@ -1423,12 +2788,15 @@ void RdmaMemoryMgr::InsertMemoryRegion(void* addr, size_t length,
   RDMA_LOG(1) << "Insert memory region 0x" << std::hex << mr->rkey << ". ["
               << addr << "-" << (void*)((uint64_t)addr + length - 1) << "]"
               << " SIZE: 0x" << length << " (" << allocator_name << ").";
+  // LOG(INFO) << "Insert memory region 0x" << std::hex << mr->rkey << ". ["
+  //           << addr << "-" << (void*)((uint64_t)addr + length - 1) << "]"
+  //           << " SIZE: 0x" << length << " (" << allocator_name << ").";
   if (mr != nullptr) {
     mutex_lock l(mrs_mu_);
     auto iter = std::upper_bound(mrs_.begin(), mrs_.end(), addr, &Comparator);
     mrs_.insert(iter, {mr, &MRDeleter});
   } else {
-    LOG(WARNING) << "Cannot register memory region";
+    LOG(FATAL) << "Cannot register memory region";
   }
 }
 
@@ -1445,7 +2813,7 @@ void RdmaMemoryMgr::EvictMemoryRegion(void* addr, size_t length) {
   }
 }
 
-const TensorMetaData* RdmaMemoryMgr::GetTensorMetaData(
+const TensorMetaData* RdmaChannel::GetTensorMetaData(
     const std::string& tensor_name) {
   mutex_lock l(tensor_meta_data_mu_);
   auto it = tensors_meta_data_.find(tensor_name);
@@ -1455,7 +2823,7 @@ const TensorMetaData* RdmaMemoryMgr::GetTensorMetaData(
   return &it->second;
 }
 
-const TensorMetaData* RdmaMemoryMgr::SetTensorMetaData(
+const TensorMetaData* RdmaChannel::SetTensorMetaData(
     const std::string& tensor_name, DataType dtype, const TensorShape& shape,
     bool is_dead, size_t proto_size) {
   mutex_lock l(tensor_meta_data_mu_);
@@ -1471,49 +2839,276 @@ const TensorMetaData* RdmaMemoryMgr::SetTensorMetaData(
 // RdmaTensorRequest
 //*****************************************************************************
 
+Status LocalDriverBufferMgr::QueueRdmaSave(const string& key,
+    const Args& send_args, Tensor* val, const bool is_dead,
+    const uint64& send_begin_micros) {
+  string key_hash(key);
+  if (!status_.ok()) {
+    Status s = status_;
+    return s;
+  }
+  QueueItems* queue_pair = queue_table_[key_hash];
+  CHECK(queue_pair != nullptr) << "QueueRdmaSave queue_pair is nullptr:"
+                               << key_hash;
+  ItemQueue * queue_item = queue_pair->queue;
+  queue_pair->queue_lock_.lock();
+  if (queue_item->empty() || queue_item->front()->HasValue()) {
+    RDMA_LOG(1) << "QueueRdmaSave Enqueue Send Item (key:" << key << "). ";
+    Item* item = new Item;
+    item->value = val;
+    item->is_dead = is_dead;
+    item->has_value = true;
+    item->send_args = send_args;
+    item->send_start_micros_ =  Env::Default()->NowMicros();
+    if (item->send_args.device_context) {
+      item->send_args.device_context->Ref();
+    }
+    queue_item->push_back(item);
+    // LOG(INFO) << "QueueRdmaEnqueueSendWaitRecv_Micros:"
+    //              << item->send_start_micros_ - send_args.rendezvous_micros;
+    queue_pair->queue_lock_.unlock();
+    return Status::OK();
+  }
+  RDMA_LOG(1) << "QueueRdmaSave Consume Recv Item (key:" << key << "). ";
+  Item* item = queue_item->front();
+  if (queue_item->size() == 1) {
+    VLOG(2) << "Clean up Send/Recv queue (key:" << key << "). ";
+    // queue_table_.erase(key_hash);
+    queue_item->pop_front();
+  } else {
+    queue_item->pop_front();
+  }
+  queue_pair->queue_lock_.unlock();
+  DCHECK(item->HasCallback());
+  // LOG(INFO) << "QueueRdmaRecvWaitSend_Micros key:" << key << " micros:"
+  //           << Env::Default()->NowMicros() - item->recv_start_micros_;
+  item->waiter(Status::OK(), send_args, item->recv_args, *val, is_dead);
+  delete item;
+  return Status::OK();
+}
+
+Status LocalDriverBufferMgr::RdmaSave(const string& key, const Args& send_args,
+    const Tensor& val, const bool is_dead) {
+  LOG(FATAL) << "this should not used;";
+  return Status::OK();
+}
+
+void LocalDriverBufferMgr::QueueLoadAsync(const string& key,
+    const Args& recv_args, DoneCallback done,
+    const uint64& request_start_micros) {
+  string key_hash(key);
+  if (!status_.ok()) {
+    // Rendezvous has been aborted.
+    Status s = status_;
+    done(s, Args(), recv_args, Tensor(), false);
+    return;
+  }
+  const auto& find = queue_table_.find(key_hash);
+  if (find == queue_table_.end()) {
+    for (auto& find : queue_table_) {
+      if (absl::StrContains(key_hash, find.first)) {
+        key_hash = find.first;
+        break;
+      }
+    }
+  }
+  QueueItems* queue_pair = queue_table_[key_hash];
+  CHECK(queue_pair != nullptr)
+      << "QueueLoadAsync queue_pair is null:" << key_hash;
+  ItemQueue * queue_item = queue_pair->queue;
+
+  queue_pair->queue_lock_.lock();
+  if (queue_item->empty() || !queue_item->front()->HasValue()) {
+    CancellationManager* cm = recv_args.cancellation_manager;
+    CancellationToken token = CancellationManager::kInvalidToken;
+    bool already_cancelled = false;
+    if (cm != nullptr) {
+        token = cm->get_cancellation_token();
+        already_cancelled = !cm->RegisterCallback(token, [this, token,
+                                                          key_hash] {
+          Item* item = nullptr;
+          {
+            QueueItems* queue_pair = queue_table_[key_hash];
+            ItemQueue * queue_item = queue_pair->queue;
+            if (queue_item->empty() || !queue_item->front()->HasValue()) {
+              for (auto it = queue_item->begin(); it != queue_item->end();
+                   it++) {
+                if ((*it)->cancellation_token == token) {
+                  item = *it;
+                  if (queue_item->size() == 1) {
+                    // key_hash queue can reuse
+                    // table_.erase(key_hash);
+                    queue_item->erase(it);
+                  } else {
+                    queue_item->erase(it);
+                  }
+                }
+              }
+            }
+          }
+          if (item != nullptr) {
+            item->waiter(StatusGroup::MakeDerived(
+                             errors::Cancelled("LoadAsync is cancelled.")),
+                         Args(), item->recv_args, Tensor(), /*is_dead=*/false);
+            delete item;
+          }
+      });
+    }
+    if (already_cancelled) {
+      queue_pair->queue_lock_.unlock();
+      done(StatusGroup::MakeDerived(
+                 errors::Cancelled("LoadAsync is cancelled.")),
+             Args(), recv_args, Tensor(), /*is_dead=*/false);
+      return;
+    }
+    RDMA_LOG(1) << "LoadAsync Enqueue Recv Item (key:" << key << "). ";
+    Item* item = new Item;
+    if (cm != nullptr) {
+      auto wrapped_done = std::bind(
+          [cm, token](const DoneCallback& done,
+                      // Begin unbound arguments.
+                      const Status& s, const Args& send_args,
+                      const Args& recv_args, const Tensor& v, bool dead) {
+            cm->TryDeregisterCallback(token);
+            RDMA_LOG(1) << "LoadAsync Enqueue Recv DoneCallback begin...";
+            done(s, send_args, recv_args, v, dead);
+          },
+          std::move(done), std::placeholders::_1, std::placeholders::_2,
+          std::placeholders::_3, std::placeholders::_4,
+          std::placeholders::_5);
+      item->waiter = std::move(wrapped_done);
+    } else {
+      item->waiter = std::move(done);
+    }
+    item->recv_args = recv_args;
+    item->cancellation_token = token;
+    item->request_start_micros_ = request_start_micros;
+    item->recv_start_micros_ = Env::Default()->NowMicros();
+    if (item->recv_args.device_context) {
+      item->recv_args.device_context->Ref();
+    }
+    queue_item->push_back(item);
+    queue_pair->queue_lock_.unlock();
+    return;
+  }
+  RDMA_LOG(1) << "LoadAsync Consume Send Item (key:" << key << "). ";
+  Item* item = queue_item->front();
+  // LOG(INFO) << "QueueRdmaSendWaitRecv_Micros key:" << key << " micros:"
+  //           << Env::Default()->NowMicros() - item->send_start_micros_;
+  if (queue_item->size() == 1) {
+    VLOG(2) << "Clean up Send/Recv queue (key:" << key << "). ";
+    // queue_table_.erase(key_hash);
+    queue_item->pop_front();
+  } else {
+    queue_item->pop_front();
+  }
+  queue_pair->queue_lock_.unlock();
+  DCHECK(item->HasValue());
+  done(Status::OK(), item->send_args, recv_args, *(item->value), item->is_dead);
+  delete item;
+}
+
+void LocalDriverBufferMgr::LoadAsync(const string& key, const Args& recv_args,
+                DoneCallback done) {
+  LOG(FATAL) << "LoadAsync is not impl.";
+  return;
+}
+
+size_t LocalDriverBufferMgr::InitLocalDriverBufferMgr() {
+  RDMA_LOG(1) << "InitLocalDriverBufferMgr begin...";
+  const auto& tensors_meta_data =
+      channel_->channel_record_->GetChannelTensorsMetaData();
+  const auto& tensors_uid_parsed_key =
+      channel_->channel_record_->GetChannelTensorsUidParsedkey();
+
+  CHECK(tensors_meta_data.size() == tensors_uid_parsed_key.size())
+        << "tensors_meta_data size:"
+        << tensors_meta_data.size()
+        << " tensors_uid_parsed_key size:"
+        << tensors_uid_parsed_key.size();
+
+  std::vector<string> print_keys;
+  for (auto& it : tensors_meta_data) {
+    auto tfi = table_.find(it.first);
+    if (tfi == table_.end()) {
+      table_[it.first] = new Item();
+    }
+    auto qfi = queue_table_.find(it.first);
+    if (qfi == queue_table_.end()) {
+      print_keys.emplace_back(it.first);
+      queue_table_[it.first] = new QueueItems();
+      queue_table_[it.first]->queue = new ItemQueue();
+    }
+  }
+  RDMA_LOG(1) << "InitLocalDriverBufferMgr Queutable:"
+            << print_keys.size()
+            << " "
+            << absl::StrJoin(print_keys, ",");
+  size_t ready_size = queue_table_.size();
+  RDMA_LOG(1) << "InitLocalDriverBufferMgr end size:" << ready_size;
+  return ready_size;
+}
+
+void LocalDriverBufferMgr::StartAbort(const Status& status) {
+  CHECK(!status.ok());
+  Table table;
+  {
+    status_.Update(status);
+    table_.swap(table);
+  }
+  for (auto& p : table) {
+    Item* item = p.second;
+    if (!item->HasCallback()) {
+      item->waiter(status, Args(), Args(), Tensor(), false);
+    }
+  }
+}
+
+//*****************************************************************************
+// RdmaTensorRequest
+//*****************************************************************************
+
 RdmaTensorRequest::RdmaTensorRequest(
     uint32_t index, const string& key, int64 step_id, RdmaChannel* channel,
     Device* dst_dev, const Rendezvous::Args recv_args,
     const RdmaTensorRequest::RecvDoneCallback& done)
     : index_(index),
-      key_(key),
       step_id_(step_id),
       channel_(channel),
       dst_dev_(dst_dev),
       recv_args_(recv_args),
-      meta_data_(RdmaMemoryMgr::Singleton().GetTensorMetaData(key)),
       result_tensor_(nullptr),
       proxy_tensor_(nullptr),
       rdma_addr_(nullptr),
       mr_(nullptr),
-      done_(done) {}
+      done_(done),
+      begin_start_req_(0) {
+        key_.assign(key, 0, RdmaMessage::kNameCapacity);
+}
 
-RdmaTensorRequest::~RdmaTensorRequest() { DeallocateTensors(); }
+RdmaTensorRequest::~RdmaTensorRequest() {
+  DeallocateTensors();
+}
 
 void RdmaTensorRequest::Done(const Status& s) {
   Tensor val = std::move(*result_tensor_);
-
-#ifdef RDMA_DATA_VALIDATION
-  // Validate checksum
-  // Unfortunately we can't always do a Checksum directly on the result tensor.
-  // If the result tensor is on GPU, then we need to copy it back to CPU. If
-  // we happen to be in the midst of a proxy callback, then the copying will
-  // get stuck.
-  uint64_t checksum = (proxy_tensor_ != nullptr)
-                          ? Checksum(nullptr, nullptr, *proxy_tensor_)
-                          : Checksum(dst_dev_, recv_args_.device_context, val);
-  ValidateChecksum(checksum_, checksum, val, index_, key_, "RDMA");
-#endif
-
   Rendezvous::Args recv_args = std::move(recv_args_);
   bool is_dead = (meta_data_ == nullptr) ? false : meta_data_->is_dead_;
   RecvDoneCallback done = done_;
   DeallocateTensors();
+  // if (result_region_.get() != nullptr) {
+  //   result_region_->Unref();
+  // }
   channel_->RemoveTensorRequest(index_);
   done(s, Rendezvous::Args(), recv_args, val, is_dead);
 }
 
 void RdmaTensorRequest::DeallocateTensors() {
+  // if (fake_allocator_ != nullptr) {
+  //   LOG(INFO) << "delete fake_allocator";
+  //   delete fake_allocator_;
+  //   fake_allocator_ = nullptr;
+  // }
   if (result_tensor_ != nullptr) {
     delete result_tensor_;
     result_tensor_ = nullptr;
@@ -1524,37 +3119,173 @@ void RdmaTensorRequest::DeallocateTensors() {
   }
 }
 
+size_t RdmaChannel::Alloc(size_t size, void** p, ibv_mr** mr, 
+                          bool dynamic, size_t realloc_size) const {
+  size_t allocate_size = size;
+  if (dynamic) {
+    ib_malloc(p, &allocate_size, size, EIGEN_MAX_ALIGN_BYTES);
+    *mr = ibv_reg_mr(pd_, *p, allocate_size,
+                    IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_WRITE);
+    return allocate_size;
+  }
+  // chunk alloc
+  adapter_->recv_chunk_->Alloc(ib_allocate_size(size), p, mr, realloc_size);
+  return allocate_size;
+}
+
+bool RdmaChannel::FindLocalMr(const std::string& key,
+    void** remote_bytes_addr, ibv_mr** mr, int* length) {
+  mutex_lock l(remote_bytes_addr_mu_);
+  auto it = remote_bytes_addr_mrs_.find(key);
+  if (it == remote_bytes_addr_mrs_.end()) {
+   return false;
+  }
+  *remote_bytes_addr = it->second->addr_;
+  *mr = it->second->mr_ptr_;
+  *length = it->second->size_;
+  CHECK(*remote_bytes_addr != nullptr && *mr != nullptr)
+      << "key " << key << "*remote_bytes_addr is null?";
+  return *remote_bytes_addr != nullptr && *mr != nullptr;
+}
+
+void RdmaChannel::FindOrCreateRemoteBytesAddrMemoryRegion(
+    const std::string& key,
+    void** remote_bytes_addr,
+    ibv_mr** mr,
+    std::shared_ptr<RemoteBytesAddrMemoryRegion> * region,
+    size_t length,
+    const Allocator* alloc_attr) {
+  int allocate_size = 0;
+  // region has already exists addr's info.
+  if ((*region).get() != nullptr && (*region)->size_ > length) {
+    *remote_bytes_addr = (*region)->addr_;
+    *mr = (*region)->mr_ptr_;
+    // (*region)->Ref();
+    return;
+  }
+  // allocate_size = VerbsEnvRegistrar::Instance()->RdmaTensorBufferRatio() * length;
+  // allocate_size = Alloc(allocate_size, remote_bytes_addr, mr, true);
+  // *region = std::make_shared<RemoteBytesAddrMemoryRegion>(
+  //                                     *remote_bytes_addr, *mr, allocate_size);
+  // return;
+
+  // TODO(wuyongyu02): KV is used,
+  // because https://km.sankuai.com/page/641262306
+  // because sparse tensors, so we need malloc large memory
+  if (!could_send_driver_) {
+    remote_bytes_addr_mu_.lock();
+  }
+  auto it = remote_bytes_addr_mrs_.find(key);
+  if (it == remote_bytes_addr_mrs_.end()) {
+    allocate_size = VerbsEnvRegistrar::Instance()->RdmaTensorBufferRatio() * length;
+    // because concat DriverPrefixMessage
+    allocate_size += DriverPrefixMessage::kPrefixMessageTotalBytes;
+    allocate_size = Alloc(allocate_size, remote_bytes_addr, mr, false);
+    *region = std::make_shared<RemoteBytesAddrMemoryRegion>(
+        *remote_bytes_addr, *mr, allocate_size);
+    remote_bytes_addr_mrs_[key] = *region;
+    // LOG(INFO) << "#1 key:" << key << " size:" << length;
+    if (!could_send_driver_) {
+      remote_bytes_addr_mu_.unlock();
+    }
+  } else {
+    if (length > it->second->size_) {
+      allocate_size = VerbsEnvRegistrar::Instance()->RdmaTensorBufferRatio() * length;
+      // because concat DriverPrefixMessage
+      allocate_size += DriverPrefixMessage::kPrefixMessageTotalBytes;
+      allocate_size = Alloc(allocate_size, remote_bytes_addr, mr, false, it->second->size_);
+      *region = std::make_shared<RemoteBytesAddrMemoryRegion>(
+          *remote_bytes_addr, *mr, allocate_size);
+      if (length > it->second->size_) {
+        it->second = *region;
+        it->second->size_ = allocate_size;
+      }
+      // LOG(INFO) << "#2 create new tensor:" << key;
+    } 
+    // else if(it->second->RefCountIsOne()) {
+    //   allocate_size = VerbsEnvRegistrar::Instance()->RdmaTensorBufferRatio() * length;
+    //   // because concat DriverPrefixMessage
+    //   allocate_size += DriverPrefixMessage::kPrefixMessageTotalBytes;
+    //   allocate_size = Alloc(allocate_size, remote_bytes_addr, mr, false);
+    //   *region = std::make_shared<RemoteBytesAddrMemoryRegion>(
+    //       *remote_bytes_addr, *mr, allocate_size);
+    // } 
+    else {
+      *region = it->second;
+      *remote_bytes_addr = it->second->addr_;
+      *mr = it->second->mr_ptr_;
+      // LOG(INFO) << "#3 key:" << key << " size:" << length;
+    }
+    // (*region)->Ref();
+    if (!could_send_driver_) {
+      remote_bytes_addr_mu_.unlock();
+    }
+  }
+}
+
+size_t RdmaChannel::ChannelAllocateTensors(
+    const string& key,
+    const TensorMetaData& meta,
+    const Allocator* alloc_attr, ibv_mr** mr/*new */,
+    std::shared_ptr<RemoteBytesAddrMemoryRegion> * region,
+    void** rdma_addr/*new*/) {
+  size_t max_length = 0;
+  if (DataTypeCanUseMemcpy(meta.data_type_)) {
+    max_length = RecordTensorMetaData::GetTensorLength(meta.data_type_,
+                                                        meta.tensor_shape_);
+  } else {
+    max_length = meta.proto_size_;
+  }
+  // use allocator for RdmaTensorrequest
+  FindOrCreateRemoteBytesAddrMemoryRegion(key, rdma_addr, mr, region,
+                                          max_length, alloc_attr);
+  return max_length;
+}
+
+size_t RdmaTensorRequest::GetTensorLength(const TensorMetaData& meta) {
+  size_t max_length = 0;
+  if (DataTypeCanUseMemcpy(meta.data_type_)) {
+    max_length = RecordTensorMetaData::GetTensorLength(meta.data_type_,
+                                                        meta.tensor_shape_);
+  } else {
+    max_length = meta.proto_size_;
+  }
+  return max_length;
+}
+
 bool RdmaTensorRequest::AllocateTensors() {
-  result_tensor_ =
+  auto len = channel_->ChannelAllocateTensors(
+      key_, *meta_data_, dst_dev_->GetAllocator(recv_args_.alloc_attrs),
+      &mr_, &result_region_, &rdma_addr_);
+  if (DataTypeCanUseMemcpy(meta_data_->data_type_)) {
+    fake_allocator_ = new FakeAllocator(rdma_addr_);
+    result_tensor_ = new Tensor(fake_allocator_,
+                               meta_data_->data_type_, 
+                               meta_data_->tensor_shape_);
+  } else {
+    // proto
+    result_tensor_ =
       new Tensor(dst_dev_->GetAllocator(recv_args_.alloc_attrs),
                  meta_data_->data_type_, meta_data_->tensor_shape_);
-
+  }
   size_t tensor_size = result_tensor_->TotalBytes();
   bool can_memcpy = DataTypeCanUseMemcpy(result_tensor_->dtype());
-  if (can_memcpy) {
-    if (tensor_size == 0) {
-      return true;
-    }
-    rdma_addr_ = DMAHelper::base(result_tensor_);
-    mr_ = RdmaMemoryMgr::Singleton().FindMemoryRegion(rdma_addr_, tensor_size);
+  if (can_memcpy && tensor_size == 0) {
+    return true;
+  }
 #if GOOGLE_CUDA
-    if (mr_ == nullptr) {
+    if (can_memcpy) {
       // Can't RDMA directly to result. Use a proxy.
       proxy_tensor_ =
           new Tensor(GPUProcessState::singleton()->GetGpuHostAllocator(0),
                      result_tensor_->dtype(), result_tensor_->shape());
       rdma_addr_ = DMAHelper::base(proxy_tensor_);
-      mr_ =
-          RdmaMemoryMgr::Singleton().FindMemoryRegion(rdma_addr_, tensor_size);
+      // mr_ =
+      //     RdmaMemoryMgr::Singleton().FindMemoryRegion(rdma_addr_, tensor_size);
     }
 #endif
-  } else {
-    uint32_t proto_size = meta_data_->proto_size_;
-    rdma_addr_ = malloc(proto_size);
-    mr_ = ibv_reg_mr(RdmaMemoryMgr::Singleton().pd_, rdma_addr_, proto_size,
-                     IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_WRITE);
-  }
-  CHECK(mr_ != nullptr) << " No memory region found for address " << rdma_addr_
+  CHECK(mr_ != nullptr) << " No memory region found for address " 
+                        << rdma_addr_
                         << ": " << key_;
   return true;
 }
@@ -1574,7 +3305,9 @@ void RdmaTensorRequest::AllocateTensorsAsync(StatusCallback done) {
 }
 
 void RdmaTensorRequest::Send(RdmaMessageType message_type) {
-  RdmaMessageBuffer* rb = channel_->tx_message_buffer_;
+  int pair_index = (index_ % RdmaChannel::kNumMessageBuffers) / 2;
+  int buffer_index = 2 * pair_index;
+  auto* rb  = channel_->message_buffers()[buffer_index];
   RdmaMessage rm;
   rm.type_ = message_type;
   rm.request_index_ = index_;
@@ -1591,7 +3324,7 @@ void RdmaTensorRequest::Send(RdmaMessageType message_type) {
     rm.data_type_ = DT_INVALID;
   }
   rm.rkey_ = (mr_ == nullptr) ? 0 : mr_->rkey;
-
+  // rm.create_micros_ = 0;
   RDMA_LOG(1) << "Step 0x" << std::hex << rm.step_id_ << std::dec
               << ": Sending  " << MessageTypeToString(message_type) << " #"
               << index_ << ": " << rm.name_ << " on " << rdma_addr_
@@ -1604,50 +3337,44 @@ void RdmaTensorRequest::Send(RdmaMessageType message_type) {
 
 void RdmaTensorRequest::RecvTensorMetaData(DataType dtype, TensorShape shape,
                                            bool is_dead, size_t proto_size) {
-  meta_data_ = RdmaMemoryMgr::Singleton().SetTensorMetaData(
+  meta_data_ = channel_->SetTensorMetaData(
       key_, dtype, shape, is_dead, proto_size);
-
+  // channel record MetaData
+  channel_->channel_record_->Record(key_, *meta_data_);
+  // global record
+  // RecordTensorMetaData::Singleton().GlobalRecord(key_, *meta_data_);
   DeallocateTensors();
+  // if (result_region_.get() != nullptr) {
+  //   result_region_->Unref();
+  // }
   AllocateTensorsAsync(
       [this](const Status& s) { Send(RDMA_MESSAGE_TENSOR_RE_REQUEST); });
 }
 
 void RdmaTensorRequest::RecvTensorContent() {
+  uint64_t deal_data_begin = Env::Default()->NowMicros();
   bool can_memcpy = DataTypeCanUseMemcpy(meta_data_->data_type_);
   size_t message_size =
       can_memcpy ? result_tensor_->TotalBytes() : meta_data_->proto_size_;
+
   RDMA_LOG(1) << "Step 0x" << std::hex << step_id_ << std::dec
               << ": Received tensor content #" << index_ << ": " << key_
               << " (Size: 0x" << std::hex << message_size << ")";
 
-  Tensor val;
-
-#if GOOGLE_CUDA
-  if (proxy_tensor_ != nullptr) {
-    CountCopies(key_, (void*)DMAHelper::base(proxy_tensor_),
-                (void*)DMAHelper::base(result_tensor_),
-                result_tensor_->TotalBytes(), false);
-    GPUUtil::CopyCPUTensorToGPU(proxy_tensor_, recv_args_.device_context,
-                                dst_dev_, result_tensor_,
-                                [this](const Status& s) {
-                                  CHECK(s.ok()) << "copy tensor to gpu sync";
-                                  Done(s);
-                                },
-                                true /*sync_dst_compute*/);
-    return;
-  }
-#endif
-
   if (can_memcpy) {
+    // copy Tensor from rdma_addr_
+    // TODO(wuyongyu)
+    // only the rdma_addr_ has value , can memcpy
+    // if (result_tensor_->TotalBytes() > 0) {
+    //   memcpy(DMAHelper::base(result_tensor_), (void*)(rdma_addr_),
+    //         result_tensor_->TotalBytes());
+    // }
+    // Recv Tensor memory if can resuse
     Done(Status::OK());
   } else {
-    RDMA_LOG(2) << "Decoding proto: " << key_
-                << " (Size: " << meta_data_->proto_size_ << ")";
     TensorProto proto;
     CHECK(ParseProtoUnlimited(&proto, rdma_addr_, meta_data_->proto_size_))
         << "fail to parse proto from array";
-    ibv_dereg_mr(mr_);
-    free(rdma_addr_);
     Status s = dst_dev_->MakeTensorFromProto(proto, recv_args_.alloc_attrs,
                                              result_tensor_);
     Done(s);
@@ -1663,7 +3390,7 @@ void RdmaTensorRequest::RecvErrorStatus(const Status& status) {
 }
 
 void RdmaTensorRequest::Start() {
-  meta_data_ = RdmaMemoryMgr::Singleton().GetTensorMetaData(key_);
+  meta_data_ = channel_->GetTensorMetaData(key_);
   if (meta_data_ != nullptr) {
     AllocateTensorsAsync(
         [this](const Status& s) { Send(RDMA_MESSAGE_TENSOR_REQUEST); });
@@ -1671,5 +3398,6 @@ void RdmaTensorRequest::Start() {
     Send(RDMA_MESSAGE_TENSOR_REQUEST);
   }
 }
-
 }  // end namespace tensorflow
+
+#endif
diff --git a/tensorflow_networking/verbs/rdma.h b/tensorflow_networking/verbs/rdma.h
index bd9460f..4582980 100644
--- a/tensorflow_networking/verbs/rdma.h
+++ b/tensorflow_networking/verbs/rdma.h
@@ -16,8 +16,9 @@ limitations under the License.
 #ifndef TENSORFLOW_CONTRIB_VERBS_RDMA_H_
 #define TENSORFLOW_CONTRIB_VERBS_RDMA_H_
 
-#include <infiniband/verbs.h>
+#ifdef TENSORFLOW_USE_VERBS
 
+#include <infiniband/verbs.h>
 #include <cstring>  // for memset
 #include <functional>
 #include <memory>  // for shared_ptr
@@ -25,7 +26,11 @@ limitations under the License.
 #include <string>
 #include <unordered_map>
 #include <vector>
+#include <chrono>
+#include <thread>
+#include <deque>
 
+#include "tensorflow_networking/verbs/verbs_util.h"
 #include "tensorflow/core/distributed_runtime/worker_env.h"
 #include "tensorflow/core/framework/rendezvous.h"
 #include "tensorflow/core/framework/tensor.h"
@@ -33,7 +38,12 @@ limitations under the License.
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/mutex.h"
-#include "tensorflow_networking/verbs/verbs_util.h"
+#include "tensorflow/core/distributed_runtime/rpc/grpc_channel.h"
+#include "tensorflow_networking/verbs/verbs_service.pb.h"
+#include "tensorflow_networking/verbs/grpc_verbs_client.h"
+#include "absl/container/flat_hash_map.h"
+#include "tensorflow_networking/verbs/rdma_mgr.h"
+
 
 namespace tensorflow {
 #define PKEY_DEFAULT 0
@@ -80,6 +90,7 @@ enum RdmaMessageType {
   RDMA_MESSAGE_TENSOR_RE_REQUEST,
   RDMA_MESSAGE_TENSOR_REQUEST,
   RDMA_MESSAGE_ERROR_STATUS,
+  RDMA_MESSAGE_DRIVER_BEGIN
 };
 
 struct RdmaMessage {
@@ -87,7 +98,7 @@ struct RdmaMessage {
   uint16_t name_size_;
   string name_;
   int64 step_id_;
-  uint64_t request_index_;
+  uint32_t request_index_;
   union {
     uint64_t remote_addr_;
 #ifdef RDMA_DATA_VALIDATION
@@ -100,13 +111,20 @@ struct RdmaMessage {
   TensorShape tensor_shape_;
   size_t tensor_bytes_;
 
+  // int64 create_micros_;
+
+  // uint32_t remote_bytes_addr_key_;
+  // uint64_t remote_bytes_addr_;
   // For error status:
   Status status_;
 
+  // (wuyongyu02) add the 'create_micros' for cat log
   // type|name_size|name|step_id|request_index|remote_addr/checksum|rkey|...
   //   1B|    2B   | 512|  8B   |     8B      |       8B           | 4B |...
-  // ...|is_dead|data_type|tensor_shape|tensor_bytes|error_status          |
-  // ...|    1B |   XB    |    XB      |    8B      |size - 4B, proto - XB |
+  // ...|is_dead|data_type|tensor_shape|tensor_bytes|create_micros |...
+  // ...|    1B |   XB    |    XB      |    8B      |  8B          |...
+  // ...|remote_bytes_addr|       error_status            |
+  // ...|8B               |   size - 4B, proto - XB       |
   static const size_t kNameCapacity = 512;
   static const size_t kTypeStartIndex = 0;
   static const size_t kNameSizeStartIndex = kTypeStartIndex + sizeof(type_);
@@ -127,8 +145,14 @@ struct RdmaMessage {
       kDataTypeStartIndex + sizeof(data_type_);
   static const size_t kTensorBytesStartIndex =
       kTensorShapeStartIndex + sizeof(TensorShape);
+  // static const size_t kCreateMicrosStartIndex =
+  //     kTensorBytesStartIndex + sizeof(tensor_bytes_);
+
+  // static const size_t kErrorStatusStartIndex =
+  //     kCreateMicrosStartIndex + sizeof(create_micros_);
   static const size_t kErrorStatusStartIndex =
       kTensorBytesStartIndex + sizeof(tensor_bytes_);
+
   static const size_t kErrorStatusMaxSize = 4096;
 
   static const size_t kMessageTotalBytes = kErrorStatusStartIndex;
@@ -138,18 +162,70 @@ struct RdmaMessage {
   static void ParseMessage(RdmaMessage& rm, void* buffer);
 };
 
+// Parse a RdmaMessage according to the pre-defined format
+// Args:
+//   rm: the message structure where the parsed message will be saved
+//   buffer: the place where the raw message is stored
+// Returns:
+//   None
+struct FussionMessages {
+  /* data */
+  static const size_t kRdmaMaxMessagesNumber = 50;
+  uint32_t message_numbers;
+  uint32_t message_size[kRdmaMaxMessagesNumber];
+  std::string messages[kRdmaMaxMessagesNumber];
+  /* func */
+  static string CreateFusionMessages(const std::vector<RdmaMessage>& rmv);
+  static void ParseFussionMessages(std::vector<RdmaMessage>& rmv, void* buffer);
+
+  /* index */
+  static const size_t kMessageNumbersStartIndex = 0;
+  static const size_t kMessageSizeStartIndex = kMessageNumbersStartIndex +
+      sizeof(message_numbers);
+  static const size_t KStringMessagesStartIndex = kMessageSizeStartIndex +
+      sizeof(message_size);
+  static const size_t kTotalFussionMessageSize = KStringMessagesStartIndex +
+      kRdmaMaxMessagesNumber * RdmaMessage::kRdmaMessageBufferSize;
+};
+
+class FakeAllocator : public Allocator {
+  public:
+    FakeAllocator(void* buffer) : buffer_(buffer) {}
+    string Name() override { return "fake_allocator"; }
+    void* AllocateRaw(size_t alignment, size_t num_bytes) override {
+      //simply return the pre-allocated data
+      return buffer_;
+    }
+    void DeallocateRaw(void* ptr) override {
+      //TODO(wyy): does the real owner will free buffer_?
+      // free(buffer_);
+      // port::AlignedFree(buffer_);
+    }
+ 
+  private:
+    //data should be 64 bytes aligned
+    void* buffer_ = nullptr;
+};
+
+class RdmaChannel;
+class ChannelRecordTensorMetaData;
+class RdmaSendDriverMgr;
+
 // Immediate types for RDMA write
+const int Const_kNumMessageBuffers = 80;  // origin 80
 enum RdmaImmDataType {
-  RDMA_IMM_MAX_REQUEST_ID = 0xFFFFFFFD,
-  RDMA_IMM_DATA_ACK = 0xFFFFFFFE,
-  RDMA_IMM_DATA_MESSAGE = 0xFFFFFFFF
+  RDMA_IMM_MAX_REQUEST_ID = 0xFFFFFFFF - 2 * Const_kNumMessageBuffers - 2,
+  RDMA_IMM_DATA_ACK = 0xFFFFFFFF - Const_kNumMessageBuffers - 1,
+  RDMA_IMM_DATA_MESSAGE = 0xFFFFFFFF,
+  RDMA_IMM_MIN_SENDMGR_BASE = int(RDMA_IMM_MAX_REQUEST_ID/2 + 1),
 };
 
 // Write types for RDMA write-complete events
 enum RdmaWriteIDType {
   RDMA_WRITE_ID_ACK,
   RDMA_WRITE_ID_MESSAGE,
-  RDMA_WRITE_ID_TENSOR_WRITE
+  RDMA_WRITE_ID_TENSOR_WRITE,
+  RDMA_WRITE_ID_SEND_DEIVER_WRITE
 };
 
 // Context for RDMA write-complete events
@@ -169,6 +245,9 @@ class TensorMetaData {
   DataType data_type_;
   size_t proto_size_;
   bool is_dead_;
+  uint32 uid_;
+  // record is the mata change for send-driven
+  bool meta_changed_ = false;
 
   std::ostream& print(std::ostream& out) const {
     out << "Dtype = " << DataTypeString(data_type_)
@@ -183,8 +262,6 @@ inline std::ostream& operator<<(std::ostream& out,
   return meta_data.print(out);
 }
 
-class RdmaChannel;
-
 void MRDeleter(ibv_mr* mr);
 using MemoryRegionPtr = std::unique_ptr<ibv_mr, decltype(&MRDeleter)>;
 
@@ -192,42 +269,175 @@ using MemoryRegionPtr = std::unique_ptr<ibv_mr, decltype(&MRDeleter)>;
 // Manages the local meta-data cache, and the registered RDMA memory regions.
 class RdmaMemoryMgr {
  public:
-  static RdmaMemoryMgr& Singleton() {
-    static RdmaMemoryMgr instance;
-    return instance;
-  }
+  RdmaMemoryMgr(struct ibv_pd* pd) :pd_(pd) {}
+  // static RdmaMemoryMgr& Singleton() {
+  //   static RdmaMemoryMgr instance;
+  //   return instance;
+  // }
 
-  // Memory regions
   ibv_mr* FindMemoryRegion(void* addr, size_t length);
+
   void InsertMemoryRegion(void* addr, size_t length,
                           const std::string& allocator_name);
   void EvictMemoryRegion(void* addr, size_t length);
 
-  // Tensor meta-data cache
-  const TensorMetaData* GetTensorMetaData(const std::string& tensor_name);
-  const TensorMetaData* SetTensorMetaData(const std::string& tensor_name,
-                                          DataType dtype,
-                                          const TensorShape& shape,
-                                          bool is_dead, size_t proto_size);
+  static bool Comparator(const void* ptr, const MemoryRegionPtr& other) {
+    return ptr < reinterpret_cast<char*>(other->addr) + other->length;
+  }
 
   struct ibv_pd* pd_;
 
- protected:
-  RdmaMemoryMgr() : pd_(nullptr) {}
+ private:
+  // Managed memory regions
+  mutex mrs_mu_;
+  std::vector<MemoryRegionPtr> mrs_ GUARDED_BY(mrs_mu_);
+};
 
-  static bool Comparator(const void* ptr, const MemoryRegionPtr& other) {
-    return ptr < reinterpret_cast<char*>(other->addr) + other->length;
+class RecordTensorMetaData {
+ public:
+  RecordTensorMetaData() {
+    // stop_.store(true, std::memory_order_relaxed);
+    total_bytes_ = 0;
+  }
+
+  ~RecordTensorMetaData() {
+    // stop_.store(false, std::memory_order_relaxed);
+  }
+
+  static RecordTensorMetaData& Singleton() {
+    static RecordTensorMetaData instance;
+    return instance;
+  }
+
+  static uint32 GetTensorLength(const DataType& date_type,
+                                const TensorShape& tensor_shape) {
+    return GetEnumSize(date_type) * tensor_shape.num_elements();
+  }
+
+  static uint32 GetEnumSize(const DataType& date_type);
+
+  void GlobalRecord(const std::string& origin_tensor_name,
+                    const TensorMetaData& m, bool stop_record=false);
+
+  typedef std::unordered_map<std::string, TensorMetaData> GTensorMetaType;
+  typedef std::unordered_map<uint32, std::string> GTensorsUidKeyType;
+
+  const GTensorMetaType& GetGlobalTensorsMetaData() {
+    return global_tensors_meta_data_;
+  }
+
+  const GTensorsUidKeyType& GetGlobalTensorsUidParsedkey() {
+    return global_tensors_uid_parsed_key_;
   }
 
+  string DebugString() const;
+
+  void WriteOutput(const std::string& content) const;
+
+  void ReadFile(const std::string& filename, StringPiece* content);
+
  private:
-  mutex tensor_meta_data_mu_;
-  std::unordered_map<std::string, TensorMetaData> tensors_meta_data_;
+  mutex global_tensor_meta_data_mu_;
+  GTensorMetaType global_tensors_meta_data_;
+  GTensorsUidKeyType global_tensors_uid_parsed_key_;
+  // uid_ should less RDMA_IMM_MAX_REQUEST_ID
+  uint32 uid_ = RDMA_IMM_MIN_SENDMGR_BASE;
+  // std::atomic<bool> stop_;
+  uint64 total_bytes_;
+  string local_worker_name_ =  "";
+};
 
-  // Managed memory regions
-  mutex mrs_mu_;
-  std::vector<MemoryRegionPtr> mrs_ TF_GUARDED_BY(mrs_mu_);
+// which is a member of RdmaChannel
+class LocalDriverBufferMgr {
+ public:
+  explicit LocalDriverBufferMgr(RdmaChannel* channel) : channel_(channel) {
+    DCHECK(channel != nullptr)
+        << "LocalDriverBufferMgr construct channel is nullptr.";
+  }
+
+  typedef Rendezvous::DoneCallback DoneCallback;
+  typedef Rendezvous::Args Args;
+  typedef Rendezvous::ParsedKey ParsedKey;
+  struct Item {
+    mutex item_lock_;
+    DoneCallback waiter = nullptr;
+    Tensor* value;
+    bool is_dead = false;
+    bool has_value =  false;
+    Args send_args;
+    Args recv_args;
+    CancellationToken cancellation_token;
+    uint64 send_start_micros_;
+    uint64 recv_start_micros_;
+    uint64 request_start_micros_;
+
+    ~Item() {
+      if (send_args.device_context) {
+        send_args.device_context->Unref();
+      }
+      if (recv_args.device_context) {
+        recv_args.device_context->Unref();
+      }
+      if (value != nullptr) {
+        // delete value;
+      }
+    }
+
+    // Returns true iff this item represents a value being sent.
+    bool HasCallback() const { return this->waiter != nullptr; }
+
+    bool HasValue() const { return this->has_value;}
+  };
+
+  typedef std::deque<Item*> ItemQueue;
+
+  struct QueueItems {
+    ItemQueue* queue;
+    mutex queue_lock_;
+  };
+
+
+  typedef gtl::FlatMap<string, Item*> Table;
+
+  typedef gtl::FlatMap<string, QueueItems*> QueueTable;
+
+
+  size_t InitLocalDriverBufferMgr();
+
+  Status RdmaSave(const string& key, const Args& send_args, const Tensor& val,
+                  const bool is_dead);
+
+  Status QueueRdmaSave(const string& key, const Args& send_args,
+                       Tensor* val, const bool is_dead,
+                       const uint64& send_begin_micros);
+
+  void LoadAsync(const string& key, const Args& recv_args,
+                 DoneCallback done);
+
+  void QueueLoadAsync(const string& key, const Args& recv_args,
+                      DoneCallback done, const uint64& request_start_micros);
+
+  void StartAbort(const Status& status);
+
+  ~LocalDriverBufferMgr() {
+    if (!table_.empty()) {
+      StartAbort(errors::Cancelled("LocalDriverBufferMgr deleted"));
+    }
+  }
+
+ public:
+  bool use_queue_item_ = true;
+
+ private:
+  RdmaChannel* channel_;  // not owned
+  Table table_;  // GUARDED_BY(mu_);
+  QueueTable queue_table_;
+  Status status_ = Status::OK();  // GUARDED_BY(mu_);
+  TF_DISALLOW_COPY_AND_ASSIGN(LocalDriverBufferMgr);
 };
 
+class RemoteBytesAddrMemoryRegion;
+
 // RdmaTensorRequest
 // Represents a single tensor request.
 class RdmaTensorRequest {
@@ -269,6 +479,10 @@ class RdmaTensorRequest {
   // Invoke Done() with the status code.
   void RecvErrorStatus(const Status& status);
 
+  RdmaChannel* rdma_channel() {
+    return channel_;
+  }
+
 #ifdef RDMA_DATA_VALIDATION
   // Receive tensor checksum
   //
@@ -277,6 +491,12 @@ class RdmaTensorRequest {
   // checksum right before invoking Done().
   void RecvTensorChecksum(uint64_t checksum) { checksum_ = checksum; }
 #endif
+  uint64_t begin_start_req_;
+  string key_;
+  // SendMetaData message micros
+  // uint64_t rm_create_micros_;
+  RecvDoneCallback done_;
+  Rendezvous::Args recv_args_;
 
  private:
   void Done(const Status& s);
@@ -285,30 +505,56 @@ class RdmaTensorRequest {
   void AllocateTensorsAsync(StatusCallback done);
   void DeallocateTensors();
 
+  size_t GetTensorLength(const TensorMetaData& meta);
+
   uint32_t index_;
-  string key_;
   int64 step_id_;
   RdmaChannel* channel_;
   Device* dst_dev_;
-  Rendezvous::Args recv_args_;
+
   const TensorMetaData* meta_data_;
+  FakeAllocator* fake_allocator_ = nullptr;
   Tensor* result_tensor_;
+
+  std::shared_ptr<RemoteBytesAddrMemoryRegion> result_region_;
   Tensor* proxy_tensor_;
   void* rdma_addr_;
+  // void* rdma_remote_bytes_addr_ = nullptr;
+  // ibv_mr* remote_bytes_addr_mr_ = nullptr;
   ibv_mr* mr_;
-  RecvDoneCallback done_;
 #ifdef RDMA_DATA_VALIDATION
   uint64_t checksum_;
 #endif
 };
 
+struct DriverEntry;
+
 // RdmaTensorResponse
 // Represents a single tensor response.
 class RdmaTensorResponse {
  public:
   // Creates a response for request message.
   RdmaTensorResponse(RdmaChannel* channel, const RdmaMessage& rm)
-      : channel_(channel), rm_(rm) {}
+      : channel_(channel), rm_(rm) {
+    //   strings::StrCat(
+    // src_device, ";", strings::Uint64ToHexString(src_incarnation, buf), ";",
+    // dst_device, ";", name, ";", frame_iter.frame_id, ":",
+    // frame_iter.iter_id);
+    if (!rm.name_.empty()) {
+      size_t found = rm.name_.find(";");
+      string str = rm.name_.substr(found + 1, rm.name_.size());
+
+      found = str.find(";");
+      str = str.substr(found + 1, str.size());
+
+      found = str.find(";");
+      req_to_device_ = str.substr(0, found);
+      parsed_key_ = rm.name_;
+    } else {
+      req_to_device_ = "";
+      parsed_key_ = "";
+    }
+  }
 
   void Update(const RdmaMessage& rm) { rm_ = rm; }
 
@@ -333,20 +579,36 @@ class RdmaTensorResponse {
   // Destroy the response's resources and remove it from the pending list.
   void Destroy();
 
+ public:
+  uint64 request_index_;
+  uint64 recv_local_send_rdma_;
+  uint64 recv_send_content_ = 0;
+  uint64 send_meta_begin_;
+  string parsed_key_;
+  string req_to_device_;
+
  private:
-  void RecvHandler(Rendezvous::ParsedKey parsed,
-                   const Rendezvous::Args& send_args,
+  void RecvHandler(const Rendezvous::Args& send_args,
                    const Rendezvous::Args& recv_args, const Tensor& in,
                    bool is_dead);
   void Clone(const Tensor& in, const TensorProto& proto, bool is_dead);
+
+
+  void RdmaClone(const Tensor& in, const TensorProto& proto,
+                 bool is_dead);
+
   void Send(const Tensor& in, const TensorProto& proto, bool is_dead,
             const Status& status);
+  void SendBck(const Tensor& in, const TensorProto& proto, bool is_dead,
+              const Status& status);
+
   bool TensorMetaDataChanged(const Tensor& in, bool is_dead);
   Status PrepareRecvTensor(const Rendezvous::ParsedKey& parsed,
                            Device** src_dev);
   void SendMetaData(const Tensor& in, const TensorProto& proto, bool is_dead);
-  void SendContent(const Tensor& in, const TensorProto& proto, bool is_dead);
-  void SendErrorStatus(const Status& status);
+  void SendContent(const Tensor& in, const TensorProto& proto, bool is_dead,
+                   bool is_resume);
+  void SendErrorStatus(const Status& status, const std::string& src_func_name);
 
   RdmaChannel* channel_;
   RdmaMessage rm_;  // The request message
@@ -361,6 +623,35 @@ class RdmaTensorResponse {
   TensorProto* proto_ = nullptr;
   Tensor* tensor_ = nullptr;
   bool is_dead_ = false;
+
+  std::shared_ptr<RemoteBytesAddrMemoryRegion> res_region_;
+  FakeAllocator* res_fake_allocator_;
+};
+
+class Chunk {
+ public:
+  Chunk(struct ibv_pd* pd);
+
+  void FreeChunk();
+
+  ~Chunk();
+
+  void Alloc(size_t size, void** p, ibv_mr** mr, size_t realloc_size=0);
+
+ private:
+  void* new_p_;
+  ibv_mr* new_mr_;
+  size_t chunk_addr_size = 64*1024*1024;
+  uint64 offset_;
+  uint64 curr_size_;
+  uint64 empty_size_;
+  uint64 total_waste_size_;
+  uint64 total_realloc_size_;
+  mutex alloc_mu_;
+  int allocate_size_;
+  struct ibv_pd* pd_;
+  std::vector<ibv_mr*> mrs_;
+  std::vector<void*> chunk_addrs_;
 };
 
 class RdmaMessageBuffer;
@@ -371,8 +662,11 @@ class RdmaAdapter {
   friend class RdmaChannel;
   friend class RdmaMessageBuffer;
   friend class RdmaTensorResponse;
+  friend class RdmaTensorRequest;
   friend class RdmaMgr;
   friend class RdmaRemoteRendezvous;
+  friend class RdmaSendDriverMgr;
+  friend class ChannelRecordTensorMetaData;
 
  public:
   RdmaAdapter(const WorkerEnv* worker_env);
@@ -380,29 +674,298 @@ class RdmaAdapter {
   // Adapter name, e.g. mlx5_0.
   string name() const;
   void StartPolling();
-  void Process_CQ();
+  void Pool_Process_CQ(int cq_num);
+  void Process_WR(ibv_wc wc_, int cq_num);
 
  protected:
-  static const int MAX_CONCURRENT_WRITES = 1000;
+  thread::ThreadPool* pool_;
+  static const int MAX_CONCURRENT_WRITES = 5000;  // origin 1000 , second 5000
   ibv_context* context_;
   // RDMA configuration parameters
   RdmaParams params_;
   // ibverbs protection domain
   ibv_pd* pd_;
   // Completion event channel, to wait for work completions
-  ibv_comp_channel* event_channel_;
+  ibv_comp_channel** event_channel_vec_;
+
   // Completion queue, to poll on work completions
-  ibv_cq* cq_;
+  ibv_cq** cq_vec_;
+  //
+  int cq_nums_;
   // Pre-allocated work completions array used for polling
-  ibv_wc wc_[MAX_CONCURRENT_WRITES * 2];
+  ibv_wc** wc_vec_;
   // worker env for thread
   const WorkerEnv* worker_env_;
   // thread for cq.
-  std::unique_ptr<Thread> polling_thread_;
+  std::vector<std::unique_ptr<Thread> > polling_thread_vec_;
+  Chunk* recv_chunk_ = nullptr;
 };
 
 // Class that represents a connection to a remote Rdma peer.
 // Responsible for connecting queue pairs.
+class RemoteBytesAddrMemoryRegion {
+ public:
+  RemoteBytesAddrMemoryRegion(void* addr, ibv_mr* mr, size_t s) {
+    mr_ptr_ = mr;
+    addr_ = addr;
+    size_ = s;
+    ref_.store(0);
+  }
+
+  // TODO(wuyongyu02) need ibv_dereg_mr mr_ptr_
+  ~RemoteBytesAddrMemoryRegion() {
+    if (mr_ptr_!= nullptr && addr_ != nullptr) {
+      // ibv_dereg_mr(mr_ptr_);
+      // free(addr_);
+      addr_ = nullptr;
+      mr_ptr_ = nullptr;
+    }
+  } 
+
+  bool RefCountIsOne() const {
+    return (ref_.load(std::memory_order_acquire) >= 1);
+  }
+
+  void Ref() const {
+    ref_.fetch_add(1, std::memory_order_relaxed);
+  }
+
+  bool Unref() const {
+    ref_.store(0);
+    return true;
+  }
+
+  mutable std::atomic_int_fast32_t ref_;
+  void* addr_;
+  ibv_mr* mr_ptr_;
+  size_t size_;
+};
+
+// save bytes info
+struct DriverPrefixMessage {
+  TensorShape tensor_shape_;
+  size_t tensor_bytes_;
+  bool is_dead_;
+  uint64 send_micros_;
+  // for not meta changed
+  static const size_t CKIsDeadIndexStartIndex = 0;
+  static const size_t CkSendMiscrosStartIndex =
+                                CKIsDeadIndexStartIndex + sizeof(is_dead_);
+  static const size_t CkPrefixMessageTotalBytes =
+                                CkSendMiscrosStartIndex + sizeof(send_micros_);
+
+  static const size_t kTensorShapeStartIndex = 0;
+  static const size_t kTensorBytesStartIndex =
+                                kTensorShapeStartIndex + sizeof(tensor_shape_);
+  static const size_t KIsDeadIndexStartIndex =
+                                kTensorBytesStartIndex + sizeof(tensor_bytes_);
+
+  static const size_t KSendMicrosStartIndex =
+                                KIsDeadIndexStartIndex + sizeof(is_dead_);
+
+  static const size_t kPrefixMessageTotalBytes =
+                                KSendMicrosStartIndex + sizeof(send_micros_);
+
+  static std::string CreateDriverPrefixMessage(const TensorShape& shape,
+      const size_t& tensor_bytes, const bool& is_dead,
+      const uint64& send_micros, const bool& meta_changed) {
+    if (meta_changed) {
+      char message[kPrefixMessageTotalBytes + 100];
+      memcpy(message + kTensorShapeStartIndex, &shape, sizeof(shape));
+      memcpy(message + kTensorBytesStartIndex, &tensor_bytes,
+              sizeof(tensor_bytes));
+      memcpy(message + KIsDeadIndexStartIndex, &is_dead, sizeof(is_dead));
+      memcpy(message + KSendMicrosStartIndex, &send_micros,
+              sizeof(send_micros));
+      return std::string(message, kPrefixMessageTotalBytes);
+    } else {
+      char message[CkPrefixMessageTotalBytes + 100];
+      memcpy(message + CKIsDeadIndexStartIndex, &is_dead, sizeof(is_dead));
+      memcpy(message + CkSendMiscrosStartIndex, &send_micros,
+              sizeof(send_micros));
+      return std::string(message, CkPrefixMessageTotalBytes);
+    }
+  }
+
+  static DriverPrefixMessage ParseDriverPrefixMessage(void* addr,
+      const bool& meta_changed) {
+    if (meta_changed) {
+      char* message = static_cast<char*>(addr);
+      DriverPrefixMessage m;
+      memcpy(&m.tensor_shape_, message + kTensorShapeStartIndex,
+              sizeof(m.tensor_shape_));
+      memcpy(&m.tensor_bytes_, message + kTensorBytesStartIndex,
+              sizeof(m.tensor_bytes_));
+      memcpy(&m.is_dead_, message + KIsDeadIndexStartIndex,
+              sizeof(m.is_dead_));
+      memcpy(&m.send_micros_, message + KSendMicrosStartIndex,
+              sizeof(m.send_micros_));
+      return m;
+    } else {
+      char* message = static_cast<char*>(addr);
+      DriverPrefixMessage m;
+      memcpy(&m.is_dead_, message + CKIsDeadIndexStartIndex,
+              sizeof(m.is_dead_));
+      memcpy(&m.send_micros_, message + CkSendMiscrosStartIndex,
+              sizeof(m.send_micros_));
+      return m;
+    }
+  }
+};
+
+enum DriverStatus {
+  DRIVER_INIT,
+  RPC_0,
+  RPC_1,
+  DATA_NOT_READY,
+  DATA_READY,
+  DRIVER_ERROR
+};
+struct DriverEntry {
+ public:
+  DriverEntry(const uint32& uid,
+              const std::string& parsedkey,
+              void* addr,
+              ibv_mr* mr,
+              int allocate_size);
+
+  DriverEntry();
+
+  uint32 uinque_id_;
+  std::string parsed_key_;
+  std::atomic<DriverStatus> dri_status_;
+  // saved tensor data and string message
+  std::shared_ptr<RemoteBytesAddrMemoryRegion> mem_mr_;
+  // uint32 prefix_msg_len_;
+  std::string prefix_msg_;
+  int allocate_size_ = 0;
+  //
+  uint32_t lkey_;
+  //
+  uint64_t addr_;
+  // record metag changed
+  bool meta_changed_ = false;
+
+
+  // allocate for send prefix string
+  std::shared_ptr<RemoteBytesAddrMemoryRegion> send_mem_mr_;
+
+  // for send tensor ref
+  TensorBuffer* src_buffer_ = nullptr;
+  // for send tensor smr_
+  struct ibv_mr* smr_ = nullptr;  // not owend
+  // can memcpy tensor
+  void* tensor_addr_ = nullptr;
+
+  int local_allocate_size_ = 0;
+
+  // allocate for send tensor
+  std::shared_ptr<RemoteBytesAddrMemoryRegion> send_region_;
+
+  uint64 send_micros_ = 0;
+};
+
+class RdmaSendDriverMgr {
+ friend class RdmaChannel;
+ friend class ChannelRecordTensorMetaData;
+ friend class RdmaAdapter;
+
+ public:
+  RdmaSendDriverMgr(RdmaChannel* channel);
+
+  size_t InitLocalDriverEntry();
+
+  void NotifyRemoteDriverEntry();
+
+  ~RdmaSendDriverMgr() {
+  }
+
+  // send service update recv_entries_
+  void RpcUpdateRemoteDriverEntry(const DriverMessageReq* request,
+                                 DriverMessageResp* response);
+
+  // recv client update driver_entries_
+  void RpcUpdateDriverEntries(const DriverMessageResp& resp);
+
+  bool RpcReqResp(GrpcVerbsClient* client, const DriverMessageReq& req);
+
+  void AllocateRecvEntriesStringMemoryAndRegion();
+
+  std::shared_ptr<DriverEntry> GetRecvEntry(const std::string& parsed_key,
+                                            bool* has_data);
+
+  std::shared_ptr<DriverEntry> GetDriverEntry(const std::string& parsed_key,
+                                              bool* has_data);
+
+ public:
+  std::atomic<bool> driver_mgr_is_ok_;
+  typedef std::unordered_map<std::string,
+                             std::shared_ptr<DriverEntry> > EntryMapType;
+  // typedef absl::flat_hash_map<string,
+  //                               std::shared_ptr<DriverEntry> > EntryMapType;
+
+ protected:
+  RdmaChannel * channel_;
+  EntryMapType driver_entries_;
+  EntryMapType recv_entries_;
+};
+
+class ChannelRecordTensorMetaData {
+ public:
+  // typedef absl::flat_hash_map<string, TensorMetaData> RecordMapType;
+  typedef std::unordered_map<std::string, TensorMetaData> RecordMapType;
+  typedef std::unordered_map<uint32, std::string> RecordMapUniIdType;
+
+  ChannelRecordTensorMetaData(RdmaChannel* channel);
+
+  static uint32 GetEnumSize(const DataType& date_type);
+
+  static int GetTensorBytes(const TensorMetaData& m);
+
+  void AllocateMemoryAndRegion(const string& key,
+                               const TensorMetaData& m,
+                               ibv_pd* pd,
+                               void** addr,
+                               ibv_mr** mr,
+                               int* addr_size,
+                               Allocator* alloc_attr = nullptr) const;
+
+  void AllocateSendStringMemoryAndRegion(ibv_pd* pd,
+                                         void** addr,
+                                         ibv_mr** mr,
+                                         int* addr_size,
+                                         Allocator* alloc_attr = nullptr);
+
+  void Record(const std::string& tensor_name,
+              const TensorMetaData& m);
+
+  static StringPiece ConsumeNextPart(StringPiece* s, char delim);
+
+  static string RegexEdgeName(const string & str);
+
+  void InitMetaDataFromEnv();
+
+  const RecordMapType & GetChannelTensorsMetaData() {
+    return channel_tensors_meta_data_;
+  }
+
+  const RecordMapUniIdType & GetChannelTensorsUidParsedkey() {
+    return channel_tensors_uid_parsed_key_;
+  }
+
+ public:
+  RecordMapType channel_tensors_meta_data_;
+
+  RecordMapUniIdType channel_tensors_uid_parsed_key_;
+
+ private:
+  RdmaChannel* channel_;
+  mutex channel_tensor_meta_data_mu_;
+  // uid_ must less RDMA_IMM_MAX_REQUEST_ID
+  uint32 uid_ = RDMA_IMM_MIN_SENDMGR_BASE;
+};
+
+class RdmaMgr;
 class RdmaChannel {
   friend class RdmaAdapter;
   friend class RdmaMessageBuffer;
@@ -411,10 +974,14 @@ class RdmaChannel {
   friend class RdmaTensorResponse;
   friend class RdmaMgr;
   friend class RdmaRemoteRendezvous;
+  friend class RdmaSendDriverMgr;
+  friend class ChannelRecordTensorMetaData;
 
  public:
   explicit RdmaChannel(const RdmaAdapter* adapter, const string local_name,
-                       const string remote_name_);
+                       const string remote_name_, GrpcChannelCache* rdma_mgr,
+                       ibv_cq* cq);
+
   ~RdmaChannel();
   inline const RdmaAddress& self() { return self_; }
   RdmaAddress address() const;
@@ -439,8 +1006,84 @@ class RdmaChannel {
   RdmaTensorResponse* UpdateTensorResponse(const RdmaMessage& rm);
   void RemoveTensorResponse(uint32_t request_index);
 
-  static const int kNumMessageBuffers = 2;
+  // static const int kNumMessageBuffers = 2;
+  static const int kNumMessageBuffers = Const_kNumMessageBuffers;
   static const int kPingRecvWrid = 0;
+  // CAT log
+  RdmaTensorRequest* GetTensorRequestForCat(uint32_t request_index);
+
+  inline size_t Alloc(size_t size, void** p, ibv_mr** mr,
+                      bool dynamic=false, size_t realloc_size=0) const;
+  bool FindLocalMr(const std::string& key, void** remote_bytes_addr,
+                   ibv_mr** mr, int* length);
+
+  inline void FindOrCreateRemoteBytesAddrMemoryRegion(const std::string& key,
+      void** remote_bytes_addr /*new*/,
+      ibv_mr** mr /*new*/,
+      std::shared_ptr<RemoteBytesAddrMemoryRegion> * region,
+      size_t length,
+      const Allocator* alloc_attr = nullptr);
+
+  size_t ChannelAllocateTensors(const string& key, const TensorMetaData& meta,
+      const Allocator* alloc_attr,   ibv_mr** mr/*new*/,
+      std::shared_ptr<RemoteBytesAddrMemoryRegion> * region,
+      void** rdma_addr /*new*/);
+
+  GrpcChannelCache* GetChannelChache() { return channel_cache_; }
+
+  std::shared_ptr<RdmaSendDriverMgr> GetRdmaSendDriverMgr() {
+    return rdma_send_driver_mgr_;
+  }
+
+  // For tensor response
+
+  // For Send Kernel op
+  void SendDriverData(const Tensor& in,
+                      bool is_dead,
+                      const std::string& name);
+
+  // (1) enter
+  void InitAndSetDriverStatus();
+
+  void TestPleSendOrCheck() {
+    LOG(INFO) << "TestPleSendOrCheck begin...";
+    PleSendOrCheck();
+  }
+
+  void FakeAllocateTest() {
+    Tensor fill_shape_tensor(DT_INT32, TensorShape({1}));
+    fill_shape_tensor.vec<int32>()(0) = 1;
+    // fill_shape_tensor.vec<int32>()(1) = 256;
+    // fill_shape_tensor.vec<int32>()(2) = 1024;
+    // fill_shape_tensor.vec<int32>()(3) = 1024;
+    auto flat = fill_shape_tensor.flat<int32>();
+    auto ts = fill_shape_tensor.scalar<int32>();
+    LOG(INFO) << "ts size:" << ts.size() 
+              << " flat size:" << flat.size();
+    for (int i = 0; i < flat.size(); ++i) {
+      // flat(i) = i;
+      LOG(INFO) << "ts " << i << " :" << ts(i);
+    }
+  }
+
+  void PleSendOrCheck();
+
+  const TensorMetaData* GetTensorMetaData(const std::string& tensor_name);
+
+  const TensorMetaData* SetTensorMetaData(const std::string& tensor_name,
+                                          DataType dtype,
+                                          const TensorShape& shape,
+                                          bool is_dead, size_t proto_size);
+  // Memory regions
+  ibv_mr* FindMemoryRegion(void* addr, size_t length);
+
+ public:
+  bool could_send_driver_ = false;
+  string local_name_;
+  string remote_name_;
+  std::shared_ptr<ChannelRecordTensorMetaData> channel_record_;
+  std::shared_ptr<RdmaSendDriverMgr> rdma_send_driver_mgr_;
+  std::shared_ptr<LocalDriverBufferMgr> local_driver_buffer_mgr_;
 
  private:
   static const int kPingBuffSize = 1024;
@@ -452,24 +1095,49 @@ class RdmaChannel {
 
  protected:
   const RdmaAdapter* adapter_;
+  RdmaMgr* rdma_mgr_;
   RdmaAddress self_;
-  string local_name_;
-  string remote_name_;
   ibv_qp* qp_;
   mutex mu_;
-  bool connected_ TF_GUARDED_BY(mu_) = false;
-  RdmaAddress remote_ TF_GUARDED_BY(mu_);
-  bool remote_set_ TF_GUARDED_BY(mu_) = false;
+  bool connected_ GUARDED_BY(mu_) = false;
+  RdmaAddress remote_ GUARDED_BY(mu_);
+  bool remote_set_ GUARDED_BY(mu_) = false;
   mutex ct_mu_;
   typedef std::unordered_map<uint32_t, RdmaTensorRequest> RequestTable;
-  RequestTable request_table_ TF_GUARDED_BY(ct_mu_);
-  uint32_t request_serial_ TF_GUARDED_BY(ct_mu_);
+  RequestTable request_table_ GUARDED_BY(ct_mu_);
+  typedef std::unordered_map<string, uint32_t> ParsedKeyToIndex;
+  typedef std::unordered_map<uint32_t, string> IndexToParsedKey;
+
+  IndexToParsedKey req_table_idx_to_pkey_ GUARDED_BY(ct_mu_);
+
+  uint32_t request_serial_ GUARDED_BY(ct_mu_);
   mutex responses_mu_;
-  typedef std::unordered_map<uint32_t, RdmaTensorResponse> ResponsesTable;
-  ResponsesTable responses_table_ TF_GUARDED_BY(responses_mu_);
-  RdmaMessageBuffer* tx_message_buffer_;
-  RdmaMessageBuffer* rx_message_buffer_;
+  typedef std::unordered_map<uint32_t,
+                         std::shared_ptr<RdmaTensorResponse> > ResponsesTable;
+  ResponsesTable responses_table_ GUARDED_BY(responses_mu_);
   std::vector<RdmaMessageBuffer*> message_buffers_;
+  // for addr size
+  // Managed memory regions
+  mutex remote_bytes_addr_mu_;
+  typedef absl::flat_hash_map<string,
+                     std::shared_ptr<RemoteBytesAddrMemoryRegion>> MRegionType;
+  // typedef std::unordered_map<std::string,
+  //   std::shared_ptr<RemoteBytesAddrMemoryRegion>> MRegionType;
+  MRegionType remote_bytes_addr_mrs_ GUARDED_BY(remote_bytes_addr_mu_);
+
+  GrpcChannelCache* const channel_cache_;
+
+  // meta record
+  mutex tensor_meta_data_mu_;
+  std::unordered_map<std::string, TensorMetaData> tensors_meta_data_;
+
+  // for mem allocator
+  Allocator* rdma_mem_allocator_;
+  RdmaMemoryMgr* rdma_memory_mgr_;
+  std::vector<SubAllocator::Visitor> alloc_visitors_;
+  std::vector<SubAllocator::Visitor> free_visitors_;
+  struct ibv_pd *pd_;  // not owned
+  size_t pagesize_ = sysconf(_SC_PAGESIZE);
 };
 
 // Class that represents a buffer for Rdma message sending.
@@ -498,15 +1166,31 @@ class RdmaMessageBuffer {
   void EnqueueItem(string Item);
   void SendNextItem();
   void CreateCPUBuffer(size_t size, bool lock = true);
+  void ChunkCreateCPUBuffer(size_t size, void* buffer, ibv_mr* mr,
+                            bool lock = true);
   void SetRemoteMR(RemoteMR rmi, bool override);
   void Write(uint32_t imm_data, size_t buffer_size);
+
   static void Write(const RdmaChannel* channel, uint32_t imm_data,
                     size_t buffer_size, uint64_t src_addr, uint32_t lkey,
                     uint64_t remote_addr, uint32_t rkey,
                     RdmaWriteIDType write_type, void* write_context);
-  static void SendAck(const RdmaChannel* channel);
+
+  static void WriteWithPrefix(const RdmaChannel* channel, uint32_t imm_data,
+                              size_t buffer_size, uint64_t src_addr,
+                              uint32_t lkey, uint64_t remote_addr,
+                              uint32_t rkey, RdmaWriteIDType write_type,
+                              void* write_context, uint64_t prefix_addr,
+                              uint32_t prefix_lkey, size_t prefix_size);
+
+  static void SendAck(const RdmaChannel* channel, int pair_index);
+
+ public:
+  int pair_index_;
+  uint64_t rm_ack_micros_;
 
  protected:
+  int64 time_guard_;
   const RdmaChannel* channel_;
   void* buffer_ = nullptr;
   bool buffer_on_host_ = true;
@@ -515,11 +1199,61 @@ class RdmaMessageBuffer {
   ibv_mr* self_ = nullptr;
   mutex mu_;
   RemoteMR remote_;
-  std::queue<string> queue_ TF_GUARDED_BY(mu_);
-  BufferStatus local_status_ TF_GUARDED_BY(mu_) = none;
-  BufferStatus remote_status_ TF_GUARDED_BY(mu_) = none;
+  std::queue<string> queue_ GUARDED_BY(mu_);
+  BufferStatus local_status_ GUARDED_BY(mu_) = none;
+  BufferStatus remote_status_ GUARDED_BY(mu_) = none;
+};
+
+class VerbsEnvRegistrar {
+ public:
+  static VerbsEnvRegistrar* Instance() {
+    static VerbsEnvRegistrar* instance_ = new VerbsEnvRegistrar();
+    return instance_;
+  }
+  int RdmaCQpoolSize() {
+    return rdma_cqpool_size_;
+  }
+
+  bool RdmaEnableSendDriven() {
+    return enable_send_driven_;
+  }
+
+  int RdmaTensorBufferRatio() {
+    return rdma_tensor_buffer_ratio_;
+  }
+
+  int RdmaCqNums() {
+    return rdma_cq_nums_;
+  }
+  int RdmaChunkSize() {
+    return rdma_chunk_size_;
+  }
+
+ private:
+  VerbsEnvRegistrar() {
+    rdma_cqpool_size_ = RDMACQPOOLSIZE();
+    CHECK(rdma_cqpool_size_ < 500 && rdma_cqpool_size_ >= 1)
+          << "rdma_cqpool_size_ must less 100 and greater 1";
+    enable_send_driven_ = RDMAENABLESENDDRIERN() == 1 ? true : false;
+
+    rdma_tensor_buffer_ratio_ = RDMATENSORBUFFERRATIO();
+    CHECK(rdma_tensor_buffer_ratio_ < 100 && rdma_tensor_buffer_ratio_ >= 1)
+          << "rdma_tensor_buffer_ratio_ must less 100 and greater 1";
+
+    rdma_cq_nums_ = RDMACQNUMS();
+    CHECK(rdma_cq_nums_ < 100 && rdma_cq_nums_ >= 1)
+          << "rdma_cq_nums_ must less 100 and greater 1";
+    rdma_chunk_size_ = RDMACHUNKSIZE();
+  }
+
+  int rdma_cqpool_size_;
+  bool enable_send_driven_;
+  int rdma_tensor_buffer_ratio_;
+  int rdma_cq_nums_;
+  int rdma_chunk_size_;
 };
 
 }  // namespace tensorflow
 
+#endif  // TENSORFLOW_USE_VERBS
 #endif  // TENSORFLOW_CONTRIB_VERBS_RDMA_H_
diff --git a/tensorflow_networking/verbs/rdma_mgr.cc b/tensorflow_networking/verbs/rdma_mgr.cc
index 6d19758..300f16e 100644
--- a/tensorflow_networking/verbs/rdma_mgr.cc
+++ b/tensorflow_networking/verbs/rdma_mgr.cc
@@ -13,20 +13,18 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#ifdef TENSORFLOW_USE_VERBS
+
 #include "tensorflow_networking/verbs/rdma_mgr.h"
 #include <fstream>
 #include <vector>
-#include "tensorflow/core/common_runtime/gpu/gpu_process_state.h"
-#include "tensorflow/core/common_runtime/gpu/gpu_util.h"
-#include "tensorflow/core/common_runtime/pool_allocator.h"
-#include "tensorflow/core/common_runtime/process_state.h"
+#include "tensorflow_networking/verbs/grpc_verbs_client.h"
+#include "tensorflow_networking/verbs/verbs_service.pb.h"
 #include "tensorflow/core/distributed_runtime/rpc/grpc_worker_cache.h"
 #include "tensorflow/core/distributed_runtime/session_mgr.h"
 #include "tensorflow/core/framework/allocator_registry.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/strings/strcat.h"
-#include "tensorflow_networking/verbs/grpc_verbs_client.h"
-#include "tensorflow_networking/verbs/verbs_service.pb.h"
 
 namespace tensorflow {
 
@@ -41,36 +39,70 @@ RdmaMgr::RdmaMgr(const WorkerEnv* const worker_env,
   std::vector<string> workers;
   worker_env_->session_mgr->LegacySession()->worker_cache->ListWorkers(
       &workers);
+
   num_remote_workers_ = workers.size() - 1;
   VLOG(2) << "rmda_mgr on local worker: " << local_worker_;
+  string other_worker_name = "";
+  int worker_cq_num = 0;
+  int ps_cq_num = 0;
   for (size_t i = 0; i < workers.size(); i++) {
     if (local_worker_.compare(workers[i]) != 0) {
+      other_worker_name += ";" + workers[i];
+      ibv_cq* cq = nullptr;
+      if (workers[i].find("worker") != string::npos) {
+        RDMA_LOG(2) << "Schedule CQ num For worker: "
+                  << workers[i]
+                  << " cq_num:"
+                  << worker_cq_num % rdma_adapter_->cq_nums_;
+        cq = rdma_adapter_->cq_vec_[worker_cq_num % rdma_adapter_->cq_nums_];
+        worker_cq_num++;
+      } else if (workers[i].find("ps") != string::npos) {
+        RDMA_LOG(2) << "Schedule CQ num For ps: "
+                  << workers[i]
+                  << " cq_num:"
+                  << ps_cq_num % rdma_adapter_->cq_nums_;
+        cq = rdma_adapter_->cq_vec_[ps_cq_num % rdma_adapter_->cq_nums_];
+        ps_cq_num++;
+      } else {
+        RDMA_LOG(2) << "Schedule CQ num For chief: "
+                  << workers[i]
+                  << " cq_num:"
+                  << 0;
+        cq = rdma_adapter_->cq_vec_[0];
+      }
       channel_table_.insert(
           {workers[i],
-           new RdmaChannel(rdma_adapter_, local_worker_, workers[i])});
+           new RdmaChannel(rdma_adapter_, local_worker_, workers[i], channel_cache_,
+           cq)});
     }
   }
+  LOG(INFO) << "local_worker: " << local_worker_ << " other_channel:" << other_worker_name;
 }
 
 // Setup Rdma channels between peers.
 // This is done at the beginning of the server setup.
 
 void RdmaMgr::SetupChannels() {
+  LOG(INFO) << "channel_table_size:" << channel_table_.size();
   for (const auto& p : channel_table_) {
     string worker_name = p.first;
     RDMA_LOG(2) << "Connecting to remote node " << worker_name;
+    LOG(INFO) << "Connecting to remote node " << worker_name;
     RdmaChannel* rc = p.second;
     GetRemoteAddressRequest req;
     GetRemoteAddressResponse resp;
     // get the channel cache
     SharedGrpcChannelPtr client_channel =
         channel_cache_->FindWorkerChannel(worker_name);
+
+    CHECK(client_channel != nullptr) << "target:" << worker_name << " client_channel is null!";
+
     GrpcVerbsClient* client = new GrpcVerbsClient(client_channel);
     CHECK(client != nullptr) << "No worker known as " << worker_name;
 
     // setting up request
     req.set_host_name(local_worker_);
-    Channel* channel_info = req.mutable_channel();
+    ChannelInfo* channel_info = req.mutable_channel();
     channel_info->set_lid(rc->self_.lid);
     channel_info->set_qpn(rc->self_.qpn);
     channel_info->set_psn(rc->self_.psn);
@@ -101,12 +133,21 @@ void RdmaMgr::SetupChannels() {
         rc->SetRemoteAddress(ra, false);
         rc->Connect();
         int i = 0;
-        int idx[] = {1, 0};
+        // int idx[] = {1, 0};
+        // {1, 0, 3, 2, 5, 4}
+        int idx[RdmaChannel::kNumMessageBuffers + 1];
+        for (auto k = 0; k < RdmaChannel::kNumMessageBuffers; k = k + 2) {
+        // for (auto k=0; k<2; k=k+2) {
+          idx[k] = k+1;
+          idx[k+1] = k;
+        }
+
         for (const auto& mr : resp.mr()) {
           // the connections are crossed, i.e.
           // local tx_message_buffer <---> remote rx_message_buffer_
           // local rx_message_buffer <---> remote tx_message_buffer_
           // hence idx[] = {1, 0}.
+          // LOG(ERROR) << "resp index:" << i << " local message_buffer idx:" << idx[i];
           RdmaMessageBuffer* rb = rc->message_buffers_[idx[i]];
           RemoteMR rmr;
           rmr.remote_addr = mr.remote_addr();
@@ -134,10 +175,11 @@ void RdmaMgr::SetupChannels() {
 bool RdmaMgr::ConnectivityCheck() {
   int i, rcnt = 0, scnt = 0;
 
+  int num_remote_workers = 0;
   for (const auto& p : channel_table_) {
+    num_remote_workers++;
     string worker_name = p.first;
     RdmaChannel* rc = p.second;
-
     VLOG(2) << "Ping to " << worker_name;
     CHECK(rc->PingPostSend() == 0) << "Couldn't post send  to " << worker_name
                                    << " with error: " << std::strerror(errno);
@@ -145,38 +187,50 @@ bool RdmaMgr::ConnectivityCheck() {
       rc->Recv();
     }
   }
+  LOG(INFO) << "PingPostSend num_remote_workers:" << num_remote_workers;
 
-  while (rcnt < num_remote_workers_ || scnt < num_remote_workers_) {
-    int ne;
-    do {
-      ne = ibv_poll_cq(rdma_adapter_->cq_, 2 * num_remote_workers_,
-                       rdma_adapter_->wc_);
-      CHECK(ne >= 0) << "poll CQ failed " << ne << "with error"
-                     << std::strerror(errno);
-    } while (ne < 1);
-
-    for (i = 0; i < ne; ++i) {
-      ibv_wc_status s = rdma_adapter_->wc_[i].status;
-      // recv complete
-      if ((int)rdma_adapter_->wc_[i].wr_id == RdmaChannel::kPingRecvWrid) {
-        CHECK(s == IBV_WC_SUCCESS)
-            << ": " << ibv_wc_status_str(rdma_adapter_->wc_[i].status) << "("
-            << rdma_adapter_->wc_[i].status << ") for PING_RECV_WRID";
-        ++rcnt;
-        // send complete
-      } else {
-        RdmaChannel* rc =
-            reinterpret_cast<RdmaChannel*>(rdma_adapter_->wc_[i].wr_id);
-        CHECK(s == IBV_WC_SUCCESS)
-            << ": " << ibv_wc_status_str(rdma_adapter_->wc_[i].status) << "("
-            << rdma_adapter_->wc_[i].status << ") to " << rc->remote_name_;
-        ++scnt;
-      }
-    }  // for
+  while (rcnt < num_remote_workers || scnt < num_remote_workers) {
+    for (int j = 0; j < rdma_adapter_->cq_nums_; j++) {
+      int ne = 0;
+      int retry_times = 0;
+      do {
+        ne = ibv_poll_cq(rdma_adapter_->cq_vec_[j], 2 * num_remote_workers_,
+                        rdma_adapter_->wc_vec_[j]);
+        CHECK(ne >= 0) << "poll CQ failed " << ne << "with error"
+                      << std::strerror(errno);
+        retry_times ++;
+        if (retry_times > 10) {
+          break;
+        }
+      } while (ne < 1);
+      for (i = 0; i < ne; ++i) {
+        ibv_wc_status s = rdma_adapter_->wc_vec_[j][i].status;
+        // recv complete
+        if ((int)rdma_adapter_->wc_vec_[j][i].wr_id == RdmaChannel::kPingRecvWrid) {
+          CHECK(s == IBV_WC_SUCCESS)
+              << ": " << ibv_wc_status_str(rdma_adapter_->wc_vec_[j][i].status) << "("
+              << rdma_adapter_->wc_vec_[j][i].status << ") for PING_RECV_WRID";
+          ++rcnt;
+          // send complete
+        } else {
+          RdmaChannel* rc =
+              reinterpret_cast<RdmaChannel*>(rdma_adapter_->wc_vec_[j][i].wr_id);
+          CHECK(s == IBV_WC_SUCCESS)
+              << ": " << ibv_wc_status_str(rdma_adapter_->wc_vec_[j][i].status) << "("
+              << rdma_adapter_->wc_vec_[j][i].status << ") to " << rc->remote_name_;
+          ++scnt;
+        }
+      }  // for
+    }
   }    // while
+  LOG(INFO) << "ConnectivityCheck:"
+            << num_remote_workers
+            << " rcnt:" << rcnt
+            << " scnt:" << scnt;
+
   CHECK(rcnt == scnt) << "Connectivity check failed!";
   rdma_adapter_->StartPolling();
-  return (num_remote_workers_ == rcnt) && (num_remote_workers_ == scnt);
+  return rcnt == scnt;
 }
 
 RdmaMgr::~RdmaMgr() {
@@ -192,10 +246,48 @@ RdmaMgr::~RdmaMgr() {
 //   channel object that is connected to the named peer.
 RdmaChannel* RdmaMgr::FindChannel(const string& name) {
   ChannelTable::iterator iter = channel_table_.find(name);
-  CHECK(iter != channel_table_.end());
+  CHECK(iter != channel_table_.end())
+    << "name:" << name
+    << "table_name like:"
+    << channel_table_.begin()->first;
   return iter->second;
 }
 
+bool RdmaMgr::NotifyAsyncAllocatorTest() {
+  for (const auto& p : channel_table_) {
+    string worker_name = p.first;
+    LOG(INFO) << "NotifyAsyncAllocator to remote node " << worker_name;
+    RdmaChannel* rc = p.second;
+    // 请 ps 端进行态空间分配，并同步静态空间给我
+    rc->PleSendOrCheck();
+    LOG(INFO) << "NotifyAsyncAllocator PleSendOrCheck to remote node"
+              << worker_name
+              << " Succeed!";
+  }
+  return true;
+}
+
+bool RdmaMgr::NotifyAsyncAllocator() {
+  for (const auto& p : channel_table_) {
+    string worker_name = p.first;
+    LOG(INFO) << "NotifyAsyncAllocator to remote node " << worker_name;
+    RdmaChannel* rc = p.second;
+    // 分配静态空间并且同步自身的静态空间给对方
+    // TODO(wuyongyu02): change to large MR
+    rc->InitAndSetDriverStatus();
+    LOG(INFO) << "NotifyAsyncAllocator InitAndSetDriverStatus to remote node "
+              << worker_name
+              << " Succeed!";
+    // 请 ps 端进行态空间分配，并同步静态空间给我
+    rc->PleSendOrCheck();
+    LOG(INFO) << "NotifyAsyncAllocator PleSendOrCheck to remote node"
+              << worker_name
+              << " Succeed!";
+
+  }
+  return true;
+}
+
 bool IsGDRAvailable() {
 #if defined(__APPLE__)
   return false;
@@ -236,8 +328,9 @@ int TryToReadNumaNode(ibv_device* device) {
   if (strings::safe_strto32(content, &value)) {
     if (value < 0) {
       LOG(INFO) << "Successful NUMA node read from SysFS had negative value ("
-                << value << "), but there must be at least one NUMA node"
-                            ", so returning NUMA node zero";
+                << value
+                << "), but there must be at least one NUMA node"
+                   ", so returning NUMA node zero";
       return 0;
     }
     LOG(INFO) << "NUMA node for device: " << device->name << " is " << value;
@@ -254,26 +347,35 @@ void MRDeleter(ibv_mr* mr) {
 }
 
 void RdmaMgr::InitAllocators() {
-  static std::once_flag flag;
-  std::call_once(
-      flag, [this]() { RdmaMemoryMgr::Singleton().pd_ = rdma_adapter_->pd_; });
+  // static std::once_flag flag;
+  // std::call_once(
+  //     flag, [this]() { RdmaMemoryMgr::Singleton().pd_ = rdma_adapter_->pd_; });
 }
 
 /*static*/ void RdmaMgr::RegMemVisitors() {
+  // SubAllocator::Visitor alloc_visitor = [](void* ptr, int numa_node,
+  //                                          size_t num_bytes) {
+  //   LOG(INFO) << "RdmaMgr alloc_visitor";
+  //   RdmaMemoryMgr::Singleton().InsertMemoryRegion(
+  //       ptr, num_bytes, strings::StrCat("CPU:", numa_node));
+  // };
+  // SubAllocator::Visitor free_visitor = [](void* ptr, int numa_node,
+  //                                         size_t num_bytes) {
+  //   RdmaMemoryMgr::Singleton().EvictMemoryRegion(ptr, num_bytes);
+  // };
+
+  // LOG(INFO) << " ProcessState::singleton()->AddCPUAllocVisitor...";
+  // ProcessState::singleton()->AddCPUAllocVisitor(alloc_visitor);
+  // ProcessState::singleton()->AddCPUFreeVisitor(free_visitor);
+
+#if GOOGLE_CUDA
   SubAllocator::Visitor alloc_visitor = [](void* ptr, int numa_node,
                                            size_t num_bytes) {
-    RdmaMemoryMgr::Singleton().InsertMemoryRegion(
-        ptr, num_bytes, strings::StrCat("CPU:", numa_node));
+    LOG(ERROR) << "Rdma For GPU is not supported!";
   };
   SubAllocator::Visitor free_visitor = [](void* ptr, int numa_node,
                                           size_t num_bytes) {
-    RdmaMemoryMgr::Singleton().EvictMemoryRegion(ptr, num_bytes);
   };
-
-  ProcessState::singleton()->AddCPUAllocVisitor(alloc_visitor);
-  ProcessState::singleton()->AddCPUFreeVisitor(free_visitor);
-
-#if GOOGLE_CUDA
   GPUProcessState::singleton()->AddGpuHostAllocVisitor(0, alloc_visitor);
   GPUProcessState::singleton()->AddGpuHostFreeVisitor(0, free_visitor);
 
@@ -289,8 +391,8 @@ void RdmaMgr::InitAllocators() {
 
     SubAllocator::Visitor cuda_alloc_visitor = [](void* ptr, int gpu_id,
                                                   size_t num_bytes) {
-      RdmaMemoryMgr::Singleton().InsertMemoryRegion(
-          ptr, num_bytes, strings::StrCat("GPU:", gpu_id));
+      // RdmaMemoryMgr::Singleton().InsertMemoryRegion(
+      //     ptr, num_bytes, strings::StrCat("GPU:", gpu_id));
     };
     GPUProcessState::singleton()->AddGPUAllocVisitor(bus_id,
                                                      cuda_alloc_visitor);
@@ -300,3 +402,5 @@ void RdmaMgr::InitAllocators() {
 }
 
 }  // end namespace tensorflow
+
+#endif
diff --git a/tensorflow_networking/verbs/rdma_mgr.h b/tensorflow_networking/verbs/rdma_mgr.h
index 06df124..9e3d9bd 100644
--- a/tensorflow_networking/verbs/rdma_mgr.h
+++ b/tensorflow_networking/verbs/rdma_mgr.h
@@ -16,18 +16,32 @@ limitations under the License.
 #ifndef TENSORFLOW_CONTRIB_VERBS_RDMA_MGR_H_
 #define TENSORFLOW_CONTRIB_VERBS_RDMA_MGR_H_
 
+#ifdef TENSORFLOW_USE_VERBS
+
 #include <string>
 #include <unordered_map>
 
+#include "tensorflow_networking/verbs/rdma.h"
 #include "tensorflow/core/distributed_runtime/rpc/grpc_channel.h"
 #include "tensorflow/core/distributed_runtime/worker_env.h"
-#include "tensorflow_networking/verbs/rdma.h"
+// For timeline logger
+#include "tensorflow/core/distributed_runtime/worker_cache_logger.h"
+#include "tensorflow/core/common_runtime/bfc_allocator.h"
+#include "tensorflow/core/common_runtime/pool_allocator.h"
+#include "tensorflow/core/common_runtime/process_state.h"
+#include "tensorflow/core/distributed_runtime/worker_cache_partial.h"
+#include "tensorflow/core/common_runtime/gpu/gpu_process_state.h"
+#include "tensorflow/core/common_runtime/gpu/gpu_util.h"
 
 namespace tensorflow {
 
+class RdmaChannel;
+class RdmaAdapter;
+class RdmaTensorRequest;
 class RdmaMgr {
   friend class RdmaChannel;
   friend class RdmaAdapter;
+  friend class RdmaSendDriverMgr;
 
  public:
   explicit RdmaMgr(const WorkerEnv* const worker_env,
@@ -40,17 +54,74 @@ class RdmaMgr {
   static void RegMemVisitors();
   const string& local_worker() { return local_worker_; }
 
- private:
+  bool NotifyAsyncAllocator();
+
+  bool NotifyAsyncAllocatorTest();
+
+ public:
   string local_worker_;
-  size_t num_remote_workers_;
   const WorkerEnv* const worker_env_;
   GrpcChannelCache* const channel_cache_;
+
+ private:
+  size_t num_remote_workers_;
   RdmaAdapter* rdma_adapter_;
   typedef std::unordered_map<string, RdmaChannel*> ChannelTable;
   ChannelTable channel_table_;
   TF_DISALLOW_COPY_AND_ASSIGN(RdmaMgr);
 };
 
+class RdmaBasicCPUAllocator : public SubAllocator {
+ public:
+  RdmaBasicCPUAllocator(const std::vector<SubAllocator::Visitor>& alloc_visitors,
+      const std::vector<SubAllocator::Visitor>& free_visitors) :
+      SubAllocator(alloc_visitors, free_visitors) {
+    numa_node_ = port::kNUMANoAffinity;
+  }
+
+  void* Alloc(size_t alignment, size_t num_bytes) override {
+    void* ptr = nullptr;
+    if (num_bytes > 0) {
+      if (numa_node_ == port::kNUMANoAffinity) {
+        ptr = port::AlignedMalloc(num_bytes, static_cast<int>(alignment));
+      } else {
+        ptr =
+          port::NUMAMalloc(numa_node_, num_bytes, static_cast<int>(alignment));
+      }
+      VisitAlloc(ptr, numa_node_, num_bytes);
+    }
+    return ptr;
+  }
+
+  void Free(void* ptr, size_t num_bytes) override {
+    if (num_bytes > 0) {
+      VisitFree(ptr, numa_node_, num_bytes);
+      if (numa_node_ == port::kNUMANoAffinity) {
+        port::AlignedFree(ptr);
+      } else {
+        port::NUMAFree(ptr, num_bytes);
+      }
+    }
+  }
+
+ private:
+  int numa_node_;
+  TF_DISALLOW_COPY_AND_ASSIGN(RdmaBasicCPUAllocator);
+};
+
+// TODO(wuyongyu02): remove this class and its registration when the default
+// cpu_allocator() returns visitable allocator
+class BFCRdmaAllocator : public BFCAllocator {
+ public:
+  BFCRdmaAllocator(const std::vector<SubAllocator::Visitor>& alloc_visitors,
+                  const std::vector<SubAllocator::Visitor>& free_visitors)
+      : BFCAllocator(new RdmaBasicCPUAllocator(alloc_visitors,
+                      free_visitors), 1LL << 36, true, "cpu_rdma_bfc") {
+  }
+};
+// REGISTER_MEM_ALLOCATOR("BFCRdmaAllocator", 101, BFCRdmaAllocator);
+
 }  // namespace tensorflow
 
+#endif  // TENSORFLOW_USE_VERBS
 #endif  // TENSORFLOW_CONTRIB_VERBS_RDMA_MGR_H_
diff --git a/tensorflow_networking/verbs/rdma_rendezvous_mgr.cc b/tensorflow_networking/verbs/rdma_rendezvous_mgr.cc
index f9a1afa..d54d754 100644
--- a/tensorflow_networking/verbs/rdma_rendezvous_mgr.cc
+++ b/tensorflow_networking/verbs/rdma_rendezvous_mgr.cc
@@ -13,31 +13,39 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#ifdef TENSORFLOW_USE_VERBS
+
 #include "tensorflow_networking/verbs/rdma_rendezvous_mgr.h"
 #include <unordered_set>
+#include "tensorflow_networking/verbs/verbs_util.h"
 #include "tensorflow/core/common_runtime/device.h"
 #include "tensorflow/core/common_runtime/device_mgr.h"
 #include "tensorflow/core/common_runtime/dma_helper.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/strings/numbers.h"
 #include "tensorflow/core/lib/strings/str_util.h"
-#include "tensorflow_networking/verbs/verbs_util.h"
+#include "tensorflow/core/distributed_runtime/worker_cache_partial.h"
 
 namespace tensorflow {
 
 class RdmaRemoteRendezvous : public BaseRemoteRendezvous {
  public:
   RdmaRemoteRendezvous(const WorkerEnv* env, int64 step_id, RdmaMgr* rdma_mgr)
-      : BaseRemoteRendezvous(env, step_id), rdma_mgr_(rdma_mgr) {}
+      : BaseRemoteRendezvous(env, step_id) {
+    rdma_mgr_ = rdma_mgr;
+  }
 
  protected:
   void RecvFromRemoteAsync(const Rendezvous::ParsedKey& parsed,
                            const Rendezvous::Args& args,
                            DoneCallback done) override;
 
+ public:
+  RdmaMgr* rdma_mgr_;
+
  private:
   ~RdmaRemoteRendezvous() override {}
-  RdmaMgr* rdma_mgr_;
+
 
   TF_DISALLOW_COPY_AND_ASSIGN(RdmaRemoteRendezvous);
 };
@@ -59,8 +67,8 @@ void RdmaRemoteRendezvous::RecvFromRemoteAsync(
     done(s, Args(), recv_args, Tensor{}, false);
     return;
   }
-  CHECK(dst_name.compare(rdma_mgr_->local_worker()) == 0);
-  RdmaChannel* rc = rdma_mgr_->FindChannel(src_name);
+  CHECK(dst_name.compare(static_cast<RdmaMgr*>(rdma_mgr_)->local_worker()) == 0);
+  RdmaChannel* rc = static_cast<RdmaMgr*>(rdma_mgr_)->FindChannel(src_name);
   string key(parsed.FullKey());
   string key_with_step_id = VerbsUtil::AppendStepidToKey(key, step_id_);
 
@@ -72,6 +80,17 @@ void RdmaRemoteRendezvous::RecvFromRemoteAsync(
     return;
   }
 
+  uint64_t time_now = Env::Default()->NowMicros();
+
+  // Add to Channel LocalDriverBufferMgr
+  if (rc->could_send_driver_) {
+    RDMA_LOG(1) << "Recv From Local key:" << key << " will GetBufferMgr";
+    rc->local_driver_buffer_mgr_->QueueLoadAsync(
+      key, recv_args, std::move(done), Env::Default()->NowMicros());
+    return;
+  }
+  RDMA_LOG(1) << "Request start:" << key;
+
   RdmaTensorRequest* request =
       rc->InsertTensorRequest(key, step_id_, dst_dev, recv_args, done);
   request->Start();
@@ -86,3 +105,5 @@ BaseRemoteRendezvous* RdmaRendezvousMgr::Create(int64 step_id,
 }
 
 }  // end namespace tensorflow
+
+#endif
diff --git a/tensorflow_networking/verbs/rdma_rendezvous_mgr.h b/tensorflow_networking/verbs/rdma_rendezvous_mgr.h
index a750dbb..5455235 100644
--- a/tensorflow_networking/verbs/rdma_rendezvous_mgr.h
+++ b/tensorflow_networking/verbs/rdma_rendezvous_mgr.h
@@ -16,9 +16,12 @@ limitations under the License.
 #ifndef TENSORFLOW_CONTRIB_VERBS_RDMA_RENDEZVOUS_MGR_H_
 #define TENSORFLOW_CONTRIB_VERBS_RDMA_RENDEZVOUS_MGR_H_
 
+#ifdef TENSORFLOW_USE_VERBS
+
 #include "tensorflow/core/distributed_runtime/base_rendezvous_mgr.h"
 #include "tensorflow/core/distributed_runtime/worker_env.h"
 #include "tensorflow/core/platform/macros.h"
+#include "tensorflow_networking/verbs/rdma.h"
 #include "tensorflow_networking/verbs/rdma_mgr.h"
 
 namespace tensorflow {
@@ -46,6 +49,12 @@ class RdmaRendezvousMgr : public BaseRendezvousMgr {
   explicit RdmaRendezvousMgr(const WorkerEnv* env);
   void SetRdmaMgr(RdmaMgr* rdma_mgr) { rdma_mgr_ = rdma_mgr; }
 
+  bool NotifyAsyncAllocatorTest() {
+    rdma_mgr_->NotifyAsyncAllocator();
+  }
+
+
+
  protected:
   BaseRemoteRendezvous* Create(int64 step_id,
                                const WorkerEnv* worker_env) override;
@@ -57,4 +66,5 @@ class RdmaRendezvousMgr : public BaseRendezvousMgr {
 
 }  // end namespace tensorflow
 
+#endif  // TENSORFLOW_USE_VERBS
 #endif  // TENSORFLOW_CONTRIB_VERBS_RDMA_RENDEZVOUS_MGR_H_
diff --git a/tensorflow_networking/verbs/verbs_server_lib.cc b/tensorflow_networking/verbs/verbs_server_lib.cc
index 103db21..74ab309 100644
--- a/tensorflow_networking/verbs/verbs_server_lib.cc
+++ b/tensorflow_networking/verbs/verbs_server_lib.cc
@@ -13,15 +13,17 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#ifdef TENSORFLOW_USE_VERBS
+
 #include "tensorflow_networking/verbs/verbs_server_lib.h"
 
 #include "grpc/support/alloc.h"
 
+#include "tensorflow_networking/verbs/rdma_mgr.h"
+#include "tensorflow_networking/verbs/rdma_rendezvous_mgr.h"
 #include "tensorflow/core/distributed_runtime/server_lib.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/platform/env.h"
-#include "tensorflow_networking/verbs/rdma_mgr.h"
-#include "tensorflow_networking/verbs/rdma_rendezvous_mgr.h"
 
 namespace tensorflow {
 
@@ -41,7 +43,7 @@ VerbsServer::VerbsServer(const ServerDef& server_def, Env* env)
 VerbsServer::~VerbsServer() {
   TF_CHECK_OK(Stop());
   TF_CHECK_OK(Join());
-  delete rdma_mgr_;
+  //delete rdma_mgr_;
   delete verbs_service_;
   delete channel_cache_;
 }
@@ -49,8 +51,8 @@ VerbsServer::~VerbsServer() {
 Status VerbsServer::ChannelCacheFactory(const ServerDef& server_def,
                                         GrpcChannelCache** channel_cache) {
   string name_prefix =
-      strings::StrCat("/job:", server_def.job_name(), "/replica:0", "/task:",
-                      server_def.task_index());
+      strings::StrCat("/job:", server_def.job_name(), "/replica:0",
+                      "/task:", server_def.task_index());
 
   GrpcChannelSpec channel_spec;
   TF_RETURN_IF_ERROR(ParseChannelSpec(server_def, &channel_spec));
@@ -59,6 +61,7 @@ Status VerbsServer::ChannelCacheFactory(const ServerDef& server_def,
       NewGrpcChannelCache(channel_spec, GetChannelCreationFunction());
 
   const string host_port = (*channel_cache)->TranslateTask(name_prefix);
+
   int requested_port;
 
   if (!strings::safe_strto32(str_util::Split(host_port, ':')[1],
@@ -79,7 +82,6 @@ Status VerbsServer::ChannelCacheFactory(const ServerDef& server_def,
 Status VerbsServer::Init(ServiceInitFunction service_func,
                          RendezvousMgrCreationFunction rendezvous_mgr_func) {
   std::call_once(reg_mem_visitors_call, []() { RdmaMgr::RegMemVisitors(); });
-
   GrpcServerOptions opts;
   opts.service_func = service_func;
   opts.rendezvous_mgr_func = rendezvous_mgr_func;
@@ -88,12 +90,16 @@ Status VerbsServer::Init(ServiceInitFunction service_func,
     mutex_lock l(mu_);
     CHECK_EQ(verbs_state_, DISCONNECTED);
     CHECK(ChannelCacheFactory(server_def(), &channel_cache_).ok());
+    LOG(INFO) << "ChannelCacheFactory init GrpcChannelCache End.";
     rdma_mgr_ = new RdmaMgr(worker_env(), channel_cache_);
     // set rdma_mgr for verbs_service and rdma_rendezvous_mgr
     verbs_service_->SetRdmaMgr(rdma_mgr_);
+    LOG(INFO) << "VerbsService SetRdmaMgr End.";
     dynamic_cast<RdmaRendezvousMgr*>(worker_env()->rendezvous_mgr)
         ->SetRdmaMgr(rdma_mgr_);
+    LOG(INFO) << "RdmaRendezvousMgr SetRdmaMgr End.";
   }
+  LOG(INFO) << "VerbsServer::Init End.";
   return s;
 }
 
@@ -107,10 +113,14 @@ Status VerbsServer::Start() {
       verbs_thread_.reset(worker_env()->env->StartThread(
           ThreadOptions(), "TF_verbs_service",
           [this] { verbs_service_->HandleRPCsLoop(); }));
+      LOG(INFO) << "Start SetupChannels begin:";
       rdma_mgr_->SetupChannels();
+      LOG(INFO) << "rdma_mgr_ SetupChannels succeed!";
       CHECK(rdma_mgr_->ConnectivityCheck()) << "Connectivity check failed!";
+      LOG(INFO) << "rdma_mgr_ Connectivity check succeed!";
       rdma_mgr_->InitAllocators();
       verbs_state_ = CONNECTED;
+      LOG(INFO) << "verbs state CONNECTED.";
     }
   }
   return s;
@@ -171,3 +181,5 @@ static VerbsServerRegistrar registrar;
 
 }  // namespace
 }  // namespace tensorflow
+
+#endif
diff --git a/tensorflow_networking/verbs/verbs_server_lib.h b/tensorflow_networking/verbs/verbs_server_lib.h
index 2869be1..3662921 100644
--- a/tensorflow_networking/verbs/verbs_server_lib.h
+++ b/tensorflow_networking/verbs/verbs_server_lib.h
@@ -16,9 +16,11 @@ limitations under the License.
 #ifndef TENSORFLOW_CONTRIB_VERBS_VERBS_SERVER_LIB_H_
 #define TENSORFLOW_CONTRIB_VERBS_VERBS_SERVER_LIB_H_
 
-#include "tensorflow/core/distributed_runtime/rpc/grpc_server_lib.h"
+#ifdef TENSORFLOW_USE_VERBS
+
 #include "tensorflow_networking/verbs/grpc_verbs_service.h"
 #include "tensorflow_networking/verbs/rdma_mgr.h"
+#include "tensorflow/core/distributed_runtime/rpc/grpc_server_lib.h"
 
 namespace tensorflow {
 
@@ -43,7 +45,7 @@ class VerbsServer : public GrpcServer {
               RendezvousMgrCreationFunction rendezvous_mgr_func);
   Status ChannelCacheFactory(const ServerDef& server_def,
                              GrpcChannelCache** channel_cache);
-
+              
  private:
   RdmaMgr* rdma_mgr_;
 
@@ -51,13 +53,14 @@ class VerbsServer : public GrpcServer {
   mutex mu_;
 
   enum State { DISCONNECTED, CONNECTED };
-  State verbs_state_ TF_GUARDED_BY(mu_);
+  State verbs_state_ GUARDED_BY(mu_);
 
   GrpcVerbsService* verbs_service_ = nullptr;
-  std::unique_ptr<Thread> verbs_thread_ TF_GUARDED_BY(mu_);
+  std::unique_ptr<Thread> verbs_thread_ GUARDED_BY(mu_);
   GrpcChannelCache* channel_cache_ = nullptr;
 };
 
 }  // namespace tensorflow
 
+#endif  // TENSORFLOW_USE_VERBS
 #endif  // TENSORFLOW_CONTRIB_VERBS_VERBS_SERVER_LIB_H_
diff --git a/tensorflow_networking/verbs/verbs_service.proto b/tensorflow_networking/verbs/verbs_service.proto
index abdae1d..3f68020 100644
--- a/tensorflow_networking/verbs/verbs_service.proto
+++ b/tensorflow_networking/verbs/verbs_service.proto
@@ -26,7 +26,7 @@ option java_package = "org.tensorflow.contrib.verbs";
 //
 ////////////////////////////////////////////////////////////////////////////////
 
-message Channel {
+message ChannelInfo {
   int32 lid = 1;
   int32 qpn = 2;
   int32 psn = 3;
@@ -40,16 +40,54 @@ message MemoryRegion {
 }
 message GetRemoteAddressRequest {
   string host_name = 1;
-  Channel channel = 2;
+  ChannelInfo channel = 2;
   repeated MemoryRegion mr = 3;
 }
 
 message GetRemoteAddressResponse {
   string host_name = 1;
-  Channel channel = 2;
+  ChannelInfo channel = 2;
   repeated MemoryRegion mr = 3;
 }
 
+message DriverMessageItem {
+  enum DriverStatus {
+    DRIVER_INIT = 0;
+    RPC_0 = 1;
+    RPC_1 = 2;
+    DATA_NOT_READY = 4;
+    DATA_READY = 5;
+    DRIVER_ERROR = 6;
+  }
+  uint32 unique_id = 1;
+  string parsed_key = 2;
+  uint64 remote_addr = 3;
+  uint32 rkey = 4;
+  DriverStatus status=5;
+  int32 allocate_bytes=6;
+  bool meta_changed=7;
+}
+
+message DriverMessageReq {
+  string host_name = 1;
+  repeated DriverMessageItem item = 2;
+}
+
+message DriverMessageResp {
+  string host_name = 1;
+  repeated DriverMessageItem item = 2;
+}
+
+message PleSendOrCheckReq {
+  string host_name = 1;
+}
+
+
+message PleSendOrCheckResp {
+  string host_name = 1;
+  bool is_ok = 2;
+}
+
 message ErrorStatusProto {
   int32 error_code = 1;
   string error_message = 2;
@@ -65,4 +103,8 @@ message ErrorStatusProto {
 service VerbsService {
   rpc GetRemoteAddress(GetRemoteAddressRequest)
       returns (GetRemoteAddressResponse);
+  rpc ReqDriverMessage(DriverMessageReq)
+      returns (DriverMessageResp);
+  rpc ReqPleSendOrCheck(PleSendOrCheckReq)
+      returns (PleSendOrCheckResp);
 }
diff --git a/tensorflow_networking/verbs/verbs_testlib.h b/tensorflow_networking/verbs/verbs_testlib.h
new file mode 100644
index 0000000..a95cb9d
--- /dev/null
+++ b/tensorflow_networking/verbs/verbs_testlib.h
@@ -0,0 +1,60 @@
+#pragma once
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "tensorflow/core/framework/device_attributes.pb.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/platform/subprocess.h"
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/public/session_options.h"
+
+namespace tensorflow {
+
+class Device;
+
+namespace test {
+
+// Provides a handle to a set of TensorFlow servers (masters and
+// workers) for testing purposes.
+//
+// This class currently runs the servers in separate processes; the
+// lifetime of this object is coterminous with the lifetimes of those
+// processes.
+class TestCluster {
+ public:
+  // Creates a new test cluster based on the given `options` (which
+  // configure the number of devices of each type) and a count of
+  // processes `n`. On success, the test cluster is stored in
+  // *out_cluster, and this function returns OK. Otherwise an error is
+  // returned.
+  static Status MakeTestCluster(const SessionOptions& options, int n,
+                                std::unique_ptr<TestCluster>* out_cluster);
+
+  // As above, but allows overridding the server binary path via `binary_path`.
+  static Status MakeTestCluster(const string& binary_path,
+                                const SessionOptions& options, int n,
+                                std::unique_ptr<TestCluster>* out_cluster);
+  ~TestCluster();
+
+  // Returns a vector of string "<hostname>:<port>" pairs that may be
+  // used as targets to construct a GrpcSession.
+  const std::vector<string>& targets() const { return targets_; }
+
+  // Returns a vector of devices available in this test cluster.
+  const std::vector<DeviceAttributes>& devices() const { return devices_; }
+
+ private:
+  TestCluster() = default;
+
+  std::vector<std::unique_ptr<SubProcess>> subprocesses_;
+  std::vector<string> targets_;
+  std::vector<DeviceAttributes> devices_;
+
+  TF_DISALLOW_COPY_AND_ASSIGN(TestCluster);
+};
+
+}  // end namespace test
+}  // end namespace tensorflow
diff --git a/tensorflow_networking/verbs/verbs_testlib_server.cc b/tensorflow_networking/verbs/verbs_testlib_server.cc
new file mode 100644
index 0000000..e11a823
--- /dev/null
+++ b/tensorflow_networking/verbs/verbs_testlib_server.cc
@@ -0,0 +1,102 @@
+#include <vector>
+
+#include "grpcpp/grpcpp.h"
+#include "grpcpp/security/credentials.h"
+#include "grpcpp/server_builder.h"
+
+#include "tensorflow/core/distributed_runtime/server_lib.h"
+
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/strings/str_util.h"
+#include "tensorflow/core/lib/strings/strcat.h"
+#include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/init_main.h"
+#include "tensorflow/core/protobuf/cluster.pb.h"
+#include "tensorflow/core/public/session_options.h"
+#include "tensorflow/core/util/command_line_flags.h"
+
+// This binary starts a TensorFlow verbs server (master and worker) for test purposes.
+namespace tensorflow {
+namespace {
+Status FillServerDef(const string& job_spec, const string& job_name,
+                     int num_cpus, int num_gpus, int task_index,
+                     ServerDef* options) {
+  options->set_protocol("grpc+verbs");
+  options->set_job_name(job_name);
+  options->set_task_index(task_index);
+
+  uint32 my_tasks_per_replica = 0;
+  for (const string& job_str : str_util::Split(job_spec, ',')) {
+    JobDef* job_def = options->mutable_cluster()->add_job();
+    // Split each entry in the flag into 2 pieces, separated by "|".
+    const std::vector<string> job_pieces = str_util::Split(job_str, '|');
+    CHECK_EQ(2, job_pieces.size()) << job_str;
+    job_def->set_name(job_pieces[0]);
+    // Does a bit more validation of the tasks_per_replica.
+    const StringPiece spec = job_pieces[1];
+    // job_str is of form <job_name>|<host_ports>.
+    const std::vector<string> host_ports = str_util::Split(spec, ';');
+    uint32 tasks_per_replica = host_ports.size();
+    for (size_t i = 0; i < host_ports.size(); ++i) {
+      (*job_def->mutable_tasks())[i] = host_ports[i];
+    }
+    if (job_def->name() == options->job_name()) {
+      my_tasks_per_replica = tasks_per_replica;
+    }
+    LOG(INFO) << "Peer " << job_def->name() << " " << tasks_per_replica << " {"
+              << absl::StrJoin(host_ports, ", ") << "}";
+  }
+  if (my_tasks_per_replica == 0) {
+    return errors::InvalidArgument("Invalid job specification");
+  }
+  ConfigProto* config = options->mutable_default_session_config();
+  (*config->mutable_device_count())["CPU"] = num_cpus;
+  (*config->mutable_device_count())["GPU"] = num_gpus;
+  return Status::OK();
+}
+} // namespace 
+} // namespace tensorflow
+
+int main(int argc, char** argv) {
+  tensorflow::port::InitMain(argv[0], &argc, &argv);
+  tensorflow::string job_spec;
+  tensorflow::string job_name;
+  int num_cpus = 1;
+  int num_gpus = 0;
+  int task_index = 0;
+  std::vector<tensorflow::Flag> flag_list = {
+    tensorflow::Flag("tf_jobs", &job_spec, "job spec"),
+    tensorflow::Flag("tf_job", &job_name, "job name"),
+    tensorflow::Flag("tf_task", &task_index, "task index"),
+    tensorflow::Flag("num_cpus", &num_cpus, "number of CPUs"),
+    tensorflow::Flag("num_gpus", &num_gpus, "number of GPUs"),
+  };
+  tensorflow::string usage = tensorflow::Flags::Usage(argv[0], flag_list);
+  const bool parse_result = tensorflow::Flags::Parse(&argc, argv, flag_list);
+  if (!parse_result || argc != 1) {
+    LOG(ERROR) << usage;
+    return -1;
+  }
+  tensorflow::ServerDef def;
+  tensorflow::Status s = tensorflow::FillServerDef(job_spec, job_name, num_cpus,
+                                                   num_gpus, task_index, &def);
+  if (!s.ok()) {
+    LOG(ERROR) << "Could not parse job spec: " << s.error_message() << "\n"
+               << usage;
+    return -1;
+  }
+  std::unique_ptr<tensorflow::ServerInterface> svr;
+  s = tensorflow::NewServer(def, &svr);
+
+  if (!s.ok()) {
+    LOG(ERROR) << "Could not create server: " << s.error_message();
+    return -1;
+  }
+  TF_QCHECK_OK(svr->Start());
+  TF_QCHECK_OK(svr->Join());
+  return 0;
+}
+
+
+
diff --git a/tensorflow_networking/verbs/verbs_util.cc b/tensorflow_networking/verbs/verbs_util.cc
index 20d2c71..356f856 100644
--- a/tensorflow_networking/verbs/verbs_util.cc
+++ b/tensorflow_networking/verbs/verbs_util.cc
@@ -44,7 +44,7 @@ void VerbsUtil::GetKeyAndStepId(const string& key_with_step_id, string& key,
   CHECK(parts.size() == 6) << "Key with step_id must have 6 parts";
   strings::safe_strto64(parts[5], &step_id);
   parts.pop_back();                        // remove step_id
-  key.assign(str_util::Join(parts, ";"));  // stitch them together
+  key.assign(absl::StrJoin(parts, ";"));   // stitch them together
 }
 
 }  // namespace tensorflow
diff --git a/tensorflow_networking/verbs/verbs_util.h b/tensorflow_networking/verbs/verbs_util.h
index 6277bc4..db76b2e 100644
--- a/tensorflow_networking/verbs/verbs_util.h
+++ b/tensorflow_networking/verbs/verbs_util.h
@@ -17,11 +17,106 @@ limitations under the License.
 #define TENSORFLOW_CONTRIB_VERBS_VERBS_UTIL_H_
 
 #include <string>
+#include <sstream>
+#include <errno.h>
+#include <fcntl.h>
+#include <sys/shm.h>
+#include <sys/stat.h>
+#include <sys/mman.h>
+#include <sys/types.h>
+#include <unistd.h>
+#include <netdb.h>
+#include <poll.h>
+#include <stddef.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#include <algorithm>
+#include <map>
+#include <queue>
+#include <set>
+#include <string>
+#include <thread>
+#include <tuple>
+#include <unordered_map>
+#include <vector>
+#include <infiniband/verbs.h>
 
 #include "tensorflow/core/framework/types.h"
 
 namespace tensorflow {
+namespace {
+int RDMACQNUMS() {
+  const char* env_p = std::getenv("RDMA_CQ_NUMS");
+  int nums = 1;
+  if (env_p != nullptr) {
+    std::stringstream ss(env_p);
+    ss >> nums;
+  }
+   LOG(INFO) << "RDMA_CQ_NUMS:" << nums;
+  return nums;
+}
+int RDMACQPOOLSIZE() {
+  const char* env_p = std::getenv("RDMA_CQPOOL_SIZE");
+  int pool_size = 20;
+  if (env_p != nullptr) {
+    std::stringstream ss(env_p);
+    ss >> pool_size;
+  }
+  return pool_size;
+}
+
+int RDMATENSORBUFFERRATIO() {
+  const char* env_p = std::getenv("RDMA_TENSOR_BUFFER_RATIO");
+  int ratio = 5;
+  if (env_p != nullptr) {
+    std::stringstream ss(env_p);
+    ss >> ratio;
+  }
+  LOG(INFO) << "RDMA_TENSOR_BUFFER_RATIO:" << ratio;
+  return ratio;
+}
+
+int RDMAENABLESENDDRIERN() {
+  const char* env_p = std::getenv("RDMASendDriver");
+  int send_driver = 0;
+  if (env_p != nullptr) {
+    std::stringstream ss(env_p);
+    ss >> send_driver;
+  }
+  return send_driver;
+}
 
+int RDMACHUNKSIZE() {
+  const char* env_p = std::getenv("RDMAChunkSize");
+  int chunk_size = 60*1024*1024;
+  if (env_p != nullptr) {
+    std::stringstream ss(env_p);
+    ss >> chunk_size;
+  }
+  return chunk_size;
+}
+
+std::string GetMetaOutput() {
+  const char* env_p = std::getenv("RDMAMetaOutput");
+  if (env_p != nullptr) {
+    return std::string(env_p);
+  }
+  return "viewfs://hadoop-meituan/user/hadoop-hdp/wuyongyu02/default_output";
+}
+
+std::string GetWorkerMetas() {
+  /*
+  edg_name#size|edg_name#size
+  */
+  const char* env_p  = std::getenv("RDMAWorkerMetas");
+  if (env_p != nullptr) {
+    return std::string(env_p);
+  }
+  return "edge_6389_global_step;0:0#80";
+}
+
+} // end namespace
 class VerbsUtil {
  public:
   static string AppendStepidToKey(const string& key, int64 step_id);
@@ -29,5 +124,104 @@ class VerbsUtil {
                               int64& step_id);
 };
 
+
+#define DIVUP(x, y) (((x)+(y)-1)/(y))
+#define ROUNDUP(x, y) (DIVUP((x), (y))*(y))
+
+template <typename T>
+static inline T align_floor(T v, T align) {
+  return v - (v % align);
+}
+
+template <typename T>
+static inline T align_ceil(T v, T align) {
+  return align_floor(v + align - 1, align);
+}
+
+static inline size_t ib_allocate_size(size_t size) {
+  size_t page_size = 4096;
+  return ROUNDUP(size, page_size);
+}
+
+static inline void ib_malloc(void** ptr, size_t* allocate_size, size_t size,
+                             int minimum_alignment) {
+  void* p;
+  *allocate_size = size;
+  const int required_alignment = sizeof(void*);
+  if (minimum_alignment < required_alignment) {
+    p = malloc(size);
+  } else {
+    int err = posix_memalign(&p, minimum_alignment, size);
+  }
+  *ptr = p;
+}
+
+class MemoryAllocator {
+ public:
+  explicit MemoryAllocator(struct ibv_pd *pd) {
+    std::lock_guard<std::mutex> lk(mu_);
+    pd_ = pd;
+  }
+
+  ~MemoryAllocator() {
+    std::lock_guard<std::mutex> lk(mu_);
+    for(auto &it : mr_) {
+      ibv_dereg_mr(it.second);
+      free(it.first);
+    }
+  }
+
+  char *Alloc(size_t size) {
+    if (size == 0) {
+      return nullptr;
+    }
+
+    // align to page size (usually 4KB)
+    size = align_ceil(size, pagesize_);
+
+    char *p;
+    size_t allocate_size = size;
+    ib_malloc((void**) &p, &allocate_size, size, 64);
+    CHECK(p);
+
+    struct ibv_mr *mr;
+    CHECK(mr = ibv_reg_mr(pd_, p, size, IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_WRITE));
+
+    std::lock_guard<std::mutex> lk(mu_);
+    mr_[p] = mr;
+    used_list.emplace(p, size);
+
+    return p;
+  }
+
+  uint32_t LocalKey(char *addr) {
+    return Addr2MR(addr)->lkey;
+  }
+
+  uint32_t RemoteKey(char *addr) {
+    return Addr2MR(addr)->rkey;
+  }
+
+  struct ibv_pd* GetPD() {
+    return pd_;
+  }
+
+ private:
+  // convert the memory address to its associated RDMA memory region
+  inline struct ibv_mr* Addr2MR(char *addr) {
+    std::lock_guard<std::mutex> lk(mu_);
+    auto it = mr_.find(addr);
+    CHECK(it != mr_.end());
+
+    return it->second;
+  }
+
+  std::mutex mu_;
+  struct ibv_pd *pd_;
+  size_t pagesize_ = sysconf(_SC_PAGESIZE);
+  std::unordered_map<char *, size_t> used_list;
+  std::unordered_map<char *, struct ibv_mr *> mr_;
+};
+
 }  // namespace tensorflow
 #endif  // TENSORFLOW_CONTRIB_VERBS_VERBS_UTIL_H_
diff --git a/tensorflow_networking/verbs/verbs_with_0_copies.xml b/tensorflow_networking/verbs/verbs_with_0_copies.xml
index c3d79e7..16130a9 100644
--- a/tensorflow_networking/verbs/verbs_with_0_copies.xml
+++ b/tensorflow_networking/verbs/verbs_with_0_copies.xml
@@ -1 +1 @@
-<mxfile userAgent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36" version="7.8.7" editor="www.draw.io" type="device"><diagram name="Page-1" id="74e2e168-ea6b-b213-b513-2b3c1d86103e">7Vxtc9o4EP41zKQfmsGW3/hIgPQ60/RyIZ1rPzHClsFXY1FZEOivP8mW8ZsAB2yXtHQ6jb2SJXl3n0e7K6cdMFhsPhC4nD9gB/kdtetsOmDYUVVFUw32g0u2scRUu7FgRjxHdEoFY+8nEsKk28pzUJjrSDH2qbfMC20cBMimORkkBL/ku7nYz8+6hDNUEoxt6Jel/3oOnQup0u2mDX8hbzYXU1u6aJhC+/uM4FUg5uuowI3+xM0LmIwl+odz6OCXjAiMOmBAMKbx1WIzQD7XbaK2+Ln7Pa27dRMU0CoP6CB+Yg39FUqWHC2MbhNlRK+D+APdDrh7mXsUjZfQ5q0vzPxMNqcLn90p7NL1fH+AfUzYfYAD1ulOzIAIRZu9y1R2L8+cCuEFomTLumx2mo8fEf5kiduX1DhWIptn7GIkQigcYrYbOlUKuxB6kevIkqjI8NkMd463zqnK+LHihrtjL0rfQ9+bBR3QZz185NK0lV3NxM9olHAJg0Q2ppDQm3dJE1tatjUjjqbOS+tfTSKbEskK2lqYcsua+r6PbUgRJwIUhJiEt03OqfI5xyhwbp5Hn8d/P02eRv98GY2f37VhAam2c7PVBVDGTg5Elmtzu1OCv6NMi2FbaOru5iuBVQLp/fgFefwqRhnAalcC4B3jngPghGxrR/ATstfPkTs+IAqHkMKbC/GQ2oD3ZenEsGNah+/ZNeTbLrTnqHkAPiHYLuxlHAjKVCBjg8q0+XsBGahtAlkxjkcryGGRnLjFhM7xDAfQH6XSu7yWMxr9D1G6FcEoXFHMROkInzBein579RjiFbGTEFIsjW3oM5R002IZX+NBbRPkQ+qt89HoWZpTG6LAiCQGBMUgJShc4iBshRvs9SfGDX4/3AZ2s7QbUcAAL7dRGsKvH79wyCPisWd/8hf/6EZv/2PlEeTsffs3B3dT+aX7tv4r0M20RbZfszff+GC3Or/dePRr7u6bmKgaJ2hlTkhS5fo4QTz6iD22lJ0pe728KU2jYKF4UeKp1Eh9QuA2023JO4QH5jELO4RZyECP9E9SttRH4hWkHrPTSTXm00rMd++RkD57Cw7cjjngf1UDLjjggmm4LPJGjN4kwhvMYTBDDmMccF8V53O8mK7C4xh3GHvY1MOMjYbMbzjCLgP3LW/zvePbfDiHS37p+mjT5xWfCKyOuBzaPgxDzz5Ioa5lI9vOIr5bRnxJzVNL1/TKiLckQYBaEfAZXesSVSeyM3kBFCI6tcgL8euUeKE0kNbt3jK/MUxLUSxD10wjP65SjW9OgHjiHm35y5k+Id0FuhflFIbSu1WtHlAVy1KApqs5U2pFU1Z1kcPDgoo70ikeUi5zfkNhyUl4OJh3gbypRUFTUuMUMeTQZoZHTH7Hudbj4aloWHiOE8Unsi0gH7M0wd+gzN+axH3UOqK28ob7Gf++qraK8U6bqpblw00VQqJM75GFyfI0r61yMM/+5MFadgVPwwc+xAvxorz0Jq7SdaITI8qM/e61S3/zqZsh8cvmQjigH9+S28zlRMK2i+2q7tWqKdmrW8rYrKIFtWr74/6M7Zwd1CwZdFcaztDBm4eJ1mqFA1Q4fn0jmU6yn+WQYlZESjtRrVpIdTTzxDi2OJBe9IWaaimleZR6ayOgQj29EfeTlNbOdT+j7H7gstzvcPZjgkaSqtIXEPUlVaC8JdR9rDqIo7Xf6VRVUlqMI2uCN9soQOXnDPcOyh4veJWOF0oDyyLj6PTkY7BmYGMXDs+p+IGu7/NPl45Fxa9S0ZEz1aHkdJf7aeBE77rAayReGoX0tEzjzQfxxePWdoP4JN6UAHyuVIKAyNFlCEeMSEnGUHzEPXY6vVbAXMX2ghkT6Ondc5QevFf3GZ05HnH9aHebe46DgibKBoqp5yzbKxt299Fb1rDFHOAku6pN2ZV/J/EnW9U0jlq115RRZamEYO0iL7o4CiDsHWmldgSu2+1y8ihBdvjQnzyMxuP+h9Ek/1Vcxt7xyCUWnjbgBRdWBwRWnqoVyTeq0uiyjkKgVq65Nmf8h9FzfzLss3+eRuPHvz+PR1cHkDgA0Np0AFm9rXH0XwnggP21Vglg/0lALfYv1s+vFpdY3NDbtHhT2XfNSXUi+LhYxP2ruCF3Qv47M8VRBQO9yvsOJYiyVLLm9773kO+E+VfPLpBulyzPHaSp7sRjXrqJRQFciMaQouWE2V70XGCKJtBxiBB8R9v4in+mPYk/0z6SGZ9w6nUMuTwvNqaGbnTKNUDXVaMa4KVh2MhjWJV86QRkGbZeB4ab/tWihjAsg1m31PvPBbpUPweBfoXtebAFkmCrOdjKvk+8wvYPhO3r9ucrsk9Att7mhpyMcUV2ceqC818+viueVF1xtwd3hqR2XRfu2G36XxzEJ8/p/yMBRv8D</diagram></mxfile>
+<mxfile userAgent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36" version="7.8.7" editor="www.draw.io" type="device"><diagram name="Page-1" id="74e2e168-ea6b-b213-b513-2b3c1d86103e">7Vxtc9o4EP41zKQfmsGW3/hIgPQ60/RyIZ1rPzHClsFXY1FZEOivP8mW8ZsAB2yXtHQ6jb2SJXl3n0e7K6cdMFhsPhC4nD9gB/kdtetsOmDYUVVFUw32g0u2scRUu7FgRjxHdEoFY+8nEsKk28pzUJjrSDH2qbfMC20cBMimORkkBL/ku7nYz8+6hDNUEoxt6Jel/3oOnQup0u2mDX8hbzYXU1u6aJhC+/uM4FUg5uuowI3+xM0LmIwl+odz6OCXjAiMOmBAMKbx1WIzQD7XbaK2+Ln7Pa27dRMU0CoP6CB+Yg39FUqWHC2MbhNlRK+D+APdDrh7mXsUjZfQ5q0vzPxMNqcLn90p7NL1fH+AfUzYfYAD1ulOzIAIRZu9y1R2L8+cCuEFomTLumx2mo8fEf5kiduX1DhWIptn7GIkQigcYrYbOlUKuxB6kevIkqjI8NkMd463zqnK+LHihrtjL0rfQ9+bBR3QZz185NK0lV3NxM9olHAJg0Q2ppDQm3dJE1tatjUjjqbOS+tfTSKbEskK2lqYcsua+r6PbUgRJwIUhJiEt03OqfI5xyhwbp5Hn8d/P02eRv98GY2f37VhAam2c7PVBVDGTg5Elmtzu1OCv6NMi2FbaOru5iuBVQLp/fgFefwqRhnAalcC4B3jngPghGxrR/ATstfPkTs+IAqHkMKbC/GQ2oD3ZenEsGNah+/ZNeTbLrTnqHkAPiHYLuxlHAjKVCBjg8q0+XsBGahtAlkxjkcryGGRnLjFhM7xDAfQH6XSu7yWMxr9D1G6FcEoXFHMROkInzBein579RjiFbGTEFIsjW3oM5R002IZX+NBbRPkQ+qt89HoWZpTG6LAiCQGBMUgJShc4iBshRvs9SfGDX4/3AZ2s7QbUcAAL7dRGsKvH79wyCPisWd/8hf/6EZv/2PlEeTsffs3B3dT+aX7tv4r0M20RbZfszff+GC3Or/dePRr7u6bmKgaJ2hlTkhS5fo4QTz6iD22lJ0pe728KU2jYKF4UeKp1Eh9QuA2023JO4QH5jELO4RZyECP9E9SttRH4hWkHrPTSTXm00rMd++RkD57Cw7cjjngf1UDLjjggmm4LPJGjN4kwhvMYTBDDmMccF8V53O8mK7C4xh3GHvY1MOMjYbMbzjCLgP3LW/zvePbfDiHS37p+mjT5xWfCKyOuBzaPgxDzz5Ioa5lI9vOIr5bRnxJzVNL1/TKiLckQYBaEfAZXesSVSeyM3kBFCI6tcgL8euUeKE0kNbt3jK/MUxLUSxD10wjP65SjW9OgHjiHm35y5k+Id0FuhflFIbSu1WtHlAVy1KApqs5U2pFU1Z1kcPDgoo70ikeUi5zfkNhyUl4OJh3gbypRUFTUuMUMeTQZoZHTH7Hudbj4aloWHiOE8Unsi0gH7M0wd+gzN+axH3UOqK28ob7Gf++qraK8U6bqpblw00VQqJM75GFyfI0r61yMM/+5MFadgVPwwc+xAvxorz0Jq7SdaITI8qM/e61S3/zqZsh8cvmQjigH9+S28zlRMK2i+2q7tWqKdmrW8rYrKIFtWr74/6M7Zwd1CwZdFcaztDBm4eJ1mqFA1Q4fn0jmU6yn+WQYlZESjtRrVpIdTTzxDi2OJBe9IWaaimleZR6ayOgQj29EfeTlNbOdT+j7H7gstzvcPZjgkaSqtIXEPUlVaC8JdR9rDqIo7Xf6VRVUlqMI2uCN9soQOXnDPcOyh4veJWOF0oDyyLj6PTkY7BmYGMXDs+p+IGu7/NPl45Fxa9S0ZEz1aHkdJf7aeBE77rAayReGoX0tEzjzQfxxePWdoP4JN6UAHyuVIKAyNFlCEeMSEnGUHzEPXY6vVbAXMX2ghkT6Ondc5QevFf3GZ05HnH9aHebe46DgibKBoqp5yzbKxt299Fb1rDFHOAku6pN2ZV/J/EnW9U0jlq115RRZamEYO0iL7o4CiDsHWmldgSu2+1y8ihBdvjQnzyMxuP+h9Ek/1Vcxt7xyCUWnjbgBRdWBwRWnqoVyTeq0uiyjkKgVq65Nmf8h9FzfzLss3+eRuPHvz+PR1cHkDgA0Np0AFm9rXH0XwnggP21Vglg/0lALfYv1s+vFpdY3NDbtHhT2XfNSXUi+LhYxP2ruCF3Qv47M8VRBQO9yvsOJYiyVLLm9773kO+E+VfPLpBulyzPHaSp7sRjXrqJRQFciMaQouWE2V70XGCKJtBxiBB8R9v4in+mPYk/0z6SGZ9w6nUMuTwvNqaGbnTKNUDXVaMa4KVh2MhjWJV86QRkGbZeB4ab/tWihjAsg1m31PvPBbpUPweBfoXtebAFkmCrOdjKvk+8wvYPhO3r9ucrsk9Att7mhpyMcUV2ceqC818+viueVF1xtwd3hqR2XRfu2G36XxzEJ8/p/yMBRv8D</diagram></mxfile>
\ No newline at end of file
diff --git a/tensorflow_networking/verbs/verbs_with_0_copies_phase1_protocol.xml b/tensorflow_networking/verbs/verbs_with_0_copies_phase1_protocol.xml
index c6b49d7..484e7c7 100644
--- a/tensorflow_networking/verbs/verbs_with_0_copies_phase1_protocol.xml
+++ b/tensorflow_networking/verbs/verbs_with_0_copies_phase1_protocol.xml
@@ -1 +1 @@
-<mxfile userAgent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36" version="7.8.4" editor="www.draw.io" type="device"><diagram name="Page-1" id="74e2e168-ea6b-b213-b513-2b3c1d86103e">7Vxbc5s4FP41nuk+pMMd8ujYTrYzTZqNk9nmyaOAbNhgRIWc2P31K4G4yzZ1AHtaZzJjOBLS8TnnOzeRDNTRcn2DQejeIgf6A0Vy1gN1PFAUWVMM+sEom4RiGpcJYYE9h0/KCVPvJ+REiVNXngOj0kSCkE+8sEy0URBAm5RoAGP0Xp42R3551xAsYI0wtYFfp/7rOcTlVFmS8oG/obdw+daWzgdegP26wGgV8P0GijqPf5LhJUjX4vMjFzjovUBSJwN1hBEiydVyPYI+k20qtuS56y2jGd8YBqTJA1bywBvwVzDl2PDpo1eO98b4IxsuE+PHijF1ReCaXADfWwQDdUhn+HBO8lF6teCf8SpRCIKUNiUAk09/pUOUqeJogRxvXaa2z01Ke8ECDnpkDCxDehG8ROzjgs4c+j6yAYGPMIgQjkoC64WBKQycT4+Tu+m3h9nD5J+nyfSxax62a6K0m1LaRYlxBpklS3T43fUInIbAZqPv1C9RmkuWPr2Ts6ffIKZMbQWLnEGQujaIlpDgDZ3CH7A4aLlTkw1+/567CCX1EG7BO2RuA3C3tMiWzqFJLzg6xUhNPUbrUH2A9ltia7eQgDEgoHOTa6juNnZjBj2GoE9M1UHc/X4xZq+erq8nDLPT+29308nWTU8LRqrSJ4xkQwCjikCgQ5MBfoswcdECBcCf5NSrssgK4vkPErLh+QxYEURJ+QpfEQpLYmQb7RYi5QutsJ3O4rzSQLqA6TRNLGwMfUC8t/L6H5Kc0pEDivHiON/wU+hQyDzAKERBBLsHDfN8XylM/WG0Cezu9/syj/XyY+VhZjvDPgP7gKGsJRL7LiMUbm7unx7R6F6UXT2dUp7XqSCmEHuUsZ/wqN/45HMnQztq8qQfw8lT0eDN9+LNM1vss85u1x75Xrp75hsdGBq0emhIqvAPhAb+6D3y6M6ZKi+VsipNo6KhhAf+VK6kIcZgU5gWsglR831Us1LL7plvWLvnW9rO+fQi4Ti3sEyGzQKmVguY1x6OyKO3hMyZmCP2W/MyBbeQYDdNy0cuCBbQoX5GvW6KchctX1bRfoQ7NCTZxEPU2YypWTFEdoH6nnO9y/25XuSCkF3Ofbgess5RDFWHX45tH0SRZ5eFNfd8f4R8hOMl0gZPAe9SHe+HodoS5HuKWOIFieoCgaa0D2K/mrwrVewn3NewX1tI1aXPpiwZpiXLlqFrplFeV27mUw6AZWoEfVlFi/5cGhxR9bpx+VmxLlVFtixZ1XSlpDCtqrCmhrB7WbVhbDnEDtSaHTzDqGYKLA8rKzoiGL3CVNUBCmBF+5zEk7exTfUMKf2KuVKPlRt8YOk5TpxpiJxzOfvowherdV+sCcxHaSP/qofCO/T7itqSjihqUYOjq+KKFUD3KBSW7H2VQBfbUqgiAw/jW7bCO6bap5+fkr7cID5BIlSrv8r4iVVThsDAusurVH1/BO2zvOI1VJZwHRx0FVMQdLsposyqBrVmgW57EfWRUGjWFLqlIXdidq/12kVQ6xlDTSKnXU+kAaZk4KZY5P1klXKloNDMA/PI6kJ6VeMtdSVq+8jtdg3UBgcUnRiZoEl1oJEZdSNTj2pku2sMU+2kdEnbSR2ULmrdX7d9FjxK8qLf6ShY0Fo78FSmvyOWETtiubl/aqSHftgaw0h45LGbq/hJWqzte+TEEm3rmHl2muwIYO7KjYDA62ERziF1p7ggdbbiFqEfXpfTUsr2ggUl6PndY5zBXyjbNIgoY3M/jmQuLdth0E2JXlLsZUO9VrP0g9QqakC2olb2FsifrNRqedCrVkXFAQ9vVSc3R3EaYWfpWK5ImphJEuOxBtnx7XB2O5lOhzeTWfntvILCk5VrLvWlAzM4sZ5b1mNLD5gtgfJFOWYbTTet3t/sTvnZa15n5W9Tvqr1qXxRO6xz5Sfv+J21L9C+1iv0t/fbW9F+loLHK1T71tloVe1na8hydr1Pa+iqMm+54E4JX5bLZH4TE2UGyv6Spboq902/ZH27akDRAUzL3/vag74Tlb96kUGyCeFAGfHOAIzIzKNWuk5IAVjywYjAcEZ1z2cuEYEz4DiYE17hJrmidgRmDiBgb/F7wOHTPuRSzTnGi6EbA1EXULHtE8SwXMawInhvSBVl8nobGO76j6I6wrAIZlJt9p8LdKF8dgL9DNuPwVYVJGLdwVb0tt8Ztn8gbD8Un7e+kXvG/i9hX+8zZKdrnLFf3boCj9P3AA2PAs+424I7Q9D0bgt39Db/1wTJuXX+/x/Uyf8=</diagram></mxfile>
+<mxfile userAgent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36" version="7.8.4" editor="www.draw.io" type="device"><diagram name="Page-1" id="74e2e168-ea6b-b213-b513-2b3c1d86103e">7Vxbc5s4FP41nuk+pMMd8ujYTrYzTZqNk9nmyaOAbNhgRIWc2P31K4G4yzZ1AHtaZzJjOBLS8TnnOzeRDNTRcn2DQejeIgf6A0Vy1gN1PFAUWVMM+sEom4RiGpcJYYE9h0/KCVPvJ+REiVNXngOj0kSCkE+8sEy0URBAm5RoAGP0Xp42R3551xAsYI0wtYFfp/7rOcTlVFmS8oG/obdw+daWzgdegP26wGgV8P0GijqPf5LhJUjX4vMjFzjovUBSJwN1hBEiydVyPYI+k20qtuS56y2jGd8YBqTJA1bywBvwVzDl2PDpo1eO98b4IxsuE+PHijF1ReCaXADfWwQDdUhn+HBO8lF6teCf8SpRCIKUNiUAk09/pUOUqeJogRxvXaa2z01Ke8ECDnpkDCxDehG8ROzjgs4c+j6yAYGPMIgQjkoC64WBKQycT4+Tu+m3h9nD5J+nyfSxax62a6K0m1LaRYlxBpklS3T43fUInIbAZqPv1C9RmkuWPr2Ts6ffIKZMbQWLnEGQujaIlpDgDZ3CH7A4aLlTkw1+/567CCX1EG7BO2RuA3C3tMiWzqFJLzg6xUhNPUbrUH2A9ltia7eQgDEgoHOTa6juNnZjBj2GoE9M1UHc/X4xZq+erq8nDLPT+29308nWTU8LRqrSJ4xkQwCjikCgQ5MBfoswcdECBcCf5NSrssgK4vkPErLh+QxYEURJ+QpfEQpLYmQb7RYi5QutsJ3O4rzSQLqA6TRNLGwMfUC8t/L6H5Kc0pEDivHiON/wU+hQyDzAKERBBLsHDfN8XylM/WG0Cezu9/syj/XyY+VhZjvDPgP7gKGsJRL7LiMUbm7unx7R6F6UXT2dUp7XqSCmEHuUsZ/wqN/45HMnQztq8qQfw8lT0eDN9+LNM1vss85u1x75Xrp75hsdGBq0emhIqvAPhAb+6D3y6M6ZKi+VsipNo6KhhAf+VK6kIcZgU5gWsglR831Us1LL7plvWLvnW9rO+fQi4Ti3sEyGzQKmVguY1x6OyKO3hMyZmCP2W/MyBbeQYDdNy0cuCBbQoX5GvW6KchctX1bRfoQ7NCTZxEPU2YypWTFEdoH6nnO9y/25XuSCkF3Ofbgess5RDFWHX45tH0SRZ5eFNfd8f4R8hOMl0gZPAe9SHe+HodoS5HuKWOIFieoCgaa0D2K/mrwrVewn3NewX1tI1aXPpiwZpiXLlqFrplFeV27mUw6AZWoEfVlFi/5cGhxR9bpx+VmxLlVFtixZ1XSlpDCtqrCmhrB7WbVhbDnEDtSaHTzDqGYKLA8rKzoiGL3CVNUBCmBF+5zEk7exTfUMKf2KuVKPlRt8YOk5TpxpiJxzOfvowherdV+sCcxHaSP/qofCO/T7itqSjihqUYOjq+KKFUD3KBSW7H2VQBfbUqgiAw/jW7bCO6bap5+fkr7cID5BIlSrv8r4iVVThsDAusurVH1/BO2zvOI1VJZwHRx0FVMQdLsposyqBrVmgW57EfWRUGjWFLqlIXdidq/12kVQ6xlDTSKnXU+kAaZk4KZY5P1klXKloNDMA/PI6kJ6VeMtdSVq+8jtdg3UBgcUnRiZoEl1oJEZdSNTj2pku2sMU+2kdEnbSR2ULmrdX7d9FjxK8qLf6ShY0Fo78FSmvyOWETtiubl/aqSHftgaw0h45LGbq/hJWqzte+TEEm3rmHl2muwIYO7KjYDA62ERziF1p7ggdbbiFqEfXpfTUsr2ggUl6PndY5zBXyjbNIgoY3M/jmQuLdth0E2JXlLsZUO9VrP0g9QqakC2olb2FsifrNRqedCrVkXFAQ9vVSc3R3EaYWfpWK5ImphJEuOxBtnx7XB2O5lOhzeTWfntvILCk5VrLvWlAzM4sZ5b1mNLD5gtgfJFOWYbTTet3t/sTvnZa15n5W9Tvqr1qXxRO6xz5Sfv+J21L9C+1iv0t/fbW9F+loLHK1T71tloVe1na8hydr1Pa+iqMm+54E4JX5bLZH4TE2UGyv6Spboq902/ZH27akDRAUzL3/vag74Tlb96kUGyCeFAGfHOAIzIzKNWuk5IAVjywYjAcEZ1z2cuEYEz4DiYE17hJrmidgRmDiBgb/F7wOHTPuRSzTnGi6EbA1EXULHtE8SwXMawInhvSBVl8nobGO76j6I6wrAIZlJt9p8LdKF8dgL9DNuPwVYVJGLdwVb0tt8Ztn8gbD8Un7e+kXvG/i9hX+8zZKdrnLFf3boCj9P3AA2PAs+424I7Q9D0bgt39Db/1wTJuXX+/x/Uyf8=</diagram></mxfile>
\ No newline at end of file