From 7ba3c4e43be493e28d0008c31f60afe2a666d355 Mon Sep 17 00:00:00 2001
From: Alexander Morozov <etesial@gmail.com>
Date: Sun, 24 Apr 2016 00:35:44 +0300
Subject: [PATCH] refactor/sync: convert to the new memory management API

Refactor code CUDA and Native backend to match #autumnai/collenchyma/62 that
provides enchanced memory management and syncronization. Since memory
management is now automatic, `*_plain` variants of functions are removed.

BREAKING CHANGE: *_plain versions of API functions are removed, arguments of
their counterpart functions may have changed in mutablity.

REFERENCE: autumnai/collenchyma#37, autumnai/collenchyma#62
---
 src/frameworks/cuda/helper.rs   | 1190 +++++++++----------------------
 src/frameworks/cuda/mod.rs      |   10 +-
 src/frameworks/native/helper.rs |  512 +++++--------
 src/frameworks/native/mod.rs    |   32 +
 src/plugin.rs                   |  416 +++--------
 5 files changed, 644 insertions(+), 1516 deletions(-)
diff --git a/src/frameworks/cuda/helper.rs b/src/frameworks/cuda/helper.rs
index a4416a8..3753196 100644
--- a/src/frameworks/cuda/helper.rs
+++ b/src/frameworks/cuda/helper.rs
@@ -1,27 +1,51 @@
 //! Provides useful macros for easier NN implementation for CUDA/cuDNN.
 
-/// Returns cuDNN ready memory pointer from a SharedTensor.
-pub unsafe fn receive_memory_ptr<T>(x: &::co::tensor::SharedTensor<T>, device: &::co::device::DeviceType) -> Result<*const ::libc::c_void, ::co::plugin::Error> {
-    Ok(::std::mem::transmute::<u64, *const ::libc::c_void>(
-        *try!(
-            try!(
-                x.get(device).ok_or(::co::plugin::Error::MissingMemoryForDevice("Unable to resolve memory."))
-            ).as_cuda().ok_or(::co::plugin::Error::MissingMemoryForDevice("Unable to receive CUDA memory."))
-        ).id_c()
-    ))
+macro_rules! read {
+    ($x:ident, $slf:ident) => (
+        try!($x.read($slf.device())).as_cuda()
+            .expect("Broken invariant: not a CUDA memory")
+    )
+}
+
+macro_rules! read_write {
+    ($x:ident, $slf:ident) => (
+        try!($x.read_write($slf.device())).as_cuda()
+            .expect("Broken invariant: not a CUDA memory")
+    )
+}
+
+macro_rules! write_only {
+    ($x:ident, $slf:ident) => (
+        try!($x.write_only($slf.device())).as_cuda()
+            .expect("Broken invariant: not a CUDA memory")
+    )
+}
+
+// trans! cannot be inlined into macros above, because `$mem` would become
+// intermidiate variable and `*mut $t` will outlive it.
+macro_rules! trans {
+    ($mem:ident) => (
+        unsafe { ::std::mem::transmute::<u64, *const ::libc::c_void>(*$mem.id_c()) }
+    )
 }
 
-/// Returns mutable cuDNN ready memory pointer from a SharedTensor.
-pub unsafe fn receive_memory_ptr_mut<T>(x: &mut ::co::tensor::SharedTensor<T>, device: &::co::device::DeviceType) -> Result<*mut ::libc::c_void, ::co::plugin::Error> {
-    Ok(::std::mem::transmute::<u64, *mut ::libc::c_void>(
-        *try!(
-            try!(
-                x.get_mut(device).ok_or(::co::plugin::Error::MissingMemoryForDevice("Unable to resolve memory."))
-            ).as_mut_cuda().ok_or(::co::plugin::Error::MissingMemoryForDevice("Unable to receive CUDA memory."))
-        ).id_c()
-    ))
+macro_rules! trans_mut {
+    ($mem:ident) => (
+        unsafe { ::std::mem::transmute::<u64, *mut ::libc::c_void>(*$mem.id_c()) }
+    )
+}
+
+macro_rules! exec {
+    ($name:ident, $f:expr) => ({
+        let res = $f;
+        // FIXME: can we properly include original error?
+        // Use String instead of &str or trait objects?
+        res.map_err(|_| PluginError::Operation(
+            stringify!(Unable to execute CUDA cuDNN $name)).into())
+    })
 }
 
+
 #[macro_export]
 macro_rules! impl_oconf_for_cc(($($t: ident), +) => (
     $(
@@ -43,673 +67,241 @@ macro_rules! impl_oconf_for_pooling(($($t: ident), +) => (
     )+
 ));
 
-#[macro_export]
-macro_rules! impl_ops_sigmoid_for {
-    ($t:ident, $b:ty) => (
-        impl ::plugin::Sigmoid<$t> for $b {
-            fn sigmoid(
-                &self,
-                x: &mut ::co::tensor::SharedTensor<$t>,
-                result: &mut ::co::tensor::SharedTensor<$t>
-            ) -> Result<(), ::co::error::Error> {
-                match x.add_device(self.device()) { _ => try!(x.sync(self.device())) }
-                match result.add_device(self.device()) { _ => () }
-
-                self.sigmoid_plain(x, result)
-            }
-
-            fn sigmoid_plain(
-                &self,
-                x: &::co::tensor::SharedTensor<$t>,
-                result: &mut ::co::tensor::SharedTensor<$t>
-            ) -> Result<(), ::co::error::Error> {
-                let scal_params: ::cudnn::utils::ScalParams<$t> = ::cudnn::utils::ScalParams::default();
-
-                Ok(try!(match CUDNN.sigmoid_forward(
-                    &try!(x.cudnn_tensor_desc_flat()), // src_desc
-                    try!(unsafe { ::frameworks::cuda::helper::receive_memory_ptr(x, self.device()) }), //src_data
-                    &try!(result.cudnn_tensor_desc_flat()), // dest_desc
-                    try!(unsafe { ::frameworks::cuda::helper::receive_memory_ptr_mut(result, self.device()) }), // dest_data
-                    scal_params
-                ) {
-                    Ok(_) => Ok(()),
-                    Err(_) => {
-                        Err(::co::plugin::Error::Operation("Unable to execute CUDA cuDNN Activation Sigmoid Forward."))
-                    }
-                }))
-            }
-
-            fn sigmoid_grad(
-                &self,
-                x: &mut ::co::tensor::SharedTensor<$t>,
-                x_diff: &mut ::co::tensor::SharedTensor<$t>,
-                result: &mut ::co::tensor::SharedTensor<$t>,
-                result_diff: &mut ::co::tensor::SharedTensor<$t>
-            ) -> Result<(), ::co::error::Error> {
-                match x.add_device(self.device()) { _ => try!(x.sync(self.device())) }
-                match x_diff.add_device(self.device()) { _ => try!(x.sync(self.device())) }
-                match result.add_device(self.device()) { _ => try!(x.sync(self.device())) }
-                match result_diff.add_device(self.device()) { _ => () }
-
-                self.sigmoid_grad_plain(x, x_diff, result, result_diff)
-            }
-
-            fn sigmoid_grad_plain(
-                &self,
-                x: &::co::tensor::SharedTensor<$t>,
-                x_diff: &::co::tensor::SharedTensor<$t>,
-                result: &::co::tensor::SharedTensor<$t>,
-                result_diff: &mut ::co::tensor::SharedTensor<$t>
-            ) -> Result<(), ::co::error::Error> {
-                let scal_params: ::cudnn::utils::ScalParams<$t> = ::cudnn::utils::ScalParams::default();
-
-                Ok(try!(match CUDNN.sigmoid_backward(
-                    &try!(x.cudnn_tensor_desc_flat()), // src_desc
-                    try!(unsafe { ::frameworks::cuda::helper::receive_memory_ptr(x, self.device()) }), //src_data
-                    &try!(x_diff.cudnn_tensor_desc_flat()), // src_diff_desc
-                    try!(unsafe { ::frameworks::cuda::helper::receive_memory_ptr(x_diff, self.device()) }), //src_diff_data
-                    &try!(result.cudnn_tensor_desc_flat()), // dest_desc
-                    try!(unsafe { ::frameworks::cuda::helper::receive_memory_ptr(result, self.device()) }), // dest_data
-                    &try!(result_diff.cudnn_tensor_desc_flat()), // dest_diff_desc
-                    try!(unsafe { ::frameworks::cuda::helper::receive_memory_ptr_mut(result_diff, self.device()) }), // dest_diff_data
-                    scal_params
-                ) {
-                    Ok(_) => Ok(()),
-                    Err(_) => {
-                        Err(::co::plugin::Error::Operation("Unable to execute CUDA cuDNN Activation Sigmoid Backward."))
-                    }
-                }))
-            }
-        }
-    )
-}
 
+// Implementation of Sigmoid, Relu, Tanh is mostly the same, excluding
+// trait and function names. And it's quite big, so I think not repeating
+// it here 3 times is worth another level of indirection.
+// Since concat_idents!() is not stable, this macro has a lot of arguments.
 #[macro_export]
-macro_rules! impl_ops_sigmoid_pointwise_for {
-    ($t:ident, $b:ty) => (
-        impl ::plugin::SigmoidPointwise<$t> for $b {
-            fn sigmoid_pointwise(
-                &self,
-                x: &mut ::co::tensor::SharedTensor<$t>
-            ) -> Result<(), ::co::error::Error> {
-                match x.add_device(self.device()) { _ => try!(x.sync(self.device())) }
-
-                self.sigmoid_pointwise_plain(x)
-            }
-
-            fn sigmoid_pointwise_plain(
-                &self,
-                x: &mut ::co::tensor::SharedTensor<$t>
-            ) -> Result<(), ::co::error::Error> {
-                let scal_params: ::cudnn::utils::ScalParams<$t> = ::cudnn::utils::ScalParams::default();
-
-                Ok(try!(match CUDNN.sigmoid_forward(
-                    &try!(x.cudnn_tensor_desc_flat()), // src_desc
-                    try!(unsafe { ::frameworks::cuda::helper::receive_memory_ptr(x, self.device()) }), //src_data
-                    &try!(x.cudnn_tensor_desc_flat()), // dest_desc
-                    try!(unsafe { ::frameworks::cuda::helper::receive_memory_ptr_mut(x, self.device()) }), // dest_data
-                    scal_params
-                ) {
-                    Ok(_) => Ok(()),
-                    Err(_) => {
-                        Err(::co::plugin::Error::Operation("Unable to execute CUDA cuDNN Sigmoid Pointwise forward."))
-                    }
-                }))
-            }
-
-            fn sigmoid_pointwise_grad(
-                &self,
-                x: &mut ::co::tensor::SharedTensor<$t>,
-                x_diff: &mut ::co::tensor::SharedTensor<$t>,
-            ) -> Result<(), ::co::error::Error> {
-                match x.add_device(self.device()) { _ => try!(x.sync(self.device())) }
-                match x_diff.add_device(self.device()) { _ => try!(x.sync(self.device())) }
-
-                self.sigmoid_pointwise_grad_plain(x, x_diff)
+macro_rules! impl_activation_ops {
+    ($t:ty, $b:ty,
+     $plugin_name:ident, $plugin_pointwise_name:ident,
+     $fwd_cuda:ident, $bkw_cuda:ident,
+     $fwd_name:ident, $bkw_name:ident,
+     $fwd_pointwise_name:ident, $bkw_pointwise_name:ident) => (
+
+        impl ::plugin::$plugin_name<$t> for $b {
+            fn $fwd_name(&self, x: &SharedTensor<$t>, result: &mut SharedTensor<$t>)
+                         -> Result<(), CoError> {
+                let r_desc = try!(result.cudnn_tensor_desc_flat());
+                let x_mem = read!(x, self);
+                let r_mem = write_only!(result, self);
+
+                exec!($fwd_name, CUDNN.$fwd_cuda(
+                    &try!(x.cudnn_tensor_desc_flat()),
+                    trans!(x_mem),
+                    &r_desc,
+                    trans_mut!(r_mem),
+                    ScalParams::<$t>::default()))
+
+            }
+
+            fn $bkw_name(&self,
+                         x: &SharedTensor<$t>,
+                         x_diff: &SharedTensor<$t>,
+                         result: &SharedTensor<$t>,
+                         result_diff: &mut SharedTensor<$t>)
+                         -> Result<(), CoError> {
+                let dr_desc = try!(result_diff.cudnn_tensor_desc_flat());
+                let x_mem = read!(x, self);
+                let dx_mem = read!(x_diff, self);
+                let r_mem = read!(result, self);
+                let dr_mem = write_only!(result_diff, self);
+
+                exec!($bkw_name, CUDNN.$bkw_cuda(
+                    &try!(x.cudnn_tensor_desc_flat()),
+                    trans!(x_mem),
+                    &try!(x_diff.cudnn_tensor_desc_flat()),
+                    trans!(dx_mem),
+                    &try!(result.cudnn_tensor_desc_flat()),
+                    trans!(r_mem),
+                    &dr_desc,
+                    trans_mut!(dr_mem),
+                    ScalParams::<$t>::default()))
             }
+        }
 
-            fn sigmoid_pointwise_grad_plain(
-                &self,
-                x: &::co::tensor::SharedTensor<$t>,
-                x_diff: &mut ::co::tensor::SharedTensor<$t>,
-            ) -> Result<(), ::co::error::Error> {
-                let scal_params: ::cudnn::utils::ScalParams<$t> = ::cudnn::utils::ScalParams::default();
-
-                Ok(try!(match CUDNN.sigmoid_backward(
-                    &try!(x.cudnn_tensor_desc_flat()), // src_desc
-                    try!(unsafe { ::frameworks::cuda::helper::receive_memory_ptr(x, self.device()) }), //src_data
-                    &try!(x_diff.cudnn_tensor_desc_flat()), // src_diff_desc
-                    try!(unsafe { ::frameworks::cuda::helper::receive_memory_ptr(x_diff, self.device()) }), //src_diff_data
-                    &try!(x.cudnn_tensor_desc_flat()), // dest_desc
-                    try!(unsafe { ::frameworks::cuda::helper::receive_memory_ptr(x, self.device()) }), // dest_data
-                    &try!(x_diff.cudnn_tensor_desc_flat()), // dest_diff_desc
-                    try!(unsafe { ::frameworks::cuda::helper::receive_memory_ptr_mut(x_diff, self.device()) }), // dest_diff_data
-                    scal_params
-                ) {
-                    Ok(_) => Ok(()),
-                    Err(_) => {
-                        Err(::co::plugin::Error::Operation("Unable to execute CUDA cuDNN Sigmoid Pointwise backward."))
-                    }
-                }))
+        impl ::plugin::$plugin_pointwise_name<$t> for $b {
+            fn $fwd_pointwise_name(&self, x: &mut SharedTensor<$t>)
+                                   -> Result<(), CoError> {
+                let x_desc = try!(x.cudnn_tensor_desc_flat());
+                let x_mem = read_write!(x, self);
+                exec!($fwd_pointwise_name, CUDNN.$fwd_cuda(
+                    &x_desc,
+                    trans!(x_mem),
+                    &x_desc,
+                    trans_mut!(x_mem),
+                    ScalParams::<$t>::default()))
+            }
+
+            fn $bkw_pointwise_name(&self, x: &SharedTensor<$t>,
+                                   x_diff: &mut SharedTensor<$t>)
+                                   -> Result<(), CoError> {
+                let x_desc = try!(x.cudnn_tensor_desc_flat());
+                let dx_desc = try!(x_diff.cudnn_tensor_desc_flat());
+                let x_mem = read!(x, self);
+                let dx_mem = read_write!(x_diff, self);
+                exec!($bkw_pointwise_name, CUDNN.$bkw_cuda(
+                    &x_desc, trans!(x_mem),
+                    &dx_desc, trans!(dx_mem),
+                    &x_desc, trans!(x_mem),
+                    &dx_desc, trans_mut!(dx_mem),
+                    ScalParams::<$t>::default()))
             }
         }
     )
 }
 
-#[macro_export]
-macro_rules! impl_ops_relu_for {
-    ($t:ident, $b:ty) => (
-        impl ::plugin::Relu<$t> for $b {
-            fn relu(
-                &self,
-                x: &mut ::co::tensor::SharedTensor<$t>,
-                result: &mut ::co::tensor::SharedTensor<$t>
-            ) -> Result<(), ::co::error::Error> {
-                match x.add_device(self.device()) { _ => try!(x.sync(self.device())) }
-                match result.add_device(self.device()) { _ => () }
-
-                self.relu_plain(x, result)
-            }
-
-            fn relu_plain(
-                &self,
-                x: &::co::tensor::SharedTensor<$t>,
-                result: &mut ::co::tensor::SharedTensor<$t>
-            ) -> Result<(), ::co::error::Error> {
-                let scal_params: ::cudnn::utils::ScalParams<$t> = ::cudnn::utils::ScalParams::default();
-
-                Ok(try!(match CUDNN.relu_forward(
-                    &try!(x.cudnn_tensor_desc_flat()), // src_desc
-                    try!(unsafe { ::frameworks::cuda::helper::receive_memory_ptr(x, self.device()) }), //src_data
-                    &try!(result.cudnn_tensor_desc_flat()), // dest_desc
-                    try!(unsafe { ::frameworks::cuda::helper::receive_memory_ptr_mut(result, self.device()) }), // dest_data
-                    scal_params
-                ) {
-                    Ok(_) => Ok(()),
-                    Err(_) => {
-                        Err(::co::plugin::Error::Operation("Unable to execute CUDA cuDNN Activation relu Forward."))
-                    }
-                }))
-            }
-
-            fn relu_grad(
-                &self,
-                x: &mut ::co::tensor::SharedTensor<$t>,
-                x_diff: &mut ::co::tensor::SharedTensor<$t>,
-                result: &mut ::co::tensor::SharedTensor<$t>,
-                result_diff: &mut ::co::tensor::SharedTensor<$t>
-            ) -> Result<(), ::co::error::Error> {
-                match x.add_device(self.device()) { _ => try!(x.sync(self.device())) }
-                match x_diff.add_device(self.device()) { _ => try!(x.sync(self.device())) }
-                match result.add_device(self.device()) { _ => try!(x.sync(self.device())) }
-                match result_diff.add_device(self.device()) { _ => () }
-
-                self.relu_grad_plain(x, x_diff, result, result_diff)
-            }
-
-            fn relu_grad_plain(
-                &self,
-                x: &::co::tensor::SharedTensor<$t>,
-                x_diff: &::co::tensor::SharedTensor<$t>,
-                result: &::co::tensor::SharedTensor<$t>,
-                result_diff: &mut ::co::tensor::SharedTensor<$t>
-            ) -> Result<(), ::co::error::Error> {
-                let scal_params: ::cudnn::utils::ScalParams<$t> = ::cudnn::utils::ScalParams::default();
-
-                Ok(try!(match CUDNN.relu_backward(
-                    &try!(x.cudnn_tensor_desc_flat()), // src_desc
-                    try!(unsafe { ::frameworks::cuda::helper::receive_memory_ptr(x, self.device()) }), //src_data
-                    &try!(x_diff.cudnn_tensor_desc_flat()), // src_diff_desc
-                    try!(unsafe { ::frameworks::cuda::helper::receive_memory_ptr(x_diff, self.device()) }), //src_diff_data
-                    &try!(result.cudnn_tensor_desc_flat()), // dest_desc
-                    try!(unsafe { ::frameworks::cuda::helper::receive_memory_ptr(result, self.device()) }), // dest_data
-                    &try!(result_diff.cudnn_tensor_desc_flat()), // dest_diff_desc
-                    try!(unsafe { ::frameworks::cuda::helper::receive_memory_ptr_mut(result_diff, self.device()) }), // dest_diff_data
-                    scal_params
-                ) {
-                    Ok(_) => Ok(()),
-                    Err(_) => {
-                        Err(::co::plugin::Error::Operation("Unable to execute CUDA cuDNN Activation relu Backward."))
-                    }
-                }))
-            }
-        }
+macro_rules! impl_ops_sigmoid_for {
+    ($t:ty, $b:ty) => (
+        impl_activation_ops!(
+            $t, $b,
+            Sigmoid, SigmoidPointwise,
+            sigmoid_forward, sigmoid_backward,
+            sigmoid, sigmoid_grad,
+            sigmoid_pointwise, sigmoid_pointwise_grad);
     )
 }
 
-#[macro_export]
-macro_rules! impl_ops_relu_pointwise_for {
-    ($t:ident, $b:ty) => (
-        impl ::plugin::ReluPointwise<$t> for $b {
-            fn relu_pointwise(
-                &self,
-                x: &mut ::co::tensor::SharedTensor<$t>,
-            ) -> Result<(), ::co::error::Error> {
-                match x.add_device(self.device()) { _ => try!(x.sync(self.device())) }
-
-                self.relu_pointwise_plain(x)
-            }
-
-            fn relu_pointwise_plain(
-                &self,
-                x: &mut ::co::tensor::SharedTensor<$t>,
-            ) -> Result<(), ::co::error::Error> {
-                let scal_params: ::cudnn::utils::ScalParams<$t> = ::cudnn::utils::ScalParams::default();
-
-                Ok(try!(match CUDNN.relu_forward(
-                    &try!(x.cudnn_tensor_desc_flat()), // src_desc
-                    try!(unsafe { ::frameworks::cuda::helper::receive_memory_ptr(x, self.device()) }), //src_data
-                    &try!(x.cudnn_tensor_desc_flat()), // dest_desc
-                    try!(unsafe { ::frameworks::cuda::helper::receive_memory_ptr_mut(x, self.device()) }), // dest_data
-                    scal_params
-                ) {
-                    Ok(_) => Ok(()),
-                    Err(_) => {
-                        Err(::co::plugin::Error::Operation("Unable to execute CUDA cuDNN ReLU Pointwise forward."))
-                    }
-                }))
-            }
-
-            fn relu_pointwise_grad(
-                &self,
-                x: &mut ::co::tensor::SharedTensor<$t>,
-                x_diff: &mut ::co::tensor::SharedTensor<$t>
-            ) -> Result<(), ::co::error::Error> {
-                match x.add_device(self.device()) { _ => try!(x.sync(self.device())) }
-                match x_diff.add_device(self.device()) { _ => try!(x.sync(self.device())) }
-
-                self.relu_pointwise_grad_plain(x, x_diff)
-            }
-
-            fn relu_pointwise_grad_plain(
-                &self,
-                x: &::co::tensor::SharedTensor<$t>,
-                x_diff: &mut ::co::tensor::SharedTensor<$t>
-            ) -> Result<(), ::co::error::Error> {
-                let scal_params: ::cudnn::utils::ScalParams<$t> = ::cudnn::utils::ScalParams::default();
-
-                Ok(try!(match CUDNN.relu_backward(
-                    &try!(x.cudnn_tensor_desc_flat()), // src_desc
-                    try!(unsafe { ::frameworks::cuda::helper::receive_memory_ptr(x, self.device()) }), //src_data
-                    &try!(x_diff.cudnn_tensor_desc_flat()), // src_diff_desc
-                    try!(unsafe { ::frameworks::cuda::helper::receive_memory_ptr(x_diff, self.device()) }), //src_diff_data
-                    &try!(x.cudnn_tensor_desc_flat()), // dest_desc
-                    try!(unsafe { ::frameworks::cuda::helper::receive_memory_ptr(x, self.device()) }), // dest_data
-                    &try!(x_diff.cudnn_tensor_desc_flat()), // dest_diff_desc
-                    try!(unsafe { ::frameworks::cuda::helper::receive_memory_ptr_mut(x_diff, self.device()) }), // dest_diff_data
-                    scal_params
-                ) {
-                    Ok(_) => Ok(()),
-                    Err(_) => {
-                        Err(::co::plugin::Error::Operation("Unable to execute CUDA cuDNN ReLU Pointwise backward."))
-                    }
-                }))
-            }
-        }
+macro_rules! impl_ops_relu_for {
+    ($t:ty, $b:ty) => (
+        impl_activation_ops!(
+            $t, $b,
+            Relu, ReluPointwise,
+            relu_forward, relu_backward,
+            relu, relu_grad,
+            relu_pointwise, relu_pointwise_grad);
     )
 }
 
-#[macro_export]
 macro_rules! impl_ops_tanh_for {
-    ($t:ident, $b:ty) => (
-        impl ::plugin::Tanh<$t> for $b {
-            fn tanh(
-                &self,
-                x: &mut ::co::tensor::SharedTensor<$t>,
-                result: &mut ::co::tensor::SharedTensor<$t>
-            ) -> Result<(), ::co::error::Error> {
-                match x.add_device(self.device()) { _ => try!(x.sync(self.device())) }
-                match result.add_device(self.device()) { _ => () }
-
-                self.tanh_plain(x, result)
-            }
-
-            fn tanh_plain(
-                &self,
-                x: &::co::tensor::SharedTensor<$t>,
-                result: &mut ::co::tensor::SharedTensor<$t>
-            ) -> Result<(), ::co::error::Error> {
-                let scal_params: ::cudnn::utils::ScalParams<$t> = ::cudnn::utils::ScalParams::default();
-
-                Ok(try!(match CUDNN.tanh_forward(
-                    &try!(x.cudnn_tensor_desc_flat()), // src_desc
-                    try!(unsafe { ::frameworks::cuda::helper::receive_memory_ptr(x, self.device()) }), //src_data
-                    &try!(result.cudnn_tensor_desc_flat()), // dest_desc
-                    try!(unsafe { ::frameworks::cuda::helper::receive_memory_ptr_mut(result, self.device()) }), // dest_data
-                    scal_params
-                ) {
-                    Ok(_) => Ok(()),
-                    Err(_) => {
-                        Err(::co::plugin::Error::Operation("Unable to execute CUDA cuDNN Activation tanh Forward."))
-                    }
-                }))
-            }
-
-            fn tanh_grad(
-                &self,
-                x: &mut ::co::tensor::SharedTensor<$t>,
-                x_diff: &mut ::co::tensor::SharedTensor<$t>,
-                result: &mut ::co::tensor::SharedTensor<$t>,
-                result_diff: &mut ::co::tensor::SharedTensor<$t>
-            ) -> Result<(), ::co::error::Error> {
-                match x.add_device(self.device()) { _ => try!(x.sync(self.device())) }
-                match x_diff.add_device(self.device()) { _ => try!(x.sync(self.device())) }
-                match result.add_device(self.device()) { _ => try!(x.sync(self.device())) }
-                match result_diff.add_device(self.device()) { _ => () }
-
-                self.tanh_grad_plain(x, x_diff, result, result_diff)
-            }
-
-            fn tanh_grad_plain(
-                &self,
-                x: &::co::tensor::SharedTensor<$t>,
-                x_diff: &::co::tensor::SharedTensor<$t>,
-                result: &::co::tensor::SharedTensor<$t>,
-                result_diff: &mut ::co::tensor::SharedTensor<$t>
-            ) -> Result<(), ::co::error::Error> {
-                let scal_params: ::cudnn::utils::ScalParams<$t> = ::cudnn::utils::ScalParams::default();
-
-                Ok(try!(match CUDNN.tanh_backward(
-                    &try!(x.cudnn_tensor_desc_flat()), // src_desc
-                    try!(unsafe { ::frameworks::cuda::helper::receive_memory_ptr(x, self.device()) }), //src_data
-                    &try!(x_diff.cudnn_tensor_desc_flat()), // src_diff_desc
-                    try!(unsafe { ::frameworks::cuda::helper::receive_memory_ptr(x_diff, self.device()) }), //src_diff_data
-                    &try!(result.cudnn_tensor_desc_flat()), // dest_desc
-                    try!(unsafe { ::frameworks::cuda::helper::receive_memory_ptr(result, self.device()) }), // dest_data
-                    &try!(result_diff.cudnn_tensor_desc_flat()), // dest_diff_desc
-                    try!(unsafe { ::frameworks::cuda::helper::receive_memory_ptr_mut(result_diff, self.device()) }), // dest_diff_data
-                    scal_params
-                ) {
-                    Ok(_) => Ok(()),
-                    Err(_) => {
-                        Err(::co::plugin::Error::Operation("Unable to execute CUDA cuDNN Activation tanh Backward."))
-                    }
-                }))
-            }
-        }
+    ($t:ty, $b:ty) => (
+        impl_activation_ops!(
+            $t, $b,
+            Tanh, TanhPointwise,
+            tanh_forward, tanh_backward,
+            tanh, tanh_grad,
+            tanh_pointwise, tanh_pointwise_grad);
     )
 }
 
-#[macro_export]
-macro_rules! impl_ops_tanh_pointwise_for {
-    ($t:ident, $b:ty) => (
-        impl ::plugin::TanhPointwise<$t> for $b {
-            fn tanh_pointwise(
-                &self,
-                x: &mut ::co::tensor::SharedTensor<$t>
-            ) -> Result<(), ::co::error::Error> {
-                match x.add_device(self.device()) { _ => try!(x.sync(self.device())) }
-
-                self.tanh_pointwise_plain(x)
-            }
-
-            fn tanh_pointwise_plain(
-                &self,
-                x: &mut ::co::tensor::SharedTensor<$t>
-            ) -> Result<(), ::co::error::Error> {
-                let scal_params: ::cudnn::utils::ScalParams<$t> = ::cudnn::utils::ScalParams::default();
-
-                Ok(try!(match CUDNN.tanh_forward(
-                    &try!(x.cudnn_tensor_desc_flat()), // src_desc
-                    try!(unsafe { ::frameworks::cuda::helper::receive_memory_ptr(x, self.device()) }), //src_data
-                    &try!(x.cudnn_tensor_desc_flat()), // dest_desc
-                    try!(unsafe { ::frameworks::cuda::helper::receive_memory_ptr_mut(x, self.device()) }), // dest_data
-                    scal_params
-                ) {
-                    Ok(_) => Ok(()),
-                    Err(_) => {
-                        Err(::co::plugin::Error::Operation("Unable to execute CUDA cuDNN Tanh Pointwise forward."))
-                    }
-                }))
-            }
-
-            fn tanh_pointwise_grad(
-                &self,
-                x: &mut ::co::tensor::SharedTensor<$t>,
-                x_diff: &mut ::co::tensor::SharedTensor<$t>
-            ) -> Result<(), ::co::error::Error> {
-                match x.add_device(self.device()) { _ => try!(x.sync(self.device())) }
-                match x_diff.add_device(self.device()) { _ => try!(x.sync(self.device())) }
-
-                self.tanh_pointwise_grad_plain(x, x_diff)
-            }
 
-            fn tanh_pointwise_grad_plain(
-                &self,
-                x: &::co::tensor::SharedTensor<$t>,
-                x_diff: &mut ::co::tensor::SharedTensor<$t>
-            ) -> Result<(), ::co::error::Error> {
-                let scal_params: ::cudnn::utils::ScalParams<$t> = ::cudnn::utils::ScalParams::default();
-
-                Ok(try!(match CUDNN.tanh_backward(
-                    &try!(x.cudnn_tensor_desc_flat()), // src_desc
-                    try!(unsafe { ::frameworks::cuda::helper::receive_memory_ptr(x, self.device()) }), //src_data
-                    &try!(x_diff.cudnn_tensor_desc_flat()), // src_diff_desc
-                    try!(unsafe { ::frameworks::cuda::helper::receive_memory_ptr(x_diff, self.device()) }), //src_diff_data
-                    &try!(x.cudnn_tensor_desc_flat()), // dest_desc
-                    try!(unsafe { ::frameworks::cuda::helper::receive_memory_ptr(x, self.device()) }), // dest_data
-                    &try!(x_diff.cudnn_tensor_desc_flat()), // dest_diff_desc
-                    try!(unsafe { ::frameworks::cuda::helper::receive_memory_ptr_mut(x_diff, self.device()) }), // dest_diff_data
-                    scal_params
-                ) {
-                    Ok(_) => Ok(()),
-                    Err(_) => {
-                        Err(::co::plugin::Error::Operation("Unable to execute CUDA cuDNN Tanh Pointwise backward."))
-                    }
-                }))
-            }
-        }
-    )
-}
 
 #[macro_export]
 macro_rules! impl_ops_convolution_for {
     ($t:ty, $b:ty) => (
         fn convolution(
             &self,
-            filter: &mut ::co::tensor::SharedTensor<$t>,
-            x: &mut ::co::tensor::SharedTensor<$t>,
-            result: &mut ::co::tensor::SharedTensor<$t>,
-            workspace: &mut ::co::tensor::SharedTensor<u8>,
-            config: &Self::CC //::frameworks::cuda::CC
-        ) -> Result<(), ::co::error::Error> {
-            match x.add_device(self.device()) { _ => try!(x.sync(self.device())) }
-            match result.add_device(self.device()) { _ => () }
-            match workspace.add_device(self.device()) { _ => () }
-
-            self.convolution_plain(filter, x, result, workspace, config)
-        }
-
-        fn convolution_plain(
-            &self,
-            filter: &::co::tensor::SharedTensor<$t>,
-            x: &::co::tensor::SharedTensor<$t>,
-            result: &mut ::co::tensor::SharedTensor<$t>,
-            workspace: &mut ::co::tensor::SharedTensor<u8>,
-            config: &Self::CC
-        ) -> Result<(), ::co::error::Error> {
-            let scal_params: ::cudnn::utils::ScalParams<$t> = ::cudnn::utils::ScalParams::default();
-
-            Ok(try!(match CUDNN.convolution_forward(
+            filter: &SharedTensor<$t>,
+            x: &SharedTensor<$t>,
+            result: &mut SharedTensor<$t>,
+            workspace: &mut SharedTensor<u8>,
+            config: &Self::CC) -> Result<(), CoError> {
+
+            let r_desc = try!(result.cudnn_tensor_desc());
+            let f_mem = read!(filter, self);
+            let x_mem = read!(x, self);
+            let r_mem = write_only!(result, self);
+            let w_mem = write_only!(workspace, self);
+
+            exec!(convolution, CUDNN.convolution_forward::<$t>(
                 config,
-                try!(unsafe { ::frameworks::cuda::helper::receive_memory_ptr_mut(workspace, self.device()) }),
-                try!(unsafe { ::frameworks::cuda::helper::receive_memory_ptr(filter, self.device()) }),
+                trans_mut!(w_mem),
+                trans!(f_mem),
                 &try!(x.cudnn_tensor_desc()), // src_desc
-                try!(unsafe { ::frameworks::cuda::helper::receive_memory_ptr(x, self.device()) }), //src_data
-                &try!(result.cudnn_tensor_desc()), // dest_desc
-                try!(unsafe { ::frameworks::cuda::helper::receive_memory_ptr_mut(result, self.device()) }), // dest_data
-                scal_params
-            ) {
-                Ok(_) => Ok(()),
-                Err(_) => {
-                    Err(::co::plugin::Error::Operation("Unable to execute CUDA cuDNN Activation convolution Forward."))
-                }
-            }))
+                trans!(x_mem),
+                &r_desc,
+                trans_mut!(r_mem),
+                ScalParams::default()))
         }
 
         #[allow(unused_variables)]
         fn convolution_grad_filter(
             &self,
-            src_data: &mut ::co::tensor::SharedTensor<$t>,
-            dest_diff: &mut ::co::tensor::SharedTensor<$t>,
-            filter_diff: &mut ::co::tensor::SharedTensor<$t>,
-            workspace: &mut ::co::tensor::SharedTensor<u8>,
-            config: &Self::CC
-        ) -> Result<(), ::co::error::Error> {
-            match src_data.add_device(self.device()) { _ => try!(src_data.sync(self.device())) }
-            match dest_diff.add_device(self.device()) { _ => try!(dest_diff.sync(self.device())) }
-            match filter_diff.add_device(self.device()) { _ => try!(filter_diff.sync(self.device())) }
-            match workspace.add_device(self.device()) { _ => () }
-
-            self.convolution_grad_filter_plain(src_data, dest_diff, filter_diff, workspace, config)
-        }
-
-        #[allow(unused_variables)]
-        fn convolution_grad_filter_plain(
-            &self,
-            src_data: &::co::tensor::SharedTensor<$t>,
-            dest_diff: &::co::tensor::SharedTensor<$t>,
-            filter_diff: &mut ::co::tensor::SharedTensor<$t>,
-            workspace: &mut ::co::tensor::SharedTensor<u8>,
-            config: &Self::CC
-        ) -> Result<(), ::co::error::Error> {
-            let scal_params: ::cudnn::utils::ScalParams<$t> = ::cudnn::utils::ScalParams::default();
-
-            Ok(try!(match CUDNN.convolution_backward_filter(
+            src_data: &SharedTensor<$t>,
+            dest_diff: &SharedTensor<$t>,
+            filter_diff: &mut SharedTensor<$t>,
+            workspace: &mut SharedTensor<u8>,
+            config: &Self::CC) -> Result<(), CoError> {
+
+            let s_mem = read!(src_data, self);
+            let dd_mem = read!(dest_diff, self);
+            let df_mem = write_only!(filter_diff, self);
+            let w_mem = write_only!(workspace, self);
+            exec!(convolution_grad_filter, CUDNN.convolution_backward_filter(
                 config,
-                try!(unsafe { ::frameworks::cuda::helper::receive_memory_ptr_mut(workspace, self.device()) }),
+                trans_mut!(w_mem),
                 &try!(src_data.cudnn_tensor_desc()),
-                try!(unsafe { ::frameworks::cuda::helper::receive_memory_ptr(src_data, self.device()) }),
+                trans!(s_mem),
                 &try!(dest_diff.cudnn_tensor_desc()),
-                try!(unsafe { ::frameworks::cuda::helper::receive_memory_ptr(dest_diff, self.device()) }),
-                try!(unsafe { ::frameworks::cuda::helper::receive_memory_ptr_mut(filter_diff, self.device()) }),
-                scal_params
-            ) {
-                Ok(_) => Ok(()),
-                Err(_) => {
-                    Err(::co::plugin::Error::Operation("Unable to execute CUDA cuDNN Activation convolution Backward."))
-                }
-            }))
+                trans!(dd_mem),
+                trans_mut!(df_mem),
+                ScalParams::<$t>::default()))
         }
 
         #[allow(unused_variables)]
         fn convolution_grad_data(
             &self,
-            filter: &mut ::co::tensor::SharedTensor<$t>,
-            x_diff: &mut ::co::tensor::SharedTensor<$t>,
-            result_diff: &mut ::co::tensor::SharedTensor<$t>,
-            workspace: &mut ::co::tensor::SharedTensor<u8>,
-            config: &Self::CC
-        ) -> Result<(), ::co::error::Error> {
-            match filter.add_device(self.device()) { _ => try!(filter.sync(self.device())) }
-            match x_diff.add_device(self.device()) { _ => try!(x_diff.sync(self.device())) }
-            match result_diff.add_device(self.device()) { _ => try!(result_diff.sync(self.device())) }
-            match workspace.add_device(self.device()) { _ => () }
-
-            self.convolution_grad_data_plain(filter, x_diff, result_diff, workspace, config)
-        }
-
-        #[allow(unused_variables)]
-        fn convolution_grad_data_plain(
-            &self,
-            filter: &::co::tensor::SharedTensor<$t>,
-            x_diff: &::co::tensor::SharedTensor<$t>,
-            result_diff: &mut ::co::tensor::SharedTensor<$t>,
-            workspace: &mut ::co::tensor::SharedTensor<u8>,
-            config: &Self::CC
-        ) -> Result<(), ::co::error::Error> {
-            let scal_params: ::cudnn::utils::ScalParams<$t> = ::cudnn::utils::ScalParams::default();
-
-            Ok(try!(match CUDNN.convolution_backward_data(
+            filter: &SharedTensor<$t>,
+            x_diff: &SharedTensor<$t>,
+            result_diff: &mut SharedTensor<$t>,
+            workspace: &mut SharedTensor<u8>,
+            config: &Self::CC) -> Result<(), CoError> {
+
+            let dr_desc = try!(result_diff.cudnn_tensor_desc());
+            let f_mem = read!(filter, self);
+            let dx_mem = read!(x_diff, self);
+            let dr_mem = write_only!(result_diff, self);
+            let w_mem = write_only!(workspace, self);
+            exec!(convolution_grad_data, CUDNN.convolution_backward_data(
                 config,
-                try!(unsafe { ::frameworks::cuda::helper::receive_memory_ptr_mut(workspace, self.device()) }),
-                try!(unsafe { ::frameworks::cuda::helper::receive_memory_ptr(filter, self.device()) }),
+                trans_mut!(w_mem),
+                trans!(f_mem),
                 &try!(x_diff.cudnn_tensor_desc()),
-                try!(unsafe { ::frameworks::cuda::helper::receive_memory_ptr(x_diff, self.device()) }),
-                &try!(result_diff.cudnn_tensor_desc()),
-                try!(unsafe { ::frameworks::cuda::helper::receive_memory_ptr_mut(result_diff, self.device()) }),
-                scal_params
-            ) {
-                Ok(_) => Ok(()),
-                Err(_) => {
-                    Err(::co::plugin::Error::Operation("Unable to execute CUDA cuDNN Activation convolution Backward."))
-                }
-            }))
+                trans!(dx_mem),
+                &dr_desc,
+                trans_mut!(dr_mem),
+                ScalParams::<$t>::default()))
         }
     )
 }
 
 #[macro_export]
 macro_rules! impl_ops_softmax_for {
-    ($t:ident, $b:ty) => (
+    ($t:ty, $b:ty) => (
         impl ::plugin::Softmax<$t> for $b {
-            fn softmax(
-                &self,
-                x: &mut ::co::tensor::SharedTensor<$t>,
-                result: &mut ::co::tensor::SharedTensor<$t>
-            ) -> Result<(), ::co::error::Error> {
-                match x.add_device(self.device()) { _ => try!(x.sync(self.device())) }
-                match result.add_device(self.device()) { _ => () }
-
-                self.softmax_plain(x, result)
-            }
-
-            fn softmax_plain(
-                &self,
-                x: &::co::tensor::SharedTensor<$t>,
-                result: &mut ::co::tensor::SharedTensor<$t>
-            ) -> Result<(), ::co::error::Error> {
-                let scal_params: ::cudnn::utils::ScalParams<$t> = ::cudnn::utils::ScalParams::default();
-
-                Ok(try!(match CUDNN.softmax_forward(
-                    &try!(x.cudnn_tensor_desc_softmax()), // src_desc
-                    try!(unsafe { ::frameworks::cuda::helper::receive_memory_ptr(x, self.device()) }), //src_data
-                    &try!(result.cudnn_tensor_desc_softmax()), // dest_desc
-                    try!(unsafe { ::frameworks::cuda::helper::receive_memory_ptr_mut(result, self.device()) }), // dest_data
-                    scal_params
-                ) {
-                    Ok(_) => Ok(()),
-                    Err(_) => {
-                        Err(::co::plugin::Error::Operation("Unable to execute CUDA cuDNN softmax Forward."))
-                    }
-                }))
+            fn softmax(&self, x: &SharedTensor<$t>, result: &mut SharedTensor<$t>)
+                       -> Result<(), CoError> {
+                let r_desc = try!(result.cudnn_tensor_desc_softmax());
+                let x_mem = read!(x, self);
+                let r_mem = write_only!(result, self);
+                exec!(softmax, CUDNN.softmax_forward(
+                    &try!(x.cudnn_tensor_desc_softmax()),
+                    trans!(x_mem),
+                    &r_desc,
+                    trans_mut!(r_mem),
+                    ScalParams::<$t>::default()))
             }
 
             fn softmax_grad(
                 &self,
-                x: &mut ::co::tensor::SharedTensor<$t>,
-                x_diff: &mut ::co::tensor::SharedTensor<$t>,
-                result_diff: &mut ::co::tensor::SharedTensor<$t>
-            ) -> Result<(), ::co::error::Error> {
-                match x.add_device(self.device()) { _ => try!(x.sync(self.device())) }
-                match x_diff.add_device(self.device()) { _ => try!(x.sync(self.device())) }
-                match result_diff.add_device(self.device()) { _ => () }
-
-                self.softmax_grad_plain(x, x_diff, result_diff)
-            }
+                x: &SharedTensor<$t>,
+                x_diff: &SharedTensor<$t>,
+                result_diff: &mut SharedTensor<$t>) -> Result<(), CoError> {
 
-            fn softmax_grad_plain(
-                &self,
-                x: &::co::tensor::SharedTensor<$t>,
-                x_diff: &::co::tensor::SharedTensor<$t>,
-                result_diff: &mut ::co::tensor::SharedTensor<$t>
-            ) -> Result<(), ::co::error::Error> {
-                let scal_params: ::cudnn::utils::ScalParams<$t> = ::cudnn::utils::ScalParams::default();
-
-                Ok(try!(match CUDNN.softmax_backward(
-                    &try!(x.cudnn_tensor_desc_softmax()), // src_desc
-                    try!(unsafe { ::frameworks::cuda::helper::receive_memory_ptr(x, self.device()) }), //src_data
-                    &try!(x_diff.cudnn_tensor_desc_softmax()), // src_diff_desc
-                    try!(unsafe { ::frameworks::cuda::helper::receive_memory_ptr(x_diff, self.device()) }), //src_diff_data
-                    &try!(result_diff.cudnn_tensor_desc_softmax()), // dest_diff_desc
-                    try!(unsafe { ::frameworks::cuda::helper::receive_memory_ptr_mut(result_diff, self.device()) }), // dest_diff_data
-                    scal_params
-                ) {
-                    Ok(_) => Ok(()),
-                    Err(_) => {
-                        Err(::co::plugin::Error::Operation("Unable to execute CUDA cuDNN softmax Backward."))
-                    }
-                }))
+                let dr_desc = try!(result_diff.cudnn_tensor_desc_softmax());
+                let x_mem = read!(x, self);
+                let dx_mem = read!(x_diff, self);
+                let dr_mem = write_only!(result_diff, self);
+
+                exec!(softmax_backward, CUDNN.softmax_backward(
+                    &try!(x.cudnn_tensor_desc_softmax()),
+                    trans!(x_mem),
+                    &try!(x_diff.cudnn_tensor_desc_softmax()),
+                    trans!(dx_mem),
+                    &dr_desc,
+                    trans_mut!(dr_mem),
+                    ScalParams::<$t>::default()))
             }
         }
     )
@@ -717,75 +309,40 @@ macro_rules! impl_ops_softmax_for {
 
 #[macro_export]
 macro_rules! impl_ops_log_softmax_for {
-    ($t:ident, $b:ty) => (
+    ($t:ty, $b:ty) => (
         impl ::plugin::LogSoftmax<$t> for $b {
-            fn log_softmax(
-                &self,
-                x: &mut ::co::tensor::SharedTensor<$t>,
-                result: &mut ::co::tensor::SharedTensor<$t>
-            ) -> Result<(), ::co::error::Error> {
-                match x.add_device(self.device()) { _ => try!(x.sync(self.device())) }
-                match result.add_device(self.device()) { _ => () }
-
-                self.log_softmax_plain(x, result)
-            }
-
-            fn log_softmax_plain(
-                &self,
-                x: &::co::tensor::SharedTensor<$t>,
-                result: &mut ::co::tensor::SharedTensor<$t>
-            ) -> Result<(), ::co::error::Error> {
-                let scal_params: ::cudnn::utils::ScalParams<$t> = ::cudnn::utils::ScalParams::default();
-
-                Ok(try!(match CUDNN.log_softmax_forward(
-                    &try!(x.cudnn_tensor_desc_softmax()), // src_desc
-                    try!(unsafe { ::frameworks::cuda::helper::receive_memory_ptr(x, self.device()) }), //src_data
-                    &try!(result.cudnn_tensor_desc_softmax()), // dest_desc
-                    try!(unsafe { ::frameworks::cuda::helper::receive_memory_ptr_mut(result, self.device()) }), // dest_data
-                    scal_params
-                ) {
-                    Ok(_) => Ok(()),
-                    Err(_) => {
-                        Err(::co::plugin::Error::Operation("Unable to execute CUDA cuDNN logarithmic softmax Forward."))
-                    }
-                }))
+            fn log_softmax(&self, x: &SharedTensor<$t>, result: &mut SharedTensor<$t>)
+                       -> Result<(), CoError> {
+                let r_desc = try!(result.cudnn_tensor_desc_softmax());
+                let x_mem = read!(x, self);
+                let r_mem = write_only!(result, self);
+                exec!(log_softmax, CUDNN.log_softmax_forward(
+                    &try!(x.cudnn_tensor_desc_softmax()),
+                    trans!(x_mem),
+                    &r_desc,
+                    trans_mut!(r_mem),
+                    ScalParams::<$t>::default()))
             }
 
             fn log_softmax_grad(
                 &self,
-                x: &mut ::co::tensor::SharedTensor<$t>,
-                x_diff: &mut ::co::tensor::SharedTensor<$t>,
-                result_diff: &mut ::co::tensor::SharedTensor<$t>
-            ) -> Result<(), ::co::error::Error> {
-                match x.add_device(self.device()) { _ => try!(x.sync(self.device())) }
-                match x_diff.add_device(self.device()) { _ => try!(x.sync(self.device())) }
-                match result_diff.add_device(self.device()) { _ => () }
-
-                self.log_softmax_grad_plain(x, x_diff, result_diff)
-            }
+                x: &SharedTensor<$t>,
+                x_diff: &SharedTensor<$t>,
+                result_diff: &mut SharedTensor<$t>) -> Result<(), CoError> {
 
-            fn log_softmax_grad_plain(
-                &self,
-                x: &::co::tensor::SharedTensor<$t>,
-                x_diff: &::co::tensor::SharedTensor<$t>,
-                result_diff: &mut ::co::tensor::SharedTensor<$t>
-            ) -> Result<(), ::co::error::Error> {
-                let scal_params: ::cudnn::utils::ScalParams<$t> = ::cudnn::utils::ScalParams::default();
-
-                Ok(try!(match CUDNN.log_softmax_backward(
-                    &try!(x.cudnn_tensor_desc_softmax()), // src_desc
-                    try!(unsafe { ::frameworks::cuda::helper::receive_memory_ptr(x, self.device()) }), //src_data
-                    &try!(x_diff.cudnn_tensor_desc_softmax()), // src_diff_desc
-                    try!(unsafe { ::frameworks::cuda::helper::receive_memory_ptr(x_diff, self.device()) }), //src_diff_data
-                    &try!(result_diff.cudnn_tensor_desc_softmax()), // dest_diff_desc
-                    try!(unsafe { ::frameworks::cuda::helper::receive_memory_ptr_mut(result_diff, self.device()) }), // dest_diff_data
-                    scal_params
-                ) {
-                    Ok(_) => Ok(()),
-                    Err(_) => {
-                        Err(::co::plugin::Error::Operation("Unable to execute CUDA cuDNN logarithmic softmax Backward."))
-                    }
-                }))
+                let dr_desc = try!(result_diff.cudnn_tensor_desc_softmax());
+                let x_mem = read!(x, self);
+                let dx_mem = read!(x_diff, self);
+                let dr_mem = write_only!(result_diff, self);
+
+                exec!(log_softmax_backward, CUDNN.log_softmax_backward(
+                    &try!(x.cudnn_tensor_desc_softmax()),
+                    trans!(x_mem),
+                    &try!(x_diff.cudnn_tensor_desc_softmax()),
+                    trans!(dx_mem),
+                    &dr_desc,
+                    trans_mut!(dr_mem),
+                    ScalParams::<$t>::default()))
             }
         }
     )
@@ -793,98 +350,53 @@ macro_rules! impl_ops_log_softmax_for {
 
 #[macro_export]
 macro_rules! impl_ops_lrn_for {
-    ($t:ident, $b:ty) => (
+    ($t:ty, $b:ty) => (
         impl ::plugin::LRN<$t> for $b {
-            fn new_lrn_config(
-                &self,
-                n: u32,
-                alpha: f64,
-                beta: f64,
-                k: f64
-            ) -> Result<Self::CLRN, ::co::error::Error> {
+            fn new_lrn_config(&self, n: u32, alpha: f64, beta: f64, k: f64)
+                              -> Result<Self::CLRN, CoError> {
+                // FIXME: unwrap()
                 Ok(CUDNN.init_normalization(n, alpha, beta, k).unwrap())
             }
 
-            fn lrn(
-                &self,
-                x: &mut ::co::tensor::SharedTensor<$t>,
-                result: &mut ::co::tensor::SharedTensor<$t>,
-                config: &Self::CLRN //::frameworks::cuda::CC
-            ) -> Result<(), ::co::error::Error> {
-                match x.add_device(self.device()) { _ => try!(x.sync(self.device())) }
-                match result.add_device(self.device()) { _ => () }
-
-                self.lrn_plain(x, result, config)
-            }
-
-            fn lrn_plain(
-                &self,
-                x: &::co::tensor::SharedTensor<$t>,
-                result: &mut ::co::tensor::SharedTensor<$t>,
-                config: &Self::CLRN
-            ) -> Result<(), ::co::error::Error> {
-                let scal_params: ::cudnn::utils::ScalParams<$t> = ::cudnn::utils::ScalParams::default();
-
-                Ok(try!(match CUDNN.lrn_forward(
+            fn lrn(&self, x: &SharedTensor<$t>, result: &mut SharedTensor<$t>,
+                   config: &Self::CLRN) -> Result<(), CoError> {
+                let r_desc = try!(result.cudnn_tensor_desc());
+                let x_mem = read!(x, self);
+                let r_mem = write_only!(result, self);
+                exec!(lrn_forward, CUDNN.lrn_forward(
                     config,
-                    &try!(x.cudnn_tensor_desc()), // src_desc
-                    try!(unsafe { ::frameworks::cuda::helper::receive_memory_ptr(x, self.device()) }), //src_data
-                    &try!(result.cudnn_tensor_desc()), // dest_desc
-                    try!(unsafe { ::frameworks::cuda::helper::receive_memory_ptr_mut(result, self.device()) }), // dest_data
-                    scal_params
-                ) {
-                    Ok(_) => Ok(()),
-                    Err(_) => {
-                        Err(::co::plugin::Error::Operation("Unable to execute CUDA cuDNN Activation lrn Forward."))
-                    }
-                }))
+                    &try!(x.cudnn_tensor_desc()),
+                    trans!(x_mem),
+                    &r_desc,
+                    trans_mut!(r_mem),
+                    ScalParams::<$t>::default()))
             }
 
             #[allow(unused_variables)]
             fn lrn_grad(
                 &self,
-                x: &mut ::co::tensor::SharedTensor<$t>,
-                x_diff: &mut ::co::tensor::SharedTensor<$t>,
-                result: &mut ::co::tensor::SharedTensor<$t>,
-                result_diff: &mut ::co::tensor::SharedTensor<$t>,
-                config: &Self::CLRN
-            ) -> Result<(), ::co::error::Error> {
-                match x.add_device(self.device()) { _ => try!(x.sync(self.device())) }
-                match x_diff.add_device(self.device()) { _ => try!(x.sync(self.device())) }
-                match result.add_device(self.device()) { _ => try!(x.sync(self.device())) }
-                match result_diff.add_device(self.device()) { _ => () }
-
-                self.lrn_grad_plain(x, x_diff, result, result_diff, config)
-            }
-
-            #[allow(unused_variables)]
-            fn lrn_grad_plain(
-                &self,
-                x: &::co::tensor::SharedTensor<$t>,
-                x_diff: &::co::tensor::SharedTensor<$t>,
-                result: &::co::tensor::SharedTensor<$t>,
-                result_diff: &mut ::co::tensor::SharedTensor<$t>,
-                config: &Self::CLRN
-            ) -> Result<(), ::co::error::Error> {
-                let scal_params: ::cudnn::utils::ScalParams<$t> = ::cudnn::utils::ScalParams::default();
-
-                Ok(try!(match CUDNN.lrn_backward(
+                x: &SharedTensor<$t>,
+                x_diff: &SharedTensor<$t>,
+                result: &SharedTensor<$t>,
+                result_diff: &mut SharedTensor<$t>,
+                config: &Self::CLRN) -> Result<(), CoError> {
+
+                let dr_desc = try!(result_diff.cudnn_tensor_desc());
+                let x_mem = read!(x, self);
+                let dx_mem = read!(x_diff, self);
+                let r_mem = read!(result, self);
+                let dr_mem = write_only!(result_diff, self);
+                exec!(lrn_backward, CUDNN.lrn_backward(
                     config,
-                    &try!(x.cudnn_tensor_desc()), // src_desc
-                    try!(unsafe { ::frameworks::cuda::helper::receive_memory_ptr(x, self.device()) }), //src_data
-                    &try!(x_diff.cudnn_tensor_desc()), // src_diff_desc
-                    try!(unsafe { ::frameworks::cuda::helper::receive_memory_ptr(x_diff, self.device()) }), //src_diff_data
-                    &try!(result.cudnn_tensor_desc()), // dest_desc
-                    try!(unsafe { ::frameworks::cuda::helper::receive_memory_ptr(result, self.device()) }), // dest_data
-                    &try!(result_diff.cudnn_tensor_desc()), // dest_diff_desc
-                    try!(unsafe { ::frameworks::cuda::helper::receive_memory_ptr_mut(result_diff, self.device()) }), // dest_diff_data
-                    scal_params
-                ) {
-                    Ok(_) => Ok(()),
-                    Err(_) => {
-                        Err(::co::plugin::Error::Operation("Unable to execute CUDA cuDNN Activation lrn Backward."))
-                    }
-                }))
+                    &try!(x.cudnn_tensor_desc()),
+                    trans!(x_mem),
+                    &try!(x_diff.cudnn_tensor_desc()),
+                    trans!(dx_mem),
+                    &try!(result.cudnn_tensor_desc()),
+                    trans!(r_mem),
+                    &dr_desc,
+                    trans_mut!(dr_mem),
+                    ScalParams::<$t>::default()))
             }
         }
     )
@@ -892,99 +404,59 @@ macro_rules! impl_ops_lrn_for {
 
 #[macro_export]
 macro_rules! impl_ops_pooling_for {
-    ($t:ident, $b:ty) => (
+    ($t:ty, $b:ty) => (
         impl ::plugin::Pooling<$t> for $b {
-            fn new_pooling_config(
-                &self,
-                window: &[i32],
-                padding: &[i32],
-                stride: &[i32],
-            ) -> Result<Self::CPOOL, ::co::error::Error> {
-                let pooling_avg = ::cudnn::PoolingDescriptor::new(::cudnn::cudnnPoolingMode_t::CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING, window, padding, stride).unwrap();
-                let pooling_max = ::cudnn::PoolingDescriptor::new(::cudnn::cudnnPoolingMode_t::CUDNN_POOLING_MAX, window, padding, stride).unwrap();
-                Ok(::cudnn::utils::PoolingConfig::new(pooling_avg, pooling_max))
-            }
-
-            fn pooling_max(
-                &self,
-                x: &mut ::co::tensor::SharedTensor<$t>,
-                result: &mut ::co::tensor::SharedTensor<$t>,
-                config: &Self::CPOOL
-            ) -> Result<(), ::co::error::Error> {
-                match x.add_device(self.device()) { _ => try!(x.sync(self.device())) }
-                match result.add_device(self.device()) { _ => () }
-
-                self.pooling_max_plain(x, result, config)
-            }
-
-            fn pooling_max_plain(
-                &self,
-                x: &::co::tensor::SharedTensor<$t>,
-                result: &mut ::co::tensor::SharedTensor<$t>,
-                config: &Self::CPOOL
-            ) -> Result<(), ::co::error::Error> {
-                let scal_params: ::cudnn::utils::ScalParams<$t> = ::cudnn::utils::ScalParams::default();
-
-                Ok(try!(match CUDNN.pooling_max_forward(
+            fn new_pooling_config(&self, window: &[i32], padding: &[i32],
+                                  stride: &[i32]) -> Result<Self::CPOOL, CoError> {
+                // FIXME: unwraps
+                let pooling_avg = PoolingDescriptor::new(
+                    cudnnPoolingMode_t::CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING,
+                    window, padding, stride).unwrap();
+                let pooling_max = PoolingDescriptor::new(
+                    cudnnPoolingMode_t::CUDNN_POOLING_MAX,
+                    window, padding, stride).unwrap();
+                Ok(utils::PoolingConfig::new(pooling_avg, pooling_max))
+            }
+
+            fn pooling_max(&self, x: &SharedTensor<$t>, result: &mut SharedTensor<$t>,
+                           config: &Self::CPOOL) -> Result<(), CoError> {
+                let r_desc = try!(result.cudnn_tensor_desc());
+                let x_mem = read!(x, self);
+                let r_mem = write_only!(result, self);
+                exec!(pooling_max_forward, CUDNN.pooling_max_forward(
                     config,
-                    &try!(x.cudnn_tensor_desc()), // src_desc
-                    try!(unsafe { ::frameworks::cuda::helper::receive_memory_ptr(x, self.device()) }), //src_data
-                    &try!(result.cudnn_tensor_desc()), // dest_desc
-                    try!(unsafe { ::frameworks::cuda::helper::receive_memory_ptr_mut(result, self.device()) }), // dest_data
-                    scal_params
-                ) {
-                    Ok(_) => Ok(()),
-                    Err(_) => {
-                        Err(::co::plugin::Error::Operation("Unable to execute CUDA cuDNN Activation pooling Forward."))
-                    }
-                }))
+                    &try!(x.cudnn_tensor_desc()),
+                    trans!(x_mem),
+                    &r_desc,
+                    trans_mut!(r_mem),
+                    ScalParams::<$t>::default()))
             }
 
             #[allow(unused_variables)]
             fn pooling_max_grad(
                 &self,
-                x: &mut ::co::tensor::SharedTensor<$t>,
-                x_diff: &mut ::co::tensor::SharedTensor<$t>,
-                result: &mut ::co::tensor::SharedTensor<$t>,
-                result_diff: &mut ::co::tensor::SharedTensor<$t>,
-                config: &Self::CPOOL
-            ) -> Result<(), ::co::error::Error> {
-                match x.add_device(self.device()) { _ => try!(x.sync(self.device())) }
-                match x_diff.add_device(self.device()) { _ => try!(x.sync(self.device())) }
-                match result.add_device(self.device()) { _ => try!(x.sync(self.device())) }
-                match result_diff.add_device(self.device()) { _ => () }
-
-                self.pooling_max_grad_plain(x, x_diff, result, result_diff, config)
-            }
-
-            #[allow(unused_variables)]
-            fn pooling_max_grad_plain(
-                &self,
-                x: &::co::tensor::SharedTensor<$t>,
-                x_diff: &::co::tensor::SharedTensor<$t>,
-                result: &::co::tensor::SharedTensor<$t>,
-                result_diff: &mut ::co::tensor::SharedTensor<$t>,
-                config: &Self::CPOOL
-            ) -> Result<(), ::co::error::Error> {
-                let scal_params: ::cudnn::utils::ScalParams<$t> = ::cudnn::utils::ScalParams::default();
-
-                Ok(try!(match CUDNN.pooling_max_backward(
+                x: &SharedTensor<$t>,
+                x_diff: &SharedTensor<$t>,
+                result: &SharedTensor<$t>,
+                result_diff: &mut SharedTensor<$t>,
+                config: &Self::CPOOL) -> Result<(), CoError> {
+
+                let dr_desc = try!(result_diff.cudnn_tensor_desc());
+                let x_mem = read!(x, self);
+                let dx_mem = read!(x_diff, self);
+                let r_mem = read!(result, self);
+                let dr_mem = write_only!(result_diff, self);
+                exec!(pooling_max_backward, CUDNN.pooling_max_backward(
                     config,
-                    &try!(x.cudnn_tensor_desc()), // src_desc
-                    try!(unsafe { ::frameworks::cuda::helper::receive_memory_ptr(x, self.device()) }), //src_data
-                    &try!(x_diff.cudnn_tensor_desc()), // src_diff_desc
-                    try!(unsafe { ::frameworks::cuda::helper::receive_memory_ptr(x_diff, self.device()) }), //src_diff_data
-                    &try!(result.cudnn_tensor_desc()), // dest_desc
-                    try!(unsafe { ::frameworks::cuda::helper::receive_memory_ptr(result, self.device()) }), // dest_data
-                    &try!(result_diff.cudnn_tensor_desc()), // dest_diff_desc
-                    try!(unsafe { ::frameworks::cuda::helper::receive_memory_ptr_mut(result_diff, self.device()) }), // dest_diff_data
-                    scal_params
-                ) {
-                    Ok(_) => Ok(()),
-                    Err(_) => {
-                        Err(::co::plugin::Error::Operation("Unable to execute CUDA cuDNN Activation pooling Backward."))
-                    }
-                }))
+                    &try!(x.cudnn_tensor_desc()),
+                    trans!(x_mem),
+                    &try!(x_diff.cudnn_tensor_desc()),
+                    trans!(dx_mem),
+                    &try!(result.cudnn_tensor_desc()),
+                    trans!(r_mem),
+                    &dr_desc,
+                    trans_mut!(dr_mem),
+                    ScalParams::<$t>::default()))
             }
         }
     )
diff --git a/src/frameworks/cuda/mod.rs b/src/frameworks/cuda/mod.rs
index baa5459..6088c38 100644
--- a/src/frameworks/cuda/mod.rs
+++ b/src/frameworks/cuda/mod.rs
@@ -1,9 +1,11 @@
 //! Provides NN for a CUDA backend.
 #![allow(missing_docs)]
 use ::plugin::*;
+use co::Error as CoError;
 use co::prelude::*;
 use co::plugin::Error as PluginError;
 use cudnn::*;
+use cudnn::utils::ScalParams;
 
 #[macro_use]
 pub mod helper;
@@ -316,10 +318,6 @@ impl_ops_log_softmax_for!(f32, Backend<Cuda>);
 impl_ops_lrn_for!(f32, Backend<Cuda>);
 impl_ops_pooling_for!(f32, Backend<Cuda>);
 
-impl_ops_sigmoid_pointwise_for!(f32, Backend<Cuda>);
-impl_ops_relu_pointwise_for!(f32, Backend<Cuda>);
-impl_ops_tanh_pointwise_for!(f32, Backend<Cuda>);
-
 impl NN<f64> for Backend<Cuda> {
     type CC = utils::ConvolutionConfig;
     type CLRN = utils::NormalizationConfig;
@@ -337,7 +335,3 @@ impl_ops_softmax_for!(f64, Backend<Cuda>);
 impl_ops_log_softmax_for!(f64, Backend<Cuda>);
 impl_ops_lrn_for!(f64, Backend<Cuda>);
 impl_ops_pooling_for!(f64, Backend<Cuda>);
-
-impl_ops_sigmoid_pointwise_for!(f64, Backend<Cuda>);
-impl_ops_relu_pointwise_for!(f64, Backend<Cuda>);
-impl_ops_tanh_pointwise_for!(f64, Backend<Cuda>);
diff --git a/src/frameworks/native/helper.rs b/src/frameworks/native/helper.rs
index d411978..e145b63 100644
--- a/src/frameworks/native/helper.rs
+++ b/src/frameworks/native/helper.rs
@@ -1,7 +1,9 @@
 //! Provides useful macros for easier NN implementation for native.
 
+use co;
 use co::plugin::numeric_helpers::Float;
 use co::memory::MemoryType;
+use co::plugin::Error as PluginError;
 
 #[derive(Debug, Copy, Clone)]
 #[allow(missing_docs)]
@@ -13,6 +15,30 @@ pub struct NormalizationConfig;
 #[allow(missing_docs)]
 pub struct PoolingConfig;
 
+macro_rules! read {
+    ($x:ident, $t:ident, $slf:ident) => (
+        try!($x.read($slf.device())).as_native()
+            .expect("Broken invariant: not a CUDA memory")
+            .as_slice::<$t>()
+    )
+}
+
+macro_rules! read_write {
+    ($x:ident, $t: ident, $slf:ident) => (
+        try!($x.read_write($slf.device())).as_mut_native()
+            .expect("Broken invariant: not a CUDA memory")
+            .as_mut_slice::<$t>()
+    )
+}
+
+macro_rules! write_only {
+    ($x:ident, $t: ident, $slf:ident) => (
+        try!($x.write_only($slf.device())).as_mut_native()
+            .expect("Broken invariant: not a CUDA memory")
+            .as_mut_slice::<$t>()
+    )
+}
+
 /// Just a helper function until SharedTensor has a nice interface for writing data
 pub fn write_to_memory<T: Iterator>(mem: &mut MemoryType, data: T)
 where T::Item: Clone {
@@ -30,43 +56,43 @@ where T::Item: Clone {
 
 #[inline]
 /// Computes the Sigmoid Function on the CPU
-pub fn sigmoid<T: Float>(x: &T) -> T {
-    (T::one()) / (T::one() + (-*x).exp())
+pub fn sigmoid<T: Float>(x: T) -> T {
+    (T::one()) / (T::one() + (-x).exp())
 }
 
 #[inline]
 /// Computes the Sigmoid Gradient on the CPU
-pub fn sigmoid_grad<T: Float>(x: &T, dx: &T) -> T {
-    *x * (T::one() -*x) * *dx
+pub fn sigmoid_grad<T: Float>(x: T, dx: T) -> T {
+    x * (T::one() - x) * dx
 }
 
 #[inline]
 /// Computes the ReLU Function on the CPU
-pub fn relu<T: Float>(x: &T) -> T {
+pub fn relu<T: Float>(x: T) -> T {
     let x : T = x.clone();
     x.max(T::zero())
 }
 
 #[inline]
 /// Computes the ReLU Gradient on the CPU
-pub fn relu_grad<T: Float>(x: &T, dx: &T) -> T {
-    if *x > T::zero() {
-        return *dx
+pub fn relu_grad<T: Float>(x: T, dx: T) -> T {
+    if x > T::zero() {
+        return dx
     }
     T::zero()
 }
 
 #[inline]
 /// Computes the Tanh Function on the CPU
-pub fn tanh<T: Float>(x: &T) -> T {
+pub fn tanh<T: Float>(x: T) -> T {
     x.tanh()
 }
 
 #[inline]
 // d/dx tanh x = sech2 x = 1 + tanh2 x
 /// Computes the Tanh Gradient on the CPU
-pub fn tanh_grad<T: Float>(x: &T, dx: &T) -> T {
-    (T::one() - x.powi(2)) * *dx
+pub fn tanh_grad<T: Float>(x: T, dx: T) -> T {
+    (T::one() - x.powi(2)) * dx
 }
 
 macro_rules! impl_oconf_for_cc(($($t: ident), +) => (
@@ -91,61 +117,25 @@ macro_rules! impl_oconf_for_pooling(($($t: ident), +) => (
 #[macro_export]
 macro_rules! impl_ops_sigmoid_for {
     ($t:ident, $b:ty) => (
-        impl ::plugin::Sigmoid<$t> for $b {
-            fn sigmoid(
-                &self,
-                x: &mut ::co::tensor::SharedTensor<$t>,
-                result: &mut ::co::tensor::SharedTensor<$t>
-            ) -> Result<(), ::co::error::Error> {
-                match x.add_device(self.device()) { _ => try!(x.sync(self.device())) }
-                match result.add_device(self.device()) { _ => () }
-                self.sigmoid_plain(x, result)
-            }
-
-            fn sigmoid_plain(
-                &self,
-                x: &::co::tensor::SharedTensor<$t>,
-                result: &mut ::co::tensor::SharedTensor<$t>
-            ) -> Result<(), ::co::error::Error> {
-                if let Some(input) = x.get(self.device()).unwrap().as_native() {
-                    let res = input.as_slice::<$t>().iter().map(::frameworks::native::helper::sigmoid);
-                    ::frameworks::native::helper::write_to_memory(result.get_mut(self.device()).unwrap(), res);
-                    return Ok(());
-                }
-                Err(Error::Plugin(PluginError::Operation("Unable to execute Native sigmoid Forward.")))
+        impl Sigmoid<$t> for $b {
+            fn sigmoid(&self, x: &SharedTensor<$t>, result: &mut SharedTensor<$t>)
+                       -> Result<(), Error> {
+                map1(read!(x, $t, self),
+                     write_only!(result, $t, self),
+                     ::frameworks::native::helper::sigmoid)
             }
 
             fn sigmoid_grad(
                 &self,
-                x: &mut ::co::tensor::SharedTensor<$t>,
-                x_diff: &mut ::co::tensor::SharedTensor<$t>,
-                result: &mut ::co::tensor::SharedTensor<$t>,
-                result_diff: &mut ::co::tensor::SharedTensor<$t>
-            ) -> Result<(), ::co::error::Error> {
-                match x.add_device(self.device()) { _ => try!(x.sync(self.device())) }
-                match x_diff.add_device(self.device()) { _ => try!(x_diff.sync(self.device())) }
-                match result.add_device(self.device()) { _ => try!(result.sync(self.device())) }
-                match result_diff.add_device(self.device()) { _ => () }
-                self.sigmoid_grad_plain(x, x_diff, result, result_diff)
-            }
-
-            fn sigmoid_grad_plain(
-                &self,
-                x: &::co::tensor::SharedTensor<$t>,
-                x_diff: &::co::tensor::SharedTensor<$t>,
-                result: &::co::tensor::SharedTensor<$t>,
-                result_diff: &mut ::co::tensor::SharedTensor<$t>
-            ) -> Result<(), ::co::error::Error> {
-                if let Some(sig_data) = x.get(self.device()).unwrap().as_native() {
-                    if let Some(sig_dx) = x_diff.get(self.device()).unwrap().as_native() {
-                        let res = sig_data.as_slice::<$t>().iter()
-                        .zip(sig_dx.as_slice::<$t>().iter())
-                        .map(|(t, dt)| ::frameworks::native::helper::sigmoid_grad(t, dt));
-                        ::frameworks::native::helper::write_to_memory(result_diff.get_mut(self.device()).unwrap(), res);
-                        return Ok(());
-                    }
-                }
-                Err(Error::Plugin(PluginError::Operation("Unable to execute Native sigmoid grad Forward.")))
+                x: &SharedTensor<$t>,
+                x_diff: &SharedTensor<$t>,
+                result: &SharedTensor<$t>,
+                result_diff: &mut SharedTensor<$t>)
+                -> Result<(), Error> {
+                map2(read!(x, $t, self),
+                     read!(x_diff, $t, self),
+                     write_only!(result_diff, $t, self),
+                     ::frameworks::native::helper::sigmoid_grad)
             }
         }
     );
@@ -154,61 +144,25 @@ macro_rules! impl_ops_sigmoid_for {
 #[macro_export]
 macro_rules! impl_ops_relu_for {
     ($t:ident, $b:ty) => (
-        impl ::plugin::Relu<$t> for $b {
-            fn relu(
-                &self,
-                x: &mut ::co::tensor::SharedTensor<$t>,
-                result: &mut ::co::tensor::SharedTensor<$t>
-            ) -> Result<(), ::co::error::Error> {
-                match x.add_device(self.device()) { _ => try!(x.sync(self.device())) }
-                match result.add_device(self.device()) { _ => () }
-                self.relu_plain(x, result)
-            }
-
-            fn relu_plain(
-                &self,
-                x: &::co::tensor::SharedTensor<$t>,
-                result: &mut ::co::tensor::SharedTensor<$t>
-            ) -> Result<(), ::co::error::Error> {
-                if let Some(input) = x.get(self.device()).unwrap().as_native() {
-                    let res = input.as_slice::<$t>().iter().map(::frameworks::native::helper::relu);
-                    ::frameworks::native::helper::write_to_memory(result.get_mut(self.device()).unwrap(), res);
-                    return Ok(());
-                }
-                Err(Error::Plugin(PluginError::Operation("Unable to execute Native ReLU Forward.")))
+        impl Relu<$t> for $b {
+            fn relu(&self, x: &SharedTensor<$t>, result: &mut SharedTensor<$t>)
+                    -> Result<(), ::co::error::Error> {
+                map1(read!(x, $t, self),
+                     write_only!(result, $t, self),
+                     ::frameworks::native::helper::relu)
             }
 
             fn relu_grad(
                 &self,
-                x: &mut ::co::tensor::SharedTensor<$t>,
-                x_diff: &mut ::co::tensor::SharedTensor<$t>,
-                result: &mut ::co::tensor::SharedTensor<$t>,
-                result_diff: &mut ::co::tensor::SharedTensor<$t>
-            ) -> Result<(), ::co::error::Error> {
-                match x.add_device(self.device()) { _ => try!(x.sync(self.device())) }
-                match x_diff.add_device(self.device()) { _ => try!(x_diff.sync(self.device())) }
-                match result.add_device(self.device()) { _ => try!(result.sync(self.device())) }
-                match result_diff.add_device(self.device()) { _ => () }
-                self.relu_grad_plain(x, x_diff, result, result_diff)
-            }
-
-            fn relu_grad_plain(
-                &self,
-                x: &::co::tensor::SharedTensor<$t>,
-                x_diff: &::co::tensor::SharedTensor<$t>,
-                result: &::co::tensor::SharedTensor<$t>,
-                result_diff: &mut ::co::tensor::SharedTensor<$t>
-            ) -> Result<(), ::co::error::Error> {
-                if let Some(input) = x.get(self.device()).unwrap().as_native() {
-                    if let Some(dx) = x_diff.get(self.device()).unwrap().as_native() {
-                        let res = input.as_slice::<$t>().iter()
-                        .zip(dx.as_slice::<$t>().iter())
-                        .map(|(x, dx)| ::frameworks::native::helper::relu_grad(x, dx));
-                        ::frameworks::native::helper::write_to_memory(result_diff.get_mut(self.device()).unwrap(), res);
-                        return Ok(());
-                    }
-                }
-                Err(Error::Plugin(PluginError::Operation("Unable to execute Native ReLU grad Forward.")))
+                x: &SharedTensor<$t>,
+                x_diff: &SharedTensor<$t>,
+                result: &SharedTensor<$t>,
+                result_diff: &mut SharedTensor<$t>)
+                -> Result<(), Error> {
+                map2(read!(x, $t, self),
+                     read!(x_diff, $t, self),
+                     write_only!(result_diff, $t, self),
+                     ::frameworks::native::helper::relu_grad)
             }
         }
     );
@@ -218,61 +172,24 @@ macro_rules! impl_ops_relu_for {
 macro_rules! impl_ops_tanh_for {
     ($t:ident, $b:ty) => (
         impl ::plugin::Tanh<$t> for $b {
-            #[inline]
-            fn tanh(
-                &self,
-                x: &mut ::co::tensor::SharedTensor<$t>,
-                result: &mut ::co::tensor::SharedTensor<$t>
-            ) -> Result<(), ::co::error::Error> {
-                match x.add_device(self.device()) { _ => try!(x.sync(self.device())) }
-                match result.add_device(self.device()) { _ => () }
-                self.tanh_plain(x, result)
-            }
-
-            fn tanh_plain(
-                &self,
-                x: &::co::tensor::SharedTensor<$t>,
-                result: &mut ::co::tensor::SharedTensor<$t>
-            ) -> Result<(), ::co::error::Error> {
-                if let Some(input) = x.get(self.device()).unwrap().as_native() {
-                    let res = input.as_slice::<$t>().iter().map(::frameworks::native::helper::tanh);
-                    ::frameworks::native::helper::write_to_memory(result.get_mut(self.device()).unwrap(), res);
-                    return Ok(());
-                }
-                Err(Error::Plugin(PluginError::Operation("Unable to execute Native tanh Forward.")))
+            fn tanh(&self, x: &SharedTensor<$t>, result: &mut SharedTensor<$t>)
+                    -> Result<(), ::co::error::Error> {
+                map1(read!(x, $t, self),
+                     write_only!(result, $t, self),
+                     ::frameworks::native::helper::tanh)
             }
 
             fn tanh_grad(
                 &self,
-                x: &mut ::co::tensor::SharedTensor<$t>,
-                x_diff: &mut ::co::tensor::SharedTensor<$t>,
-                result: &mut ::co::tensor::SharedTensor<$t>,
-                result_diff: &mut ::co::tensor::SharedTensor<$t>
-            ) -> Result<(), ::co::error::Error> {
-                match x.add_device(self.device()) { _ => try!(x.sync(self.device())) }
-                match x_diff.add_device(self.device()) { _ => try!(x_diff.sync(self.device())) }
-                match result.add_device(self.device()) { _ => try!(result.sync(self.device())) }
-                match result_diff.add_device(self.device()) { _ => () }
-                self.tanh_grad_plain(x, x_diff, result, result_diff)
-            }
-
-            fn tanh_grad_plain(
-                &self,
-                x: &::co::tensor::SharedTensor<$t>,
-                x_diff: &::co::tensor::SharedTensor<$t>,
-                result: &::co::tensor::SharedTensor<$t>,
-                result_diff: &mut ::co::tensor::SharedTensor<$t>
-            ) -> Result<(), ::co::error::Error> {
-                if let Some(input) = x.get(self.device()).unwrap().as_native() {
-                    if let Some(dx) = x_diff.get(self.device()).unwrap().as_native() {
-                        let res = input.as_slice::<$t>().iter()
-                        .zip(dx.as_slice::<$t>().iter())
-                        .map(|(x, dx)| ::frameworks::native::helper::tanh_grad(x, dx));
-                        ::frameworks::native::helper::write_to_memory(result_diff.get_mut(self.device()).unwrap(), res);
-                        return Ok(());
-                    }
-                }
-                Err(Error::Plugin(PluginError::Operation("Unable to execute Native tanh_grad Forward.")))
+                x: &SharedTensor<$t>,
+                x_diff: &SharedTensor<$t>,
+                result: &SharedTensor<$t>,
+                result_diff: &mut SharedTensor<$t>)
+                -> Result<(), Error> {
+                map2(read!(x, $t, self),
+                     read!(x_diff, $t, self),
+                     write_only!(result_diff, $t, self),
+                     ::frameworks::native::helper::tanh_grad)
             }
         }
     );
@@ -284,29 +201,20 @@ macro_rules! impl_ops_convolution_for {
         impl ::plugin::Convolution<$t> for $b {
             fn new_convolution_config(
                 &self,
-                src: &::co::tensor::SharedTensor<$t>,
-                dest: &::co::tensor::SharedTensor<$t>,
-                filter: &mut ::co::tensor::SharedTensor<$t>,
+                src: &SharedTensor<$t>,
+                dest: &SharedTensor<$t>,
+                filter: &mut SharedTensor<$t>,
                 stride: &[i32],
                 zero_padding: &[i32]
             ) -> Result<Self::CC, ::co::error::Error> {
                 unimplemented!();
                 Ok(helper::ConvolutionConfig)
             }
-            fn convolution(
-                &self,
-                x: &mut ::co::tensor::SharedTensor<$t>,
-                result: &mut ::co::tensor::SharedTensor<$t>,
-                config: &Self::CC
-            ) -> Result<(), ::co::error::Error> {
-                unimplemented!();
-                Ok(())
-            }
 
-            fn convolution_plain(
+            fn convolution(
                 &self,
-                x: &::co::tensor::SharedTensor<$t>,
-                result: &mut ::co::tensor::SharedTensor<$t>,
+                x: &SharedTensor<$t>,
+                result: &mut SharedTensor<$t>,
                 config: &Self::CC
             ) -> Result<(), ::co::error::Error> {
                 unimplemented!();
@@ -315,22 +223,10 @@ macro_rules! impl_ops_convolution_for {
 
             fn convolution_grad(
                 &self,
-                x: &mut ::co::tensor::SharedTensor<$t>,
-                x_diff: &mut ::co::tensor::SharedTensor<$t>,
-                result: &mut ::co::tensor::SharedTensor<$t>,
-                result_diff: &mut ::co::tensor::SharedTensor<$t>,
-                config: &Self::CC
-            ) -> Result<(), ::co::error::Error> {
-                unimplemented!();
-                Ok(())
-            }
-
-            fn convolution_grad_plain(
-                &self,
-                x: &::co::tensor::SharedTensor<$t>,
-                x_diff: &::co::tensor::SharedTensor<$t>,
-                result: &::co::tensor::SharedTensor<$t>,
-                result_diff: &mut ::co::tensor::SharedTensor<$t>,
+                x: &SharedTensor<$t>,
+                x_diff: &SharedTensor<$t>,
+                result: &SharedTensor<$t>,
+                result_diff: &mut SharedTensor<$t>,
                 config: &Self::CC
             ) -> Result<(), ::co::error::Error> {
                 unimplemented!();
@@ -344,69 +240,40 @@ macro_rules! impl_ops_convolution_for {
 macro_rules! impl_ops_softmax_for {
     ($t:ident, $b:ty) => (
         impl ::plugin::Softmax<$t> for $b {
-            fn softmax(
-                &self,
-                x: &mut ::co::tensor::SharedTensor<$t>,
-                result: &mut ::co::tensor::SharedTensor<$t>
-            ) -> Result<(), ::co::error::Error> {
-                match x.add_device(self.device()) { _ => try!(x.sync(self.device())) }
-                match result.add_device(self.device()) { _ => () }
-                self.softmax_plain(x, result)
-            }
-            fn softmax_plain(
-                &self,
-                x: &::co::tensor::SharedTensor<$t>,
-                result: &mut ::co::tensor::SharedTensor<$t>
-            ) -> Result<(), ::co::error::Error> {
-                if let Some(input) = x.get(self.device()).unwrap().as_native() {
-                    let mut exps = Vec::with_capacity(x.capacity());
-                    let mut sum : $t = 0 as $t;
-                    for exp in input.as_slice::<$t>().iter().map(|t|t.exp()) {
-                        exps.push(exp);
-                        sum += exp;
-                    }
-                    let res = exps.iter().map(|t| t / sum);
-                    ::frameworks::native::helper::write_to_memory(result.get_mut(self.device()).unwrap(), res);
-                    return Ok(());
+            fn softmax(&self, x: &SharedTensor<$t>, result: &mut SharedTensor<$t>)
+                       -> Result<(), Error> {
+                let xs = read!(x, $t, self);
+                let rs = write_only!(result, $t, self);
+
+                try!(map1(xs, rs, |v| v.exp()));
+
+                let mut sum: $t = 0.0; // iter_arith is not stable yet
+                for r in &*rs {
+                    sum += *r;
                 }
-                Err(Error::Plugin(
-                    PluginError::Operation("Unable to execute Native softmax Forward.")))
+                for r in rs {
+                    *r /= sum;
+                }
+                Ok(())
             }
+
+            // TODO: check
             fn softmax_grad(
                 &self,
-                x: &mut ::co::tensor::SharedTensor<$t>,
-                x_diff: &mut ::co::tensor::SharedTensor<$t>,
-                result_diff: &mut ::co::tensor::SharedTensor<$t>
-            ) -> Result<(), ::co::error::Error> {
-                match x.add_device(self.device()) { _ => try!(x.sync(self.device())) }
-                match x_diff.add_device(self.device()) { _ => try!(x_diff.sync(self.device())) }
-                match result_diff.add_device(self.device()) { _ => () }
-                self.softmax_grad_plain(x, x_diff, result_diff)
-            }
-            fn softmax_grad_plain(
-                &self,
-                x: &::co::tensor::SharedTensor<$t>,
-                x_diff: &::co::tensor::SharedTensor<$t>,
-                result_diff: &mut ::co::tensor::SharedTensor<$t>
-            ) -> Result<(), ::co::error::Error> {
-                if let Some(sig_data) = x.get(self.device()).unwrap().as_native() {
-                    if let Some(sig_dx) = x_diff.get(self.device()).unwrap().as_native() {
-                        let mut dot : $t = 0 as $t;
-                        let sig_data_slice = sig_data.as_slice::<$t>();
-                        let sig_dx_slice = sig_dx.as_slice::<$t>();
-                        for (t, dt) in sig_data_slice.iter().zip(sig_dx_slice.iter()) {
-                            dot += t * dt;
-                        }
-                        let res = sig_data_slice.iter()
-                            .zip(sig_dx_slice.iter())
-                            .map(|(t, dt)| t * (dt - dot));
-                        ::frameworks::native::helper::write_to_memory(result_diff.get_mut(self.device()).unwrap(), res);
-                        return Ok(());
-                    }
+                x: &SharedTensor<$t>,
+                x_diff: &SharedTensor<$t>,
+                result_diff: &mut SharedTensor<$t>) -> Result<(), Error> {
+
+                let xs = read!(x, $t, self);
+                let dxs = read!(x_diff, $t, self);
+                let drs = write_only!(result_diff, $t, self);
+
+                let mut dot: $t = 0 as $t;
+                for (t, dt) in xs.iter().zip(dxs.iter()) {
+                    dot += t * dt;
                 }
-                Err(Error::Plugin(
-                        PluginError::Operation("Unable to execute Native softmax Backward.")))
 
+                map2(xs, dxs, drs, |t, dt| t * (dt - dot))
             }
         }
     );
@@ -416,76 +283,35 @@ macro_rules! impl_ops_softmax_for {
 macro_rules! impl_ops_log_softmax_for {
     ($t:ident, $b:ty) => (
         impl ::plugin::LogSoftmax<$t> for $b {
-            fn log_softmax(
-                &self,
-                x: &mut ::co::tensor::SharedTensor<$t>,
-                result: &mut ::co::tensor::SharedTensor<$t>
-            ) -> Result<(), ::co::error::Error> {
-                match x.add_device(self.device()) { _ => try!(x.sync(self.device())) }
-                match result.add_device(self.device()) { _ => () }
-                self.log_softmax_plain(x, result)
-            }
-            fn log_softmax_plain(
-                &self,
-                x: &::co::tensor::SharedTensor<$t>,
-                result: &mut ::co::tensor::SharedTensor<$t>
-            ) -> Result<(), ::co::error::Error> {
-                if let Some(input) = x.get(self.device()).unwrap().as_native() {
-                    let mut max_input = ::std::$t::NEG_INFINITY;
-                    for &input_val in input.as_slice::<$t>() {
-                        max_input = max_input.max(input_val);
-                    }
-
-                    let mut logsum : $t = 0 as $t;
-                    for exp in input.as_slice::<$t>().iter().map(|t| (-(max_input - t)).exp()) {
-                        logsum += exp;
-                    }
-                    logsum = max_input + logsum.ln();
-
-                    let res = input.as_slice::<$t>().iter().map(|t| t - logsum);
-
-                    ::frameworks::native::helper::write_to_memory(result.get_mut(self.device()).unwrap(), res);
-                    return Ok(());
+            fn log_softmax(&self, x: &SharedTensor<$t>, result: &mut SharedTensor<$t>)
+                           -> Result<(), ::co::error::Error> {
+                let xs = read!(x, $t, self);
+                let rs = write_only!(result, $t, self);
+
+                let max_x = xs.iter().fold(::std::$t::NEG_INFINITY,
+                                           |acc, &t| acc.max(t));
+
+                let mut logsum : $t = 0 as $t;
+                for t in xs {
+                    logsum += (-(max_x - t)).exp();
                 }
-                Err(Error::Plugin(
-                    PluginError::Operation("Unable to execute Native softmax Forward.")))
-            }
-            fn log_softmax_grad(
-                &self,
-                x: &mut ::co::tensor::SharedTensor<$t>,
-                x_diff: &mut ::co::tensor::SharedTensor<$t>,
-                result_diff: &mut ::co::tensor::SharedTensor<$t>
-            ) -> Result<(), ::co::error::Error> {
-                match x.add_device(self.device()) { _ => try!(x.sync(self.device())) }
-                match x_diff.add_device(self.device()) { _ => try!(x_diff.sync(self.device())) }
-                match result_diff.add_device(self.device()) { _ => () }
-                self.log_softmax_grad_plain(x, x_diff, result_diff)
+                logsum = max_x + logsum.ln();
+
+                map1(xs, rs, |t| t - logsum)
             }
-            fn log_softmax_grad_plain(
-                &self,
-                x: &::co::tensor::SharedTensor<$t>,
-                x_diff: &::co::tensor::SharedTensor<$t>,
-                result_diff: &mut ::co::tensor::SharedTensor<$t>
-            ) -> Result<(), ::co::error::Error> {
-                if let Some(sig_data) = x.get(self.device()).unwrap().as_native() {
-                    if let Some(sig_dx) = x_diff.get(self.device()).unwrap().as_native() {
-                        let x_slice = sig_data.as_slice::<$t>();
-                        let x_diff_slice = sig_dx.as_slice::<$t>();
-                        let mut sum = 0 as $t;
-                        for &grad_val in x_diff_slice.iter() {
-                            sum += grad_val;
-                        }
-                        let res = x_slice.iter().zip(x_diff_slice.iter()).map(|(x_val, x_diff_val)| {
-                            x_diff_val - x_val.exp() * sum
-                        });
-
-                        ::frameworks::native::helper::write_to_memory(result_diff.get_mut(self.device()).unwrap(), res);
-                        return Ok(());
-                    }
-                }
-                Err(Error::Plugin(
-                        PluginError::Operation("Unable to execute Native softmax Backward.")))
 
+            fn log_softmax_grad(&self, x: &SharedTensor<$t>, x_diff: &SharedTensor<$t>,
+                                result_diff: &mut SharedTensor<$t>)
+                                -> Result<(), ::co::error::Error> {
+                let xs = read!(x, $t, self);
+                let dxs = read!(x_diff, $t, self);
+                let drs = write_only!(result_diff, $t, self);
+
+                let mut sum = 0 as $t;
+                for &grad_val in dxs.iter() {
+                    sum += grad_val;
+                }
+                map2(xs, dxs, drs, |t, dt| dt - t.exp() * sum)
             }
         }
     );
@@ -508,8 +334,8 @@ macro_rules! impl_ops_lrn_for {
 
             fn lrn(
                 &self,
-                x: &mut ::co::tensor::SharedTensor<$t>,
-                result: &mut ::co::tensor::SharedTensor<$t>,
+                x: &mut SharedTensor<$t>,
+                result: &mut SharedTensor<$t>,
                 config: &Self::CLRN
             ) -> Result<(), ::co::error::Error> {
                 unimplemented!();
@@ -518,8 +344,8 @@ macro_rules! impl_ops_lrn_for {
 
             fn lrn_plain(
                 &self,
-                x: &::co::tensor::SharedTensor<$t>,
-                result: &mut ::co::tensor::SharedTensor<$t>,
+                x: &SharedTensor<$t>,
+                result: &mut SharedTensor<$t>,
                 config: &Self::CLRN
             ) -> Result<(), ::co::error::Error> {
                 unimplemented!();
@@ -528,10 +354,10 @@ macro_rules! impl_ops_lrn_for {
 
             fn lrn_grad(
                 &self,
-                x: &mut ::co::tensor::SharedTensor<$t>,
-                x_diff: &mut ::co::tensor::SharedTensor<$t>,
-                result: &mut ::co::tensor::SharedTensor<$t>,
-                result_diff: &mut ::co::tensor::SharedTensor<$t>,
+                x: &mut SharedTensor<$t>,
+                x_diff: &mut SharedTensor<$t>,
+                result: &mut SharedTensor<$t>,
+                result_diff: &mut SharedTensor<$t>,
                 config: &Self::CLRN
             ) -> Result<(), ::co::error::Error> {
                 unimplemented!();
@@ -540,10 +366,10 @@ macro_rules! impl_ops_lrn_for {
 
             fn lrn_grad_plain(
                 &self,
-                x: &::co::tensor::SharedTensor<$t>,
-                x_diff: &::co::tensor::SharedTensor<$t>,
-                result: &::co::tensor::SharedTensor<$t>,
-                result_diff: &mut ::co::tensor::SharedTensor<$t>,
+                x: &SharedTensor<$t>,
+                x_diff: &SharedTensor<$t>,
+                result: &SharedTensor<$t>,
+                result_diff: &mut SharedTensor<$t>,
                 config: &Self::CLRN
             ) -> Result<(), ::co::error::Error> {
                 unimplemented!();
@@ -569,8 +395,8 @@ macro_rules! impl_ops_pooling_for {
 
             fn pooling_max(
                 &self,
-                x: &mut ::co::tensor::SharedTensor<$t>,
-                result: &mut ::co::tensor::SharedTensor<$t>,
+                x: &mut SharedTensor<$t>,
+                result: &mut SharedTensor<$t>,
                 config: &Self::CPOOL
             ) -> Result<(), ::co::error::Error> {
                 unimplemented!();
@@ -579,8 +405,8 @@ macro_rules! impl_ops_pooling_for {
 
             fn pooling_max_plain(
                 &self,
-                x: &::co::tensor::SharedTensor<$t>,
-                result: &mut ::co::tensor::SharedTensor<$t>,
+                x: &SharedTensor<$t>,
+                result: &mut SharedTensor<$t>,
                 config: &Self::CPOOL
             ) -> Result<(), ::co::error::Error> {
                 unimplemented!();
@@ -589,10 +415,10 @@ macro_rules! impl_ops_pooling_for {
             #[allow(unused_variables)]
             fn pooling_max_grad(
                 &self,
-                x: &mut ::co::tensor::SharedTensor<$t>,
-                x_diff: &mut ::co::tensor::SharedTensor<$t>,
-                result: &mut ::co::tensor::SharedTensor<$t>,
-                result_diff: &mut ::co::tensor::SharedTensor<$t>,
+                x: &mut SharedTensor<$t>,
+                x_diff: &mut SharedTensor<$t>,
+                result: &mut SharedTensor<$t>,
+                result_diff: &mut SharedTensor<$t>,
                 config: &Self::CPOOL
             ) -> Result<(), ::co::error::Error> {
                 unimplemented!();
@@ -601,10 +427,10 @@ macro_rules! impl_ops_pooling_for {
 
             fn pooling_max_grad_plain(
                 &self,
-                x: &::co::tensor::SharedTensor<$t>,
-                x_diff: &::co::tensor::SharedTensor<$t>,
-                result: &::co::tensor::SharedTensor<$t>,
-                result_diff: &mut ::co::tensor::SharedTensor<$t>,
+                x: &SharedTensor<$t>,
+                x_diff: &SharedTensor<$t>,
+                result: &SharedTensor<$t>,
+                result_diff: &mut SharedTensor<$t>,
                 config: &Self::CPOOL
             ) -> Result<(), ::co::error::Error> {
                 unimplemented!();
diff --git a/src/frameworks/native/mod.rs b/src/frameworks/native/mod.rs
index 1e70e9a..d0e6adf 100644
--- a/src/frameworks/native/mod.rs
+++ b/src/frameworks/native/mod.rs
@@ -8,10 +8,42 @@ use ::plugin::*;
 use co::prelude::*;
 use co::Error;
 use co::plugin::Error as PluginError;
+use co::plugin::numeric_helpers::Float;
 
 #[macro_use]
 pub mod helper;
 
+// Those functions should be in helper.rs, but there is no point to make them
+// public.
+fn lens_eq<T>(xs: &[T], ys: &[T]) -> Result<(), Error> {
+    if xs.len() != ys.len() {
+        return Err(PluginError::Operation("Tensor dimension mismatch").into());
+    }
+    Ok(())
+}
+
+fn map1<T, F>(src: &[T], dst: &mut [T], f: F) -> Result<(), Error>
+    where T: Float,
+          F: Fn(T) -> T {
+    try!(lens_eq(dst, src));
+    for i in 0..dst.len() {
+        dst[i] = f(src[i]);
+    }
+    Ok(())
+}
+
+fn map2<T, F>(src1: &[T], src2: &[T], dst: &mut [T], f: F) -> Result<(), Error>
+    where T: Float,
+          F: Fn(T, T) -> T {
+    try!(lens_eq(dst, src1));
+    try!(lens_eq(dst, src2));
+    for i in 0..dst.len() {
+        dst[i] = f(src1[i], src2[i]);
+    }
+    Ok(())
+}
+
+
 impl_oconf_for_cc!(f32, f64);
 impl_oconf_for_clrn!(f32, f64);
 impl_oconf_for_pooling!(f32, f64);
diff --git a/src/plugin.rs b/src/plugin.rs
index 80aaa5f..dd1ab71 100644
--- a/src/plugin.rs
+++ b/src/plugin.rs
@@ -156,44 +156,25 @@ pub trait NN<F> {
 
 /// Provides the functionality for a Backend to support Sigmoid operations.
 pub trait Sigmoid<F> : NN<F> {
-    /// Computes the [Sigmoid function][sigmoid] over the input Tensor `x` with complete memory management.
+    /// Computes the [Sigmoid function][sigmoid] over the input Tensor `x`.
     /// [sigmoid]: https://en.wikipedia.org/wiki/Sigmoid_function
     ///
     /// Saves the result to `result`.
-    ///
-    /// For a no-memory managed version see `sigmoid_plain`.
-    fn sigmoid(&self, x: &mut SharedTensor<F>, result: &mut SharedTensor<F>) -> Result<(), ::co::error::Error>;
-
-    /// Computes the Sigmoid function over the input Tensor `x` without any memory management.
-    ///
-    /// Saves the result to `result`.
-    ///
-    /// *Attention*:<br/>
-    /// For a correct computation result, you need to manage the memory allocation and synchronization yourself.<br/>
-    /// For a memory managed version see `sigmoid`.
-    fn sigmoid_plain(&self, x: &SharedTensor<F>, result: &mut SharedTensor<F>) -> Result<(), ::co::error::Error>;
+    fn sigmoid(&self, x: &SharedTensor<F>, result: &mut SharedTensor<F>)
+               -> Result<(), ::co::error::Error>;
 
-    /// Computes the gradient of a [Sigmoid function][sigmoid] over the input Tensor `x` with complete memory management.
+    /// Computes the gradient of a [Sigmoid function][sigmoid] over the input Tensor `x`.
     /// [sigmoid]: https://en.wikipedia.org/wiki/Sigmoid_function
     ///
     /// Saves the result to `result_diff`.
-    ///
-    /// For a no-memory managed version see `sigmoid_grad_plain`.
-    fn sigmoid_grad(&self, x: &mut SharedTensor<F>, x_diff: &mut SharedTensor<F>, result: &mut SharedTensor<F>, result_diff: &mut SharedTensor<F>) -> Result<(), ::co::error::Error>;
-
-    /// Computes the gradient of a Sigmoid function over the input Tensor `x` without any memory management.
-    ///
-    /// Saves the result to `result_diff`.
-    ///
-    /// *Attention*:<br/>
-    /// For a correct computation result, you need to manage the memory allocation and synchronization yourself.<br/>
-    /// For a memory managed version see `sigmoid_grad`.
-    fn sigmoid_grad_plain(&self, x: &SharedTensor<F>, x_diff: &SharedTensor<F>, result: &SharedTensor<F>, result_diff: &mut SharedTensor<F>) -> Result<(), ::co::error::Error>;
+    fn sigmoid_grad(&self, x: &SharedTensor<F>, x_diff: &SharedTensor<F>,
+                    result: &SharedTensor<F>, result_diff: &mut SharedTensor<F>)
+                    -> Result<(), ::co::error::Error>;
 }
 
 /// Provides the functionality for pointwise Sigmoid operations (overwrites the input with the result of the operation).
 pub trait SigmoidPointwise<F> : NN<F> {
-    /// Computes the [Sigmoid function][sigmoid] over the input Tensor `x` with complete memory management.
+    /// Computes the [Sigmoid function][sigmoid] over the input Tensor `x`.
     /// [sigmoid]: https://en.wikipedia.org/wiki/Sigmoid_function
     ///
     /// Saves the result back to `x`.
@@ -201,239 +182,132 @@ pub trait SigmoidPointwise<F> : NN<F> {
     /// For a no-memory managed version see `sigmoid_pointwise_plain`.
     fn sigmoid_pointwise(&self, x: &mut SharedTensor<F>) -> Result<(), ::co::error::Error>;
 
-    /// Computes the Sigmoid function over the input Tensor `x` without any memory management.
-    ///
-    /// Saves the result back to `x`.
-    ///
-    /// *Attention*:<br/>
-    /// For a correct computation result, you need to manage the memory allocation and synchronization yourself.<br/>
-    /// For a memory managed version see `sigmoid_pointwise`.
-    fn sigmoid_pointwise_plain(&self, x: &mut SharedTensor<F>) -> Result<(), ::co::error::Error>;
-
-    /// Computes the gradient of a [Sigmoid function][sigmoid] over the input Tensor `x` with complete memory management.
+    /// Computes the gradient of a [Sigmoid function][sigmoid] over the input Tensor `x`.
     /// [sigmoid]: https://en.wikipedia.org/wiki/Sigmoid_function
     ///
     /// Saves the result back to `x_diff`.
-    ///
-    /// For a no-memory managed version see `sigmoid_pointwise_grad_plain`.
-    fn sigmoid_pointwise_grad(&self, x: &mut SharedTensor<F>, x_diff: &mut SharedTensor<F>) -> Result<(), ::co::error::Error>;
-
-    /// Computes the gradient of a Sigmoid function over the input Tensor `x` without any memory management.
-    ///
-    /// Saves the result back to `x_diff`.
-    ///
-    /// *Attention*:<br/>
-    /// For a correct computation result, you need to manage the memory allocation and synchronization yourself.<br/>
-    /// For a memory managed version see `sigmoid_pointwise_grad`.
-    fn sigmoid_pointwise_grad_plain(&self, x: &SharedTensor<F>, x_diff: &mut SharedTensor<F>) -> Result<(), ::co::error::Error>;
+    fn sigmoid_pointwise_grad(&self, x: &SharedTensor<F>, x_diff: &mut SharedTensor<F>) -> Result<(), ::co::error::Error>;
 }
 
 /// Provides the functionality for a Backend to support ReLU operations.
 pub trait Relu<F> : NN<F> {
-    /// Computes the [Rectified linear units][relu] over the input Tensor `x` with complete memory management.
+    /// Computes the [Rectified linear units][relu] over the input Tensor `x`.
     /// [relu]: https://en.wikipedia.org/wiki/Rectifier_(neural_networks)
     ///
     /// Saves the result to `result`.
-    ///
-    /// For a no-memory managed version see `relu_plain`.
-    fn relu(&self, x: &mut SharedTensor<F>, result: &mut SharedTensor<F>) -> Result<(), ::co::error::Error>;
+    fn relu(&self, x: &SharedTensor<F>, result: &mut SharedTensor<F>) -> Result<(), ::co::error::Error>;
 
-    /// Computes the ReLU over the input Tensor `x` without any memory management.
-    ///
-    /// Saves the result to `result`.
-    ///
-    /// *Attention*:<br/>
-    /// For a correct computation result, you need to manage the memory allocation and synchronization yourself.<br/>
-    /// For a memory managed version see `relu`.
-    fn relu_plain(&self, x: &SharedTensor<F>, result: &mut SharedTensor<F>) -> Result<(), ::co::error::Error>;
-
-    /// Computes the gradient of [ReLU][relu] over the input Tensor `x` with complete memory management.
+    /// Computes the gradient of [ReLU][relu] over the input Tensor `x`.
     /// [relu]: https://en.wikipedia.org/wiki/Rectifier_(neural_networks)
     ///
     /// Saves the result to `result_diff`.
-    ///
-    /// For a no-memory managed version see `relu_grad_plain`.
-    fn relu_grad(&self, x: &mut SharedTensor<F>, x_diff: &mut SharedTensor<F>, result: &mut SharedTensor<F>, result_diff: &mut SharedTensor<F>) -> Result<(), ::co::error::Error>;
-
-    /// Computes the gradient of ReLU over the input Tensor `x` without any memory management.
-    ///
-    /// Saves the result to `result_diff`.
-    ///
-    /// *Attention*:<br/>
-    /// For a correct computation result, you need to manage the memory allocation and synchronization yourself.<br/>
-    /// For a memory managed version see `relu_grad`.
-    fn relu_grad_plain(&self, x: &SharedTensor<F>, x_diff: &SharedTensor<F>, result: &SharedTensor<F>, result_diff: &mut SharedTensor<F>) -> Result<(), ::co::error::Error>;
+    fn relu_grad(&self, x: &SharedTensor<F>, x_diff: &SharedTensor<F>,
+                 result: &SharedTensor<F>, result_diff: &mut SharedTensor<F>)
+                 -> Result<(), ::co::error::Error>;
 }
 
 /// Provides the functionality for pointwise ReLU operations (overwrites the input with the result of the operation).
 pub trait ReluPointwise<F> : NN<F> {
-    /// Computes the [Rectified linear units][relu] over the input Tensor `x` with complete memory management.
+    /// Computes the [Rectified linear units][relu] over the input Tensor `x`.
     /// [relu]: https://en.wikipedia.org/wiki/Rectifier_(neural_networks)
     ///
     /// Saves the result back to `x`.
-    ///
-    /// For a no-memory managed version see `relu_pointwise_plain`.
     fn relu_pointwise(&self, x: &mut SharedTensor<F>) -> Result<(), ::co::error::Error>;
 
-    /// Computes the ReLU over the input Tensor `x` without any memory management.
-    ///
-    /// Saves the result back to `x`.
-    ///
-    /// *Attention*:<br/>
-    /// For a correct computation result, you need to manage the memory allocation and synchronization yourself.<br/>
-    /// For a memory managed version see `relu_pointwise`.
-    fn relu_pointwise_plain(&self, x: &mut SharedTensor<F>) -> Result<(), ::co::error::Error>;
-
-    /// Computes the gradient of [ReLU][relu] over the input Tensor `x` with complete memory management.
+    /// Computes the gradient of [ReLU][relu] over the input Tensor `x`.
     /// [relu]: https://en.wikipedia.org/wiki/Rectifier_(neural_networks)
     ///
     /// Saves the result back to `x_diff`.
-    ///
-    /// For a no-memory managed version see `relu_pointwise_grad_plain`.
-    fn relu_pointwise_grad(&self, x: &mut SharedTensor<F>, x_diff: &mut SharedTensor<F>) -> Result<(), ::co::error::Error>;
-
-    /// Computes the gradient of ReLU over the input Tensor `x` without any memory management.
-    ///
-    /// Saves the result back to `x_diff`.
-    ///
-    /// *Attention*:<br/>
-    /// For a correct computation result, you need to manage the memory allocation and synchronization yourself.<br/>
-    /// For a memory managed version see `relu_pointwise_grad`.
-    fn relu_pointwise_grad_plain(&self, x: &SharedTensor<F>, x_diff: &mut SharedTensor<F>) -> Result<(), ::co::error::Error>;
+    fn relu_pointwise_grad(&self, x: &SharedTensor<F>, x_diff: &mut SharedTensor<F>)
+                           -> Result<(), ::co::error::Error>;
 }
 
 /// Provides the functionality for a Backend to support TanH operations.
 pub trait Tanh<F> : NN<F> {
-    /// Computes the [hyperbolic Tangent][tanh] over the input Tensor `x` with complete memory management.
+    /// Computes the [hyperbolic Tangent][tanh] over the input Tensor `x`.
     /// [tanh]: https://en.wikipedia.org/wiki/Hyperbolic_function
     ///
     /// Saves the result to `result`.
-    ///
-    /// For a no-memory managed version see `tanh_plain`.
-    fn tanh(&self, x: &mut SharedTensor<F>, result: &mut SharedTensor<F>) -> Result<(), ::co::error::Error>;
+    fn tanh(&self, x: &SharedTensor<F>, result: &mut SharedTensor<F>)
+            -> Result<(), ::co::error::Error>;
 
-    /// Computes the tanh over the input Tensor `x` without any memory management.
-    ///
-    /// Saves the result to `result`.
-    ///
-    /// *Attention*:<br/>
-    /// For a correct computation result, you need to manage the memory allocation and synchronization yourself.<br/>
-    /// For a memory managed version see `tanh`.
-    fn tanh_plain(&self, x: &SharedTensor<F>, result: &mut SharedTensor<F>) -> Result<(), ::co::error::Error>;
-
-    /// Computes the gradient of [tanh][tanh] over the input Tensor `x` with complete memory management.
+    /// Computes the gradient of [tanh][tanh] over the input Tensor `x`.
     /// [tanh]: https://en.wikipedia.org/wiki/Hyperbolic_function
     ///
     /// Saves the result to `result_diff`.
-    ///
-    /// For a no-memory managed version see `tanh_grad_plain`.
-    fn tanh_grad(&self, x: &mut SharedTensor<F>, x_diff: &mut SharedTensor<F>, result: &mut SharedTensor<F>, result_diff: &mut SharedTensor<F>) -> Result<(), ::co::error::Error>;
-
-    /// Computes the gradient of tanh over the input Tensor `x` without any memory management.
-    ///
-    /// Saves the result to `result_diff`.
-    ///
-    /// *Attention*:<br/>
-    /// For a correct computation result, you need to manage the memory allocation and synchronization yourself.<br/>
-    /// For a memory managed version see `tanh_grad`.
-    fn tanh_grad_plain(&self, x: &SharedTensor<F>, x_diff: &SharedTensor<F>, result: &SharedTensor<F>, result_diff: &mut SharedTensor<F>) -> Result<(), ::co::error::Error>;
+    fn tanh_grad(&self, x: &SharedTensor<F>, x_diff: &SharedTensor<F>,
+                 result: &SharedTensor<F>, result_diff: &mut SharedTensor<F>)
+                 -> Result<(), ::co::error::Error>;
 }
 
-/// Provides the functionality for pointwise ReLU operations (overwrites the input with the result of the operation).
+/// Provides the functionality for pointwise ReLU operations (overwrites the input
+/// with the result of the operation).
 pub trait TanhPointwise<F> : NN<F> {
-    /// Computes the [hyperbolic Tangent][tanh] over the input Tensor `x` with complete memory management.
+    /// Computes the [hyperbolic Tangent][tanh] over the input Tensor `x`.
     /// [tanh]: https://en.wikipedia.org/wiki/Hyperbolic_function
     ///
     /// Saves the result back to `x`.
-    ///
-    /// For a no-memory managed version see `tanh_pointwise_plain`.
     fn tanh_pointwise(&self, x: &mut SharedTensor<F>) -> Result<(), ::co::error::Error>;
 
-    /// Computes the tanh over the input Tensor `x` without any memory management.
-    ///
-    /// Saves the result back to `x`.
-    ///
-    /// *Attention*:<br/>
-    /// For a correct computation result, you need to manage the memory allocation and synchronization yourself.<br/>
-    /// For a memory managed version see `tanh_pointwise`.
-    fn tanh_pointwise_plain(&self, x: &mut SharedTensor<F>) -> Result<(), ::co::error::Error>;
-
-    /// Computes the gradient of [tanh][tanh] over the input Tensor `x` with complete memory management.
+    /// Computes the gradient of [tanh][tanh] over the input Tensor `x`.
     /// [tanh]: https://en.wikipedia.org/wiki/Hyperbolic_function
     ///
     /// Saves the result back to `x_diff`.
-    ///
-    /// For a no-memory managed version see `tanh_pointwise_grad_plain`.
-    fn tanh_pointwise_grad(&self, x: &mut SharedTensor<F>, x_diff: &mut SharedTensor<F>) -> Result<(), ::co::error::Error>;
-
-    /// Computes the gradient of tanh over the input Tensor `x` without any memory management.
-    ///
-    /// Saves the result back to `x_diff`.
-    ///
-    /// *Attention*:<br/>
-    /// For a correct computation result, you need to manage the memory allocation and synchronization yourself.<br/>
-    /// For a memory managed version see `tanh_pointwise_grad`.
-    fn tanh_pointwise_grad_plain(&self, x: &SharedTensor<F>, x_diff: &mut SharedTensor<F>) -> Result<(), ::co::error::Error>;
+    fn tanh_pointwise_grad(&self, x: &SharedTensor<F>, x_diff: &mut SharedTensor<F>)
+                           -> Result<(), ::co::error::Error>;
 }
 
 /// Provides the functionality for a Backend to support Convolution operations.
 pub trait Convolution<F> : NN<F> {
-    /// Creates a new ConvolutionConfig, which needs to be passed to further convolution Operations.
-    fn new_convolution_config(&self, src: &SharedTensor<F>, dest: &SharedTensor<F>, filter: &mut SharedTensor<F>,
-                            algo_fwd: ConvForwardAlgo, algo_bwd_filter: ConvBackwardFilterAlgo, algo_bwd_data: ConvBackwardDataAlgo,
-                            stride: &[i32], zero_padding: &[i32]) -> Result<Self::CC, ::co::error::Error>;
-
-    /// Computes a [CNN convolution][convolution] over the input Tensor `x` with complete memory management.
+    /// Creates a new ConvolutionConfig, which needs to be passed to further
+    /// convolution Operations.
+    fn new_convolution_config(&self,
+                              src: &SharedTensor<F>,
+                              dest: &SharedTensor<F>,
+                              filter: &SharedTensor<F>,
+                              algo_fwd: ConvForwardAlgo,
+                              algo_bwd_filter: ConvBackwardFilterAlgo,
+                              algo_bwd_data: ConvBackwardDataAlgo,
+                              stride: &[i32],
+                              zero_padding: &[i32])
+                              -> Result<Self::CC, ::co::error::Error>;
+
+    /// Computes a [CNN convolution][convolution] over the input Tensor `x`.
     /// [convolution]: https://en.wikipedia.org/wiki/Convolutional_neural_network
     ///
     /// Saves the result to `result`.
-    ///
-    /// For a no-memory managed version see `convolution_plain`.
-    fn convolution(&self, filter: &mut SharedTensor<F>, x: &mut SharedTensor<F>, result: &mut SharedTensor<F>, workspace: &mut SharedTensor<u8>, config: &Self::CC) -> Result<(), ::co::error::Error>;
-
-    /// Computes the convolution over the input Tensor `x` without any memory management.
-    ///
-    /// Saves the result to `result`.
-    ///
-    /// *Attention*:<br/>
-    /// For a correct computation result, you need to manage the memory allocation and synchronization yourself.<br/>
-    /// For a memory managed version see `convolution`.
-    fn convolution_plain(&self, filter: &SharedTensor<F>, x: &SharedTensor<F>, result: &mut SharedTensor<F>, workspace: &mut SharedTensor<u8>, config: &Self::CC) -> Result<(), ::co::error::Error>;
-
-    /// Computes the gradient of a [CNN convolution][convolution] with respect to the filter and complete memory management.
+    fn convolution(&self,
+                   filter: &SharedTensor<F>,
+                   x: &SharedTensor<F>,
+                   result: &mut SharedTensor<F>,
+                   workspace: &mut SharedTensor<u8>,
+                   config: &Self::CC)
+                   -> Result<(), ::co::error::Error>;
+
+    /// Computes the gradient of a [CNN convolution][convolution] with respect to the filter.
     /// [convolution]: https://en.wikipedia.org/wiki/Convolutional_neural_network
     ///
     /// Saves the result to `filter_diff`.
-    ///
-    /// For a no-memory managed version see `convolution_grad_filter_plain`.
-    fn convolution_grad_filter(&self, src_data: &mut SharedTensor<F>, dest_diff: &mut SharedTensor<F>, filter_diff: &mut SharedTensor<F>, workspace: &mut SharedTensor<u8>, config: &Self::CC) -> Result<(), ::co::error::Error>;
-
-    /// Computes the gradient of a convolution with respect to the filter and without any memory management.
-    ///
-    /// Saves the result to `filter_diff`.
-    ///
-    /// *Attention*:<br/>
-    /// For a correct computation result, you need to manage the memory allocation and synchronization yourself.<br/>
-    /// For a memory managed version see `convolution_grad_filter`.
-    fn convolution_grad_filter_plain(&self, src_data: &SharedTensor<F>, dest_diff: &SharedTensor<F>, filter_diff: &mut SharedTensor<F>, workspace: &mut SharedTensor<u8>, config: &Self::CC) -> Result<(), ::co::error::Error>;
-
-    /// Computes the gradient of a [CNN convolution][convolution] over the input Tensor `x` with respect to the data and complete memory management.
+    fn convolution_grad_filter(&self,
+                               src_data: &SharedTensor<F>,
+                               dest_diff: &SharedTensor<F>,
+                               filter_diff: &mut SharedTensor<F>,
+                               workspace: &mut SharedTensor<u8>,
+                               config: &Self::CC)
+                               -> Result<(), ::co::error::Error>;
+
+    /// Computes the gradient of a [CNN convolution][convolution] over the input
+    /// Tensor `x` with respect to the data.
     /// [convolution]: https://en.wikipedia.org/wiki/Convolutional_neural_network
     ///
     /// Saves the result to `result_diff`.
-    ///
-    /// For a no-memory managed version see `convolution_grad_data_plain`.
-    fn convolution_grad_data(&self, filter: &mut SharedTensor<F>, x_diff: &mut SharedTensor<F>, result_diff: &mut SharedTensor<F>, workspace: &mut SharedTensor<u8>, config: &Self::CC) -> Result<(), ::co::error::Error>;
-
-    /// Computes the gradient of a convolution over the input Tensor `x` with respect to the data and without any memory management.
-    ///
-    /// Saves the result to `result_diff`.
-    ///
-    /// *Attention*:<br/>
-    /// For a correct computation result, you need to manage the memory allocation and synchronization yourself.<br/>
-    /// For a memory managed version see `convolution_grad_data`.
-    fn convolution_grad_data_plain(&self, filter: &SharedTensor<F>, x_diff: &SharedTensor<F>, result_diff: &mut SharedTensor<F>, workspace: &mut SharedTensor<u8>, config: &Self::CC) -> Result<(), ::co::error::Error>;
-}
+    fn convolution_grad_data(&self,
+                             filter: &SharedTensor<F>,
+                             x_diff: &SharedTensor<F>,
+                             result_diff: &mut SharedTensor<F>,
+                             workspace: &mut SharedTensor<u8>,
+                             config: &Self::CC)
+                             -> Result<(), ::co::error::Error>;
 
     // /// Computes the backward Convolution function w.r.t the bias.
     // ///
@@ -460,155 +334,85 @@ pub trait Convolution<F> : NN<F> {
     //     filter_data: *mut ::libc::c_void,
     //     scale: ScalParams<T>,
     // }
+}
 
 /// Provides the functionality for a Backend to support Softmax operations.
 pub trait Softmax<F> : NN<F> {
-    /// Computes a [Softmax][softmax] over the input Tensor `x` with complete memory management.
+    /// Computes a [Softmax][softmax] over the input Tensor `x`.
     /// [softmax]: https://en.wikipedia.org/wiki/Softmax_function
     ///
     /// Saves the result to `result`.
-    ///
-    /// For a no-memory managed version see `softmax_plain`.
-    fn softmax(&self, x: &mut SharedTensor<F>, result: &mut SharedTensor<F>) -> Result<(), ::co::error::Error>;
+    fn softmax(&self, x: &SharedTensor<F>, result: &mut SharedTensor<F>)
+               -> Result<(), ::co::error::Error>;
 
-    /// Computes the softmax over the input Tensor `x` without any memory management.
-    ///
-    /// Saves the result to `result`.
-    ///
-    /// *Attention*:<br/>
-    /// For a correct computation result, you need to manage the memory allocation and synchronization yourself.<br/>
-    /// For a memory managed version see `softmax`.
-    fn softmax_plain(&self, x: &SharedTensor<F>, result: &mut SharedTensor<F>) -> Result<(), ::co::error::Error>;
-
-    /// Computes the gradient of a [Softmax][softmax] over the input Tensor `x` with complete memory management.
+    /// Computes the gradient of a [Softmax][softmax] over the input Tensor `x`.
     /// [softmax]: https://en.wikipedia.org/wiki/Softmax_function
     ///
     /// Saves the result to `result_diff`.
-    ///
-    /// For a no-memory managed version see `softmax_grad_plain`.
-    fn softmax_grad(&self, x: &mut SharedTensor<F>, x_diff: &mut SharedTensor<F>, result_diff: &mut SharedTensor<F>) -> Result<(), ::co::error::Error>;
-
-    /// Computes the gradient of a softmax over the input Tensor `x` without any memory management.
-    ///
-    /// Saves the result to `result_diff`.
-    ///
-    /// *Attention*:<br/>
-    /// For a correct computation result, you need to manage the memory allocation and synchronization yourself.<br/>
-    /// For a memory managed version see `softmax_grad`.
-    fn softmax_grad_plain(&self, x: &SharedTensor<F>, x_diff: &SharedTensor<F>, result_diff: &mut SharedTensor<F>) -> Result<(), ::co::error::Error>;
+    fn softmax_grad(&self, x: &SharedTensor<F>, x_diff: &SharedTensor<F>,
+                    result_diff: &mut SharedTensor<F>)
+                    -> Result<(), ::co::error::Error>;
 }
 
 /// Provides the functionality for a Backend to support LogSoftmax operations.
 pub trait LogSoftmax<F> : NN<F> {
-    /// Computes a logarithmic softmax over the input Tensor `x` with complete memory management.
+    /// Computes a logarithmic softmax over the input Tensor `x`.
     ///
     /// Saves the result to `result`.
-    ///
-    /// For a no-memory managed version see `log_softmax_plain`.
-    fn log_softmax(&self, x: &mut SharedTensor<F>, result: &mut SharedTensor<F>) -> Result<(), ::co::error::Error>;
-
-    /// Computes the logarithmic softmax over the input Tensor `x` without any memory management.
-    ///
-    /// Saves the result to `result`.
-    ///
-    /// *Attention*:<br/>
-    /// For a correct computation result, you need to manage the memory allocation and synchronization yourself.<br/>
-    /// For a memory managed version see `log_softmax`.
-    fn log_softmax_plain(&self, x: &SharedTensor<F>, result: &mut SharedTensor<F>) -> Result<(), ::co::error::Error>;
+    fn log_softmax(&self, x: &SharedTensor<F>, result: &mut SharedTensor<F>)
+                   -> Result<(), ::co::error::Error>;
 
-    /// Computes the gradient of a logarithmic softmax over the input Tensor `x` with complete memory management.
+    /// Computes the gradient of a logarithmic softmax over the input Tensor `x`.
     ///
     /// Saves the result to `result_diff`.
-    ///
-    /// For a no-memory managed version see `log_softmax_grad_plain`.
-    fn log_softmax_grad(&self, x: &mut SharedTensor<F>, x_diff: &mut SharedTensor<F>, result_diff: &mut SharedTensor<F>) -> Result<(), ::co::error::Error>;
-
-    /// Computes the gradient of a logarithmic softmax over the input Tensor `x` without any memory management.
-    ///
-    /// Saves the result to `result_diff`.
-    ///
-    /// *Attention*:<br/>
-    /// For a correct computation result, you need to manage the memory allocation and synchronization yourself.<br/>
-    /// For a memory managed version see `log_softmax_grad`.
-    fn log_softmax_grad_plain(&self, x: &SharedTensor<F>, x_diff: &SharedTensor<F>, result_diff: &mut SharedTensor<F>) -> Result<(), ::co::error::Error>;
+    fn log_softmax_grad(&self, x: &SharedTensor<F>, x_diff: &SharedTensor<F>,
+                        result_diff: &mut SharedTensor<F>)
+                        -> Result<(), ::co::error::Error>;
 }
 
 /// Provides the functionality for a Backend to support Local Response Normalization operations.
 pub trait LRN<F> : NN<F> {
-    /// Creates a new (Local Response Normalization) LRNConfig, which needs to be passed to further LRN Operations.
-    fn new_lrn_config(&self, n: u32, alpha: f64, beta: f64, k: f64) -> Result<Self::CLRN, ::co::error::Error>;
+    /// Creates a new (Local Response Normalization) LRNConfig, which needs to be
+    /// passed to further LRN Operations.
+    fn new_lrn_config(&self, n: u32, alpha: f64, beta: f64, k: f64)
+                      -> Result<Self::CLRN, ::co::error::Error>;
 
-    /// Computes a [LRN][lrn] over the input Tensor `x` with complete memory management.
+    /// Computes a [LRN][lrn] over the input Tensor `x`.
     /// [lrn]: https://en.wikipedia.org/wiki/lrnal_neural_network
     ///
     /// Saves the result to `result`.
-    ///
-    /// For a no-memory managed version see `lrn_plain`.
-    fn lrn(&self, x: &mut SharedTensor<F>, result: &mut SharedTensor<F>, config: &Self::CLRN) -> Result<(), ::co::error::Error>;
-
-    /// Computes the LRN over the input Tensor `x` without any memory management.
-    ///
-    /// Saves the result to `result`.
-    ///
-    /// *Attention*:<br/>
-    /// For a correct computation result, you need to manage the memory allocation and synchronization yourself.<br/>
-    /// For a memory managed version see `lrn`.
-    fn lrn_plain(&self, x: &SharedTensor<F>, result: &mut SharedTensor<F>, config: &Self::CLRN) -> Result<(), ::co::error::Error>;
+    fn lrn(&self, x: &SharedTensor<F>, result: &mut SharedTensor<F>,
+           config: &Self::CLRN) -> Result<(), ::co::error::Error>;
 
-    /// Computes the gradient of a [LRN][lrn] over the input Tensor `x` with complete memory management.
+    /// Computes the gradient of a [LRN][lrn] over the input Tensor `x`.
     /// [lrn]: https://en.wikipedia.org/wiki/lrnal_neural_network
     ///
     /// Saves the result to `result_diff`.
-    ///
-    /// For a no-memory managed version see `lrn_grad_plain`.
-    fn lrn_grad(&self, x: &mut SharedTensor<F>, x_diff: &mut SharedTensor<F>, result: &mut SharedTensor<F>, result_diff: &mut SharedTensor<F>, config: &Self::CLRN) -> Result<(), ::co::error::Error>;
-
-    /// Computes the gradient of a LRN over the input Tensor `x` without any memory management.
-    ///
-    /// Saves the result to `result_diff`.
-    ///
-    /// *Attention*:<br/>
-    /// For a correct computation result, you need to manage the memory allocation and synchronization yourself.<br/>
-    /// For a memory managed version see `lrn_grad`.
-    fn lrn_grad_plain(&self, x: &SharedTensor<F>, x_diff: &SharedTensor<F>, result: &SharedTensor<F>, result_diff: &mut SharedTensor<F>, config: &Self::CLRN) -> Result<(), ::co::error::Error>;
+    fn lrn_grad(&self,
+                x: &SharedTensor<F>, x_diff: &SharedTensor<F>,
+                result: &SharedTensor<F>, result_diff: &mut SharedTensor<F>,
+                config: &Self::CLRN)
+                -> Result<(), ::co::error::Error>;
 }
 
 /// Provides the functionality for a Backend to support Pooling operations.
 pub trait Pooling<F> : NN<F> {
     /// Creates a new PoolingConfig, which needs to be passed to further pooling Operations.
-    fn new_pooling_config(&self, window: &[i32], padding: &[i32], stride: &[i32]) -> Result<Self::CPOOL, ::co::error::Error>;
+    fn new_pooling_config(&self, window: &[i32], padding: &[i32], stride: &[i32])
+                          -> Result<Self::CPOOL, ::co::error::Error>;
 
-    /// Computes non-linear down-sampling ([max Pooling][pooling]) over the input Tensor `x` with complete memory management.
+    /// Computes non-linear down-sampling ([max Pooling][pooling]) over the input Tensor `x`.
     /// [pooling]: https://en.wikipedia.org/wiki/Convolutional_neural_network#Pooling_layer
     ///
     /// Saves the result to `result`.
-    ///
-    /// For a no-memory managed version see `pooling_max_plain`.
-    fn pooling_max(&self, x: &mut SharedTensor<F>, result: &mut SharedTensor<F>, config: &Self::CPOOL) -> Result<(), ::co::error::Error>;
+    fn pooling_max(&self, x: &SharedTensor<F>, result: &mut SharedTensor<F>,
+                   config: &Self::CPOOL) -> Result<(), ::co::error::Error>;
 
-    /// Computes the max pooling over the input Tensor `x` without any memory management.
-    ///
-    /// Saves the result to `result`.
-    ///
-    /// *Attention*:<br/>
-    /// For a correct computation result, you need to manage the memory allocation and synchronization yourself.<br/>
-    /// For a memory managed version see `pooling_max`.
-    fn pooling_max_plain(&self, x: &SharedTensor<F>, result: &mut SharedTensor<F>, config: &Self::CPOOL) -> Result<(), ::co::error::Error>;
-
-    /// Computes the gradient of [max Pooling][pooling] over the input Tensor `x` with complete memory management.
+    /// Computes the gradient of [max Pooling][pooling] over the input Tensor `x`.
     /// [pooling]: https://en.wikipedia.org/wiki/Convolutional_neural_network#Pooling_layer
     ///
     /// Saves the result to `result_diff`.
-    ///
-    /// For a no-memory managed version see `pooling_max_grad_plain`.
-    fn pooling_max_grad(&self, x: &mut SharedTensor<F>, x_diff: &mut SharedTensor<F>, result: &mut SharedTensor<F>, result_diff: &mut SharedTensor<F>, config: &Self::CPOOL) -> Result<(), ::co::error::Error>;
-
-    /// Computes the gradient of max pooling over the input Tensor `x` without any memory management.
-    ///
-    /// Saves the result to `result_diff`.
-    ///
-    /// *Attention*:<br/>
-    /// For a correct computation result, you need to manage the memory allocation and synchronization yourself.<br/>
-    /// For a memory managed version see `pooling_max_grad`.
-    fn pooling_max_grad_plain(&self, x: &SharedTensor<F>, x_diff: &SharedTensor<F>, result: &SharedTensor<F>, result_diff: &mut SharedTensor<F>, config: &Self::CPOOL) -> Result<(), ::co::error::Error>;
+    fn pooling_max_grad(&self, x: &SharedTensor<F>, x_diff: &SharedTensor<F>,
+                        result: &SharedTensor<F>, result_diff: &mut SharedTensor<F>,
+                        config: &Self::CPOOL) -> Result<(), ::co::error::Error>;
 }