diff --git a/cmake/mkl.cmake b/cmake/mkl.cmake
index 55115f71..b2a04c16 100644
--- a/cmake/mkl.cmake
+++ b/cmake/mkl.cmake
@@ -151,7 +151,12 @@ detect_mkl("mkl_rt")
 if(HAVE_MKL)
     include_directories(AFTER ${MKLINC})
     install(DIRECTORY ${MKLINC}/ DESTINATION include/mklml)
-    install(DIRECTORY ${MKLINC}/../lib/ DESTINATION lib)
+
+    set(LIBLIST libmklml_intel.so libiomp5.so)
+    foreach(LIB ${LIBLIST})
+        install(FILES ${MKLINC}/../lib/${LIB} DESTINATION lib)
+    endforeach()
+
     list(APPEND mkldnn_LINKER_LIBS ${MKLLIB})
 
     set(MSG "Intel(R) MKL:")
diff --git a/conda/build.sh b/conda/build.sh
new file mode 100755
index 00000000..f7079d40
--- /dev/null
+++ b/conda/build.sh
@@ -0,0 +1,24 @@
+#!/bin/bash
+
+# Install script for Anaconda environments on macOS and linux.
+# This script is not supposed to be called directly, but should be run by:
+#
+# $ cd <path to ideep, e.g. ~/ideep>
+# $ conda build conda
+#
+#
+# If you're debugging this, it may be useful to use the env that conda build is
+# using:
+# $ cd <anaconda_root>/conda-bld/ideep_<timestamp>
+# $ source activate _h_env_... # some long path with lots of placeholders
+#
+# Also, failed builds will accumulate those ideep_<timestamp> directories. You
+# can remove them after a succesfull build with
+# $ conda build purge
+#
+git submodule update --init
+mkdir build
+cd build
+cmake -DCMAKE_INSTALL_PREFIX=$HOME/.local ..
+cd ../python
+python setup.py install
diff --git a/conda/meta.yaml b/conda/meta.yaml
new file mode 100644
index 00000000..9be14791
--- /dev/null
+++ b/conda/meta.yaml
@@ -0,0 +1,27 @@
+{% set version = "2.0.0b1" %}
+
+package:
+  name: ideep4py
+  version: {{ version }}
+
+source:
+  path: ../
+
+build:
+  number: 0
+  skip: True  # [win]
+
+requirements:
+  build:
+    - numpy
+    - python
+  run:
+    - numpy
+    - python
+
+test:
+  imports:
+    - ideep4py
+
+about:
+  license: MIT
diff --git a/docker/python/Dockerfile b/docker/python/Dockerfile
new file mode 100644
index 00000000..f39b0591
--- /dev/null
+++ b/docker/python/Dockerfile
@@ -0,0 +1,11 @@
+FROM ubuntu:16.04
+
+RUN apt-get update -y && \
+    apt-get install -y --no-install-recommends \
+    python-dev \
+    python-pip \
+    python-wheel \
+    python-setuptools && \
+    rm -rf /var/lib/apt/lists/* /var/cache/apt/archives/*
+
+RUN pip install ideep4py 
diff --git a/include/ideep.hpp b/include/ideep.hpp
index b5e7c000..f62d566e 100644
--- a/include/ideep.hpp
+++ b/include/ideep.hpp
@@ -43,10 +43,7 @@
 #include "ideep/tensor.hpp"
 #include "ideep/computations.hpp"
 #include "ideep/allocators.hpp"
-
-#if __GNUC__ > 4
 #include "ideep/fast_math.hpp"
-#endif
 
 #endif
 
diff --git a/include/ideep/abstract_types.hpp b/include/ideep/abstract_types.hpp
index 19306d9b..8cd255f3 100644
--- a/include/ideep/abstract_types.hpp
+++ b/include/ideep/abstract_types.hpp
@@ -21,38 +21,27 @@ using error = mkldnn::error;
 // For 2D convolution with grouped weights, the ndims must be 5 (goihw)
 #define IDEEP_IS_GROUPED_4DIMS(d) (((d).size() == 5) ? 1 : 0)
 
+#define IDEEP_MOD_PTR(ptr, bytes) (((uintptr_t)(ptr)) & ((bytes) - 1))
+#define IDEEP_IS_ALIGNED_PTR(ptr, bytes) ((IDEEP_MOD_PTR(ptr, bytes)) == 0)
+
 /// Same class for resource management, except public default constructor
 /// Movable support for better performance
 template <typename T, typename traits = mkldnn::handle_traits<T>>
-class c_wrapper{
-protected:
-  std::shared_ptr<typename std::remove_pointer<T>::type> _data;
+class c_wrapper :
+  public std::shared_ptr<typename std::remove_pointer<T>::type> {
+  using super = std::shared_ptr<typename std::remove_pointer<T>::type>;
 public:
   /// Constructs a C handle wrapper.
   /// @param t The C handle to wrap.
   /// @param weak A flag to specify whether to construct a weak wrapper.
-  c_wrapper(T t = nullptr, bool weak = false): _data(t, [weak]() {
+  c_wrapper(T t = nullptr, bool weak = false): super(t, [weak]() {
     auto dummy = [](T) {
       return decltype(traits::destructor(0))(0);
     };
     return weak? dummy : traits::destructor;
   }()) {}
 
-  bool operator==(const T other) const { return other == _data.get(); }
-  bool operator!=(const T other) const { return !(*this == other); }
-
-  c_wrapper(const c_wrapper& other): _data(other._data) {}
-  c_wrapper(c_wrapper&& movable) : _data(std::move(movable._data)) {}
-
-  c_wrapper &operator=(c_wrapper&& other) {
-    _data = std::move(other._data);
-    return *this;
-  }
-
-  c_wrapper &operator=(const c_wrapper& other) {
-    _data = other._data;
-    return *this;
-  }
+  using super::super;
 
   /// Resets the value of a C handle.
   /// @param t The new value of the C handle.
@@ -61,17 +50,7 @@ class c_wrapper{
     auto dummy_destructor = [](T) {
       return decltype(traits::destructor(0))(0);
     };
-    _data.reset(t, weak ? dummy_destructor : traits::destructor);
-  }
-
-  /// Returns the value of the underlying C handle.
-  T get() const { return _data.get(); }
-
-  bool operator==(const c_wrapper &other) const {
-    return other._data.get() == _data.get();
-  }
-  bool operator!=(const c_wrapper &other) const {
-    return !(*this == other);
+    super::reset(t, weak ? dummy_destructor : traits::destructor);
   }
 };
 
diff --git a/include/ideep/computations.hpp b/include/ideep/computations.hpp
index 05210ac0..7c24eaed 100644
--- a/include/ideep/computations.hpp
+++ b/include/ideep/computations.hpp
@@ -49,13 +49,10 @@
 #include "scope_guard.hpp"
 #include "instruments.hpp"
 #include <mkl_vsl.h>
-
-#if __GNUC__ > 4
+#include <bitset>
 #include "fast_math.hpp"
 #endif
 
-#endif
-
 namespace ideep {
 
 template<>
@@ -78,10 +75,12 @@ inline tensor::data_type tensor::descriptor::type_to_id<signed char>() {
   return tensor::data_type::s8;
 }
 
-/// Descriptor group, create relative descriptors all in one
+/// A group of primitive descriptors, pack related reorder descriptors
+/// with computational descriptor.
 class descriptor_group: public c_wrapper_complex<mkldnn_primitive_desc_t> {
   friend class primitive_group;
 public:
+  /// Post ops for fusion operations
   class post_ops : public c_wrapper<mkldnn_post_ops_t> {
   public:
     post_ops() : c_wrapper([]() {
@@ -91,7 +90,6 @@ class descriptor_group: public c_wrapper_complex<mkldnn_primitive_desc_t> {
       return result;
     }()) {}
 
-
     int num_ops() const {
       return mkldnn_post_ops_len(get());
     }
@@ -195,6 +193,8 @@ class descriptor_group: public c_wrapper_complex<mkldnn_primitive_desc_t> {
     }
   };
 
+  /// Attribute class for extra information into computations, including
+  /// post operations, rounding mode, etc.
   class attr_t : public c_wrapper<mkldnn_primitive_attr_t> {
   public:
     attr_t() : c_wrapper([]() {
@@ -273,6 +273,7 @@ class descriptor_group: public c_wrapper_complex<mkldnn_primitive_desc_t> {
       return attr;
     }
 
+    // XXX: concept error
     static inline attr_t residual(float scale = 1.0,
         float alpha = 0.f, float beta = 0.f) {
       attr_t attr;
@@ -304,9 +305,14 @@ class descriptor_group: public c_wrapper_complex<mkldnn_primitive_desc_t> {
   }
 
 public:
+  /// Empty construction
   descriptor_group()
     : c_wrapper_complex() {}
 
+  /// Query interface
+  ///
+  /// @param q query kind
+  /// @param index query index
   tensor::descriptor expected_descriptor_of(mkldnn::query q
       , int index = 0) const {
     mkldnn_primitive_desc_t cdesc;
@@ -319,55 +325,83 @@ class descriptor_group: public c_wrapper_complex<mkldnn_primitive_desc_t> {
     return param::descriptor(cdesc);
   }
 
+  /// Query expected input descriptor
+  ///
+  /// @param index Input index
   tensor::descriptor expected_input_descriptor(int index) const {
     return expected_descriptor_of(mkldnn::input_pd, index);
   }
 
+  /// Query expected output descriptor
+  ///
+  /// @param index Input index
   tensor::descriptor expected_output_descriptor(int index) const {
     return expected_descriptor_of(mkldnn::output_pd, index);
   }
 
+  /// Query expected src descriptor
+  ///
   tensor::descriptor expected_src_descriptor() const {
     return expected_descriptor_of(mkldnn::src_pd);
   }
 
+  /// Query expected weights descriptor
+  ///
   tensor::descriptor expected_weights_descriptor() const {
     return expected_descriptor_of(mkldnn::weights_pd);
   }
 
+  /// Query expected bias descriptor
+  ///
   tensor::descriptor expected_bias_descriptor() const {
     return expected_descriptor_of(mkldnn::weights_pd, 1);
   }
 
+  /// Query expected dst descriptor
+  ///
   tensor::descriptor expected_dst_descriptor() const {
     return expected_descriptor_of(mkldnn::dst_pd, 0);
   }
 
+  /// Query expected workspace descriptor
+  ///
   tensor::descriptor expected_workspace_descriptor() const {
     return expected_descriptor_of(mkldnn::workspace_pd, 0);
   }
 
+  /// Query expected gradient X descriptor
+  ///
   tensor::descriptor expected_gradx_descriptor() const {
     return expected_descriptor_of(mkldnn::diff_src_pd, 0);
   }
 
+  /// Query expected gradient Y descriptor
+  ///
   tensor::descriptor expected_grady_descriptor() const {
     return expected_descriptor_of(mkldnn::diff_dst_pd, 0);
   }
 
+  /// Qeury expected weights gradient descriptor
+  ///
   tensor::descriptor expected_gradw_descriptor() const {
     return expected_descriptor_of(mkldnn::diff_weights_pd, 0);
   }
 
+  /// Qeury expected bias gradient descriptor
+  ///
   tensor::descriptor expected_gradb_descriptor() const {
     return expected_descriptor_of(mkldnn::diff_weights_pd, 1);
   }
 
+  /// Query number of inputs
+  ///
   int num_of_inputs() const {
       return mkldnn_primitive_desc_query_s32(get()
           , mkldnn::convert_to_c(mkldnn::num_of_inputs_s32), 0);
   }
 
+  /// Query number of outputs
+  ///
   int num_of_outputs() const {
       return mkldnn_primitive_desc_query_s32(get()
           , mkldnn::convert_to_c(mkldnn::num_of_outputs_s32), 0);
@@ -390,8 +424,11 @@ class descriptor_group: public c_wrapper_complex<mkldnn_primitive_desc_t> {
   }
 };
 
+/// A group of primitives, pack related reorder with computation.
+/// It serves as a base class of computation
 class primitive_group: public c_wrapper_complex<mkldnn_primitive_t> {
 public:
+  /// Empty constructor
   primitive_group()
     : c_wrapper_complex() {}
 
@@ -404,6 +441,7 @@ class primitive_group: public c_wrapper_complex<mkldnn_primitive_t> {
     return cdesc;
   }
 
+  /// Query interface
   tensor::descriptor expected_descriptor_of(mkldnn::query q,
       int index = 0) const {
     mkldnn_primitive_desc_t cdesc;
@@ -430,6 +468,8 @@ class primitive_group: public c_wrapper_complex<mkldnn_primitive_t> {
     auxiliaries_[index].reset(result);
   }
 
+  /// Specific query interface, not valid for all computations.
+  ///
   tensor::descriptor expected_input_descriptor(int index) const {
     return expected_descriptor_of(mkldnn::input_pd, index);
   }
@@ -688,6 +728,8 @@ struct spliter : public reorder {
   }
 };
 
+/// Computation class, abstruct of computation
+///
 struct computation : public primitive_group {
   computation() = default;
 
@@ -837,9 +879,26 @@ struct computation : public primitive_group {
   std::vector<param> primitive_inputs_;
 };
 
+/// Convolution forward computation, this class represent a MKL-DNN
+/// convolution forward process, also manage old computation instances.
 struct convolution_forward: public computation,
   public utils::computation_cache<convolution_forward> {
+  /// Descriptor class for describing convolution forward process
+  ///
   struct descriptor : public descriptor_group {
+    /// Constructor
+    ///
+    /// @param src_desc Input tensor descriptor
+    /// @param weights_desc Weights tensor descriptor
+    /// @param bias_desc Bias tensor descriptor
+    /// @param dst_desc Result tensor descriptor
+    /// @param strides Strides parameters for the convolution
+    /// @param padding_l Paddings of up-left
+    /// @param padding_r Paddings of down-right
+    /// @param attr Extra attribute for the convolution
+    /// @param aalgorithm Convolution algorithm
+    /// @param aprop_kind The propagation kind of convolution
+    /// @param apadding_kind Padding kind of convolution
     descriptor(const tensor::descriptor &src_desc,
         const tensor::descriptor &weights_desc,
         const tensor::descriptor &bias_desc,
@@ -877,6 +936,19 @@ struct convolution_forward: public computation,
       reset(result);
       create_reorder_pds({src_desc, weights_desc});
     }
+
+    /// Constructor
+    ///
+    /// @param src_desc Input tensor descriptor
+    /// @param weights_desc Weights tensor descriptor
+    /// @param dst_desc Result tensor descriptor
+    /// @param strides Strides parameters for the convolution
+    /// @param padding_l Paddings of up-left
+    /// @param padding_r Paddings of down-right
+    /// @param attr Extra attribute for the convolution
+    /// @param aalgorithm Convolution algorithm
+    /// @param aprop_kind The propagation kind of convolution
+    /// @param apadding_kind Padding kind of convolution
     descriptor(const tensor::descriptor &src_desc,
         const tensor::descriptor &weights_desc,
         const tensor::descriptor &dst_desc,
@@ -912,6 +984,21 @@ struct convolution_forward: public computation,
       reset(result);
       create_reorder_pds({src_desc, weights_desc});
     }
+
+    /// Constructor
+    ///
+    /// @param src_desc Input tensor descriptor
+    /// @param weights_desc Weights tensor descriptor
+    /// @param bias_desc Bias tensor descriptor
+    /// @param dst_desc Result tensor descriptor
+    /// @param strides Strides parameters for the convolution
+    /// @param dilates Dilates parameters for the convolution
+    /// @param padding_l Paddings of up-left
+    /// @param padding_r Paddings of down-right
+    /// @param attr Extra attribute for the convolution
+    /// @param aalgorithm Convolution algorithm
+    /// @param aprop_kind The propagation kind of convolution
+    /// @param apadding_kind Padding kind of convolution
     descriptor(const tensor::descriptor &src_desc,
         const tensor::descriptor &weights_desc,
         const tensor::descriptor &bias_desc,
@@ -949,6 +1036,20 @@ struct convolution_forward: public computation,
       reset(result);
       create_reorder_pds({src_desc, weights_desc});
     }
+
+    /// Constructor
+    ///
+    /// @param src_desc Input tensor descriptor
+    /// @param weights_desc Weights tensor descriptor
+    /// @param dst_desc Result tensor descriptor
+    /// @param strides Strides parameters for the convolution
+    /// @param dilates Dilates parameters for the convolution
+    /// @param padding_l Paddings of up-left
+    /// @param padding_r Paddings of down-right
+    /// @param attr Extra attribute for the convolution
+    /// @param aalgorithm Convolution algorithm
+    /// @param aprop_kind The propagation kind of convolution
+    /// @param apadding_kind Padding kind of convolution
     descriptor(const tensor::descriptor &src_desc,
         const tensor::descriptor &weights_desc,
         const tensor::descriptor &dst_desc,
@@ -2211,6 +2312,112 @@ struct eltwise_backward : public computation,
   }
 };
 
+struct channel_shuffle_forward {
+public:
+  channel_shuffle_forward() = delete;
+
+public:
+  static void compute_impl(const tensor& src, tensor& dst, int group) {
+    auto C = src.get_dim(1);
+    auto K = C / group;
+    auto S = src.get_dim(2) * src.get_dim(3); // h * w
+    float* X = static_cast<float*>(src.get_data_handle());
+    float* Y = static_cast<float*>(dst.get_data_handle());
+
+    IDEEP_ENFORCE(C % group == 0, "Invalid channel and group");
+    IDEEP_ENFORCE(src.get_data_type() == tensor::data_type::f32, "invalid data type");
+
+    if (group <= 1) {
+      direct_copy::compute(src, dst);
+      return;
+    }
+
+    # pragma omp parallel for collapse(3) schedule(static)
+    for (auto n = 0; n < src.get_dim(0); n++) {
+      for (auto g = 0; g < group; g++) {
+        for (auto i = 0; i < K; i++) {
+          auto* X_offset = (X + g * K * S + n * C * S + S * i);
+          auto* Y_offset = (Y + g * S + n * C * S + group * S * i);
+#ifdef __AVX2__
+          FM_AVX2_PREF::memcpy<float>(X_offset, Y_offset, S);
+#else
+          std::memcpy(Y_offset, X_offset, sizeof(float) * S);
+#endif
+        }
+      }
+    }
+  }
+
+  template<class alloc = utils::allocator>
+  static void compute(const tensor& src, tensor& dst, const int group = 1) {
+    IDEEP_ENFORCE(src != dst, "Unsupport in-place op");
+    IDEEP_ENFORCE(src.ndims() == 4, "Only support 4 dims");
+
+    auto src_in = src;
+    if (!src_in.is_public_format()) {
+      src_in.init<alloc, channel_shuffle_forward>(
+          {src.get_dims(), src.get_data_type(), format::nchw});
+      reorder::compute(src, src_in);
+    }
+
+    dst.reinit_like(src_in);
+    compute_impl(src_in, dst, group);
+  }
+};
+
+struct channel_shuffle_backward {
+public:
+  channel_shuffle_backward() = delete;
+
+public:
+  static void compute_impl(const tensor& grady, tensor& gradx, int group) {
+    auto C = grady.get_dim(1);
+    auto K = C / group;
+    auto S = grady.get_dim(2) * grady.get_dim(3); // h * w
+    float* dY = static_cast<float*>(grady.get_data_handle());
+    float* dX = static_cast<float*>(gradx.get_data_handle());
+
+    IDEEP_ENFORCE(C % group == 0, "Invalid channel and group");
+    IDEEP_ENFORCE(grady.get_data_type() == tensor::data_type::f32, "invalid data type");
+
+    if (group <= 1) {
+      direct_copy::compute(grady, gradx);
+      return;
+    }
+
+    # pragma omp parallel for collapse(3) schedule(static)
+    for (auto n = 0; n < grady.get_dim(0); n++) {
+      for (auto g = 0; g < group; g++) {
+        for (auto i = 0; i < K; i++) {
+          auto* dY_offset = (dY + g * S + n * C * S + group * S * i);
+          auto* dX_offset = (dX + g * K * S + n * C * S + S * i);
+#ifdef __AVX2__
+          FM_AVX2_PREF::memcpy<float>(dY_offset, dX_offset, S);
+#else
+          std::memcpy(dX_offset, dY_offset, sizeof(float) * S);
+#endif
+        }
+      }
+    }
+  }
+
+  template<class alloc = utils::allocator>
+  static void compute(const tensor& grady, tensor& gradx, const int group = 1) {
+    IDEEP_ENFORCE(grady != gradx, "Unsupport in-place op");
+    IDEEP_ENFORCE(grady.ndims() == 4, "Only support 4 dims");
+
+    auto grady_in = grady;
+    if (!grady_in.is_public_format()) {
+      grady_in.init<alloc, channel_shuffle_forward>(
+          {grady.get_dims(), grady.get_data_type(), format::nchw});
+      reorder::compute(grady, grady_in);
+    }
+
+    gradx.reinit_like(grady_in);
+    compute_impl(grady_in, gradx, group);
+  }
+};
+
 struct sum : public computation,
   public utils::computation_cache<sum> {
   struct descriptor : public descriptor_group {
@@ -3413,7 +3620,6 @@ struct dropout_backward {
   }
 };
 
-#if __GNUC__ > 4
 struct eltwise_binary {
 public:
   enum eltwise_binary_op {
@@ -3439,12 +3645,14 @@ struct eltwise_binary {
       }
       switch (op) {
       case ELTWISE_ADD:
-        utils::fast_math<utils::cpu_isa_t::avx2>::add<float>(
+#ifdef __AVX2__
+        FM_AVX2_PREF::add<float>(
             static_cast<float*>(outputC.get_data_handle()),
             static_cast<float*>(inputA.get_data_handle()),
             static_cast<float*>(inputB_data),
             static_cast<unsigned>(inputA.get_nelems()));
         return;
+#endif
       case ELTWISE_MUL:
       case ELTWISE_DIV:
       default:
@@ -3455,7 +3663,6 @@ struct eltwise_binary {
     }
   }
 };
-#endif
 
 struct sum_array {
 public:
diff --git a/include/ideep/fast_math.hpp b/include/ideep/fast_math.hpp
index eb17c16b..31aac561 100644
--- a/include/ideep/fast_math.hpp
+++ b/include/ideep/fast_math.hpp
@@ -2,9 +2,16 @@
 #define _FAST_MATH_HPP_
 #include <assert.h>
 #include <string>
+#include <bitset>
 #include <type_traits>
 #include <immintrin.h>
 #include "abstract_types.hpp"
+
+#ifdef __AVX2__
+
+#define FM_AVX2_PREF \
+  ideep::utils::fast_math<ideep::utils::cpu_isa_t::avx2>
+
 namespace ideep {
 namespace utils {
 
@@ -18,147 +25,110 @@ typedef enum {
     avx512_mic_4ops,
 } cpu_isa_t;
 
-template <cpu_isa_t T> struct TypeMap {};
-#define MAP_T(v, F, I) \
-  template<> struct TypeMap<v> { using tF = F; using tI = I;};
-MAP_T(avx2, __m256, __m256i)
-#undef MAP_T
-#define TF  typename TypeMap<isa>::tF
-#define TI  typename TypeMap<isa>::tI
 
 template<cpu_isa_t isa = avx2>
 class fast_math {
   static constexpr int thread_hold = 1024;
+  static constexpr int align_bytes = 32;
+
 public:
+  using TF = __m256;
+  using TI = __m256i;
 
   template<typename T>
   static inline unsigned get_vec_sz() {
-    switch (isa) {
-    case avx2:
-      return 256/8/sizeof(T);
-    case avx512_common:
-    case avx512_core:
-      return 512/8/sizeof(T);
-    default:
-      throw error(mkldnn_unimplemented, "Not implemented!");
-      return 0;
-    }
+    return 256 / 8 / sizeof(T);
   }
 
-  // Move this to utils
-  template<typename T>
   static inline TI size_to_mask(unsigned nres) {
-    constexpr int on = -1;
-    constexpr int off = 0;
-    switch (isa) {
-    case avx2:
-      assert(nres < 8 && nres > 0);
-      switch(nres) {
-      case 1:
-        return _mm256_set_epi32(off, off, off, off, off, off, off, on);
-      case 2:
-        return _mm256_set_epi32(off, off, off, off, off, off, on, on);
-      case 3:
-        return _mm256_set_epi32(off, off, off, off, off, on, on, on);
-      case 4:
-        return _mm256_set_epi32(off, off, off, off, on, on, on, on);
-      case 5:
-        return _mm256_set_epi32(off, off, off, on, on, on, on, on);
-      case 6:
-        return _mm256_set_epi32(off, off, on, on, on, on, on, on);
-      case 7:
-        return _mm256_set_epi32(off, on, on, on, on, on, on, on);
-      default:
-        return _mm256_set_epi32(off, off, off, off, off, off, off, off);
-      }
-    default:
-      throw error(mkldnn_unimplemented, "Not implemented!");
-    }
+    assert(nres < 8 && nres >= 0);
+    std::bitset<8> e = ~((1 << nres) - 1);
+    return _mm256_set_epi32(e[7]-1, e[6]-1, e[5]-1, e[4]-1, e[3]-1, e[2]-1, e[1]-1, e[0]-1);
   }
 
-#define BIN_OPS(name) \
-  template<typename T> \
-  static TF name##_ps (TF v1, TF v2) { \
-    switch (isa) {  \
-    case avx2:  \
-      return _mm256_##name##_ps(v1, v2); \
-    default: \
-      throw error(mkldnn_unimplemented, "Not implemented!"); \
-      return set1_ps(0.f); \
-    } \
+  static inline TF add_ps(TF v1, TF v2) {
+    return _mm256_add_ps(v1, v2);
   }
 
-  BIN_OPS(add);
-  BIN_OPS(mul);
-  BIN_OPS(div);
-#undef BIN_OPS
+  static inline TF mul_ps(TF v1, TF v2) {
+    return _mm256_mul_ps(v1, v2);
+  }
 
-  template<typename T>
-  static TF set1_ps (const T v) {
-    switch (isa) {
-    case avx2:
-      return _mm256_set1_ps(v);
-    default:
-      throw error(mkldnn_unimplemented, "Not implemented!");
-      return set1_ps(0.f);
-    }
+  static inline TF div_ps(TF v1, TF v2) {
+    return _mm256_div_ps(v1, v2);
   }
 
-  template<typename T>
-  static TF sqrt_ps (TF v) {
-    switch (isa) {
-    case avx2:
-      return _mm256_sqrt_ps(v);
-    default:
-      throw error(mkldnn_unimplemented, "Not implemented!");
-      return set1_ps(0.f);
-    }
+  static inline TF sqrt_ps(TF v) {
+    return _mm256_sqrt_ps(v);
   }
 
-  template<typename T>
-  static TF load_ps (const T *src) {
-    switch (isa) {
-    case avx2:
-      return _mm256_load_ps(src);
-    default:
-      throw error(mkldnn_unimplemented, "Not implemented!");
-      return set1_ps(0.f);
-    }
+  template<typename T = float>
+  static inline TF set1_ps(const T v) {
+    return _mm256_set1_ps(v);
   }
 
-  template<typename T>
-  static TF maskload_ps (const T *src, TI mask) {
-    switch (isa) {
-    case avx2:
-      return _mm256_maskload_ps(src, mask);
-    default:
-      throw error(mkldnn_unimplemented, "Not implemented!");
-      return set1_ps(0.f);
-    }
+  template<typename T = float>
+  static inline TF load_ps(const T *src) {
+    return _mm256_load_ps(src);
   }
 
-  template<typename T>
-  static void store_ps (T *dst, TF v) {
-    switch (isa) {
-    case avx2:
-      _mm256_store_ps(dst, v);
-      return;
-    default:
-      throw error(mkldnn_unimplemented, "Not implemented!");
-      return;
-    }
+  template<typename T = float>
+  static inline TF maskload_ps(const T *src, TI mask) {
+    return _mm256_maskload_ps(src, mask);
   }
 
-  template<typename T>
-  static void maskstore_ps (T *dst, TI mask, TF v) {
-    switch (isa) {
-    case avx2:
-      _mm256_maskstore_ps(dst, mask, v);
-      return;
-    default:
-      throw error(mkldnn_unimplemented, "Not implemented!");
+  template<typename T = float>
+  static inline void store_ps(T *dst, TF v) {
+    _mm256_store_ps(dst, v);
+  }
+
+  template<typename T = float>
+  static inline void maskstore_ps(T *dst, TI mask, TF v) {
+    _mm256_maskstore_ps(dst, mask, v);
+  }
+
+  template<class T = float>
+  static inline void memcpy(T* src, T* dst, size_t size) {
+    auto itemsize = sizeof(T);
+    auto vec_sz = get_vec_sz<T>();
+    auto num_vec = size / vec_sz;
+    auto num_res = size % vec_sz;
+
+    if ((size < vec_sz) ||
+        (IDEEP_MOD_PTR(src, align_bytes) != IDEEP_MOD_PTR(dst, align_bytes))) {
+      std::memcpy(dst, src, itemsize * size);
       return;
     }
+
+    auto cpy_cnt = 0;
+    auto cur_res = num_res;
+    auto cur_vec = num_vec;
+    if (!IDEEP_IS_ALIGNED_PTR(src, align_bytes)) {
+      cpy_cnt = (align_bytes - IDEEP_MOD_PTR(src, align_bytes)) / itemsize;
+      std::memcpy(dst, src, itemsize * cpy_cnt);
+      src += cpy_cnt;
+      dst += cpy_cnt;
+    }
+    IDEEP_ENFORCE(cpy_cnt < vec_sz, "invalid copy count");
+    IDEEP_ENFORCE(IDEEP_IS_ALIGNED_PTR(dst, align_bytes), "not bytes aligned address");
+
+    if (cpy_cnt > cur_res) {
+        cur_vec -= 1;
+        cur_res = vec_sz - (cpy_cnt - cur_res);
+    } else {
+        cur_res -= cpy_cnt;
+    }
+
+    for (auto j = 0; j < cur_vec; j++, dst += vec_sz, src += vec_sz) {
+      auto vmm = load_ps(src);
+      store_ps(dst, vmm);
+    }
+
+    if (cur_res != 0) {
+      auto mask = size_to_mask(cur_res);
+      auto vmm = maskload_ps(src, mask);
+      maskstore_ps(dst, mask, vmm);
+    }
   }
 
   // Unary ops
@@ -176,7 +146,7 @@ class fast_math {
     }
 
     if (nres != 0) {
-      TI mask = size_to_mask<T>(nres);
+      TI mask = size_to_mask(nres);
       TF vmm1 = maskload_ps(src, mask);
       vmm1 = op_mask(vmm1, mask);
       maskstore_ps(dst, mask, vmm1);
@@ -190,11 +160,11 @@ class fast_math {
       single_thread_vecwise_unary_op(dst, src, nelems, op, op_mask);
   }
 
-  template<class elem_t = float>
+  template<class T = float>
   static void inv_square_var(float epsilon,
-      const elem_t* inv_sqrt_var, elem_t* variance, unsigned nelems) {
+      const T* inv_sqrt_var, T* variance, unsigned nelems) {
     if (isa == avx2) {
-      if (std::is_same<elem_t, float>::value) {
+      if (std::is_same<T, float>::value) {
         const float *src = reinterpret_cast<const float *>(inv_sqrt_var);
         float *dst = reinterpret_cast<float *>(variance);
 
@@ -222,11 +192,11 @@ class fast_math {
     }
   }
 
-  template<class elem_t = float>
+  template<class T = float>
   static void inv_sqrt_var(float epsilon,
       const void* variance, void* inv_sqrt_var, unsigned nelems) {
     if (isa == avx2) {
-      if (std::is_same<elem_t, float>::value) {
+      if (std::is_same<T, float>::value) {
         const float *src =
           reinterpret_cast<const float *>(variance);
         float *dst =
@@ -245,7 +215,7 @@ class fast_math {
         }
 
         if (nres != 0) {
-          TI mask = size_to_mask<elem_t>(nres);
+          TI mask = size_to_mask(nres);
           TF vmm1 = maskload_ps(src, mask);
           vmm1 = add_ps(vmm1, epsilones);
           vmm1 = sqrt_ps(vmm1);
@@ -277,7 +247,7 @@ class fast_math {
     }
 
     if (nres != 0) {
-      TI mask = size_to_mask<T>(nres);
+      TI mask = size_to_mask(nres);
       TF vmm1 = maskload_ps(src1, mask);
       TF vmm2 = maskload_ps(src2, mask);
       vmm2 = op_mask(vmm1, vmm2);
@@ -292,12 +262,12 @@ class fast_math {
       single_thread_vecwise_binary_op(dst, src1, src2, nelems, op, op_mask);
   }
 
-  template<class elem_t = float>
-  static void add(elem_t *dst, const elem_t *src1, const elem_t *src2,
+  template<class T = float>
+  static void add(T *dst, const T *src1, const T *src2,
       unsigned nelems) {
-    if (std::is_same<elem_t, float>::value) {
+    if (std::is_same<T, float>::value) {
       auto op = [] (TF vmm1, TF vmm2) {
-        vmm1 = add_ps<elem_t>(vmm1, vmm2);
+        vmm1 = add_ps(vmm1, vmm2);
         return vmm1;
       };
       vecwise_binary_op(dst, src1, src2, nelems, op, op);
@@ -305,7 +275,10 @@ class fast_math {
       throw error(mkldnn_unimplemented, "Not implemented!");
     }
   }
+
 };
 }
 }
 #endif
+
+#endif
diff --git a/include/ideep/tensor.hpp b/include/ideep/tensor.hpp
index 56739798..4f382d04 100644
--- a/include/ideep/tensor.hpp
+++ b/include/ideep/tensor.hpp
@@ -12,15 +12,18 @@ namespace ideep {
 struct computation;
 
 /// @addtogroup api_tensor Tensor
-//
-/// Param class describes parameters internal to operators
+///
+/// Param class handles operands to computations' internal, it wrappers MKL-DNN
+/// memory primitive and provides utilities to manipulate underlying object.
+/// It's also the base class of tensor, handles major tensor services.
 class param: public c_wrapper<mkldnn_primitive_t> {
 public:
   using dims = mkldnn::memory::dims;
   using dim_t = dims::value_type;
   using data_type = mkldnn::memory::data_type;
 
-  /// A param descriptor.
+  /// Param descriptor class wrappers MKL-DNN memory primitive descriptor
+  /// and provides utilities to manipulate underlying object
   struct descriptor : public c_wrapper<mkldnn_primitive_desc_t> {
     friend class param;
     inline static mkldnn_primitive_kind_t convert_to_c(kind akind) {
@@ -42,7 +45,6 @@ class param: public c_wrapper<mkldnn_primitive_t> {
       md.format = convert_to_c(aformat);
     }
 
-    // borrowed from memory_desc_wrapper
     static inline void set_default_strides(dims &strides, const dims &adims,
         const int *perm = NULL) {
       static const int id_perm[]
@@ -76,12 +78,12 @@ class param: public c_wrapper<mkldnn_primitive_t> {
     }
 
   public:
-    /// Initiate a param descriptor, specifying all details.
+    /// Initiate a param descriptor, specifying blocking details.
     ///
     /// @param adims Data dimensions
     /// @param adata_type Data precision/type.
     /// @param extra block information for data.
-    /// @param perm  permutation for layout sequence
+    /// @param perm permutation for layout sequence
     descriptor(const dims adims, data_type adata_type, const dims stride,
         const dims block_dims, const dims stride_inner = dims(12, 1))
       : c_wrapper([&adims, adata_type, &block_dims,
@@ -98,7 +100,7 @@ class param: public c_wrapper<mkldnn_primitive_t> {
       return result;
     }()), public_format_(format::blocked) {}
 
-    /// Initiate a param descriptor, specifying format.
+    /// Initiate a param descriptor, using format for blocking initialization.
     ///
     /// @param adims Data dimensions
     /// @param adata_type Data precision/type.
@@ -107,6 +109,7 @@ class param: public c_wrapper<mkldnn_primitive_t> {
       :c_wrapper([&adims, adata_type, aformat]() {
         mkldnn::memory::validate_dims(adims);
 
+        // XXX: out of range enum might result unspecified behavior
         mkldnn_memory_desc_t data;
         if (adims.size() == 3) {
           fill_param(data, adims, adata_type, aformat);
@@ -130,7 +133,7 @@ class param: public c_wrapper<mkldnn_primitive_t> {
         return result;
       }()), public_format_(public_format(aformat)) {}
 
-    /// Initiate a param descriptor, specifying no format.
+    /// Initiate a param descriptor, assume nature format.
     ///
     /// @param adims Data dimensions
     /// @param adata_type Data precision/type.
@@ -143,15 +146,15 @@ class param: public c_wrapper<mkldnn_primitive_t> {
         public_format_ = format::format_undef;
     }
 
-    /// Initiate a tensor descriptor from primitive_desc_t struct
+    /// Initiate a descriptor from primitive_desc_t struct
     ///
     /// @param adesc Pointer to a primitive_desct_t C struct
-    /// @param aformat Specify a format for current descriptor
+    /// @param aformat Specify public format for current descriptor
     descriptor(mkldnn_primitive_desc_t adesc, format aformat)
       :c_wrapper(adesc), public_format_(aformat) {
     }
 
-    /// Initiate a tensor descriptor from primitive_desc_t struct
+    /// Initiate a descriptor from primitive_desc_t struct
     ///
     /// @param adesc Pointer to a primitive_desct_t C struct
     descriptor(mkldnn_primitive_desc_t adesc) : descriptor(adesc,
@@ -160,19 +163,19 @@ class param: public c_wrapper<mkldnn_primitive_t> {
             mkldnn_primitive_desc_query_memory_d(adesc)->format))) {
     }
 
-    /// Initiate a tensor descriptor from another one, share resource
+    /// Initiate a descriptor from another, share resource
     ///
     /// @param adesc is a reference to another descriptor
     descriptor(const descriptor &adesc): c_wrapper(adesc),
       public_format_ (adesc.public_format_) {
     }
 
-    /// Empty initiate a tensor decriptor
+    /// Empty decriptor constructor
     ///
     descriptor():descriptor(dims(0), data_type::f32, format::format_undef) {
     }
 
-    /// Copy a tensor descriptor from another, share resource
+    /// Share a descriptor from another, share resource
     descriptor &operator=(const descriptor& adesc) {
       c_wrapper::operator=(adesc);
       public_format_ = adesc.public_format_;
@@ -181,19 +184,26 @@ class param: public c_wrapper<mkldnn_primitive_t> {
 
     /// Returns the number of bytes required to allocate the memory
     /// described including the padding area.
+    ///
     inline size_t get_size() const {
       return mkldnn_memory_primitive_desc_get_size(get());
     }
 
+    /// Returns number of dimensions
+    ///
     inline int ndims() const {
       return get_mkldnn_memory_desc_t()->ndims;
     }
 
+    /// Returns dimension vector
+    ///
     inline dims get_dims() const {
       auto *internal = get_mkldnn_memory_desc_t();
       return dims(internal->dims, &internal->dims[internal->ndims]);
     }
 
+    /// Returns descriptor data type
+    ///
     inline data_type get_data_type() const {
       auto *internal = get_mkldnn_memory_desc_t();
       return static_cast<data_type>(internal->data_type);
@@ -225,6 +235,7 @@ class param: public c_wrapper<mkldnn_primitive_t> {
     /// pre-condition. 4-dimension only
     /// 1. (format_undef, nchw) for all unknown format creation
     /// 2. (format_undef, <internel>) compatible with all public correspondent
+    /// @param expected Expected format to transform to
     descriptor format_to(format expected) const {
       mkldnn_memory_desc_t adesc;
       const mkldnn_memory_desc_t *origin = get_mkldnn_memory_desc_t();
@@ -256,6 +267,9 @@ class param: public c_wrapper<mkldnn_primitive_t> {
       return descriptor(result, expected);
     }
 
+    /// Change format from data representation to weights, only nature formats
+    /// were supported.
+    /// Example: from nchw to oihw
     descriptor as_weights_format() const {
       switch(get_internal_format()) {
       case format::nc:
@@ -299,15 +313,18 @@ class param: public c_wrapper<mkldnn_primitive_t> {
       return mkldnn_primitive_desc_query_memory_d(get());
     }
 
+    /// Operator ==
     inline bool operator ==(const descriptor &other) const {
-      // TODO: (format_undef, *) == (nhwc, *) like
       return mkldnn_memory_primitive_desc_equal(get(), other.get());
     }
 
+    /// Operator !=
     inline bool operator !=(const descriptor &other) const {
       return !operator==(other);
     }
 
+    /// Return format generated by MKL-DNN
+    // XXX: format might be out of range.
     format get_internal_format() const {
       return static_cast<format>(this->get_mkldnn_memory_desc_t()->format);
     }
@@ -438,7 +455,13 @@ class param: public c_wrapper<mkldnn_primitive_t> {
     }
   };
 
+  /// View is for describing a subregion from a param
+  ///
   struct view : public c_wrapper<mkldnn_primitive_desc_t> {
+    /// Create view by specifying starting coordinate and size of each dimension
+    /// @param host From which the view was created
+    /// @param volume Size of each dimension of the subregion
+    /// @param start Start coordinates
     view (const descriptor& host, dims volume, dims start) {
       mkldnn_primitive_desc_t result;
       error::wrap_c_api(mkldnn_view_primitive_desc_create(&result,
@@ -512,6 +535,10 @@ class param: public c_wrapper<mkldnn_primitive_t> {
     }
   };
 
+  /// The template initialize param with a descriptor, allocate and manage
+  /// buffer automatically. A customized allocator can be specified to override
+  /// default implementation.
+  /// @param adesc Descriptor for the param
   template<class alloc = utils::allocator, class computation_t = computation>
   void init(const descriptor &adesc) {
     mkldnn_primitive_t result;
@@ -527,6 +554,9 @@ class param: public c_wrapper<mkldnn_primitive_t> {
     public_format_ = adesc.public_format_;
   }
 
+  /// The template initialize param with a descriptor. Specifiy extra buffer.
+  /// @param adesc Descriptor for the param
+  /// @param ahandle Buffer of the param
   void init(const descriptor &adesc, void *ahandle) {
     mkldnn_primitive_t result;
     error::wrap_c_api(
@@ -534,24 +564,26 @@ class param: public c_wrapper<mkldnn_primitive_t> {
       "could not create a memory primitive");
 
     reset(result);
-    set_data_handle(ahandle);
     buffer_.reset();
+    set_data_handle(ahandle);
     public_format_ = adesc.public_format_;
   }
 
+  /// The template initialize param with a descriptor, allocate and manage
+  /// buffer automatically. A customized allocator can be specified to override
+  /// default implementation.
+  /// @param adesc Descriptor for the param
   void init(const descriptor &adesc) {
     init<utils::allocator, computation>(adesc);
   }
 
   /// Function that refill tensor with new description or buffer
-  //
   template<class alloc = utils::allocator, class computation_t = computation>
   void reinit(const descriptor &adesc) {
     auto curr_size = get_size();
     auto new_size = adesc.get_size();
 
-    if (curr_size >= new_size ||
-        (buffer_ == nullptr && get_data_handle() != nullptr)) {
+    if (curr_size >= new_size && buffer_.get() == get_data_handle()) {
       // We don't have to allocate new buffer or we don't manage the buffer
       // either way, we don't allocate new buffer
       // People who manage buffer provide enough space
@@ -584,11 +616,15 @@ class param: public c_wrapper<mkldnn_primitive_t> {
 
   /// Constructs a param and allocating internal buffer.
   ///
-  /// @param adesc param descriptor.
+  /// @param adesc Descriptor for the param
   param(const descriptor &adesc) {
     init(adesc);
   }
 
+  /// Constructs a param and allocating internal buffer.
+  ///
+  /// @param adesc Descriptor for the param.
+  /// @param ahandle Buffer for the param.
   param(const descriptor &adesc, void *ahandle) {
     init(adesc, ahandle);
   }
@@ -596,11 +632,16 @@ class param: public c_wrapper<mkldnn_primitive_t> {
   /// Recreate a param with completely different content from old one
   /// but reuse the param shell. Notice that after resize, its format
   /// is undefined
+  /// @param adims New dimension
+  /// @param adata_type New data type
   void resize(dims adims, data_type adata_type) {
     descriptor adesc(adims, adata_type);
     init(adesc);
   }
 
+  /// Reshape a param, reorder might happen if its format is internal
+  /// @param new_dims New dimension
+  /// @result Return new param reference
   param &reshape(dims new_dims) {
     if (!get_descriptor().is_shape_compatible(new_dims)) {
       throw error(mkldnn_runtime_error, "reshape to incompatible shape");
@@ -609,8 +650,8 @@ class param: public c_wrapper<mkldnn_primitive_t> {
         param p;
         p.init<utils::scratch_allocator>({get_dims(), get_data_type()});
         reorder_to(p);
-        set_data_handle(p.get_data_handle());
         buffer_ = p.get_tensor_buffer();
+        set_data_handle(p.get_data_handle());
       }
 
       set_descriptor({new_dims, get_data_type()});
@@ -619,11 +660,12 @@ class param: public c_wrapper<mkldnn_primitive_t> {
     return *this;
   }
 
+  // XXX: ???
   param &_reshape(dims new_dims) {
     return reshape(new_dims);
   }
 
-  /// Returns the internal structure of primitive descriptor.
+  /// Returns pointer to structure of primitive descriptor.
   const_mkldnn_primitive_desc_t get_mkldnn_primitive_desc_t() const {
     const_mkldnn_primitive_desc_t cdesc;
     error::wrap_c_api(mkldnn_primitive_get_primitive_desc(get(),
@@ -632,6 +674,7 @@ class param: public c_wrapper<mkldnn_primitive_t> {
     return cdesc;
   }
 
+  /// Return pointer to memory descriptor structure
   const mkldnn_memory_desc_t *get_mkldnn_memory_desc_t() const {
     const_mkldnn_primitive_desc_t cdesc;
     error::wrap_c_api(mkldnn_primitive_get_primitive_desc(get(),
@@ -650,7 +693,10 @@ class param: public c_wrapper<mkldnn_primitive_t> {
     return descriptor(clone, public_format_);
   }
 
-  // Force a descriptor into param
+  /// Set a descriptor into param to replace the older one, keep buffer
+  /// It is caller's responsibility to make sure the original buffer is large
+  /// enough for specified descriptor
+  /// @param new_desc New descriptor
   void set_descriptor(const descriptor& new_desc) {
     // Keep the original management
     auto buf = std::move(buffer_);
@@ -659,15 +705,21 @@ class param: public c_wrapper<mkldnn_primitive_t> {
     public_format_ = new_desc.public_format_;
   }
 
+  /// Create a view from current param
+  /// @param view_dims Size of each dimension of the view
+  /// @param offsets Start cooridinate of the view
   view create_view(dims view_dims, dims offsets) const {
     return view(get_descriptor(), view_dims, offsets);
   }
 
+  /// Reture param's data type
   inline data_type get_data_type() const {
     const mkldnn_memory_desc_t *adesc = get_mkldnn_memory_desc_t();
     return static_cast<data_type>(adesc->data_type);
   }
 
+  /// Return size of specified dimension
+  /// @param index Dimension index
   inline dim_t get_dim(int index) const {
     if (index < 0 || index >= ndims())
       return static_cast<dim_t>(0);
@@ -675,23 +727,28 @@ class param: public c_wrapper<mkldnn_primitive_t> {
     return mdesc->dims[index];
   }
 
+  /// Return dimensions' size vector
   inline dims get_dims() const {
     const mkldnn_memory_desc_t *mdesc = get_mkldnn_memory_desc_t();
     return dims (mdesc->dims, &mdesc->dims[mdesc->ndims]);
   }
 
+  /// Return number of dimensions
   inline int ndims() const {
     return get_mkldnn_memory_desc_t()->ndims;
   }
 
+  /// Return whether the tensor is empty
   inline bool is_empty() const {
     return ndims() == 0 && get_data_handle() == 0;
   }
 
+  /// Return buffer size required by the param
   inline size_t get_size() const {
     return mkldnn_memory_primitive_desc_get_size(get_mkldnn_primitive_desc_t());
   }
 
+  /// Return element number of the param
   inline dim_t get_nelems() const {
     const mkldnn_memory_desc_t *mdesc = get_mkldnn_memory_desc_t();
     return std::accumulate(
@@ -701,15 +758,18 @@ class param: public c_wrapper<mkldnn_primitive_t> {
   /// Returns a handle of the data contained in the param. On
   /// the CPU engine, this is a pointer to the allocated memory.
   inline void *get_data_handle() const {
-      void *handle;
-      error::wrap_c_api(mkldnn_memory_get_data_handle(get(), &handle),
-              "could not get native handle");
-      return handle;
+    void *handle;
+    error::wrap_c_api(mkldnn_memory_get_data_handle(get(), &handle),
+            "could not get native handle");
+    return handle;
   }
 
+  /// Set new buffer handle into param
+  /// @param handle Buffer handle
   inline void set_data_handle(void *handle) {
-      error::wrap_c_api(mkldnn_memory_set_data_handle(get(), handle),
-              "could not set native handle");
+    if (buffer_.get() != handle && buffer_ != nullptr) buffer_.reset();
+    error::wrap_c_api(mkldnn_memory_set_data_handle(get(), handle),
+            "could not set native handle");
   }
 
   /// Materialize a param. For specific scenario param will allocate
@@ -746,10 +806,12 @@ class param: public c_wrapper<mkldnn_primitive_t> {
       return static_cast<mkldnn_memory_format_t>(aformat);
   }
 
+  /// Return internal format of the param
   inline format get_internal_format() const {
     return static_cast<format>(get_mkldnn_memory_desc_t()->format);
   }
 
+  /// Need reorder if current param used by non MKL-DNN routines.
   inline bool need_reorder() const {
     return get_internal_format() != public_format_;
   }
@@ -860,11 +922,16 @@ class param: public c_wrapper<mkldnn_primitive_t> {
   std::shared_ptr<char> buffer_;
 };
 
-/// Tensor that describes the data and its explanation.
+/// Tensor that describes data buffer and its explanation.
+/// It also integrates an optional tensor as an intemediate results, used in
+/// Pooling/LRN
 class tensor : public param {
 public:
   using param::param;
 
+  /// Pack an extra tensor into current one, allocate buffer using specified
+  /// allocator.
+  /// @param descriptor Descriptor of the extra tensor
   template<class alloc = utils::allocator, class computation_t = computation>
   void init_extra(const descriptor &workspace) {
     auto twin = new tensor();
@@ -872,66 +939,97 @@ class tensor : public param {
     twin_.reset(twin);
   }
 
+  /// Pack an extra tensor into current one
+  ///
+  /// @param descriptor Descriptor of the extra tensor
+  /// @param handle Buffer handle
   void init_extra(const descriptor &workspace, void *handle) {
     twin_.reset(new tensor(workspace, handle));
   }
 
+  /// Pack an extra tensor into current one
+  ///
+  /// @param tensor Extra tensor to pack in
   void init_extra(const tensor &ws) {
     twin_.reset();
     twin_ = std::make_shared<tensor>(ws);
   }
 
   // for gcc4.8
+  /// Empty construction
   tensor() : param() {}
 
+  /// Construct tensor
+  ///
+  /// @param major Descriptor for the tensor
+  /// @param workspace Extra descriptor of the tensor which will be packed in
   tensor(const descriptor &major, const descriptor &workspace)
     : tensor(major) {
     init_extra(workspace);
   }
 
+  /// Construct tensor
+  ///
+  /// @param major Descriptor of the tensor
+  /// @param h_major Buffer handle of the tensor
+  /// @param workspace Descriptor of the extra tensor
   tensor(const descriptor &major, void *h_major, const descriptor &workspace)
     : tensor(major, h_major) {
     init_extra(workspace);
   }
 
+  /// Construct tensor
+  ///
+  /// @param major Descriptor of the tensor
+  /// @param h_major Buffer handle of the tensor
+  /// @param workspace Descriptor of the extra tensor
+  /// @param h_workspace Buffer handle of the extra tensor
   tensor(const descriptor &major, void *h_major,
       const descriptor &workspace, void *h_workspace)
     : tensor(major, h_major) {
     init_extra(workspace, h_workspace);
   }
 
+  /// Copy constructor
   tensor (const tensor& t) : param(t) {
     twin_ = t.twin_;
   }
 
+  /// Move constructor
   tensor (tensor&& movable) : param(std::move(movable)) {
     twin_ = std::move(movable.twin_);
   }
 
+  /// Assignment operator
   tensor &operator = (const tensor& t) {
     param::operator = (t);
     twin_ = t.twin_;
     return *this;
   }
 
+  /// Move assignment operator
   tensor &operator = (tensor&& movable) {
     param::operator = (std::move(movable));
     twin_ = std::move(movable.twin_);
     return *this;
   }
 
+  /// Return extra packed tensor
   tensor *get_extra() {
     return twin_.get();
   }
 
+  /// Return extra packed tensor
   const tensor *get_extra() const {
     return twin_.get();
   }
 
+  /// Decide wether there is an extra tensor packed in
   bool has_extra() const {
     return twin_ != nullptr;
   }
 
+  // XXX: ???
   tensor as_weights() const {
     tensor ret = *this;
     if (!is_weights())
diff --git a/python/ideep4py/py/mm/mdarray.cc b/python/ideep4py/py/mm/mdarray.cc
index b092a81c..4a397988 100755
--- a/python/ideep4py/py/mm/mdarray.cc
+++ b/python/ideep4py/py/mm/mdarray.cc
@@ -282,6 +282,50 @@ PyObject *mdarray::inplace_axpby(float a, PyObject *self, float b, PyObject *o)
   return self;
 }
 
+void mdarray::set(PyObject *o) {
+  // Resource manager, for GCC do not accept lambda
+  struct py_decref {
+    void operator () (PyObject *p) const {
+      Py_DECREF(p);
+    }
+  };
+
+  std::unique_ptr<PyObject, py_decref> op(nullptr);
+
+  // Create mdarray from buffer provider
+  if (reinterpret_cast<PyTypeObject *>(o->ob_type) == &PyArray_Type) {
+    o = py_mdarray_from(o);
+    op.reset(o);
+  }
+
+  void *oprd2;
+  int res = SWIG_ConvertPtr(o, &oprd2, nullptr, 0);
+
+  if (!SWIG_IsOK(res)) {
+    PyErr_SetString(PyExc_ValueError, "Wrong operand object in add wrapper");
+    return;
+  }
+
+  auto in = *(reinterpret_cast<py_handle *>(oprd2))->get();
+  auto dims = get_dims();
+  auto in_dims = in.get_dims();
+  if (dims.size() != in_dims.size())
+    throw error(mkldnn_invalid_arguments, "mdarray set: Inconsistent ndims");
+  for (size_t d = 0; d < dims.size(); d++) {
+    if (dims[d] != in_dims[d])
+      throw error(mkldnn_invalid_arguments, "mdarray set: Inconsistent dims");
+  }
+
+  tensor in_ = in;
+  if (in.get_descriptor() != get_descriptor()) {
+    in_.init(get_descriptor());
+    reorder::compute(in, in_);
+  }
+
+  memcpy(get_data_handle(), in_.get_data_handle(), get_size());
+  return;
+}
+
 PyObject *mdarray::m_Add(PyObject *self, PyObject *o) {
   // Array Broadcast
   if (!is_mdarray_supported(self, o)) {
diff --git a/python/ideep4py/py/mm/mdarray.h b/python/ideep4py/py/mm/mdarray.h
index 18d5bef9..25eae780 100755
--- a/python/ideep4py/py/mm/mdarray.h
+++ b/python/ideep4py/py/mm/mdarray.h
@@ -325,6 +325,8 @@ class mdarray : public ideep::tensor {
 
   PyObject *flat(void);
 
+  void set(PyObject *o);
+
   PyObject *reshape(py_handle *self, std::vector<int> dims);
 
   PyObject *m_mult_div(PyObject *self, PyObject *o, int mult_or_div, bool inplace);
diff --git a/python/ideep4py/py/mm/mdarray.i b/python/ideep4py/py/mm/mdarray.i
index cf236547..d952aaa0 100644
--- a/python/ideep4py/py/mm/mdarray.i
+++ b/python/ideep4py/py/mm/mdarray.i
@@ -100,6 +100,11 @@
   PyObject *flat() {
     return (*self)->flat();
   }
+
+  void set(PyObject *o){
+    (*self)->set(o);
+    return;
+  }
 }
 
 /* mdarray::reshape */
diff --git a/python/ideep4py/tests/mm/test_mdarray_set_mm.py b/python/ideep4py/tests/mm/test_mdarray_set_mm.py
new file mode 100755
index 00000000..797f6597
--- /dev/null
+++ b/python/ideep4py/tests/mm/test_mdarray_set_mm.py
@@ -0,0 +1,54 @@
+import ideep4py  # NOQA
+import numpy
+import testing
+from ideep4py import mdarray
+import unittest
+
+
+class TestMdarraySet(unittest.TestCase):
+    def setUp(self):
+        self.check_options = {'atol': 1e-5, 'rtol': 1e-4}
+
+    def test_set1(self):
+        x = numpy.array([1, 1, 1], dtype=numpy.float32)
+        mx = mdarray(x)
+        numpy.testing.assert_allclose(
+            mx, x, **self.check_options)
+        x = numpy.array([1, 2, 1], dtype=numpy.float32)
+        mx.set(x)
+        numpy.testing.assert_allclose(
+            mx, x, **self.check_options)
+
+    def test_set2(self):
+        x = numpy.arange(24, dtype=numpy.float32)
+        mx = mdarray(x)
+        numpy.testing.assert_allclose(
+            mx, x, **self.check_options)
+        x.fill(1)
+        mx.set(x)
+        numpy.testing.assert_allclose(
+            mx, x, **self.check_options)
+
+    def test_set3(self):
+        x = numpy.random.rand(10, 10, 10, 10)
+        x = x.astype(numpy.float32)
+        mx = mdarray(x)
+        numpy.testing.assert_allclose(
+            mx, x, **self.check_options)
+        x = numpy.random.rand(10, 10, 10, 10)
+        x = x.astype(numpy.float32)
+        mx.set(x)
+        numpy.testing.assert_allclose(
+            mx, x, **self.check_options)
+
+    def test_set4(self):
+        x = numpy.array([0, 0, 0, 0], dtype=numpy.float32)
+        mx1 = mdarray(x)
+        mx2 = mdarray(x)
+        mx1.fill(0)
+        mx2.set(mx1)
+        numpy.testing.assert_allclose(
+            mx1, mx2, **self.check_options)
+
+
+testing.run_module(__name__, __file__)
diff --git a/python/setup.py b/python/setup.py
index 42e4c8bc..be4ce944 100644
--- a/python/setup.py
+++ b/python/setup.py
@@ -62,6 +62,12 @@ def clean_ext():
 ###############################################################################
 # Custom build commands
 ###############################################################################
+class build(distutils.command.build.build):
+    def run(self):
+        prepare_ext()
+        distutils.command.build.build.run(self)
+
+
 class build_ext(setuptools.command.build_ext.build_ext):
     def run(self):
         prepare_ext()
@@ -84,6 +90,7 @@ def run(self):
 
 
 cmdclass = {
+    'build': build,
     'build_ext': build_ext,
     'install': install,
     'clean': clean,
@@ -160,7 +167,7 @@ def run(self):
 
 setup(
     name='ideep4py',
-    version='2.0.0_b1',
+    version='2.0.0',
     description='ideep4py is a wrapper for iDeep library.',
     author='Intel',
     author_email='',