diff --git a/.gitignore b/.gitignore
new file mode 100644
index 00000000..504e12c2
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,35 @@
+*~
+*.egg/
+*.pyc
+*.pyo
+*.cpp
+*.so
+cscope.*
+tags
+build
+\#*\#
+.\#*
+.coverage
+.eggs/
+_readthedocs_build
+ideep.egg-info/
+dist/
+htmlcov/
+.idea/
+ideep/python/api/c_api.py
+ideep/python/api/support.py
+ideep/python/api/memory.py
+ideep/python/api/inner_product_*.py
+ideep/python/api/reorder.py
+ideep/python/api/convolution_*.py
+ideep/python/api/eltwise_*.py
+ideep/python/api/concat.py
+ideep/python/api/lrn_*.py
+ideep/python/api/pooling_*.py
+ideep/python/api/bn_*.py
+ideep/python/api/view.py
+ideep/python/api/sum.py
+ideep/python/api/cosim_dump.py
+ideep/python/api/dropout.py
+ideep/python/mdarray.py
+external/mkldnn/source
diff --git a/.gitmodules b/.gitmodules
new file mode 100644
index 00000000..d66e8c35
--- /dev/null
+++ b/.gitmodules
@@ -0,0 +1,3 @@
+[submodule "mkl-dnn"]
+	path = mkl-dnn
+	url = https://github.com/01org/mkl-dnn.git
diff --git a/.pep8 b/.pep8
new file mode 100644
index 00000000..3051e81f
--- /dev/null
+++ b/.pep8
@@ -0,0 +1,3 @@
+[pep8]
+exclude=caffe_pb*,.eggs,*.egg,build
+
diff --git a/LICENSE b/LICENSE
new file mode 100644
index 00000000..2956417a
--- /dev/null
+++ b/LICENSE
@@ -0,0 +1,19 @@
+Copyright (c) 2018 Intel Corporation.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
diff --git a/README.md b/README.md
new file mode 100644
index 00000000..4646dae2
--- /dev/null
+++ b/README.md
@@ -0,0 +1,51 @@
+# iDeep: Intel Deep Learning Package
+
+Intel Deep Learning Package (iDeep) is an open source performance library of primitives for accelerating deep learning frameworks on Intel Architecture. iDeep provides user-friendly API and highly tuned implementations for DNN standard routines.
+
+The package provides C and Python API.
+
+## iDeep Python Package (ideep4py) Requirements
+
+We recommend these Linux distributions.
+- Ubuntu 14.04/16.04 LTS 64bit
+- CentOS 7 64bit
+
+The following versions of Python can be used:
+- 2.7.5+, 3.5.2+, and 3.6.0+
+
+Above recommended environments are tested. We cannot guarantee that ideep4py works on other environments including Windows and macOS, even if ideep4py looks running correctly.
+
+
+Minimum requirements:
+- Numpy 1.9+
+- Six 1.9+
+- Swig 3.0.12
+- Glog 0.3.5
+- Cmake 2.8.0
+- Doxygen 1.8.5
+- C++ compiler with C++11 standard support
+
+## Installation of ideep4py
+
+If you use old ``setuptools``, upgrade it:
+
+```
+pip install -U setuptools
+```
+
+Then, install ideep from the source code:
+```
+python setup.py install
+```
+
+Use pip to uninstall ideep4py:
+
+```sh
+$ pip uninstall ideep4py
+```
+
+## More information
+- ideep github: https://github.com/intel/ideep.git
+
+## License
+MIT License (see `LICENSE` file).
diff --git a/dlcp/Makefile b/dlcp/Makefile
new file mode 100644
index 00000000..e781bdfd
--- /dev/null
+++ b/dlcp/Makefile
@@ -0,0 +1,88 @@
+##################################################################################
+ # # Copyright (c) 2018 Intel Corporation.
+ # #
+ # # Permission is hereby granted, free of charge, to any person obtaining a copy
+ # # of this software and associated documentation files (the "Software"), to deal
+ # # in the Software without restriction, including without limitation the rights
+ # # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ # # copies of the Software, and to permit persons to whom the Software is
+ # # furnished to do so, subject to the following conditions:
+ # #
+ # # The above copyright notice and this permission notice shall be included in
+ # # all copies or substantial portions of the Software.
+ # #
+ # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ # # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ # # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ # # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ # # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ # # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ # # THE SOFTWARE.
+ # #
+##################################################################################
+
+
+LOCAL_DIR = $(shell pwd)
+
+DLCP_CXX ?= icpc
+
+FLAG_DEBUG ?= 0
+
+AR       	= ar
+CXXFLAGS    += -fPIC
+
+ifneq (,$(findstring icpc, $(DLCP_CXX)))
+    CXX       = $(DLCP_CXX)
+    CXXFLAGS += -std=c++11
+    LDFLAGS  += -static-intel
+else
+    $(error Unsupported compiler $(DLCP_CXX))
+endif
+
+ifeq ($(FLAG_DEBUG), 1)
+    CXXFLAGS += -O0 -g
+else
+    CXXFLAGS += -O2
+endif
+
+ifneq (,$(findstring icpc,$(CXX)))
+    LDFLAGS += -static-intel 
+endif
+
+ifneq (,$(findstring icpc,$(CXX)))
+    CXXFLAGS += -qopenmp
+endif
+
+COMPRESSION_LIB		= lib/libdlcomp.so
+COMPRESSION_LIBNAME = libdlcomp.so
+SRC_DIR				= $(LOCAL_DIR)/src
+INCL_DIR			= $(LOCAL_DIR)/include $(LOCAL_DIR)/src
+
+TARGET              = libdlcomp.so
+INCS        		= -I$(INCLUDE_DIR) -I$(SRC_DIR)
+LDFLAGS             += -ldl -lrt -lpthread -liomp5
+CXXFLAGS 			+= $(addprefix -I,$(INCL_DIR)) 
+
+
+SRCS += src/dl_compression_impl.cpp
+SRCS += src/dl_compression_util.cpp
+SRCS += src/dl_compression.cpp
+
+OBJS := $(SRCS:.cpp=.o)
+
+
+all: $(TARGET)
+
+$(TARGET): $(COMPRESSION_LIB)
+
+$(COMPRESSION_LIB): $(OBJS)
+	$(CXX) $(CXXFLAGS) -shared -Wl,-soname,$(COMPRESSION_LIBNAME) -o $(COMPRESSION_LIB) $(OBJS) $(LDFLAGS)
+
+$(SRC_DIR)/%.o: $(SRC_DIR)/%.cpp 
+	$(CXX) -c $(CXXFLAGS) $< -o $@
+
+clean:
+	rm -f $(SRC_DIR)/*.o $(COMPRESSION_LIB) 
+
+cleanall: clean
+
diff --git a/dlcp/env_setup.sh b/dlcp/env_setup.sh
new file mode 100755
index 00000000..6ecaef56
--- /dev/null
+++ b/dlcp/env_setup.sh
@@ -0,0 +1,3 @@
+#!/bin/bash
+
+source /opt/intel/bin/compilervars.sh intel64
diff --git a/dlcp/include/dl_compression.h b/dlcp/include/dl_compression.h
new file mode 100644
index 00000000..e081461b
--- /dev/null
+++ b/dlcp/include/dl_compression.h
@@ -0,0 +1,193 @@
+/*
+ *Copyright (c) 2018 Intel Corporation.
+ *
+ *Permission is hereby granted, free of charge, to any person obtaining a copy
+ *of this software and associated documentation files (the "Software"), to deal
+ *in the Software without restriction, including without limitation the rights
+ *to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ *copies of the Software, and to permit persons to whom the Software is
+ *furnished to do so, subject to the following conditions:
+ *
+ *The above copyright notice and this permission notice shall be included in
+ *all copies or substantial portions of the Software.
+ *
+ *THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ *OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ *THE SOFTWARE.
+ *
+ */
+
+
+#ifndef DL_COMPRESSION_H
+#define DL_COMPRESSION_H
+
+#include <stddef.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef enum {
+    DL_COMP_NONE = 0,
+    DL_COMP_DFP = 1,
+} dl_comp_method_t;
+
+typedef enum {
+    DL_COMP_OK = 0,
+    DL_COMP_FAIL = 1,
+    DL_COMP_FAIL_SRC_DATA_TYPE_NOT_SUPPORTED = 2,
+    DL_COMP_FAIL_RATIO_NOT_SUPPORTED = 3,
+    DL_COMP_FAIL_COMP_METHOD_NOT_SUPPORTED = 4,
+    DL_COMP_FAIL_INVALID_COMPRESSED_FORMAT = 5,
+    DL_COMP_FAIL_NOT_SUPPORTED = 6
+} dl_comp_return_t;
+
+typedef enum {
+    DL_COMP_INT8    = 0,
+    DL_COMP_FLOAT16 = 1,
+    DL_COMP_FLOAT32 = 2,
+    DL_COMP_FLOAT64 = 3,
+} dl_comp_data_type_t;
+
+// Compress src buffer into dst buffer.
+//
+// Parameters:
+// src [in] pointer to src buffer
+// dst [out] pointer to dst buffer
+// dataCount [in] num of element needs to be compressed
+// diff [in/out] place the precision lost from the last compress
+//               return the precision lost from this compress. 
+//               If you don't care about lost precision, you can
+//               set it NULL pointer.
+// src_data_type [in/out] data type in src buffer              
+// comp_ratio [in] compression ratio, it should only be 2,4,8,16,32.
+//                 e.g. If we compress FLOAT32 to INT8, the comp_ratio
+//                 is 4.
+// method [in] compression algorithm
+// Returns:
+// compress successful or not. DL_COMP_OK means successful, otherwise not.
+dl_comp_return_t dl_comp_compress_buffer( const void *src, 
+                                          void *dst, 
+                                          size_t dataCount, 
+                                          void *diff, 
+                                          dl_comp_data_type_t src_data_type,
+                                          size_t comp_ratio,
+                                          dl_comp_method_t method );
+
+// de-Compress src buffer into dst buffer.
+// 
+// Parameters:
+// src [in] pointer to src buffer
+// dst [out] pointer to dst buffer
+// dataCount [in] num of element needs to be de-Compressed
+// Returns:
+// de-compress successful or not.
+dl_comp_return_t dl_comp_decompress_buffer( const void *src, 
+                                            void *dst, 
+                                            size_t dataCount );
+
+// Sum up compressed data from two input buffer and put the result
+// in the outBuffer.
+// 
+// Parameters:
+// inBuffer1 [in] pointer to quantized data vector
+// inBuffer2 [in] pointer to quantized data vector
+// dataCount [in] num of element in inBuffer1 and inBuffer2 
+//                needs to be sum up.
+// outBuffer [out] pointer to quantized data vector and the result
+//                      will be placed in this inoutBuffer.
+// Returns:
+// sum up successful or not.
+dl_comp_return_t dl_comp_compressed_buffer_sum( const void *inBuffer1, 
+                                                const void *inBuffer2,
+                                                size_t dataCount,
+                                                void *outBuffer ); 
+
+// Get compress meta data info(block). Some operation like multi-node all-reduce
+// will divide payload into parts to enhance communication efficiency.This api 
+// is to notify of the compressed meta data info: The minimum slicing granularity.
+// Its size is related with src DataType, comp_ratio, compression algorithm
+//
+// Parameters:
+// srcDataType [in] data type of src data before compression.
+// comp_ratio [in] compression ratio
+// method [in] compression algorithm
+// Returns:
+// N/A.
+size_t dl_comp_get_sizeof_block( dl_comp_data_type_t src_data_type, 
+                                 size_t comp_ratio, 
+                                 dl_comp_method_t method );
+
+// Sum up two buffer's compressed data and put the result in
+// second buffer.Please attention here we use blockCount 
+// as input parameter. 1 block can contain multiple data.
+// 
+// Parameters:
+// inBuffer [in] pointer to quantized data
+// inoutBuffer [in/out] pointer to quantized data. Result will 
+//                      be placed in this buffer.
+// Returns:
+// 
+dl_comp_return_t  dl_comp_compressed_buffer_reduce_sum( const void *inBuffer, 
+                                                        void *inoutBuffer,
+                                                        size_t blockCount );
+
+// Util function for converting data count into block count.
+//
+// Prameters:
+// dataCount [in] num of digit
+// Returns:
+// return corresponding num of block
+size_t dl_comp_convert_block_count(size_t dataCount);
+
+// Util function to get how many elements in one block.
+// Parameters:
+// N/A
+// Returns:
+// return how many elements in one block
+size_t dl_comp_get_elem_num_in_block();
+
+// Check Running Environment.
+// Parameters:
+// N/A
+// Returns:
+// If check successful, return true. otherwise false.
+// If false, pls disable quantization functionality.
+// E.g. We sugguest not to use quantization on machine
+// not support avx512 instructions, because there's no 
+// performance gain.
+bool dl_comp_check_running_environ();
+
+// Uitl function for compress float32 data to int8
+// Parameters:
+// srcBuffer [in] src float32 data
+// dstBuffer [out] dst int8 data
+// diff: [in/out] precision lost in compression
+// dataCount [in] data count
+// Return:
+// If successful, return 0, otherwise error code.
+int dl_comp_compress_buffer_FLOAT32ToINT8( const void *srcBuffer,
+                                           void *dstBuffer,
+                                           void *diff,
+                                           size_t dataCount);
+
+// Util function for de-compress int8 to float32
+// Parameters:
+// srcBuffer [in] contain int8 compressed data
+// dstBuffer [out] de-comressed float32 data
+// dataCount [in] data count
+// Return:
+// If successful, return 0, othersise error code.
+int dl_comp_decompress_buffer_INT8ToFLOAT32(const void *srcBuffer,
+                                            void *dstBuffer,
+                                            size_t dataCount);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/dlcp/lib/.gitignore b/dlcp/lib/.gitignore
new file mode 100644
index 00000000..da883b4b
--- /dev/null
+++ b/dlcp/lib/.gitignore
@@ -0,0 +1,4 @@
+# Ignore everything in this directory
+#
+# Except this file
+!.gitignore
diff --git a/dlcp/src/dl_compression.cpp b/dlcp/src/dl_compression.cpp
new file mode 100644
index 00000000..671a4175
--- /dev/null
+++ b/dlcp/src/dl_compression.cpp
@@ -0,0 +1,168 @@
+/*
+ *Copyright (c) 2018 Intel Corporation.
+ *
+ *Permission is hereby granted, free of charge, to any person obtaining a copy
+ *of this software and associated documentation files (the "Software"), to deal
+ *in the Software without restriction, including without limitation the rights
+ *to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ *copies of the Software, and to permit persons to whom the Software is
+ *furnished to do so, subject to the following conditions:
+ *
+ *The above copyright notice and this permission notice shall be included in
+ *all copies or substantial portions of the Software.
+ *
+ *THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ *OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ *THE SOFTWARE.
+ *
+ */
+
+
+#include <stdio.h>
+
+#include "dl_compression.h"
+#include "dl_compression_impl.hpp"
+
+
+dl_comp_return_t dl_comp_compress_buffer( const void *src,
+                                          void *dst,
+                                          size_t dataCount,
+                                          void *diff,
+                                          dl_comp_data_type_t src_data_type,
+                                          size_t comp_ratio,
+                                          dl_comp_method_t method )
+{
+    // Parameter checking
+    if (src_data_type != DL_COMP_FLOAT32) {
+        return DL_COMP_FAIL_SRC_DATA_TYPE_NOT_SUPPORTED;
+    }
+
+    if (comp_ratio != 4) {
+        return DL_COMP_FAIL_RATIO_NOT_SUPPORTED;
+    }
+
+    if (method != DL_COMP_DFP) {
+        return DL_COMP_FAIL_COMP_METHOD_NOT_SUPPORTED;
+    }
+
+    // Do compession
+    DLCompressBase *compInst = DLCompressBase::get_compression_instance(DL_COMP_DFP);
+    
+    return compInst->compress_buffer((float *)src,
+                                     (int8_t *)dst,
+                                     (float *)diff,
+                                     dataCount,
+                                     src == dst);
+}
+
+dl_comp_return_t dl_comp_decompress_buffer( const void *src, 
+                                            void *dst, 
+                                            size_t dataCount )
+{
+    dl_comp_head *compHead = (dl_comp_head *)src;
+
+    if (compHead->magic != DL_COMP_HEAD_MAGIC) {
+        // This is a work-around for MLSL. Because in MPI_Test
+        // sometimes an already de-compressed buffer may be sent 
+        // to compress lib to do de-compressed buffer. So we
+        // simply ignore it in this case.
+        return DL_COMP_OK;
+    }
+
+    size_t blockCount = dataCount % DL_COMP_BLOCK_NUM == 0 ? (dataCount / DL_COMP_BLOCK_NUM) : (dataCount / DL_COMP_BLOCK_NUM + 1);
+    // do de-compression
+    DLCompressBase *compInst = DLCompressBase::get_compression_instance(DL_COMP_DFP);
+
+    return compInst->decompress_buffer((const int8_t *)src, (float *)dst, blockCount);
+}
+
+dl_comp_return_t dl_comp_compressed_buffer_sum( const void *inBuffer1, 
+                                                const void *inBuffer2,
+                                                size_t dataCount,
+                                                void *outBuffer )
+{
+    return DL_COMP_FAIL_NOT_SUPPORTED;
+}
+
+size_t dl_comp_get_sizeof_block( dl_comp_data_type_t src_data_type, 
+                                 size_t comp_ratio, 
+                                 dl_comp_method_t method )
+{
+    size_t blockSize = 0;
+    if (src_data_type == DL_COMP_FLOAT32 &&
+        comp_ratio == 4 &&
+        method == DL_COMP_DFP) {
+        blockSize = sizeof(int8_t) * DL_COMP_BLOCK_NUM + sizeof(dl_comp_head);
+    }
+
+    return blockSize;
+}
+
+size_t dl_comp_get_elem_num_in_block()
+{
+    return DL_COMP_BLOCK_NUM;
+}
+
+dl_comp_return_t  dl_comp_compressed_buffer_reduce_sum( const void *inBuffer, 
+                                                        void *inoutBuffer,
+                                                        size_t blockCount )
+{
+    DLCompressBase *compInst = DLCompressBase::get_compression_instance(DL_COMP_DFP);
+    
+    return compInst->compress_sum2((const int8_t *)inBuffer, (int8_t *)inoutBuffer, blockCount);
+}
+
+size_t dl_comp_convert_block_count(size_t dataCount)
+{
+    size_t blockCount = dataCount % DL_COMP_BLOCK_NUM == 0 ? 
+                        (dataCount / DL_COMP_BLOCK_NUM) : (dataCount / DL_COMP_BLOCK_NUM + 1);
+    return blockCount;
+}
+
+bool dl_comp_check_running_environ()
+{
+  // Currently, we only check whether avx512 instruction supported.
+  return dl_comp_check_avx512_supported(); 
+}
+
+int dl_comp_compress_buffer_FLOAT32ToINT8( const void *srcBuffer,
+                                            void *dstBuffer,
+                                            void *diff,
+                                            size_t dataCount)
+{
+    DLCompressBase *compInst = DLCompressBase::get_compression_instance(DL_COMP_DFP);
+
+    dl_comp_return_t ret =  compInst->compress_buffer((float *)srcBuffer,
+                                                      (int8_t *)dstBuffer,
+                                                      (float *)diff,
+                                                      dataCount,
+                                                      srcBuffer == dstBuffer);
+    return ret;
+}
+
+int dl_comp_decompress_buffer_INT8ToFLOAT32(const void *srcBuffer,
+                                            void *dstBuffer,
+                                            size_t dataCount)
+{
+    dl_comp_head *compHead = (dl_comp_head *)srcBuffer;
+
+    if (compHead->magic != DL_COMP_HEAD_MAGIC) {
+        // This is a work-around for MLSL. Because in MPI_Test
+        // sometimes an already de-compressed buffer may be sent 
+        // to compress lib to do de-compressed buffer. So we
+        // simply ignore it in this case.
+        return DL_COMP_OK;
+    }
+
+    // do de-compression
+    size_t blockCount = dataCount % DL_COMP_BLOCK_NUM == 0 ? 
+                        (dataCount / DL_COMP_BLOCK_NUM) : (dataCount / DL_COMP_BLOCK_NUM + 1); 
+    DLCompressBase *compInst = DLCompressBase::get_compression_instance(DL_COMP_DFP);
+    dl_comp_return_t ret = compInst->decompress_buffer((const int8_t *)srcBuffer, (float *)dstBuffer, blockCount);
+
+    return ret;
+}
diff --git a/dlcp/src/dl_compression_impl.cpp b/dlcp/src/dl_compression_impl.cpp
new file mode 100644
index 00000000..c51cb3b9
--- /dev/null
+++ b/dlcp/src/dl_compression_impl.cpp
@@ -0,0 +1,696 @@
+/*
+ *Copyright (c) 2018 Intel Corporation.
+ *
+ *Permission is hereby granted, free of charge, to any person obtaining a copy
+ *of this software and associated documentation files (the "Software"), to deal
+ *in the Software without restriction, including without limitation the rights
+ *to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ *copies of the Software, and to permit persons to whom the Software is
+ *furnished to do so, subject to the following conditions:
+ *
+ *The above copyright notice and this permission notice shall be included in
+ *all copies or substantial portions of the Software.
+ *
+ *THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ *OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ *THE SOFTWARE.
+ *
+ */
+
+
+#include <cmath>
+#include <algorithm>
+#include <limits>
+#include <immintrin.h>
+#include <string.h>
+
+#include "dl_compression.h"
+#include "dl_compression_impl.hpp"
+#include "dl_compression_util.hpp"
+
+bool g_avx512_supported = dl_comp_check_avx512_supported();
+
+bool dl_comp_check_avx512_supported()
+{
+    const unsigned long avx512_features = (_FEATURE_AVX512F | _FEATURE_AVX512CD | _FEATURE_AVX512VL | _FEATURE_AVX512BW);
+    return _may_i_use_cpu_feature( avx512_features );
+}
+
+DLCompressBase* DLCompressBase::get_compression_instance(dl_comp_method_t method)
+{
+    DLCompressBase *pInstance = NULL;
+    static DLCompressDFP dfpInstance;
+
+    switch(method) {
+        case DL_COMP_DFP:
+            pInstance = &dfpInstance;
+            break;
+
+        case DL_COMP_NONE:
+
+        default:
+            pInstance = NULL;
+            DLCP_LOG(INFO, "Unsupported Compression Method");
+    }
+
+    return pInstance;
+}
+
+dl_comp_return_t DLCompressDFP::compress_block(float *src, int8_t *dst, float *diff, size_t count, int *scale)
+{
+    // Do quantization
+    // only handle float buffer as src and int8_t as dst
+    float max_abs = 0.;
+    float max_abs_log2 = 0.;
+    float round_value, d_value;
+    int8_t decomp_value = 0;
+
+    if (NULL != diff) {
+#ifdef _OPENMP
+#pragma omp parallel for
+#endif
+        for (size_t i = 0; i < count; ++i) {
+            src[i] += diff[i];
+        }
+    }
+
+    for (size_t i = 0; i < count; ++i) {
+        max_abs = std::max(max_abs, std::abs(src[i]));
+    }
+
+    max_abs_log2 = std::log2f(max_abs);
+    // If max_log2 is equal to -inf, this means max_abs is 0.
+    // In this case, we set scale as 0.
+    if (max_abs_log2 * (-1.0) == std::numeric_limits<float>::infinity()) {
+        *scale = 0;
+    } else {
+        *scale = 8*sizeof(int8_t) - ((int)std::ceil(max_abs_log2) + 1);
+    }
+     
+    float pow2_scale = std::pow(2, *scale);
+
+    for (size_t i = 0; i < count; ++i) {
+        // It's corner case that the result value of src[i]*pow2_scale will be
+        // bigger than 127.5f. The value will rounded up to 128. This is out of range
+        // of int8_t. (-128 - 127) So we set it as 127.
+        round_value = std::round(src[i]*pow2_scale);
+        if (round_value <= 127.0f) {
+            decomp_value = (int8_t)round_value;
+        } else {
+            decomp_value = 127;
+        }
+        if (NULL != diff) {
+            d_value = ((float)decomp_value) / pow2_scale;
+            diff[i] = src[i] - d_value;
+        }
+        dst[i] = decomp_value;
+    }
+
+    return DL_COMP_OK;
+}
+
+dl_comp_return_t DLCompressDFP::avx512_compress_block(float *src, int8_t *dst, float *diff, size_t count, int *scale)
+{
+    // If count is smaller than 16 we use non-avx512 implementation
+    // 16 is the element number which one avx512 register can hold
+    if (count < DL_COMP_BLOCK_NUM) {
+        return compress_block(src, dst, diff, count, scale);
+    }
+   
+
+    DLCP_ASSERT(count % 16 == 0, "count can't be divided by 16!");
+
+    // Do quantization
+    // Error FeedBack
+    if (NULL != diff) {
+        dl_comp_avx512_float_vector_add(diff, src, count);
+    }
+
+    float max_abs = 0.;
+    float max_abs_log2 = 0.;
+    size_t group_size = 16;
+    __m512 max_vec = _mm512_set1_ps(0.0f);
+
+    for (size_t idx = 0; idx < count; idx += group_size) {
+        __m512 float_vec     = _mm512_loadu_ps(src+idx);
+        __m512 float_abs_vec = _mm512_abs_ps(float_vec);
+        __mmask16 cmp_mask = _mm512_cmp_ps_mask(max_vec, float_abs_vec, _CMP_GE_OS);
+        max_vec = _mm512_mask_mov_ps(float_abs_vec, cmp_mask, max_vec);
+    }
+
+    max_abs = _mm512_reduce_max_ps(max_vec);
+
+    max_abs_log2 = std::log2f(max_abs);
+    // If max_log2 is equal to -inf, this means max_abs is 0.
+    // In this case, we set scale as 0.
+    if (max_abs_log2 * (-1.0) == std::numeric_limits<float>::infinity()) {
+        *scale = 0;
+    } else {
+        *scale = 8*sizeof(int8_t) - ((int)std::ceil(max_abs_log2) + 1);
+    }
+
+    float pow2_scale = std::pow(2, *scale);
+
+    float pow2_scale_inv = 1.0f / std::pow(2, *scale);
+    __m512 pow2_scale_v = _mm512_set1_ps(pow2_scale);
+    __m512 pow2_scale_inv_v = _mm512_set1_ps(pow2_scale_inv);
+    __mmask16 mask = _mm512_int2mask(0xFFFF);
+    float *f32_diff;
+    for (size_t idx = 0; idx < count; idx += group_size) {
+        float *f32_src      = src + idx;
+        int8_t *i8_dst      = dst + idx;
+        __m512 f32_src_v    = _mm512_loadu_ps(f32_src);
+        __m512 f32_result_v = _mm512_mul_ps(f32_src_v, pow2_scale_v);
+        __m512i i32_round_v = _mm512_cvt_roundps_epi32(f32_result_v, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
+        // satruation has already been considered in cvt instruction
+        _mm512_mask_cvtsepi32_storeu_epi8(i8_dst, mask, i32_round_v);
+        if (NULL != diff) {
+            f32_diff     = diff + idx;
+            __m512 f32_round_v  = _mm512_cvt_roundepi32_ps(i32_round_v, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
+            __m512 f32_dequant_v = _mm512_mul_ps(f32_round_v, pow2_scale_inv_v);
+            __m512 f32_diff_v    = _mm512_sub_ps(f32_src_v, f32_dequant_v);
+             _mm512_storeu_ps(f32_diff, f32_diff_v);
+        }
+    }
+    return DL_COMP_OK;
+}
+
+dl_comp_return_t DLCompressDFP::compress_buffer(float *src, int8_t *dst, float *diff, size_t count, bool inPlace)
+{
+    dl_comp_return_t ret    = DL_COMP_FAIL;
+    dl_comp_head *compHead    = NULL;
+    int scale               = 0;
+    size_t comp_block       = 0;
+    for (size_t i = 0; i < count; i += DL_COMP_BLOCK_NUM) {
+        comp_block = (i + DL_COMP_BLOCK_NUM) < count ? DL_COMP_BLOCK_NUM : (count - i);
+        compHead = (dl_comp_head *)dst;
+        if (!inPlace) {
+            dst += sizeof(dl_comp_head);
+        }
+        if (!avx512_enabled_ || comp_block < DL_COMP_BLOCK_NUM) {
+            ret = compress_block(src, dst, diff, comp_block, &scale);
+        } else {
+           ret = avx512_compress_block(src, dst, diff, comp_block, &scale);
+        }
+        if (ret == DL_COMP_FAIL) {
+            return ret;
+        }
+        if (inPlace) {
+            memmove(dst+sizeof(dl_comp_head), dst, comp_block);
+            dst += sizeof(dl_comp_head);
+        }
+        compHead->magic = DL_COMP_HEAD_MAGIC;
+        compHead->exponent = scale;
+        compHead->payloadLen = comp_block;
+        dst += comp_block;
+        src += comp_block;
+        if (NULL != diff) {
+            diff += comp_block;
+        }
+    }
+    
+    return DL_COMP_OK;
+}
+
+dl_comp_return_t DLCompressDFP::compress_buffer(float *src, int8_t *dst, size_t count, bool inPlace)
+{
+    dl_comp_return_t ret = compress_buffer(src, dst, NULL, count, inPlace);
+    return ret;
+}
+
+dl_comp_return_t DLCompressDFP::decompress_buffer(const int8_t *src, float *dst, size_t blockCount)
+{
+    dl_comp_head *compHead   = NULL;
+    dl_comp_return_t ret;
+    size_t count;
+    int scale;
+    const int8_t *origSrc = src;
+    float *origDst = dst;
+    int8_t decomp_block[DL_COMP_BLOCK_NUM];
+
+
+    if (blockCount == 0) {
+        return DL_COMP_OK;
+    }
+        
+    do {
+        src = origSrc + (blockCount - 1) * (sizeof(dl_comp_head) + DL_COMP_BLOCK_NUM);
+        dst = origDst + (blockCount - 1) * DL_COMP_BLOCK_NUM;
+        compHead = (dl_comp_head *)src;
+        if (compHead->magic != DL_COMP_HEAD_MAGIC) {
+            return DL_COMP_FAIL_INVALID_COMPRESSED_FORMAT;
+        }
+        count = compHead->payloadLen;
+        scale = compHead->exponent; 
+        if (blockCount == 1) {
+            memcpy(decomp_block, src + sizeof(dl_comp_head), count);
+        }
+        if (!avx512_enabled_) {
+            if (blockCount != 1) {
+                ret = decompress_block(src + sizeof(dl_comp_head), dst, count, scale);
+            } else {
+                ret = decompress_block(decomp_block, dst, count, scale);
+            }
+        } else {
+            if (blockCount != 1) {
+                ret = avx512_decompress_block(src + sizeof(dl_comp_head), dst, count, scale);
+            } else {
+                ret = avx512_decompress_block(decomp_block, dst, count, scale);
+            }
+        }
+        if (ret != DL_COMP_OK) {
+            return ret;
+        }
+        blockCount--;
+    } while (blockCount > 0);
+
+    return ret;
+}
+
+dl_comp_return_t DLCompressDFP::avx512_decompress_block(const int8_t *src, float *dst, size_t count, int scale)
+{
+    // If count is smaller than 16 we use non-avx512 implementation
+    //16 is the element number which one avx512 register can hold
+    if (count < DL_COMP_BLOCK_NUM) {
+        return decompress_block(src, dst, count, scale);
+    }
+
+    DLCP_ASSERT(count % 16 == 0, "count can't be divided by 16!");
+
+    // Do de-quantization
+    float pow2_scale_inv = 1.0f / std::pow(2, scale);
+    size_t group_size = 16;
+    size_t num_group = count / group_size;
+    __m512 scale_factor = _mm512_set1_ps(pow2_scale_inv);
+
+#ifdef _OPENMP
+#pragma omp parallel for
+#endif
+    for (size_t idx = 0; idx < count; idx += group_size) {
+        __m512 float_vec    = _mm512_set_ps((float)src[idx + 15], (float)src[idx + 14],
+                                            (float)src[idx + 13], (float)src[idx + 12],
+                                            (float)src[idx + 11], (float)src[idx + 10],
+                                            (float)src[idx + 9], (float)src[idx + 8],
+                                            (float)src[idx + 7], (float)src[idx + 6],
+                                            (float)src[idx + 5], (float)src[idx + 4],
+                                            (float)src[idx + 3], (float)src[idx + 2],
+                                            (float)src[idx + 1], (float)src[idx]);
+        __m512 result_vec   = _mm512_mul_ps(float_vec, scale_factor);
+        _mm512_storeu_ps(dst+idx, result_vec);
+    }
+    return DL_COMP_OK;
+}
+
+dl_comp_return_t DLCompressDFP::decompress_block(const int8_t *src, float *dst, size_t count, int scale)
+{
+    // Do de-quantization
+    // only handle int8_t as src and float as dst
+    float pow2_scale_inv = 1.0f / std::pow(2, scale);
+#ifdef _OPENMP
+#pragma omp parallel for
+#endif
+    for (size_t i = 0; i < count; ++i) {
+        dst[i] = (float)src[i];
+        dst[i] *= pow2_scale_inv;
+    }
+
+    return DL_COMP_OK;
+}
+
+size_t DLCompressDFP::get_dataCount_in_compressed_buffer(const int8_t *src, size_t blockCount) {
+    size_t count = 0;
+    size_t sum = 0;
+    dl_comp_head *compHead = NULL;
+
+    if (blockCount == 0) {
+        return sum;
+    }
+
+    do {
+        compHead = (dl_comp_head *)src;
+        DLCP_ASSERT(compHead->magic == DL_COMP_HEAD_MAGIC, "Invalid compHead!!!\n");
+        count = compHead->payloadLen;
+        src += sizeof(dl_comp_head);
+        src += count;
+        sum += count;
+        blockCount--;
+    } while (blockCount > 0);
+
+    return sum;
+}
+
+dl_comp_return_t DLCompressDFP::compress_sum(const int8_t *invec, int8_t *inoutvec, size_t blockCount)
+{
+    dl_comp_return_t ret      = DL_COMP_OK;
+    const size_t blockSize  = sizeof(dl_comp_head) + DL_COMP_BLOCK_NUM;
+    size_t inCount          = get_dataCount_in_compressed_buffer((const int8_t*)invec, blockCount);
+    size_t outCount         = get_dataCount_in_compressed_buffer((const int8_t*)inoutvec, blockCount);
+
+    DLCP_ASSERT(inCount == outCount, "inCount is not equal to outCount");
+
+    float deqBuf1[DL_COMP_BLOCK_NUM];
+    float deqBuf2[DL_COMP_BLOCK_NUM];
+
+    for (size_t i = 0; i < inCount; i += DL_COMP_BLOCK_NUM, invec += blockSize, inoutvec += blockSize) {
+        size_t compBlock = (i + DL_COMP_BLOCK_NUM) < inCount ? DL_COMP_BLOCK_NUM : (inCount - i);
+        decompress_buffer(invec, deqBuf1, 1);
+        decompress_buffer(inoutvec, deqBuf2, 1);
+        if (!avx512_enabled_) {
+            dl_comp_float_vector_add(deqBuf2, deqBuf1, compBlock);
+        } else {
+            dl_comp_avx512_float_vector_add(deqBuf2, deqBuf1, compBlock);
+        }
+        ret = compress_buffer(deqBuf1, inoutvec, compBlock, false);
+        if (ret != DL_COMP_OK) {
+            return ret;
+        }
+    }
+
+    return ret; 
+}
+
+dl_comp_return_t DLCompressDFP::compress_sum2(const int8_t *invec, int8_t *inoutvec, size_t blockCount)
+{
+    const size_t blockSize  = sizeof(dl_comp_head) + DL_COMP_BLOCK_NUM;
+    dl_comp_return_t ret = DL_COMP_OK;
+    // size_t count          = get_dataCount_in_compressed_buffer((const int8_t*)invec, blockCount);
+           
+    if (!avx512_enabled_) { 
+        for (size_t i = 0; i < blockCount; i++, invec += blockSize, inoutvec += blockSize) {
+            ret = compress_block_sum(invec, inoutvec);
+            if (ret != DL_COMP_OK) {
+                return ret;
+            }         
+        }
+    } else {
+        for (size_t i = 0; i < blockCount; i++, invec += blockSize, inoutvec += blockSize) {
+            ret = compress_block_sum2(invec, inoutvec);
+            if (ret != DL_COMP_OK) {
+                return ret;
+            }
+        }
+    }
+ 
+    return ret; 
+}
+
+dl_comp_return_t DLCompressDFP::compress_block_sum(const int8_t *invec, int8_t *inoutvec)
+{
+    dl_comp_head *inHead   = (dl_comp_head *)invec;
+    dl_comp_head *outHead  = (dl_comp_head *)inoutvec;
+
+    size_t count    = inHead->payloadLen;
+    int inScale     = inHead->exponent;
+    int outScale    = outHead->exponent;
+
+    if ((inHead->magic != DL_COMP_HEAD_MAGIC) || (outHead->magic != DL_COMP_HEAD_MAGIC)) {
+        return DL_COMP_FAIL_INVALID_COMPRESSED_FORMAT;
+    }
+
+    if (inScale == 0) {
+        // Means invec contain all 0.
+        return DL_COMP_OK;
+    }
+
+    if (outScale == 0) {
+        // Means outvec contain all 0.
+        memcpy(inoutvec, invec, sizeof(dl_comp_head) + count);
+        return DL_COMP_OK;
+    }
+
+    // Since scale is 2 exponent, if their gap is bigger than 128 (we don't need to sum up)
+    if (std::abs(inScale - outScale) > 8) {
+        if (outScale < inScale) {
+            return DL_COMP_OK;
+        } else {
+            memcpy(inoutvec, invec, sizeof(dl_comp_head) + count);
+            return DL_COMP_OK;
+        }
+    }
+   
+    int resvec[DL_COMP_BLOCK_NUM] = {0}; 
+    int minScale        = std::min(inScale, outScale);
+    int inScaleGap      = inScale - minScale;
+    int outScaleGap     = outScale - minScale;
+    int8_t left, right;
+    int max_abs = 0;
+
+
+    invec       += sizeof(dl_comp_head);
+    inoutvec    += sizeof(dl_comp_head);
+    
+#ifdef _OPENMP
+#pragma omp parallel for
+#endif
+    for (size_t i = 0; i < count; i++) {
+        left = invec[i] >> inScaleGap;
+        right = inoutvec[i] >> outScaleGap;
+        resvec[i] = left + right;
+        // This is for compensation of final right shift
+        // To make it an unbiased estimator, we only 
+        // compensate when left number is 
+        resvec[i] += resvec[i] & left & 1;
+        max_abs |= (resvec[i] > 0 ? resvec[i] : (-resvec[i]));
+    }
+
+    if (max_abs >= 128) {
+        minScale -= 1;
+#ifdef _OPENMP
+#pragma omp parallel for
+#endif
+        for (size_t i = 0; i < count; i++) {
+            inoutvec[i] = resvec[i] >> 1;
+        }
+    } else {
+#ifdef _OPENMP
+#pragma omp parallel for
+#endif
+        for (size_t i = 0; i < count; i++) {
+            inoutvec[i] = resvec[i];
+        }
+    }
+
+    outHead->exponent = minScale; 
+    return DL_COMP_OK;
+}
+
+dl_comp_return_t DLCompressDFP::compress_block_sum2(const int8_t *invec, int8_t *inoutvec)
+{
+    dl_comp_head *inHead   = (dl_comp_head *)invec;
+    dl_comp_head *outHead  = (dl_comp_head *)inoutvec;
+
+    size_t count    = inHead->payloadLen;
+    int inScale     = inHead->exponent;
+    int outScale    = outHead->exponent;
+
+    if ((inHead->magic != DL_COMP_HEAD_MAGIC) || (outHead->magic != DL_COMP_HEAD_MAGIC)) {
+        return DL_COMP_FAIL_INVALID_COMPRESSED_FORMAT;
+    }
+
+    if (count % 16 != 0) {
+        return compress_block_sum(invec, inoutvec);
+    }
+
+    if (inScale == 0) {
+        // Means invec contain all 0.
+        return DL_COMP_OK;
+    }
+
+    if (outScale == 0) {
+        // Means outvec contain all 0.
+        memcpy(inoutvec, invec, sizeof(dl_comp_head) + count);
+        return DL_COMP_OK;
+    }
+
+    // Since scale is 2 exponent, if their gap is bigger than 128 (we don't need to sum up)
+    if (std::abs(inScale - outScale) > 7) {
+        if (outScale < inScale) {
+            return DL_COMP_OK;
+        } else {
+            memcpy(inoutvec, invec, sizeof(dl_comp_head) + count);
+            return DL_COMP_OK;
+        }
+    }
+
+    int32_t resvec[DL_COMP_BLOCK_NUM] = {0};
+    int minScale        = std::min(inScale, outScale);
+    int inScaleGap      = inScale - minScale;
+    int outScaleGap     = outScale - minScale;
+    int8_t left, right;
+    int max_abs = 0;
+    size_t group_size = 16;
+    __mmask16 mask = _mm512_int2mask(0xFFFF);
+    __m512i i32_one_v = _mm512_set1_epi32(1);
+    __m512i i32_or_v = _mm512_set1_epi32(0);
+
+    invec       += sizeof(dl_comp_head);
+    inoutvec    += sizeof(dl_comp_head);        
+
+    for (size_t i = 0; i < count; i += group_size) {
+        const int8_t *i8_left     = invec + i;
+        int8_t *i8_right    = inoutvec + i;
+        int32_t *i32_result  = resvec + i;
+        __m128i i8_left_v   = _mm_maskz_loadu_epi8(mask, i8_left);
+        __m128i i8_right_v  = _mm_maskz_loadu_epi8(mask, i8_right);
+        __m512i i32_left_v  = _mm512_cvtepi8_epi32(i8_left_v);
+        __m512i i32_right_v = _mm512_cvtepi8_epi32(i8_right_v);
+        i32_left_v          = _mm512_srai_epi32(i32_left_v, inScaleGap);
+        i32_right_v         = _mm512_srai_epi32(i32_right_v, outScaleGap);
+        __m512i i32_result_v= _mm512_add_epi32(i32_left_v, i32_right_v);
+        //compensation
+        __m512i i32_comp_v  = _mm512_and_epi32(i32_result_v, i32_left_v);
+        i32_comp_v          = _mm512_and_epi32(i32_comp_v, i32_one_v);
+        i32_result_v        = _mm512_add_epi32(i32_result_v, i32_comp_v);
+        _mm512_mask_storeu_epi32(i32_result, mask, i32_result_v);
+        // To get or of while result
+        i32_result_v        = _mm512_abs_epi32(i32_result_v);
+        i32_or_v            = _mm512_or_epi32(i32_result_v, i32_or_v);
+    } 
+
+    max_abs = _mm512_reduce_or_epi32(i32_or_v);
+
+    if (max_abs >= 128) {
+        minScale -= 1;
+        for (size_t i = 0; i < count; i += group_size) {
+            int32_t *i32_res    = resvec + i;
+            int8_t *i8_inout    = inoutvec + i;
+            __m512i i32resvec_v = _mm512_loadu_si512(i32_res);
+            i32resvec_v         = _mm512_srai_epi32(i32resvec_v, 1);
+            _mm512_mask_cvtsepi32_storeu_epi8(i8_inout, mask, i32resvec_v);
+        }
+    } else {
+        for (size_t i = 0; i < count; i += group_size) {
+            int32_t *i32_res    = resvec + i;
+            int8_t *i8_inout    = inoutvec + i;
+            __m512i i32resvec_v = _mm512_loadu_si512(i32_res);
+             _mm512_mask_cvtsepi32_storeu_epi8(i8_inout, mask, i32resvec_v);
+        }
+    }
+
+    outHead->exponent = minScale;
+    return DL_COMP_OK;
+}
+
+void DLCompressDFP::dump_compressed_buffer(const int8_t *src, size_t blockCount) 
+{
+    size_t count = 0;
+    dl_comp_head *compHead = NULL;
+    int scale = 0; 
+    float pow2_scale = .0;
+
+    if (blockCount == 0) return;
+
+    DLCP_LOG(INFO, "Enter function dump_compressed_buffer...\n");
+    do {
+        compHead = (dl_comp_head *)src;
+        if (compHead->magic != DL_COMP_HEAD_MAGIC) {
+            DLCP_LOG(INFO, "Invalid compHead!!!\n");
+            return;
+        }
+        count = compHead->payloadLen;
+        scale = compHead->exponent;
+        DLCP_LOG(INFO, "count = %lu Scale = %d\n", count, scale);
+        pow2_scale = std::pow(2, scale);
+        src += sizeof(dl_comp_head);
+        for (size_t i = 0; i < count; i++) {
+            float d_value = ((float)src[i])/pow2_scale;
+            DLCP_LOG(INFO, "compressed value %d decompressed value %f\n", src[i], d_value);
+        }
+        src += count;
+        blockCount--;
+    } while (blockCount > 0);
+    DLCP_LOG(INFO, "End of function dump_compressed_buffer...\n");
+}
+
+bool DLCompressDFP::check_compressed_buffer(const float *comp1, const int8_t *comp2, const float *diff, size_t blockCount)
+{
+    float epislon = 1e-9;
+    dl_comp_head *compHead = NULL;
+    int scale = 0;
+    float pow2_scale = .0;
+    size_t count = 0;
+
+    do {
+        compHead = (dl_comp_head *)comp2;
+        if (compHead->magic != DL_COMP_HEAD_MAGIC) {
+            DLCP_LOG(ERROR, "Invalid compHead!!!\n");
+            return false;
+        }
+        count = compHead->payloadLen;
+        scale = compHead->exponent;
+        comp2 += sizeof(dl_comp_head);
+        pow2_scale = std::pow(2, scale);
+        for (size_t i = 0; i < count; i++) {
+            float d_value = ((float)comp2[i])/pow2_scale;
+            if (d_value * comp1[i] < 0.0f) {
+                DLCP_LOG(ERROR, "detected big gap src = %f d_value = %f diff = %f\n", comp1[i], d_value, diff[i]);
+                DLCP_LOG(ERROR, "scale = %d, pow2_scale = %f, compressed_value = %d\n", scale, std::pow(2, scale), comp2[i]);
+                return false;
+            }
+        }
+        comp1 += count;
+        comp2 += count;
+        diff  += count;
+        blockCount--;
+    } while (blockCount > 0);
+
+    return true;
+}
+
+dl_comp_return_t compress_helper(float *src, int8_t *dst, float *diff, dl_comp_method_t method, size_t count)
+{
+    DLCompressBase *compInst = DLCompressBase::get_compression_instance(method);
+    dl_comp_return_t ret = compInst->compress_buffer(src, dst, diff, count);
+    return ret;
+}
+
+dl_comp_return_t decompress_helper(const int8_t *src, float *dst, dl_comp_method_t method)
+{
+    DLCompressBase *compInst = DLCompressBase::get_compression_instance(method);
+    return compInst->decompress_buffer(src, dst, 0);
+}
+
+void dl_comp_float_vector_add(const float* invec, float *inoutvec, size_t count)
+{
+#ifdef _OPENMP
+#pragma omp parallel for
+#endif
+    for (size_t i = 0; i < count; ++i) {
+        inoutvec[i] += invec[i];
+    }
+}
+
+void dl_comp_avx512_float_vector_add(const float* invec, float *inoutvec, size_t count)
+{
+    // If count is smaller than 16 we use non-avx512 implementation
+    // 16 is the element number which one avx512 register can hold
+    if (count < 16) {
+        return dl_comp_float_vector_add(invec, inoutvec, count);
+    }
+
+    // If count can't be divided by 16, we handle tailing remainder
+    // with non-avx512 imeplementation
+    if (count % 16 != 0) {
+        size_t remainder = count % 16;
+        count -= remainder;
+        dl_comp_float_vector_add(invec+count, inoutvec+count, remainder);
+    }
+
+    size_t group_size = 16;
+#ifdef _OPENMP
+#pragma omp parallel for
+#endif
+    for (size_t idx = 0; idx < count; idx += group_size) {
+        const float *fvec1  = invec + idx;
+        float *fvec2        = inoutvec + idx;
+        __m512 operand1     = _mm512_loadu_ps(fvec1);
+        __m512 operand2     = _mm512_loadu_ps(fvec2);
+        __m512 result       = _mm512_add_ps(operand1, operand2);
+        _mm512_storeu_ps(fvec2, result);
+    }
+}
+
diff --git a/dlcp/src/dl_compression_impl.hpp b/dlcp/src/dl_compression_impl.hpp
new file mode 100644
index 00000000..9442f72b
--- /dev/null
+++ b/dlcp/src/dl_compression_impl.hpp
@@ -0,0 +1,122 @@
+/*
+ *Copyright (c) 2018 Intel Corporation.
+ *
+ *Permission is hereby granted, free of charge, to any person obtaining a copy
+ *of this software and associated documentation files (the "Software"), to deal
+ *in the Software without restriction, including without limitation the rights
+ *to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ *copies of the Software, and to permit persons to whom the Software is
+ *furnished to do so, subject to the following conditions:
+ *
+ *The above copyright notice and this permission notice shall be included in
+ *all copies or substantial portions of the Software.
+ *
+ *THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ *OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ *THE SOFTWARE.
+ *
+ */
+
+
+#ifndef DL_COMPRESSION_IMPL_HPP
+#define DL_COMPRESSION_IMPL_HPP
+
+
+#include <stdint.h>
+#include <stdio.h>
+
+// Disable the copy and assignment operator for a class
+
+#define DISABLE_COPY_AND_ASSIGN(classname) \
+private:\
+    classname(const classname&);\
+    classname& operator=(const classname&)
+
+#define DL_COMP_BLOCK_NUM 256
+
+#define DL_COMP_HEAD_MAGIC 0xdeadbeef
+
+typedef struct __attribute__((__packed__))
+{
+    int magic;
+    int payloadLen;
+    int exponent;
+} dl_comp_head;
+
+bool dl_comp_check_avx512_supported(void);
+
+void dl_comp_float_vector_add(const float *invec, float *inoutvec, size_t count);
+
+void dl_comp_avx512_float_vector_add(const float *invec, float *inoutvec, size_t count);
+
+void dl_comp_int8_vector_add(const int8_t *invec, int8_t *inoutvec, size_t count);
+
+void dl_comp_avx512_int8_vector_add(const int8_t *invec, int8_t *inoutvec, size_t count);
+
+dl_comp_return_t compress_helper(float *src, int8_t *dst, float *diff, dl_comp_method_t method, size_t count);
+
+dl_comp_return_t decompress_helper(const int8_t *src, float *dst, dl_comp_method_t method);
+
+/*
+ * Abstract base class for quantization
+ */
+class DLCompressBase {
+
+public:
+    DLCompressBase() = default;
+    // Compress with error feedback
+    virtual dl_comp_return_t compress_buffer(float *src, int8_t *dst, float *diff, size_t count, bool inPlace = false) = 0;
+    // Compress  without error feedback
+    virtual dl_comp_return_t compress_buffer(float *src, int8_t *dst, size_t count, bool inPlace = false) = 0;
+    virtual dl_comp_return_t decompress_buffer(const int8_t *src, float *dst, size_t blockCount) = 0;
+    virtual size_t get_dataCount_in_compressed_buffer(const int8_t *src, size_t blockCount) = 0;
+    virtual dl_comp_return_t compress_sum(const int8_t *invec, int8_t *inoutvec, size_t blockCount) = 0;
+    virtual dl_comp_return_t compress_sum2(const int8_t *invec, int8_t *inoutvec, size_t blockCount) = 0;
+    virtual void dump_compressed_buffer(const int8_t *src, size_t blockCount) = 0;
+    virtual bool check_compressed_buffer(const float *comp1, const int8_t *comp2, const float *diff, size_t blockCount) = 0;
+    virtual ~DLCompressBase(void) {};
+
+public:
+    static DLCompressBase* get_compression_instance(dl_comp_method_t method);
+
+    DISABLE_COPY_AND_ASSIGN(DLCompressBase);
+};
+
+
+class DLCompressDFP : public DLCompressBase {
+
+    friend class DLCompressBase;
+public:
+    virtual ~DLCompressDFP(void) {};
+    virtual dl_comp_return_t compress_buffer(float *src, int8_t *dst, float *diff, size_t count, bool inPlace = false);
+    virtual dl_comp_return_t compress_buffer(float *src, int8_t *dst, size_t count, bool inPlace = false);
+    virtual dl_comp_return_t decompress_buffer(const int8_t *src, float *dst, size_t blockCount);
+    virtual size_t get_dataCount_in_compressed_buffer(const int8_t *src, size_t blockCount);
+    virtual dl_comp_return_t compress_sum(const int8_t *invec, int8_t *inoutvec, size_t blockCount);
+    virtual dl_comp_return_t compress_sum2(const int8_t *invec, int8_t *inoutvec, size_t blockCount);
+    virtual void dump_compressed_buffer(const int8_t *src, size_t blockCount);
+    virtual bool check_compressed_buffer(const float *comp1, const int8_t *comp2, const float *diff, size_t blockCount);
+
+private:
+    DLCompressDFP(): avx512_enabled_(dl_comp_check_avx512_supported()) {};
+
+private:
+    dl_comp_return_t compress_block(float *src, int8_t *dst, float *diff, size_t count, int *scale);
+    dl_comp_return_t decompress_block(const int8_t *src, float *dst, size_t count, int scale);
+    dl_comp_return_t avx512_decompress_block(const int8_t *src, float *dst, size_t count, int scale);
+    dl_comp_return_t avx512_compress_block(float *src, int8_t *dst, float *diff, size_t count, int *scale);
+    dl_comp_return_t compress_block_sum(const int8_t *invec, int8_t *inoutvec);
+    dl_comp_return_t compress_block_sum2(const int8_t *invec, int8_t *inoutvec);
+
+private:
+    bool avx512_enabled_;
+
+DISABLE_COPY_AND_ASSIGN(DLCompressDFP);
+};
+
+
+#endif /* DL_COMPRESSION_IMPL_HPP */
diff --git a/dlcp/src/dl_compression_util.cpp b/dlcp/src/dl_compression_util.cpp
new file mode 100644
index 00000000..ad05acb8
--- /dev/null
+++ b/dlcp/src/dl_compression_util.cpp
@@ -0,0 +1,36 @@
+/*
+ *Copyright (c) 2018 Intel Corporation.
+ *
+ *Permission is hereby granted, free of charge, to any person obtaining a copy
+ *of this software and associated documentation files (the "Software"), to deal
+ *in the Software without restriction, including without limitation the rights
+ *to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ *copies of the Software, and to permit persons to whom the Software is
+ *furnished to do so, subject to the following conditions:
+ *
+ *The above copyright notice and this permission notice shall be included in
+ *all copies or substantial portions of the Software.
+ *
+ *THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ *OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ *THE SOFTWARE.
+ *
+ */
+
+
+#include "dl_compression_util.hpp"
+
+int g_log_th = 0;
+
+void dl_comp_get_time(char* buf, size_t bufSize)
+{
+    time_t timer;
+    struct tm* timeInfo = 0;
+    time(&timer);
+    timeInfo = localtime(&timer);
+    strftime(buf, bufSize, "%Y:%m:%d %H:%M:%S", timeInfo);
+}
diff --git a/dlcp/src/dl_compression_util.hpp b/dlcp/src/dl_compression_util.hpp
new file mode 100644
index 00000000..5b33de25
--- /dev/null
+++ b/dlcp/src/dl_compression_util.hpp
@@ -0,0 +1,100 @@
+/*
+ *Copyright (c) 2018 Intel Corporation.
+ *
+ *Permission is hereby granted, free of charge, to any person obtaining a copy
+ *of this software and associated documentation files (the "Software"), to deal
+ *in the Software without restriction, including without limitation the rights
+ *to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ *copies of the Software, and to permit persons to whom the Software is
+ *furnished to do so, subject to the following conditions:
+ *
+ *The above copyright notice and this permission notice shall be included in
+ *all copies or substantial portions of the Software.
+ *
+ *THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ *OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ *THE SOFTWARE.
+ *
+ */
+
+
+#ifndef DL_COMPRESSION_UTIL_HPP
+#define DL_COMPRESSION_UTIL_HPP
+
+#include <assert.h>
+#include <cstring>
+#include <ctype.h>
+#include <execinfo.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/syscall.h>
+#include <time.h>
+#include <unistd.h>
+
+extern int g_log_th; // log threash hold
+
+#define GET_TID() syscall(SYS_gettid)
+#define __FILENAME__ (strrchr(__FILE__, '/') ? strrchr(__FILE__, '/') + 1 : __FILE__)
+
+#define DLCP_LOG(log_level, fmt, ...)   \
+do {                                    \
+    if (log_level <= g_log_th)          \
+    {                                   \
+        char time_buf[20];              \
+        dl_comp_get_time(time_buf, 20); \
+        switch (log_level)              \
+        {                               \
+            case ERROR:                 \
+            {                           \
+                printf("%s: ERROR: (%ld): %s:%u " fmt "\n", time_buf, GET_TID(),   \
+                       __FUNCTION__, __LINE__, ##__VA_ARGS__);                     \ 
+                break;                                                             \
+            }                                                                      \
+            case INFO:                                                             \
+            {                                                                      \
+                printf("(%ld):" fmt "\n", GET_TID(), ##__VA_ARGS__);               \
+                break;                                                             \
+            }                                                                      \
+            case DEBUG:                                                            \
+            case TRACE:                                                            \
+            {                                                                      \
+                printf("%s: (%ld): %s:%u " fmt "\n", time_buf, GET_TID(),          \
+                       __FUNCTION__, __LINE__, ##__VA_ARGS__);                     \
+                break;                                                             \
+            }                                                                      \
+            default:                                                               \
+            {                                                                      \
+                printf("(%ld):" fmt "\n", GET_TID(), ##__VA_ARGS__);               \
+            }                                                                      \
+        }                                                                          \
+        fflush(stdout);                                                            \
+    }                                                                              \
+} while (0)
+
+#define DLCP_ASSERT(cond, fmt, ...)                                                     \
+do                                                                                      \
+{                                                                                       \
+    if (!(cond))                                                                        \
+    {                                                                                   \
+        fprintf(stderr, "(%ld): %s:%s:%d: ASSERT '%s' FAILED: " fmt "\n",               \
+                GET_TID(), __FILENAME__, __FUNCTION__, __LINE__, #cond, ##__VA_ARGS__); \
+        fflush(stderr);                                                                 \
+        _exit(1);                                                                       \
+    }                                                                                   \
+} while(0)
+
+enum LogLevel
+{
+    ERROR = 0,
+    INFO,
+    DEBUG,
+    TRACE
+};
+
+void dl_comp_get_time(char *buf, size_t buf_size);
+
+#endif
diff --git a/dlcp/test/Makefile b/dlcp/test/Makefile
new file mode 100644
index 00000000..c3524f9a
--- /dev/null
+++ b/dlcp/test/Makefile
@@ -0,0 +1,68 @@
+LOCAL_DIR = $(shell pwd)
+
+#COMPILER ?= gnu
+COMPILER ?= intel
+
+FLAG_DEBUG       ?= 0
+
+AR       	= ar
+CXXFLAGS    += -fPIC
+
+ifeq ($(COMPILER), intel)
+    CC        = icc
+    CXX       = icpc
+    CXXFLAGS += -std=c++11
+    LDFLAGS  += -static-intel
+else ifeq ($(COMPILER), gnu)
+    CC        = gcc
+    CXX       = g++
+    CXXFLAGS += -std=c++0x
+else
+    $(error Unsupported compiler $(COMPILER))
+endif
+
+ifeq ($(FLAG_DEBUG), 1)
+    CXXFLAGS += -O0 -g
+else
+    CXXFLAGS += -O2
+endif
+
+ifeq ($(CXX), icpc)
+    LDFLAGS += -static-intel -qopenmp
+endif
+
+ifeq ($(CXX), icpc)
+    CXXFLAGS += 
+endif
+
+COMPRESSION_LIB		= libdlcomp.so
+SRC_DIR				= $(LOCAL_DIR)
+INCL_DIR			= $(LOCAL_DIR)/../include 
+
+TARGET              = test_compress_lib
+EXECUTE				= test
+INCS        		= -I$(INCLUDE_DIR) -I$(SRC_DIR)
+LDFLAGS             += -L$(LOCAL_DIR)/../lib -ldlcomp 
+CXXFLAGS 			+= $(addprefix -I,$(INCL_DIR)) 
+
+
+SRCS += main.cpp
+
+OBJS := $(SRCS:.cpp=.o)
+
+
+all: $(TARGET)
+
+$(TARGET): $(EXECUTE)
+
+$(EXECUTE): $(OBJS)
+	$(CXX) $(CXXFLAGS) -o $(EXECUTE) $(OBJS) $(LDFLAGS)
+
+$(SRC_DIR)/%.o: $(SRC_DIR)/%.cpp 
+	$(CXX) -c $(CXXFLAGS) $< -o $@
+
+clean:
+	rm -f $(SRC_DIR)/*.o $(EXECUTE) 
+
+cleanall: clean
+
diff --git a/dlcp/test/main.cpp b/dlcp/test/main.cpp
new file mode 100644
index 00000000..16962a67
--- /dev/null
+++ b/dlcp/test/main.cpp
@@ -0,0 +1,268 @@
+
+#include <stdio.h>
+#include <math.h>
+#include <time.h>
+#include <sys/time.h>
+#include <sys/types.h>
+#include <unistd.h>
+#include <string.h>
+#include <stdlib.h>
+#include <algorithm>
+
+#include "dl_compression.h"
+
+#define DATA_LEN    100000000
+
+float data1[DATA_LEN];
+
+float data2[DATA_LEN];
+
+void dataSetUp(void);
+
+bool test_compress_buffer();
+
+bool test_decompress_buffer();
+
+bool test_compressed_buffer_reduce_sum();
+
+void addVec(const float *vec1, const float *vec2, float *vec3, int count) {
+    for (int i = 0; i < count; i++) {
+        vec3[i] = vec1[i] + vec2[i];
+    }
+}
+
+void cmpVec(const float *vec1, const float *vec2, int count) {
+    for (int i = 0; i < count; i++) {
+        if (std::abs(vec1[i] - vec2[i]) > 1e-3) {
+            printf("Detect big gap index: %d\n", i);
+        }
+    }
+}
+
+float getSum(const float *src, int count) {
+    float sum = 0.0f;
+    for (int i = 0; i < count; i++) {
+        sum += src[i];
+    }
+    return sum;
+}
+
+void dumpVec(const float *vec, int count) {
+    for (int i = 0; i < count; i++) {
+        printf("vec[%d] = %lf\n", i, vec[i]);
+    }
+}
+
+float sumVec(const float *vec1, const float *vec2, int count) {
+    float sum = 0.0f;
+    for (int i = 0; i < count; i++) {
+        sum = sum + vec1[i] + vec2[i];
+    //    printf("data1[%d] = %lf data2[%d] = %lf sum = %lf\n", i, vec1[i], i, vec2[i], vec1[i] + vec2[i]);
+    }
+    return sum;
+}
+
+float sumVec2(const float *vec1, const float *vec2, int count) {
+    float sum = 0.0f;
+    for (int i = 0; i < count; i++) {
+        sum = sum + vec1[i] + vec2[i];
+    //    printf("tempData1[%d] = %lf tempData2[%d] = %lf sum = %lf\n", i, vec1[i], i, vec2[i], vec1[i] + vec2[i]);
+    }
+    return sum;
+}
+
+int main(int argc, char *argv[])
+{
+    dataSetUp();
+
+    if (!test_compress_buffer()) {
+        printf("test_compress_buffer failure!\n");
+    } else {
+        printf("test_compress_buffer successful!\n");
+    }
+
+    if (!test_decompress_buffer()) {
+        printf("test_decompress_buffer failure!\n");
+    } else {
+        printf("test_decompress_buffer successful!\n");
+    }
+
+    if (!test_compressed_buffer_reduce_sum()) {
+        printf("test_compressed_buffer_reduce_sum failure!\n");
+    } else {
+        printf("test_compressed_buffer_reduce_sum successful!\n");
+    }
+
+    return 0;
+}
+
+void dataSetUp()
+{
+    srand((int)time(0));
+
+    for (int i = 0; i < DATA_LEN; i++) {
+        data1[i] = (rand() % 10000) / (-100000.f) ;
+    }
+    
+    for (int i = 0; i < DATA_LEN; i++) {
+        data2[i] = (rand() % 10000) / (-100000.f);
+    }
+}
+
+bool test_compress_buffer()
+{
+    float *tempData = (float *)malloc(sizeof(float) * DATA_LEN);
+    memcpy(tempData, data1, sizeof(float) * DATA_LEN);
+
+    dl_comp_return_t ret = dl_comp_compress_buffer((const void *)tempData,
+                                                   tempData,
+                                                   DATA_LEN,
+                                                   NULL,
+                                                   DL_COMP_FLOAT32,
+                                                   4,
+                                                   DL_COMP_DFP);
+    free(tempData);
+    if (ret != DL_COMP_OK) {
+        printf("compress failed error = %d!\n", ret);
+        return false;
+    }
+
+    return true;
+}
+
+bool test_decompress_buffer()
+{
+    float *tempData = (float *)malloc(sizeof(float) * DATA_LEN);
+    float *diff = (float *)malloc(sizeof(float) * DATA_LEN);
+    memcpy(tempData, data1, sizeof(float) * DATA_LEN);
+    memset(diff, 0, sizeof(float) * DATA_LEN); 
+
+    printf("before compress Total Sum: %f\n", getSum(data1, DATA_LEN));
+    dl_comp_return_t ret = dl_comp_compress_buffer((const void *)tempData,
+                                                   tempData,
+                                                   DATA_LEN,
+                                                   diff,
+                                                   DL_COMP_FLOAT32,
+                                                   4,
+                                                   DL_COMP_DFP);
+    if (ret != DL_COMP_OK) {
+        printf("compress failed error = %d!\n", ret);
+        free(tempData);
+        free(diff);
+        return false;
+    }
+
+    ret = dl_comp_decompress_buffer((const void *)tempData,
+                                    tempData,
+                                    DATA_LEN);
+    if (ret != DL_COMP_OK) {
+        printf("de-compress failed error = %d!\n", ret);
+        free(tempData);
+        free(diff);
+        return false;
+    }
+
+    printf("after compress Total Sum: %f diff: %f\n", getSum(tempData, DATA_LEN), getSum(diff, DATA_LEN));
+    printf("after diff compensation Total Sum: %f\n", sumVec(tempData, diff, DATA_LEN));
+    free(tempData);
+    free(diff);
+    return true;
+}
+
+bool test_compressed_buffer_reduce_sum()
+{
+    float *tempData1 = (float *)malloc(sizeof(float) * DATA_LEN);
+    float *tempData2 = (float *)malloc(sizeof(float) * DATA_LEN);
+    float *tempData3 = (float *)malloc(sizeof(float) * DATA_LEN);
+    float *tempData4 = (float *)malloc(sizeof(float) * DATA_LEN);
+    float *sum1 = (float *)malloc(sizeof(float) * DATA_LEN);
+    float *sum2 = (float *)malloc(sizeof(float) * DATA_LEN);
+    float *sum3 = (float *)malloc(sizeof(float) * DATA_LEN);
+    memcpy(tempData1, data1, sizeof(float) * DATA_LEN); 
+    memcpy(tempData2, data2, sizeof(float) * DATA_LEN); 
+
+    dl_comp_return_t ret = dl_comp_compress_buffer((const void *)tempData1,
+                                                   tempData1,
+                                                   DATA_LEN,
+                                                   NULL,
+                                                   DL_COMP_FLOAT32,
+                                                   4,
+                                                   DL_COMP_DFP);
+
+    if (ret != DL_COMP_OK) {
+        printf("compress failed error = %d!\n", ret);
+        free(tempData1);
+        free(tempData2);
+        return false;
+    }
+    
+    ret = dl_comp_compress_buffer((const void *)tempData2,
+                                  tempData2,
+                                  DATA_LEN,
+                                  NULL,
+                                  DL_COMP_FLOAT32,
+                                  4,
+                                  DL_COMP_DFP);
+
+    if (ret != DL_COMP_OK) {
+        printf("compress failed error = %d!\n", ret);
+        free(tempData1);
+        free(tempData2);
+        return false;
+    }
+
+#if 0
+    ret = dl_comp_decompress_buffer((const void *)tempData1,
+                                    (void *)tempData3,
+                                    DATA_LEN);
+    ret = dl_comp_decompress_buffer((const void *)tempData2,
+                                    (void *)tempData4,
+                                    DATA_LEN);
+      
+    printf("orig data sum = %lf\n", sumVec(data1, data2, DATA_LEN));
+    printf("new data sum = %lf\n", sumVec2(tempData3, tempData4, DATA_LEN));
+#endif
+
+#if 1
+    size_t blockCount = dl_comp_convert_block_count(DATA_LEN);
+
+    ret = dl_comp_compressed_buffer_reduce_sum((const void *)tempData1,
+                                               (void *)tempData2,
+                                               blockCount);
+
+    if (ret != DL_COMP_OK) {
+        printf("reduce sum failed error = %d!\n", ret);
+        free(tempData1);
+        free(tempData2);
+        return false;
+    }
+   
+    ret = dl_comp_decompress_buffer((const void *)tempData2,
+                                    (void *)tempData2,
+                                    DATA_LEN);
+
+    if (ret != DL_COMP_OK) {
+        printf("de compress failed error = %d!\n", ret);
+        free(tempData1);
+        free(tempData2);
+        return false;
+    }
+
+    printf("orig data sum = %lf\n", sumVec(data1, data2, DATA_LEN));
+    printf("new reduce sum = %lf\n", getSum(tempData2, DATA_LEN));
+#endif
+
+//    addVec(data1, data2, sum1, DATA_LEN);
+//    addVec(tempData3, tempData4, sum2, DATA_LEN);
+
+//    printf("start to cmp sum1 and tempData2!\n");
+//    cmpVec(sum1, tempData2, DATA_LEN);
+
+//   printf("start to cmp sum2 and sum1!\n");
+//    cmpVec(sum2, sum1, DATA_LEN);
+
+
+    free(tempData1);
+    free(tempData2);
+    return true;
+}
diff --git a/dlcp/test/run.sh b/dlcp/test/run.sh
new file mode 100755
index 00000000..dcf5f18f
--- /dev/null
+++ b/dlcp/test/run.sh
@@ -0,0 +1,5 @@
+#!/bin/bash
+
+export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:../lib/
+export OMP_NUM_THREADS=1
+./test 
diff --git a/python/external/__init__.py b/python/external/__init__.py
new file mode 100644
index 00000000..791c3041
--- /dev/null
+++ b/python/external/__init__.py
@@ -0,0 +1 @@
+import external.mkldnn  # NOQA
diff --git a/python/external/dlcp/include/dl_compression.h b/python/external/dlcp/include/dl_compression.h
new file mode 100644
index 00000000..e081461b
--- /dev/null
+++ b/python/external/dlcp/include/dl_compression.h
@@ -0,0 +1,193 @@
+/*
+ *Copyright (c) 2018 Intel Corporation.
+ *
+ *Permission is hereby granted, free of charge, to any person obtaining a copy
+ *of this software and associated documentation files (the "Software"), to deal
+ *in the Software without restriction, including without limitation the rights
+ *to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ *copies of the Software, and to permit persons to whom the Software is
+ *furnished to do so, subject to the following conditions:
+ *
+ *The above copyright notice and this permission notice shall be included in
+ *all copies or substantial portions of the Software.
+ *
+ *THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ *OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ *THE SOFTWARE.
+ *
+ */
+
+
+#ifndef DL_COMPRESSION_H
+#define DL_COMPRESSION_H
+
+#include <stddef.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef enum {
+    DL_COMP_NONE = 0,
+    DL_COMP_DFP = 1,
+} dl_comp_method_t;
+
+typedef enum {
+    DL_COMP_OK = 0,
+    DL_COMP_FAIL = 1,
+    DL_COMP_FAIL_SRC_DATA_TYPE_NOT_SUPPORTED = 2,
+    DL_COMP_FAIL_RATIO_NOT_SUPPORTED = 3,
+    DL_COMP_FAIL_COMP_METHOD_NOT_SUPPORTED = 4,
+    DL_COMP_FAIL_INVALID_COMPRESSED_FORMAT = 5,
+    DL_COMP_FAIL_NOT_SUPPORTED = 6
+} dl_comp_return_t;
+
+typedef enum {
+    DL_COMP_INT8    = 0,
+    DL_COMP_FLOAT16 = 1,
+    DL_COMP_FLOAT32 = 2,
+    DL_COMP_FLOAT64 = 3,
+} dl_comp_data_type_t;
+
+// Compress src buffer into dst buffer.
+//
+// Parameters:
+// src [in] pointer to src buffer
+// dst [out] pointer to dst buffer
+// dataCount [in] num of element needs to be compressed
+// diff [in/out] place the precision lost from the last compress
+//               return the precision lost from this compress. 
+//               If you don't care about lost precision, you can
+//               set it NULL pointer.
+// src_data_type [in/out] data type in src buffer              
+// comp_ratio [in] compression ratio, it should only be 2,4,8,16,32.
+//                 e.g. If we compress FLOAT32 to INT8, the comp_ratio
+//                 is 4.
+// method [in] compression algorithm
+// Returns:
+// compress successful or not. DL_COMP_OK means successful, otherwise not.
+dl_comp_return_t dl_comp_compress_buffer( const void *src, 
+                                          void *dst, 
+                                          size_t dataCount, 
+                                          void *diff, 
+                                          dl_comp_data_type_t src_data_type,
+                                          size_t comp_ratio,
+                                          dl_comp_method_t method );
+
+// de-Compress src buffer into dst buffer.
+// 
+// Parameters:
+// src [in] pointer to src buffer
+// dst [out] pointer to dst buffer
+// dataCount [in] num of element needs to be de-Compressed
+// Returns:
+// de-compress successful or not.
+dl_comp_return_t dl_comp_decompress_buffer( const void *src, 
+                                            void *dst, 
+                                            size_t dataCount );
+
+// Sum up compressed data from two input buffer and put the result
+// in the outBuffer.
+// 
+// Parameters:
+// inBuffer1 [in] pointer to quantized data vector
+// inBuffer2 [in] pointer to quantized data vector
+// dataCount [in] num of element in inBuffer1 and inBuffer2 
+//                needs to be sum up.
+// outBuffer [out] pointer to quantized data vector and the result
+//                      will be placed in this inoutBuffer.
+// Returns:
+// sum up successful or not.
+dl_comp_return_t dl_comp_compressed_buffer_sum( const void *inBuffer1, 
+                                                const void *inBuffer2,
+                                                size_t dataCount,
+                                                void *outBuffer ); 
+
+// Get compress meta data info(block). Some operation like multi-node all-reduce
+// will divide payload into parts to enhance communication efficiency.This api 
+// is to notify of the compressed meta data info: The minimum slicing granularity.
+// Its size is related with src DataType, comp_ratio, compression algorithm
+//
+// Parameters:
+// srcDataType [in] data type of src data before compression.
+// comp_ratio [in] compression ratio
+// method [in] compression algorithm
+// Returns:
+// N/A.
+size_t dl_comp_get_sizeof_block( dl_comp_data_type_t src_data_type, 
+                                 size_t comp_ratio, 
+                                 dl_comp_method_t method );
+
+// Sum up two buffer's compressed data and put the result in
+// second buffer.Please attention here we use blockCount 
+// as input parameter. 1 block can contain multiple data.
+// 
+// Parameters:
+// inBuffer [in] pointer to quantized data
+// inoutBuffer [in/out] pointer to quantized data. Result will 
+//                      be placed in this buffer.
+// Returns:
+// 
+dl_comp_return_t  dl_comp_compressed_buffer_reduce_sum( const void *inBuffer, 
+                                                        void *inoutBuffer,
+                                                        size_t blockCount );
+
+// Util function for converting data count into block count.
+//
+// Prameters:
+// dataCount [in] num of digit
+// Returns:
+// return corresponding num of block
+size_t dl_comp_convert_block_count(size_t dataCount);
+
+// Util function to get how many elements in one block.
+// Parameters:
+// N/A
+// Returns:
+// return how many elements in one block
+size_t dl_comp_get_elem_num_in_block();
+
+// Check Running Environment.
+// Parameters:
+// N/A
+// Returns:
+// If check successful, return true. otherwise false.
+// If false, pls disable quantization functionality.
+// E.g. We sugguest not to use quantization on machine
+// not support avx512 instructions, because there's no 
+// performance gain.
+bool dl_comp_check_running_environ();
+
+// Uitl function for compress float32 data to int8
+// Parameters:
+// srcBuffer [in] src float32 data
+// dstBuffer [out] dst int8 data
+// diff: [in/out] precision lost in compression
+// dataCount [in] data count
+// Return:
+// If successful, return 0, otherwise error code.
+int dl_comp_compress_buffer_FLOAT32ToINT8( const void *srcBuffer,
+                                           void *dstBuffer,
+                                           void *diff,
+                                           size_t dataCount);
+
+// Util function for de-compress int8 to float32
+// Parameters:
+// srcBuffer [in] contain int8 compressed data
+// dstBuffer [out] de-comressed float32 data
+// dataCount [in] data count
+// Return:
+// If successful, return 0, othersise error code.
+int dl_comp_decompress_buffer_INT8ToFLOAT32(const void *srcBuffer,
+                                            void *dstBuffer,
+                                            size_t dataCount);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/python/external/dlcp/lib/libdlcomp.so b/python/external/dlcp/lib/libdlcomp.so
new file mode 100755
index 00000000..5e0d245a
Binary files /dev/null and b/python/external/dlcp/lib/libdlcomp.so differ
diff --git a/python/external/mkldnn/__init__.py b/python/external/mkldnn/__init__.py
new file mode 100644
index 00000000..e2d012a6
--- /dev/null
+++ b/python/external/mkldnn/__init__.py
@@ -0,0 +1,6 @@
+from external.mkldnn import prepare_mkldnn  # NOQA
+from external.mkldnn.prepare_mkldnn import prepare  # NOQA
+from external.mkldnn.prepare_mkldnn import root  # NOQA
+from external.mkldnn.prepare_mkldnn import lib_path  # NOQA
+from external.mkldnn.prepare_mkldnn import include_path  # NOQA
+from external.mkldnn.prepare_mkldnn import source_path  # NOQA
diff --git a/python/external/mkldnn/prepare_mkldnn.py b/python/external/mkldnn/prepare_mkldnn.py
new file mode 100644
index 00000000..199d9327
--- /dev/null
+++ b/python/external/mkldnn/prepare_mkldnn.py
@@ -0,0 +1,127 @@
+import os
+import sys
+
+MKLDNN_ROOT = os.environ['HOME'] + '/.chainer'
+MKLDNN_WORK_PATH = os.path.split(os.path.realpath(__file__))[0]
+MKLDNN_LIB_PATH = MKLDNN_ROOT + '/lib'
+MKLDNN_INCLUDE_PATH = MKLDNN_ROOT + '/include'
+MKLDNN_SOURCE_PATH = MKLDNN_WORK_PATH + '/source'
+MKLDNN_BUILD_PATH = MKLDNN_WORK_PATH + '/source/build'
+MKLML_PKG_PATH = MKLDNN_SOURCE_PATH + '/external'
+
+
+def download(mkldnn_version):
+    print('Downloading ...')
+
+    os.chdir(MKLDNN_WORK_PATH)
+    os.system(
+        'git clone -b master \
+            --single-branch https://github.com/01org/mkl-dnn.git source')
+
+    os.chdir(MKLDNN_SOURCE_PATH)
+    os.system('git reset --hard %s' % mkldnn_version)
+
+    if not os.path.exists(MKLML_PKG_PATH):
+        os.system('cd scripts && ./prepare_mkl.sh && cd ..')
+
+
+def build():
+    print('Building ...')
+
+    os.system(
+        'mkdir -p build && cd build \
+            && cmake -DCMAKE_INSTALL_PREFIX=%s .. && make -j' % MKLDNN_ROOT)
+
+
+def install(refresh_build):
+    print('Installing ...')
+
+    os.chdir(MKLDNN_SOURCE_PATH)
+
+    # install mkldnn
+    if refresh_build:
+        os.system('cd build && make -j && make install')
+    else:
+        os.system('cd build && make install')
+
+    # install mklml
+    mklml_pkg_path_leafs = os.listdir(MKLML_PKG_PATH)
+    mklml_origin_path = None
+    for leaf in mklml_pkg_path_leafs:
+        if os.path.isdir('%s/%s' % (MKLML_PKG_PATH, leaf)) and \
+           'mklml' in leaf:
+            mklml_origin_path = '%s/%s' % (MKLML_PKG_PATH, leaf)
+            break
+
+    if mklml_origin_path:
+        os.system('cp %s/lib/* %s' % (mklml_origin_path, MKLDNN_LIB_PATH))
+        os.system('cp %s/include/* %s' %
+                  (mklml_origin_path, MKLDNN_INCLUDE_PATH))
+
+
+def build_install():
+    build()
+    install(False)
+
+
+def download_build_install(mkldnn_version):
+    download(mkldnn_version)
+    build_install()
+
+
+def prepare(mkldnn_version):
+    print('Intel mkl-dnn preparing ...')
+    mkldnn_prepared = True
+    mkldnn_built = True
+    mkldnn_installed = True
+
+    if os.path.exists(MKLDNN_SOURCE_PATH):
+        os.chdir(MKLDNN_SOURCE_PATH)
+        res = os.popen('git log | sed -n \'1p\'', 'r')
+        commit_head = res.read()
+        if mkldnn_version not in commit_head:
+            os.chdir(MKLDNN_WORK_PATH)
+            os.system('rm -rf %s' % MKLDNN_SOURCE_PATH)
+            os.system('rm -rf %s' % MKLDNN_LIB_PATH)
+            os.system('rm -rf %s' % MKLDNN_INCLUDE_PATH)
+            mkldnn_prepared = False
+        else:
+            if not os.path.exists(MKLDNN_BUILD_PATH):
+                os.system('rm -rf %s' % MKLDNN_LIB_PATH)
+                os.system('rm -rf %s' % MKLDNN_INCLUDE_PATH)
+                mkldnn_built = False
+            elif (not os.path.exists(MKLDNN_LIB_PATH)) or \
+                 (not os.path.exists(MKLDNN_INCLUDE_PATH)):
+                os.system('rm -rf %s' % MKLDNN_LIB_PATH)
+                os.system('rm -rf %s' % MKLDNN_INCLUDE_PATH)
+                mkldnn_installed = False
+    else:
+        os.system('rm -rf %s' % MKLDNN_LIB_PATH)
+        os.system('rm -rf %s' % MKLDNN_INCLUDE_PATH)
+        mkldnn_prepared = False
+
+    if not mkldnn_prepared:
+        download_build_install(mkldnn_version)
+    elif not mkldnn_built:
+        build_install()
+    elif not mkldnn_installed:
+        install(True)
+
+    os.chdir(sys.path[0])
+    print('Intel mkl-dnn prepared !')
+
+
+def root():
+    return MKLDNN_ROOT
+
+
+def lib_path():
+    return MKLDNN_LIB_PATH
+
+
+def include_path():
+    return MKLDNN_INCLUDE_PATH
+
+
+def source_path():
+    return MKLDNN_SOURCE_PATH
diff --git a/python/ideep4py/__init__.py b/python/ideep4py/__init__.py
new file mode 100644
index 00000000..8002fb32
--- /dev/null
+++ b/python/ideep4py/__init__.py
@@ -0,0 +1,171 @@
+import numpy
+import sys
+
+from ideep4py._ideep4py import intVector  # NOQA
+
+from ideep4py._ideep4py import mdarray  # NOQA
+from ideep4py._ideep4py import mdarrayVector  # NOQA
+
+from ideep4py._ideep4py import batchNormalization  # NOQA
+from ideep4py._ideep4py import concat  # NOQA
+from ideep4py._ideep4py import convolution2D  # NOQA
+from ideep4py._ideep4py import convolution2DParam as conv2DParam  # NOQA
+from ideep4py._ideep4py import dropout  # NOQA
+from ideep4py._ideep4py import linear  # NOQA
+from ideep4py._ideep4py import localResponseNormalization  # NOQA
+from ideep4py._ideep4py import localResponseNormalizationParam as lrnParam  # NOQA
+from ideep4py._ideep4py import pooling2D  # NOQA
+from ideep4py._ideep4py import pooling2DParam as pol2DParam  # NOQA
+from ideep4py._ideep4py import relu  # NOQA
+
+from ideep4py._ideep4py import basic_acc_sum  # NOQA
+from ideep4py._ideep4py import basic_copyto  # NOQA
+
+from ideep4py._ideep4py import dlCompression  # NOQA
+
+from ideep4py import cosim  # NOQA
+
+
+# ------------------------------------------------------------------------------
+# ideep4py.mdarray allocation
+# ------------------------------------------------------------------------------
+dat_array = 'd'  # data array
+wgt_array = 'w'  # weight array
+
+
+def array(x, itype=dat_array):
+    """Create a :class:`ideep4py.mdarray` object according to ``x``.
+
+    Args:
+        array (numpy.ndarray or ideep4py.mdarray):
+            if ``x`` is numpy.ndarray not in C contiguous, it will be
+            converted to C contiguous before ideep4py.mdarray created.
+        itype (=data_type): ideep4py.mdarray created is optimized according
+            ``itype`` flag.
+
+    Returns:
+        Instance of :class:`ideep4py.mdarray`.
+
+    """
+    if isinstance(x, numpy.ndarray) and \
+            x.dtype == numpy.dtype('float32'):
+        if x.flags.contiguous is False:
+            x = numpy.ascontiguousarray(x)
+        return mdarray(x, itype)
+    else:
+        return x
+
+
+_ideep4py_ = sys.modules[__name__]
+
+
+def get_array_module(array):
+    return _ideep4py_
+
+
+def check_ndim(inputs, supported_ndim=(2, 4)):
+    # Check with ideep4py supported dimension of input data
+    valid_ndim = False
+    for ndim in supported_ndim:
+        valid_ndim = valid_ndim or inputs[0].ndim == ndim
+
+    if supported_ndim and not valid_ndim:
+        return False
+    else:
+        return True
+
+
+def check_type(inputs):
+    if isinstance(inputs[0], numpy.ndarray):
+        _should_use_ideep = True
+
+        for x in inputs:
+            _should_use_ideep = _should_use_ideep and \
+                x.dtype == numpy.dtype('float32') and \
+                x.size != 0
+        return _should_use_ideep
+    else:
+        return False
+
+
+def all_ready(inputs, supported_ndim=(2, 4)):
+    """Check inputs dimentions and type
+
+    The function checks ``inputs`` info and ``supported_ndim``.
+
+    Args:
+        inputs (numpy.ndarray, ideep.mdarray):
+            ``inputs`` to be checked including array type, dimension
+            and data type.
+        supported_ndim: A tuple of ndim. ideep supports array dimension
+            in either 2 or 4 only.
+
+    Returns:
+        bool: ``True`` if all conditions meet.
+
+    """
+
+    if check_ndim(inputs, supported_ndim) is False:
+        return False
+    elif isinstance(inputs[0], mdarray):
+        return True
+    else:
+        return check_type(inputs)
+
+
+def tanh(x):
+    if all_ready((x,)):
+        y = _ideep4py.tanh.Forward(array(x))  # NOQA
+    else:
+        y = numpy.tanh(x)
+
+    return y
+
+
+def convolution2DParam(out_dims, dy, dx, sy, sx, ph, pw, pd, pr):
+    cp = conv2DParam()
+    cp.out_dims = intVector()
+    for d in out_dims:
+        cp.out_dims.push_back(d)
+    cp.dilate_y, cp.dilate_x = (dy - 1), (dx - 1)
+    cp.sy, cp.sx = sy, sx
+    cp.pad_lh, cp.pad_lw = ph, pw
+    cp.pad_rh, cp.pad_rw = pd, pr
+    return cp
+
+
+def pooling2DParam(out_dims, kh, kw, sy, sx, ph, pw, pd, pr, algo):
+    pp = pol2DParam()
+    pp.out_dims = intVector()
+    for d in out_dims:
+        pp.out_dims.push_back(d)
+    pp.kh, pp.kw = kh, kw
+    pp.sy, pp.sx = sy, sx
+    pp.pad_lh, pp.pad_lw = ph, pw
+    pp.pad_rh, pp.pad_rw = pd, pr
+    pp.algo_kind = algo
+    return pp
+
+
+pooling2DParam.pooling_max = pol2DParam.pooling_max
+pooling2DParam.pooling_avg = pol2DParam.pooling_avg
+pooling2DParam.pooling_avg_include_padding = \
+    pol2DParam.pooling_avg_include_padding
+pooling2DParam.pooling_avg_exclude_padding = \
+    pol2DParam.pooling_avg_exclude_padding
+
+
+def localResponseNormalizationParam(n, k, alpha, beta, algo):
+    lp = lrnParam()
+    lp.n = n
+    lp.k = k
+    lp.alpha = alpha
+    lp.beta = beta
+    lp.algo_kind = algo
+    return lp
+
+
+localResponseNormalizationParam.lrn_across_channels = \
+    lrnParam.lrn_across_channels
+localResponseNormalizationParam.lrn_within_channel = \
+    lrnParam.lrn_within_channel
diff --git a/python/ideep4py/blas/sum.cc b/python/ideep4py/blas/sum.cc
new file mode 100644
index 00000000..a3bcfd4b
--- /dev/null
+++ b/python/ideep4py/blas/sum.cc
@@ -0,0 +1,498 @@
+/*
+ *Copyright (c) 2018 Intel Corporation.
+ *
+ *Permission is hereby granted, free of charge, to any person obtaining a copy
+ *of this software and associated documentation files (the "Software"), to deal
+ *in the Software without restriction, including without limitation the rights
+ *to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ *copies of the Software, and to permit persons to whom the Software is
+ *furnished to do so, subject to the following conditions:
+ *
+ *The above copyright notice and this permission notice shall be included in
+ *all copies or substantial portions of the Software.
+ *
+ *THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ *OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ *THE SOFTWARE.
+ *
+ */
+
+
+#include <mkldnn.hpp>
+#include <vector>
+#include <cstdlib>
+#include <memory>
+#include <omp.h>
+#include "tensor.h"
+#include "sum.h"
+
+using namespace mkldnn;
+
+static inline bool optimized_format(Tensor *t) {
+    switch(t->format()) {
+    case mkldnn_nChw16c:
+    case mkldnn_nChw8c:
+    case mkldnn_OIhw8i8o:
+    case mkldnn_OIhw16i16o:
+    case mkldnn_OIhw8i16o2i:
+    case mkldnn_OIhw8o16i2o:
+    case mkldnn_OIhw8o8i:
+    case mkldnn_OIhw16o16i:
+    case mkldnn_Oihw8o:
+    case mkldnn_Oihw16o:
+        return true;
+    default:
+        return false;
+    }
+}
+
+template<typename T>
+static T * sum_nChwXC_along_channel(T *src, mkldnn_memory_format_t format,
+                                    mkldnn_dims_t dims, vector<int> axis, T *dst) {
+    int mb = dims[0],
+        ic = dims[1],
+        ih = dims[2],
+        iw = dims[3];
+    const int cg = format == mkldnn_nChw16c ? 16 : 8;
+    int cn = ic / cg;
+
+    int blk_nthr = omp_get_max_threads(),
+        blk_num = blk_nthr,
+        blk_len = mb / blk_num,
+        blk_len_ex = mb % blk_num;
+
+    if (!blk_len)
+        blk_nthr = mb;
+
+    T *buf = reinterpret_cast<T *>(new avx::byte[ic * blk_nthr * sizeof(T)]);
+
+    # pragma omp parallel num_threads(blk_nthr)
+    {
+        int ithr = omp_get_thread_num();
+        int blen = ithr < blk_len_ex ? blk_len + 1 : blk_len;
+        int bstart = ithr <= blk_len_ex ? (blk_len + 1) * ithr :
+                     blk_len_ex * (blk_len + 1) + (ithr - blk_len_ex) * blk_len;
+        int bend = bstart + blen;
+
+        T *loc_src = src + bstart * ic * ih * iw;
+        if ((cg == 16) && (((unsigned long)buf & 0xf) == 0) && (((unsigned long)loc_src & 0xf) == 0)) {
+            for (int b = bstart; b < bend; b++) {
+                T *loc_buf = buf + ithr * ic;
+                for (int c = 0; c < cn; c++) {
+                    if (b == bstart)
+                        for (int o = 0; o < cg; o++)
+                            loc_buf[o] = 0;
+                    for (int hw = 0; hw < ih * iw; hw++) {
+                        __asm__(
+                                "mov %0, %%rax\n"
+                                "mov %1, %%rbx\n"
+                                ".byte 0x62, 0xf1, 0x7c, 0x48, 0x10, 0x00\n" //vmovups (%%rax), %%zmm0
+                                ".byte 0x62, 0xf1, 0x7c, 0x48, 0x58, 0x03\n" //vaddps (%%rbx), %%zmm0, %%zmm0
+                                ".byte 0x62, 0xf1, 0x7c, 0x48, 0x11, 0x00\n" //vmovups %%zmm0, (%%rax)
+                                :"+r"(loc_buf)
+                                :"r"(loc_src)
+                                :"rax", "rbx"
+                                );
+                        loc_src += cg;
+                    }
+
+                    loc_buf += cg;
+                }
+            }
+        } else if ((cg == 8) && (((unsigned long)buf & 0x7) == 0) && (((unsigned long)loc_src & 0x7) == 0)) {
+             for (int b = bstart; b < bend; b++) {
+                T *loc_buf = buf + ithr * ic;
+                for (int c = 0; c < cn; c++) {
+                    if (b == bstart)
+                        for (int o = 0; o < cg; o++)
+                            loc_buf[o] = 0;
+                    for (int hw = 0; hw < ih * iw; hw++) {
+                        __asm__(
+                                "mov %0, %%rax\n"
+                                "mov %1, %%rbx\n"
+                                ".byte 0xc5, 0xfc, 0x10, 0x00\n" //vmovups (%%rax), %%ymm0
+                                ".byte 0xc5, 0xfc, 0x58, 0x03\n" //vaddps (%%rbx), %%ymm0, %%ymm0
+                                ".byte 0xc5, 0xfc, 0x11, 0x00\n" //vmovups %%ymm0, (%rax)
+                                :"+r"(loc_buf)
+                                :"r"(loc_src)
+                                :"rax", "rbx"
+                                );
+                        loc_src += cg;
+                    }
+
+                    loc_buf += cg;
+                }
+            }
+        } else {
+            for (int b = bstart; b < bend; b++) {
+                T *loc_buf = buf + ithr * ic;
+                for (int c = 0; c < cn; c++) {
+                    if (b == bstart)
+                        for (int o = 0; o < cg; o++)
+                            loc_buf[o] = 0;
+
+                    for (int hw = 0; hw < ih * iw; hw++) {
+                        for (int o = 0; o < cg; o++)
+                            loc_buf[o] += loc_src[o];
+                        loc_src += cg;
+                    }
+
+                    loc_buf += cg;
+                }
+            }
+        }
+
+    }
+
+    // Allreduce
+    int c_nthr = omp_get_max_threads(),
+        c_num = c_nthr,
+        c_len = ic / c_num,
+        c_len_ex = ic % c_num;
+
+    if (!c_len)
+        c_nthr = ic;
+
+    # pragma omp parallel num_threads(c_nthr)
+    {
+        int ithr = omp_get_thread_num();
+        int clen = ithr < c_len_ex ? c_len + 1 : c_len;
+        int cstart = ithr <= c_len_ex ? (c_len + 1) * ithr :
+                     c_len_ex * (c_len + 1) + (ithr - c_len_ex) * c_len;
+        int cend = cstart + clen;
+
+        for (int c = cstart; c < cend; c++)
+            dst[c] = 0;
+
+        for (int i = 0; i < blk_nthr; i++) {
+            T *loc_buf = buf + i * ic;
+            for (int c = cstart; c < cend; c++)
+                dst[c] += loc_buf[c];
+        }
+    }
+
+    delete(reinterpret_cast<avx::byte *>(buf));
+
+    return dst;
+}
+
+// 4 dimensions(NCHW/OIHW) opitimzation for mkldnn backend only.
+Tensor * sum_opt_along_axis(Tensor *src, vector<int> axis) {
+    int axises = axis.size();
+    vector<int> valid_axis_4dim = {0, 2, 3};
+
+    if (src->ndims() != 4 || axises != 3) {
+        return nullptr;
+    }
+
+    auto valid_axis = [](int axises,
+                         vector<int> axis,
+                         vector<int> valid_axis) -> bool {
+        for (int i = 0; i < axises; i++) {
+            if (valid_axis[i] != axis[i])
+                return false;
+        }
+        return true;
+    };
+
+    try {
+        switch (src->format()) {
+        case mkldnn_nChw8c:
+            if (!valid_axis(axises, axis, valid_axis_4dim))
+                throw std::runtime_error(
+                    "Invalid axis in tensor sum along axis <mkldnn_nChw8c>");
+            break;
+        case mkldnn_nChw16c:
+            if (!valid_axis(axises, axis, valid_axis_4dim))
+                throw std::runtime_error(
+                    "Invalid axis in tensor sum along axis <mkldnn_nChw16c>");
+            break;
+        default:
+            throw std::runtime_error(
+                "Invalid format in tensor sum along axis");
+            break;
+        }
+    } catch (std::runtime_error &e) {
+        (void)e;
+        return nullptr;
+    }
+
+    Tensor *dst = nullptr;
+    try {
+        switch (src->type()) {
+        case FLOAT32:
+            dst = new Tensor(1, {src->desc().data.dims[1]}, src->type());
+            sum_nChwXC_along_channel(static_cast<float *>(src->data()), src->format(),
+                    src->desc().data.dims, axis, static_cast<float *>(dst->data()));
+            break;
+        case SINT32:
+            dst = new Tensor(1, {src->desc().data.dims[1]}, src->type());
+            sum_nChwXC_along_channel(static_cast<int32_t *>(src->data()), src->format(),
+                    src->desc().data.dims, axis, static_cast<int32_t *>(dst->data()));
+            break;
+        case SINT16:
+            dst = new Tensor(1, {src->desc().data.dims[1]}, src->type());
+            sum_nChwXC_along_channel(static_cast<int16_t *>(src->data()), src->format(),
+                    src->desc().data.dims, axis, static_cast<int16_t *>(dst->data()));
+            break;
+        case SINT8:
+            dst = new Tensor(1, {src->desc().data.dims[1]}, src->type());
+            sum_nChwXC_along_channel(static_cast<int8_t *>(src->data()), src->format(),
+                    src->desc().data.dims, axis, static_cast<int8_t *>(dst->data()));
+            break;
+        case UINT8:
+            dst = new Tensor(1, {src->desc().data.dims[1]}, src->type());
+            sum_nChwXC_along_channel(static_cast<uint8_t *>(src->data()), src->format(),
+                    src->desc().data.dims, axis, static_cast<uint8_t *>(dst->data()));
+            break;
+        default:
+            throw std::runtime_error(
+            "Invalid dtype in tensor opt sum along axis");
+            break;
+        }
+    } catch (std::runtime_error &e) {
+        (void)e;
+        return nullptr;
+    }
+
+    return dst;
+}
+
+// Less optimization gained in case of first dimension in small size
+template<typename T>
+static T * sum_along_axis(T *src, int src_ndims, mkldnn_dims_t src_dims,
+                          vector<int> axis, vector<int> dst_dims, T *dst) {
+    int tail = 1;
+    for (int d = 1; d < src_ndims; d++)
+        tail *= src_dims[d];
+
+    bool along_mb = false;
+    for (int a = 0; a < axis.size(); a++) {
+        if (axis[a] == 0) {
+            along_mb = true;
+            break;
+        }
+    }
+
+    int gbl_ws_size = 1;
+    for (int d = 1; d < src_ndims; d++) {
+        int a = 0;
+        for (; a < axis.size(); a++)
+            if (d == axis[a])
+                break;
+
+        if (a >= axis.size())
+            gbl_ws_size *= src_dims[d];
+    }
+
+    int mb = src_dims[0];
+    int blk_nthr = omp_get_max_threads(),
+        blk_num = blk_nthr,
+        blk_len = mb / blk_num,
+        blk_len_ex = mb % blk_num;
+
+    if (!blk_len)
+        blk_nthr = mb;
+
+    T *gbl_ws[blk_nthr];
+    # pragma omp parallel num_threads(blk_nthr)
+    {
+        int ithr = omp_get_thread_num();
+        int blen = ithr < blk_len_ex ? blk_len + 1 : blk_len;
+        int bstart = ithr <= blk_len_ex ? (blk_len + 1) * ithr :
+                     blk_len_ex * (blk_len + 1) + (ithr - blk_len_ex) * blk_len;
+        int bend = bstart + blen;
+
+        T *loc_ws[blen];
+        for (int b = bstart; b < bend; b++) {
+            T *loc_src = src + b * tail;
+            T *cur_src = loc_src;
+
+            // Intialize for new blk
+            vector<int> cur_dims;
+            for (int d = 0; d < src_ndims; d++)
+                cur_dims.push_back(src_dims[d]);
+
+            vector<int> cur_axis;
+            for (int a = 0; a < axis.size(); a++)
+                if (axis[a] != 0)
+                    cur_axis.insert(cur_axis.begin(), axis[a]);
+
+            // Sum along axis[a]
+            for (int a = 0; a < cur_axis.size(); a++) {
+
+                int cur_fore = 1;
+                for (int d = 1; d < cur_axis[a]; d++)
+                    cur_fore *= cur_dims[d];
+
+                int cur_tail = 1;
+                for (int d = cur_axis[a] + 1; d < cur_dims.size(); d++)
+                    cur_tail *= cur_dims[d];
+
+                int cur_ws_size = cur_fore * cur_tail;
+                T *ws = reinterpret_cast<T *>(new avx::byte[cur_ws_size * sizeof(T)]);
+                for (int o = 0; o < cur_ws_size; o++) ws[o] = 0;
+
+                // kernel
+                for (int w = 0; w < cur_ws_size; w++) {
+                    for (int t = 0; t < cur_dims[cur_axis[a]]; t++) {
+                        ws[w] += cur_src[w + t * cur_tail];
+                    }
+                }
+
+                // adjust dims and cur_axis for sum in next axis
+                cur_dims.erase(cur_dims.begin() + cur_axis[a]);
+                for (int _a = a + 1; _a < cur_axis.size(); _a++) {
+                    if (cur_axis[_a] > cur_axis[a])
+                        cur_axis[_a] -= 1;
+                }
+
+                // refresh buffer
+                if (cur_src != loc_src) delete(reinterpret_cast<avx::byte *>(cur_src));
+                if (a == cur_axis.size() - 1) loc_ws[b - bstart] = ws;
+
+                cur_src = ws;
+            }
+        }
+
+        if (along_mb) {
+            // local allreduce
+            if (src_ndims == 2 && axis.size() == 1 && axis[0] == 0) {
+                loc_ws[0] = reinterpret_cast<T *>(new avx::byte[tail * sizeof(T)]);
+                for (int o = 0; o < tail; o++)
+                    loc_ws[0][o] = 0;
+                for (int b = bstart; b < bend; b++) {
+                    T *loc_src = src + b * tail;
+                    for (int o = 0; o < tail; o++)
+                        loc_ws[0][o] += loc_src[o];
+                }
+            } else {
+                for (int b = 1; b < blen; b++) {
+                    for (int o = 0; o < gbl_ws_size; o++)
+                        loc_ws[0][o] += loc_ws[b][o];
+                    delete(reinterpret_cast<avx::byte *>(loc_ws[b]));
+                }
+            }
+
+            gbl_ws[ithr] = loc_ws[0];
+        } else {
+            // cpy to dst
+            for (int b = bstart; b < bend; b++) {
+                for (int o = 0; o < gbl_ws_size; o++)
+                    dst[b * gbl_ws_size + o] = loc_ws[b - bstart][o];
+                delete(reinterpret_cast<avx::byte *>(loc_ws[b - bstart]));
+            }
+        }
+    }
+
+    if (along_mb) {
+        // global allreduce
+        int c_nthr = omp_get_max_threads(),
+            c_num = c_nthr,
+            c_len = gbl_ws_size / c_num,
+            c_len_ex = gbl_ws_size % c_num;
+
+        if (!c_len)
+            c_nthr = gbl_ws_size;
+
+        # pragma omp parallel num_threads(c_nthr)
+        {
+            int ithr = omp_get_thread_num();
+            int clen = ithr < c_len_ex ? c_len + 1 : c_len;
+            int cstart = ithr <= c_len_ex ? (c_len + 1) * ithr :
+                         c_len_ex * (c_len + 1) + (ithr - c_len_ex) * c_len;
+            int cend = cstart + clen;
+
+            for (int c = cstart; c < cend; c++)
+                dst[c] = 0;
+
+            for (int i = 0; i < blk_nthr; i++) {
+                T *loc_buf = gbl_ws[i];
+                for (int c = cstart; c < cend; c++)
+                    dst[c] += loc_buf[c];
+            }
+        }
+
+        for (int i = 0; i < blk_nthr; i++)
+            delete(reinterpret_cast<avx::byte *>(gbl_ws[i]));
+    }
+
+    return dst;
+}
+
+Tensor * sum_common_along_axis(Tensor *src, vector<int> axis) {
+    auto dims = src->desc().data.dims;
+    vector<int> o_dims;
+    int o_ndims = src->ndims() - axis.size();
+
+    // TODO: Support sum all
+    if ((o_ndims != 1 && o_ndims != 2 && o_ndims != 4) ||
+        axis.size() == 0)
+        return nullptr;
+
+    for (int d = 0; d < src->ndims(); d++) {
+        unsigned a = 0; for (; a < axis.size(); a++) {
+            if (d == axis[a])
+                break;
+        }
+
+        if (a >= axis.size())
+            o_dims.push_back(dims[d]);
+    }
+
+    Tensor *dst = nullptr;
+    try {
+        switch (src->type()) {
+        case FLOAT32:
+            dst = new Tensor(o_ndims, o_dims, src->type());
+            sum_along_axis(static_cast<float *>(src->data()),
+                           src->ndims(), src->desc().data.dims, axis,
+                           o_dims, static_cast<float *>(dst->data()));
+            break;
+        case SINT32:
+            dst = new Tensor(o_ndims, o_dims, src->type());
+            sum_along_axis(static_cast<int32_t *>(src->data()),
+                           src->ndims(), src->desc().data.dims, axis,
+                           o_dims, static_cast<int32_t *>(dst->data()));
+            break;
+        case SINT16:
+            dst = new Tensor(o_ndims, o_dims, src->type());
+            sum_along_axis(static_cast<int16_t *>(src->data()),
+                           src->ndims(), src->desc().data.dims, axis,
+                           o_dims, static_cast<int16_t *>(dst->data()));
+            break;
+        case SINT8:
+            dst = new Tensor(o_ndims, o_dims, src->type());
+            sum_along_axis(static_cast<int8_t *>(src->data()),
+                           src->ndims(), src->desc().data.dims, axis,
+                           o_dims, static_cast<int8_t *>(dst->data()));
+            break;
+        case UINT8:
+            dst = new Tensor(o_ndims, o_dims, src->type());
+            sum_along_axis(static_cast<uint8_t *>(src->data()),
+                           src->ndims(), src->desc().data.dims, axis,
+                           o_dims, static_cast<uint8_t *>(dst->data()));
+            break;
+        default:
+            throw std::runtime_error(
+                  "Invalid dtype in tensor sum common along axis");
+            break;
+        }
+    } catch (std::runtime_error &e) {
+        (void)e;
+        return nullptr;
+    }
+
+    return dst;
+}
+
+Tensor * blas_sum(Tensor *src, vector<int> axis) {
+    if (optimized_format(src))
+        return sum_opt_along_axis(src, axis);
+    else
+        return sum_common_along_axis(src, axis);
+}
diff --git a/python/ideep4py/common/common.cc b/python/ideep4py/common/common.cc
new file mode 100644
index 00000000..9c795d0d
--- /dev/null
+++ b/python/ideep4py/common/common.cc
@@ -0,0 +1,42 @@
+/*
+ *Copyright (c) 2018 Intel Corporation.
+ *
+ *Permission is hereby granted, free of charge, to any person obtaining a copy
+ *of this software and associated documentation files (the "Software"), to deal
+ *in the Software without restriction, including without limitation the rights
+ *to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ *copies of the Software, and to permit persons to whom the Software is
+ *furnished to do so, subject to the following conditions:
+ *
+ *The above copyright notice and this permission notice shall be included in
+ *all copies or substantial portions of the Software.
+ *
+ *THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ *OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ *THE SOFTWARE.
+ *
+ */
+
+
+#include <cstddef>
+#include <cstring>
+#include <glog/logging.h>
+#include <iostream>
+#include "mkldnn.hpp"
+#include "common.h"
+#include "cpu_info.h"
+
+using namespace mkldnn;
+
+engine cpu_engine(engine::cpu, 0);
+unsigned char dummy[PAGE_SIZE] __attribute__((aligned(PAGE_SIZE)));
+#define DUMMY_VAL 0xcc
+
+bool enable_prim_reuse = true; // whether reuse primitive
+bool enable_mem_reuse = true; // whether reuse output memory to next layer
+
+// vim: et ts=4 sw=4 cindent cino^=l0,\:0,N-s
diff --git a/python/ideep4py/common/common.h b/python/ideep4py/common/common.h
new file mode 100644
index 00000000..e190e40f
--- /dev/null
+++ b/python/ideep4py/common/common.h
@@ -0,0 +1,36 @@
+/*
+ *Copyright (c) 2018 Intel Corporation.
+ *
+ *Permission is hereby granted, free of charge, to any person obtaining a copy
+ *of this software and associated documentation files (the "Software"), to deal
+ *in the Software without restriction, including without limitation the rights
+ *to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ *copies of the Software, and to permit persons to whom the Software is
+ *furnished to do so, subject to the following conditions:
+ *
+ *The above copyright notice and this permission notice shall be included in
+ *all copies or substantial portions of the Software.
+ *
+ *THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ *OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ *THE SOFTWARE.
+ *
+ */
+
+
+#ifndef _COMMON_H_
+#define _COMMON_H_
+
+#include <mkldnn.hpp>
+
+const mkldnn::memory::dims NONE_DIMS = {};
+#define PAGE_SIZE 4096
+extern unsigned char dummy[PAGE_SIZE];
+#endif // _COMMON_H_
+
+
+// vim: et ts=4 sw=4 cindent cino^=l0,\:0,N-s
diff --git a/python/ideep4py/common/config.h b/python/ideep4py/common/config.h
new file mode 100644
index 00000000..40989fa1
--- /dev/null
+++ b/python/ideep4py/common/config.h
@@ -0,0 +1,28 @@
+/*
+ *Copyright (c) 2018 Intel Corporation.
+ *
+ *Permission is hereby granted, free of charge, to any person obtaining a copy
+ *of this software and associated documentation files (the "Software"), to deal
+ *in the Software without restriction, including without limitation the rights
+ *to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ *copies of the Software, and to permit persons to whom the Software is
+ *furnished to do so, subject to the following conditions:
+ *
+ *The above copyright notice and this permission notice shall be included in
+ *all copies or substantial portions of the Software.
+ *
+ *THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ *OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ *THE SOFTWARE.
+ *
+ */
+
+
+#ifndef _CONFIG_H_
+#define _CONFIG_H_
+
+#endif // _CONFIG_H_
diff --git a/python/ideep4py/common/cpu_info.cc b/python/ideep4py/common/cpu_info.cc
new file mode 100644
index 00000000..7a6c3854
--- /dev/null
+++ b/python/ideep4py/common/cpu_info.cc
@@ -0,0 +1,465 @@
+/*
+ *Copyright (c) 2018 Intel Corporation.
+ *
+ *Permission is hereby granted, free of charge, to any person obtaining a copy
+ *of this software and associated documentation files (the "Software"), to deal
+ *in the Software without restriction, including without limitation the rights
+ *to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ *copies of the Software, and to permit persons to whom the Software is
+ *furnished to do so, subject to the following conditions:
+ *
+ *The above copyright notice and this permission notice shall be included in
+ *all copies or substantial portions of the Software.
+ *
+ *THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ *OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ *THE SOFTWARE.
+ *
+ */
+
+
+#include <glog/logging.h>
+
+#include <fstream>
+#include <set>
+#include <string>
+#include <vector>
+
+#include "cpu_info.h"
+
+Processor::Processor() {
+  processor = 0;
+  physicalId = 0;
+  siblings = 0;
+  coreId = 0;
+  cpuCores = 0;
+  speedMHz = 0;
+}
+
+CpuInfo::CpuInfo() {
+  loadContentFromFile("/proc/cpuinfo");
+}
+
+CpuInfo::CpuInfo(const char *content) {
+  loadContent(content);
+}
+
+void CpuInfo::loadContentFromFile(const char *fileName) {
+  std::ifstream file(fileName);
+  std::string content(
+    (std::istreambuf_iterator<char>(file)),
+    (std::istreambuf_iterator<char>()));
+
+  loadContent(content.c_str());
+}
+
+void CpuInfo::loadContent(const char *content) {
+  size_t contentLength = strlen(content);
+  char *contentCopy = new char[contentLength + 1];
+  snprintf(contentCopy, contentLength + 1, "%s", content);
+
+  parseLines(contentCopy);
+
+  fileContentBegin = contentCopy;
+  fileContentEnd = &contentCopy[contentLength];
+  currentLine = NULL;
+}
+
+CpuInfo::~CpuInfo() {
+  delete [] fileContentBegin;
+}
+
+void CpuInfo::parseLines(char *content) {
+  for (; *content; content++) {
+    if (*content == '\n') {
+      *content = '\0';
+    }
+  }
+}
+
+const char *CpuInfo::getFirstLine() {
+  currentLine = fileContentBegin < fileContentEnd ? fileContentBegin : NULL;
+  return getNextLine();
+}
+
+const char *CpuInfo::getNextLine() {
+  if (!currentLine) {
+    return NULL;
+  }
+
+  const char *savedCurrentLine = currentLine;
+  while (*(currentLine++)) {
+  }
+
+  if (currentLine >= fileContentEnd) {
+    currentLine = NULL;
+  }
+
+  return savedCurrentLine;
+}
+
+Collection::Collection(CpuInfoInterface *cpuInfo) : cpuInfo(*cpuInfo) {
+  totalNumberOfSockets = 0;
+  totalNumberOfCpuCores = 0;
+  currentProcessor = NULL;
+
+  processors.reserve(96);
+
+  parseCpuInfo();
+  collectBasicCpuInformation();
+}
+
+unsigned Collection::getProcessorSpeedMHz() {
+  return processors.size() ? processors[0].speedMHz : 0;
+}
+
+unsigned Collection::getTotalNumberOfSockets() {
+  return totalNumberOfSockets;
+}
+
+unsigned Collection::getTotalNumberOfCpuCores() {
+  return totalNumberOfCpuCores;
+}
+
+unsigned Collection::getNumberOfProcessors() {
+  return processors.size();
+}
+
+const Processor &Collection::getProcessor(unsigned processorId) {
+  return processors[processorId];
+}
+
+void Collection::parseCpuInfo() {
+  const char *cpuInfoLine = cpuInfo.getFirstLine();
+  for (; cpuInfoLine; cpuInfoLine = cpuInfo.getNextLine()) {
+    parseCpuInfoLine(cpuInfoLine);
+  }
+}
+
+void Collection::parseCpuInfoLine(const char *cpuInfoLine) {
+  int delimiterPosition = strcspn(cpuInfoLine, ":");
+
+  if (cpuInfoLine[delimiterPosition] == '\0') {
+    currentProcessor = NULL;
+  } else {
+    parseValue(cpuInfoLine, &cpuInfoLine[delimiterPosition + 2]);
+  }
+}
+
+void Collection::parseValue(const char *fieldName, const char *valueString) {
+  if (!currentProcessor) {
+    appendNewProcessor();
+  }
+
+  if (beginsWith(fieldName, "processor")) {
+    currentProcessor->processor = parseInteger(valueString);
+  }
+
+  if (beginsWith(fieldName, "physical id")) {
+    currentProcessor->physicalId = parseInteger(valueString);
+  }
+
+  if (beginsWith(fieldName, "siblings")) {
+    currentProcessor->siblings = parseInteger(valueString);
+  }
+
+  if (beginsWith(fieldName, "core id")) {
+    currentProcessor->coreId = parseInteger(valueString);
+  }
+
+  if (beginsWith(fieldName, "cpu cores")) {
+    currentProcessor->cpuCores = parseInteger(valueString);
+  }
+
+  if (beginsWith(fieldName, "model name")) {
+    currentProcessor->speedMHz = extractSpeedFromModelName(valueString);
+  }
+}
+
+void Collection::appendNewProcessor() {
+  processors.push_back(Processor());
+  currentProcessor = &processors.back();
+}
+
+bool Collection::beginsWith(const char *lineBuffer, const char *text) const {
+  while (*text) {
+    if (*(lineBuffer++) != *(text++)) {
+      return false;
+    }
+  }
+
+  return true;
+}
+
+unsigned Collection::parseInteger(const char *text) const {
+  return atol(text);
+}
+
+/* Function extracts CPU speed from model name. If unit is not set it is
+   assumed that values below 100 are specified in GHz, otherwise MHz */
+unsigned Collection::extractSpeedFromModelName(const char *text) const {
+  text = strstr(text, "@");
+  if (!text) {
+    return 0;
+  }
+
+  char *unit;
+  double speed = strtod(&text[1], &unit);
+
+  while (isspace(*unit)) {
+    unit++;
+  }
+
+  bool isMHz = !strncmp(unit, "MHz", 3);
+  bool isGHz = !strncmp(unit, "GHz", 3);
+  bool isGHzPossible = (speed < 100);
+
+  if (isGHz || (isGHzPossible && !isMHz)) {
+    return 1000 * speed + 0.5;
+  } else {
+    return speed + 0.5;
+  }
+}
+
+void Collection::collectBasicCpuInformation() {
+  std::set<unsigned> uniquePhysicalId;
+  std::vector<Processor>::iterator processor = processors.begin();
+  for (; processor != processors.end(); processor++) {
+    uniquePhysicalId.insert(processor->physicalId);
+    updateCpuInformation(*processor, uniquePhysicalId.size());
+  }
+}
+
+void Collection::updateCpuInformation(const Processor &processor,
+    unsigned numberOfUniquePhysicalId) {
+  if (totalNumberOfSockets == numberOfUniquePhysicalId) {
+    return;
+  }
+
+  totalNumberOfSockets = numberOfUniquePhysicalId;
+  totalNumberOfCpuCores += processor.cpuCores;
+}
+
+/* The OpenMpManager class is responsible for determining a set of all of
+   available CPU cores and delegating each core to perform other tasks. The
+   first of available cores is delegated for background threads, while other
+   remaining cores are dedicated for OpenMP threads. Each OpenMP thread owns
+   one core for exclusive use. The number of OpenMP threads is then limited
+   to the number of available cores minus one. The amount of CPU cores may
+   be limited by system eg. when numactl was used. */
+
+#include <omp.h>
+#include <sched.h>
+
+static const char *openMpEnvVars[] = {
+  "OMP_CANCELLATION", "OMP_DISPLAY_ENV", "OMP_DEFAULT_DEVICE", "OMP_DYNAMIC",
+  "OMP_MAX_ACTIVE_LEVELS", "OMP_MAX_TASK_PRIORITY", "OMP_NESTED",
+  "OMP_NUM_THREADS", "OMP_PROC_BIND", "OMP_PLACES", "OMP_STACKSIZE",
+  "OMP_SCHEDULE", "OMP_THREAD_LIMIT", "OMP_WAIT_POLICY", "GOMP_CPU_AFFINITY",
+  "GOMP_DEBUG", "GOMP_STACKSIZE", "GOMP_SPINCOUNT", "GOMP_RTEMS_THREAD_POOLS",
+  "KMP_AFFINITY", "KMP_NUM_THREADS", "MIC_KMP_AFFINITY",
+  "MIC_OMP_NUM_THREADS", "MIC_OMP_PROC_BIND", "PHI_KMP_AFFINITY",
+  "PHI_OMP_NUM_THREADS", "PHI_KMP_PLACE_THREADS", "MKL_NUM_THREADS",
+  "MKL_DYNAMIC", "MKL_DOMAIN_NUM_THREADS"
+};
+
+static const unsigned numberOfOpenMpEnvVars =
+  sizeof(openMpEnvVars) / sizeof(openMpEnvVars[0]);
+
+OpenMpManager::OpenMpManager(Collection *collection) :
+                             mainThreadId(boost::this_thread::get_id()),
+                             collection(*collection) {
+  getOpenMpEnvVars();
+  getCurrentCpuSet();
+  getCurrentCoreSet();
+}
+
+OpenMpManager &OpenMpManager::get_instance() {
+  static CpuInfo cpuInfo;
+  static Collection collection(&cpuInfo);
+  static OpenMpManager openMpManager(&collection);
+  return openMpManager;
+}
+
+void OpenMpManager::setGpuEnabled() {
+  OpenMpManager &openMpManager = get_instance();
+  openMpManager.isGpuEnabled = true;
+}
+
+void OpenMpManager::setGpuDisabled() {
+  OpenMpManager &openMpManager = get_instance();
+  openMpManager.isGpuEnabled = false;
+}
+
+bool OpenMpManager::isMajorThread(boost::thread::id currentThread) {
+  OpenMpManager &openMpManager = get_instance();
+  return (boost::this_thread::get_id() == openMpManager.mainThreadId);
+}
+
+// Ideally bind given thread to secondary logical core, if
+// only one thread exists then bind to primary one
+void OpenMpManager::bindCurrentThreadToNonPrimaryCoreIfPossible() {
+  OpenMpManager &openMpManager = get_instance();
+  if (openMpManager.isThreadsBindAllowed()) {
+    int totalNumberOfAvailableCores = CPU_COUNT(&openMpManager.currentCoreSet);
+    int logicalCoreToBindTo = totalNumberOfAvailableCores > 1 ? 1 : 0;
+    openMpManager.bindCurrentThreadToLogicalCoreCpus(logicalCoreToBindTo);
+  }
+}
+
+void OpenMpManager::bindOpenMpThreads() {
+  OpenMpManager &openMpManager = get_instance();
+
+  if (!openMpManager.isThreadsBindAllowed())
+    return;
+
+  openMpManager.setOpenMpThreadNumberLimit();
+  #pragma omp parallel
+  {
+    unsigned logicalCoreId = omp_get_thread_num();
+    openMpManager.bindCurrentThreadToLogicalCoreCpu(logicalCoreId);
+  }
+}
+
+void OpenMpManager::getOpenMpEnvVars() {
+  isAnyOpenMpEnvVarSpecified = false;
+  for (unsigned i = 0; i < numberOfOpenMpEnvVars; i++) {
+    if (getenv(openMpEnvVars[i])) {
+      isAnyOpenMpEnvVarSpecified = true;
+    }
+  }
+}
+
+void OpenMpManager::getCurrentCpuSet() {
+  if (sched_getaffinity(0, sizeof(currentCpuSet), &currentCpuSet)) {
+    getDefaultCpuSet(&currentCpuSet);
+  }
+}
+
+void OpenMpManager::getDefaultCpuSet(cpu_set_t *defaultCpuSet) {
+  CPU_ZERO(defaultCpuSet);
+  unsigned numberOfProcessors = collection.getNumberOfProcessors();
+  for (unsigned processorId = 0; processorId < numberOfProcessors; processorId++) {
+    CPU_SET(processorId, defaultCpuSet);
+  }
+}
+
+/* Function getCurrentCoreSet() fills currentCoreSet variable with a set of
+   available CPUs, where only one CPU per core is chosen. When multiple CPUs
+   of single core are used, function is selecting only first one of all
+   available. */
+
+void OpenMpManager::getCurrentCoreSet() {
+  unsigned numberOfProcessors = collection.getNumberOfProcessors();
+  unsigned totalNumberOfCpuCores = collection.getTotalNumberOfCpuCores();
+
+  cpu_set_t usedCoreSet;
+  CPU_ZERO(&usedCoreSet);
+  CPU_ZERO(&currentCoreSet);
+
+  for (unsigned processorId = 0; processorId < numberOfProcessors; processorId++) {
+    if (CPU_ISSET(processorId, &currentCpuSet)) {
+      unsigned coreId = processorId % totalNumberOfCpuCores;
+      if (!CPU_ISSET(coreId, &usedCoreSet)) {
+        CPU_SET(coreId, &usedCoreSet);
+        CPU_SET(processorId, &currentCoreSet);
+      }
+    }
+  }
+}
+
+void OpenMpManager::selectAllCoreCpus(cpu_set_t *set, unsigned physicalCoreId) {
+  unsigned numberOfProcessors = collection.getNumberOfProcessors();
+  unsigned totalNumberOfCpuCores = collection.getTotalNumberOfCpuCores();
+
+  unsigned processorId = physicalCoreId % totalNumberOfCpuCores;
+  while (processorId < numberOfProcessors) {
+    if (CPU_ISSET(processorId, &currentCpuSet)) {
+      CPU_SET(processorId, set);
+    }
+
+    processorId += totalNumberOfCpuCores;
+  }
+}
+
+unsigned OpenMpManager::getPhysicalCoreId(unsigned logicalCoreId) {
+  unsigned numberOfProcessors = collection.getNumberOfProcessors();
+
+  for (unsigned processorId = 0; processorId < numberOfProcessors; processorId++) {
+    if (CPU_ISSET(processorId, &currentCoreSet)) {
+      if (!logicalCoreId--) {
+        return processorId;
+      }
+    }
+  }
+
+  LOG(FATAL) << "This should never happen!";
+  return 0;
+}
+
+bool OpenMpManager::isThreadsBindAllowed() {
+  return !isAnyOpenMpEnvVarSpecified && !isGpuEnabled;
+}
+
+// Limit of threads to number of logical cores available
+void OpenMpManager::setOpenMpThreadNumberLimit() {
+  omp_set_num_threads(CPU_COUNT(&currentCoreSet));
+}
+
+void OpenMpManager::bindCurrentThreadToLogicalCoreCpu(unsigned logicalCoreId) {
+  unsigned physicalCoreId = getPhysicalCoreId(logicalCoreId);
+
+  cpu_set_t set;
+  CPU_ZERO(&set);
+  CPU_SET(physicalCoreId, &set);
+  sched_setaffinity(0, sizeof(set), &set);
+}
+
+void OpenMpManager::bindCurrentThreadToLogicalCoreCpus(unsigned logicalCoreId) {
+  unsigned physicalCoreId = getPhysicalCoreId(logicalCoreId);
+
+  cpu_set_t set;
+  CPU_ZERO(&set);
+  selectAllCoreCpus(&set, physicalCoreId);
+  sched_setaffinity(0, sizeof(set), &set);
+}
+
+void OpenMpManager::printVerboseInformation() {
+  OpenMpManager &openMpManager = get_instance();
+
+  LOG(INFO) << "Processor speed [MHz]: "
+    << openMpManager.collection.getProcessorSpeedMHz();
+
+  LOG(INFO) << "Total number of sockets: "
+    << openMpManager.collection.getTotalNumberOfSockets();
+
+  LOG(INFO) << "Total number of CPU cores: "
+    << openMpManager.collection.getTotalNumberOfCpuCores();
+
+  LOG(INFO) << "Total number of processors: "
+    << openMpManager.collection.getNumberOfProcessors();
+
+  LOG(INFO) << "GPU is used: "
+    << (openMpManager.isGpuEnabled ? "yes" : "no");
+
+  LOG(INFO) << "OpenMP environmental variables are specified: "
+    << (openMpManager.isAnyOpenMpEnvVarSpecified ? "yes" : "no");
+
+  LOG(INFO) << "OpenMP thread bind allowed: "
+    << (openMpManager.isThreadsBindAllowed() ? "yes" : "no");
+
+  LOG(INFO) << "Number of OpenMP threads: "
+    << omp_get_max_threads();
+}
+
+unsigned OpenMpManager::getProcessorSpeedMHz() {
+  OpenMpManager &openMpManager = get_instance();
+  return openMpManager.collection.getProcessorSpeedMHz();
+}
+
+
+// vim: et ts=4 sw=4 cindent cino^=l0,\:0,N-s
diff --git a/python/ideep4py/common/cpu_info.h b/python/ideep4py/common/cpu_info.h
new file mode 100644
index 00000000..c8cd722a
--- /dev/null
+++ b/python/ideep4py/common/cpu_info.h
@@ -0,0 +1,166 @@
+/*
+ *Copyright (c) 2018 Intel Corporation.
+ *
+ *Permission is hereby granted, free of charge, to any person obtaining a copy
+ *of this software and associated documentation files (the "Software"), to deal
+ *in the Software without restriction, including without limitation the rights
+ *to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ *copies of the Software, and to permit persons to whom the Software is
+ *furnished to do so, subject to the following conditions:
+ *
+ *The above copyright notice and this permission notice shall be included in
+ *all copies or substantial portions of the Software.
+ *
+ *THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ *OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ *THE SOFTWARE.
+ *
+ */
+
+
+#ifndef _CPU_INFO_H
+#define _CPU_INFO_H
+
+#include <boost/thread/thread.hpp>
+#include <sched.h>
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include <set>
+#include <vector>
+//#include "utils.h"
+
+#define DISALLOW_COPY_AND_ASSIGN(TypeName) \
+  TypeName(const TypeName&) = delete;      \
+  void operator=(const TypeName&) = delete
+
+struct Processor {
+  unsigned processor;
+  unsigned physicalId;
+  unsigned siblings;
+  unsigned coreId;
+  unsigned cpuCores;
+  unsigned speedMHz;
+
+  Processor();
+};
+
+class CpuInfoInterface {
+ public:
+  virtual ~CpuInfoInterface() {}
+  virtual const char *getFirstLine() = 0;
+  virtual const char *getNextLine() = 0;
+};
+
+class CpuInfo : public CpuInfoInterface {
+ public:
+  CpuInfo();
+  explicit CpuInfo(const char *content);
+  virtual ~CpuInfo();
+
+  virtual const char *getFirstLine();
+  virtual const char *getNextLine();
+
+ private:
+  const char *fileContentBegin;
+  const char *fileContentEnd;
+  const char *currentLine;
+
+  void loadContentFromFile(const char *fileName);
+  void loadContent(const char *content);
+  void parseLines(char *content);
+  DISALLOW_COPY_AND_ASSIGN(CpuInfo);
+};
+
+class CollectionInterface {
+ public:
+  virtual ~CollectionInterface() {}
+  virtual unsigned getProcessorSpeedMHz() = 0;
+  virtual unsigned getTotalNumberOfSockets() = 0;
+  virtual unsigned getTotalNumberOfCpuCores() = 0;
+  virtual unsigned getNumberOfProcessors() = 0;
+  virtual const Processor &getProcessor(unsigned processorId) = 0;
+};
+
+class Collection : public CollectionInterface {
+ public:
+  explicit Collection(CpuInfoInterface *cpuInfo);
+
+  virtual unsigned getProcessorSpeedMHz();
+  virtual unsigned getTotalNumberOfSockets();
+  virtual unsigned getTotalNumberOfCpuCores();
+  virtual unsigned getNumberOfProcessors();
+  virtual const Processor &getProcessor(unsigned processorId);
+
+ private:
+  CpuInfoInterface &cpuInfo;
+  unsigned totalNumberOfSockets;
+  unsigned totalNumberOfCpuCores;
+  std::vector<Processor> processors;
+  Processor *currentProcessor;
+
+  Collection(const Collection &collection);
+  Collection &operator =(const Collection &collection);
+
+  void parseCpuInfo();
+  void parseCpuInfoLine(const char *cpuInfoLine);
+  void parseValue(const char *fieldName, const char *valueString);
+  void appendNewProcessor();
+  bool beginsWith(const char *lineBuffer, const char *text) const;
+  unsigned parseInteger(const char *text) const;
+  unsigned extractSpeedFromModelName(const char *text) const;
+
+  void collectBasicCpuInformation();
+  void updateCpuInformation(const Processor &processor,
+    unsigned numberOfUniquePhysicalId);
+};
+
+class OpenMpManager {
+ public:
+  static void setGpuEnabled();
+  static void setGpuDisabled();
+
+  static void bindCurrentThreadToNonPrimaryCoreIfPossible();
+
+  static void bindOpenMpThreads();
+  static void printVerboseInformation();
+
+  static bool isMajorThread(boost::thread::id currentThread);
+  static unsigned getProcessorSpeedMHz();
+
+ private:
+  boost::thread::id mainThreadId;
+  Collection &collection;
+
+  bool isGpuEnabled;
+  bool isAnyOpenMpEnvVarSpecified;
+  cpu_set_t currentCpuSet;
+  cpu_set_t currentCoreSet;
+
+  explicit OpenMpManager(Collection *collection);
+  OpenMpManager(const OpenMpManager &openMpManager);
+  OpenMpManager &operator =(const OpenMpManager &openMpManager);
+  static OpenMpManager &get_instance();
+
+  void getOpenMpEnvVars();
+  void getCurrentCpuSet();
+  void getDefaultCpuSet(cpu_set_t *defaultCpuSet);
+  void getCurrentCoreSet();
+
+  void selectAllCoreCpus(cpu_set_t *set, unsigned physicalCoreId);
+  unsigned getPhysicalCoreId(unsigned logicalCoreId);
+
+  bool isThreadsBindAllowed();
+  void setOpenMpThreadNumberLimit();
+  void bindCurrentThreadToLogicalCoreCpu(unsigned logicalCoreId);
+  void bindCurrentThreadToLogicalCoreCpus(unsigned logicalCoreId);
+};
+
+#endif  // _CPU_INFO_H
+
+
+// vim: et ts=4 sw=4 cindent cino^=l0,\:0,N-s
diff --git a/python/ideep4py/common/utils.cc b/python/ideep4py/common/utils.cc
new file mode 100644
index 00000000..4e497fad
--- /dev/null
+++ b/python/ideep4py/common/utils.cc
@@ -0,0 +1,359 @@
+/*
+ *Copyright (c) 2018 Intel Corporation.
+ *
+ *Permission is hereby granted, free of charge, to any person obtaining a copy
+ *of this software and associated documentation files (the "Software"), to deal
+ *in the Software without restriction, including without limitation the rights
+ *to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ *copies of the Software, and to permit persons to whom the Software is
+ *furnished to do so, subject to the following conditions:
+ *
+ *The above copyright notice and this permission notice shall be included in
+ *all copies or substantial portions of the Software.
+ *
+ *THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ *OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ *THE SOFTWARE.
+ *
+ */
+
+
+#include "utils.h"
+#include <stdint.h>
+
+#if defined(_MSC_VER)
+static inline uint64_t __cpuidXfeature()
+{
+#if (_MSC_VER > 1600)
+    return _xgetbv(0);
+#else
+    uint32_t a, d;
+    __asm {
+        push edx
+        push ecx
+        push eax
+        xor ecx, ecx
+        _asm _emit 0x0f _asm _emit 0x01 _asm _emit 0xd0
+        mov a, eax
+        mov d, edx
+        pop eax
+        pop ecx
+        pop edx
+    } 
+    return (((uint64_t)d << 32) | a);
+#endif
+}
+
+#if (_MSC_VER < 1400)
+static inline __declspec(naked) void __cpuid(int[4] result, int level)
+{
+    __asm {
+        push    ebx
+        push    edi
+        mov     eax, dword ptr [esp + 4 * 4]    // level
+        cpuid
+        mov     edi, dword ptr [esp + 4 * 3]    // result
+        mov     dword ptr [edi + 4 * 0], eax    // result[0]
+        mov     dword ptr [edi + 4 * 1], ebx    // result[1]
+        mov     dword ptr [edi + 4 * 2], ecx    // result[2]
+        mov     dword ptr [edi + 4 * 3], edx    // result[3]
+        pop     edi
+        pop     ebx
+        ret
+    }
+}
+
+static inline __declspec(naked) void __cpuidex(int[4] result, int level, int count)
+{
+    __asm {
+        push    ebx
+        push    ecx
+        push    edi
+        mov     ecx, dword ptr [esp + 4 * 6]    // count
+        mov     eax, dword ptr [esp + 4 * 5]    // level
+        cpuid
+        mov     edi, dword ptr [esp + 4 * 4]    // result
+        mov     dword ptr [edi + 4 * 0], eax    // result[0]
+        mov     dword ptr [edi + 4 * 1], ebx    // result[1]
+        mov     dword ptr [edi + 4 * 2], ecx    // result[2]
+        mov     dword ptr [edi + 4 * 3], edx    // result[3]
+        pop     edi
+        pop     ecx
+        pop     ebx
+        ret
+    }
+}
+
+#else
+#include <intrin.h>
+
+#endif
+
+#else   // Non-MSC
+static inline uint64_t __cpuidXfeature()
+{
+    uint32_t eax, edx;
+#if (((__GNUC__) > 4) || (((__GNUC__) == 4) && ((__GNUC_MINOR_) > 2)))
+    __asm__ volatile("xgetbv"
+            : "=a"(eax), "=d"(edx)
+            : "c"(0));
+#else
+    __asm__ volatile(".byte 0x0f, 0x01, 0xd0"
+            : "=a"(eax), "=d"(edx)
+            : "c"(0));
+#endif
+    return (((uint64_t)edx << 32) | eax);
+}
+
+#if defined(__APPLE__)
+#define __cpuid(a, b, c, d, level) \
+    __asm__ __volatile__(   \
+            "pushl %%ebx\ncpuid\nmovl %%ebp, %%esi\npopl %%ebx"    \
+                : "=a"(a), "=S"(b), "=c"(c), "=d"(d)    \
+                : "0"(level))
+
+#define __cpuid_count(a, b, c, d, level, count) \
+    __asm__ __volatile__(   \
+            "pushl %%ebx\ncpuid\nmovl %%ebp, %%esi\npopl %%ebx" \
+                : "=a"(a), "=S"(b), "=c"(c), "=d"(d)    \
+                : "0"(level), "2"(count))
+
+#else   // Non-APPLE
+#define __cpuid(a, b, c, d, level)  \
+    __asm__ __volatile__(   \
+            "cpuid\n"   \
+                : "=a"(a), "=b"(b), "=c"(c), "=d"(d)    \
+                : "0"(level))
+
+#define __cpuid_count(a, b, c, d, level, count) \
+    __asm__ __volatile__(   \
+            "cpuid\n"   \
+                : "=a"(a), "=b"(b), "=c"(c), "=d"(d)    \
+                : "0"(level), "2"(count))
+#endif
+#endif
+
+static inline void get_cpu_feature(uint32_t level, uint32_t result[4])
+{
+#ifdef _MSC_VER
+        __cpuid(reinterpret_cast<int*>(result), level);
+#else
+        __cpuid(result[0], result[1], result[2], result[3], level);
+#endif
+}
+
+static inline void get_cpu_feature_ext(uint32_t level, uint32_t count, uint32_t result[4])
+{
+#ifdef _MSC_VER
+    __cpuidex(reinterpret_cast<int*>(result), level, count);
+#else
+    __cpuid_count(result[0], result[1], result[2], result[3], level, count);
+#endif
+}
+
+class CpuFeatures
+{
+public:
+    static const uint64_t f_NONE            = uint64_t(0);
+    static const uint64_t f_MMX             = uint64_t(1) << 0;
+    static const uint64_t f_MMX2            = uint64_t(1) << 1;
+    static const uint64_t f_CMOV            = uint64_t(1) << 2;
+    static const uint64_t f_SSE             = uint64_t(1) << 3;
+    static const uint64_t f_SSE2            = uint64_t(1) << 4;
+    static const uint64_t f_SSE3            = uint64_t(1) << 5;
+    static const uint64_t f_SSSE3           = uint64_t(1) << 6;
+    static const uint64_t f_SSE41           = uint64_t(1) << 7;
+    static const uint64_t f_SSE42           = uint64_t(1) << 8;
+    static const uint64_t f_POPCNT          = uint64_t(1) << 9;
+    static const uint64_t f_AESNI           = uint64_t(1) << 10;
+    static const uint64_t f_SSE5            = uint64_t(1) << 11;
+    static const uint64_t f_OSXSAVE         = uint64_t(1) << 12;
+    static const uint64_t f_PCLMULQDQ       = uint64_t(1) << 13;
+    static const uint64_t f_AVX             = uint64_t(1) << 14;
+    static const uint64_t f_FMA             = uint64_t(1) << 15;
+    static const uint64_t f_SSE4a           = uint64_t(1) << 16;
+    static const uint64_t f_RDTSCP          = uint64_t(1) << 17;
+    static const uint64_t f_AVX2            = uint64_t(1) << 18;
+    static const uint64_t f_BMI1            = uint64_t(1) << 19;
+    static const uint64_t f_BMI2            = uint64_t(1) << 20;
+    static const uint64_t f_LZCNT           = uint64_t(1) << 21;
+    static const uint64_t f_ENHANCED_REP    = uint64_t(1) << 22;
+    static const uint64_t f_RDRAND          = uint64_t(1) << 23;
+    static const uint64_t f_ADX             = uint64_t(1) << 24;
+    static const uint64_t f_RDSEED          = uint64_t(1) << 25;
+    static const uint64_t f_SMAP            = uint64_t(1) << 26;
+    static const uint64_t f_HLE             = uint64_t(1) << 27;
+    static const uint64_t f_RTM             = uint64_t(1) << 28;
+    static const uint64_t f_F16C            = uint64_t(1) << 29;
+    static const uint64_t f_MOVBE           = uint64_t(1) << 30;
+    static const uint64_t f_AVX512F         = uint64_t(1) << 31;
+    static const uint64_t f_AVX512DQ        = uint64_t(1) << 32;
+    static const uint64_t f_AVX512IFMA      = uint64_t(1) << 33;
+    static const uint64_t f_AVX512PF        = uint64_t(1) << 34;
+    static const uint64_t f_AVX512ER        = uint64_t(1) << 35;
+    static const uint64_t f_AVX512CD        = uint64_t(1) << 36;
+    static const uint64_t f_AVX512BW        = uint64_t(1) << 37;
+    static const uint64_t f_AVX512VL        = uint64_t(1) << 38;
+    static const uint64_t f_AVX512VBMI      = uint64_t(1) << 39;
+    static const uint64_t f_AVX512_4VNNIW   = uint64_t(1) << 40;
+    static const uint64_t f_AVX512_4FMAPS   = uint64_t(1) << 41;
+    static const uint64_t f_PREFETCHWT1     = uint64_t(1) << 42;
+
+    static const uint32_t any               = 0;
+    static const uint32_t sse42             = 1;
+    static const uint32_t avx               = 2;
+    static const uint32_t avx2              = 3;
+    static const uint32_t avx512_comm       = 4;
+    static const uint32_t avx512_core       = 5;
+    static const uint32_t avx512_mic        = 6;
+    static const uint32_t avx512_mic_4ops   = 7;
+
+    CpuFeatures()
+    {
+        features = f_NONE;
+        uint32_t result[4] = {0};
+
+        get_cpu_feature(0x80000001, result);
+        if (result[2] & (1U << 5))  features |= f_LZCNT;
+        if (result[3] & (1U << 27)) features |= f_RDTSCP;
+
+        get_cpu_feature(1, result);
+        if (result[2] & (1U << 0))  features |= f_SSE3;
+        if (result[2] & (1U << 1))  features |= f_PCLMULQDQ;
+        if (result[2] & (1U << 9))  features |= f_SSSE3;
+        if (result[2] & (1U << 19)) features |= f_SSE41;
+        if (result[2] & (1U << 20)) features |= f_SSE42;
+        if (result[2] & (1U << 22)) features |= f_MOVBE;
+        if (result[2] & (1U << 23)) features |= f_POPCNT;
+        if (result[2] & (1U << 25)) features |= f_AESNI;
+        if (result[2] & (1U << 27)) features |= f_OSXSAVE;
+        if (result[2] & (1U << 30)) features |= f_RDRAND;
+        if (result[2] & (1U << 29)) features |= f_F16C;
+        if (result[3] & (1U << 15)) features |= f_CMOV;
+        if (result[3] & (1U << 23)) features |= f_MMX;
+        if (result[3] & (1U << 25)) features |= f_MMX2 | f_SSE;
+        if (result[3] & (1U << 26)) features |= f_SSE2;
+
+        if (features & f_OSXSAVE) {
+            uint64_t x_enabled = __cpuidXfeature();
+            if ((x_enabled & 0x6) == 0x6) {
+                if (result[2] & (1U << 28)) features |= f_AVX;
+                if (result[2] & (1U << 12)) features |= f_FMA;
+                if (((x_enabled >> 5) & 0x7) == 0x7) {
+                    get_cpu_feature_ext(0x7, 0x0, result);
+                    if (result[1] & (1U << 16)) {
+                        features |= f_AVX512F;
+                        if (result[1] & (1U << 17)) features |= f_AVX512DQ;
+                        if (result[1] & (1U << 21)) features |= f_AVX512IFMA;
+                        if (result[1] & (1U << 26)) features |= f_AVX512PF;
+                        if (result[1] & (1U << 27)) features |= f_AVX512ER;
+                        if (result[1] & (1U << 28)) features |= f_AVX512CD;
+                        if (result[1] & (1U << 30)) features |= f_AVX512BW;
+                        if (result[1] & (1U << 31)) features |= f_AVX512VL;
+                        if (result[2] & (1U << 1))  features |= f_AVX512VBMI;
+                        if (result[3] & (1U << 2))  features |= f_AVX512_4VNNIW;
+                        if (result[3] & (1U << 3))  features |= f_AVX512_4FMAPS;
+                    }
+                }
+            }
+        }
+
+        get_cpu_feature(0x0, result);
+        if (result[0] >= 7) {
+            get_cpu_feature_ext(0x7, 0x0, result);
+            if ((features & f_AVX) && (result[1] & 0x20)) features |= f_AVX2;
+            if (result[1] & (1U << 3))  features |= f_BMI1;
+            if (result[1] & (1U << 8))  features |= f_BMI2;
+            if (result[1] & (1U << 9))  features |= f_ENHANCED_REP;
+            if (result[1] & (1U << 18)) features |= f_RDSEED;
+            if (result[1] & (1U << 19)) features |= f_ADX;
+            if (result[1] & (1U << 20)) features |= f_SMAP;
+            if (result[1] & (1U << 4))  features |= f_HLE;
+            if (result[1] & (1U << 11)) features |= f_RTM;
+            if (result[2] & (1U << 0))  features |= f_PREFETCHWT1;
+        }
+
+    }
+
+    bool has_feature(uint64_t f)
+    {
+        return (features & f) ? true : false;
+    }
+
+    bool is_supported(const uint32_t cpu_isa)
+    {
+        switch (cpu_isa) {
+            case sse42:
+                return has_feature(f_SSE42);
+            case avx:
+                return has_feature(f_AVX);
+            case avx2:
+                return has_feature(f_AVX2);
+            case avx512_comm:
+                return has_feature(f_AVX512F);
+            case avx512_core:
+                return has_feature(f_AVX512F)
+                    && has_feature(f_AVX512BW)
+                    && has_feature(f_AVX512VL)
+                    && has_feature(f_AVX512DQ);
+            case avx512_mic:
+                return has_feature(f_AVX512F)
+                    && has_feature(f_AVX512CD)
+                    && has_feature(f_AVX512ER)
+                    && has_feature(f_AVX512PF);
+            case avx512_mic_4ops:
+                return is_supported(avx512_mic)
+                    && has_feature(f_AVX512_4FMAPS)
+                    && has_feature(f_AVX512_4VNNIW);
+            case any:
+                return true;
+            default:
+                return false;
+        }
+
+        return false;
+    }
+
+private:
+    uint64_t features;
+};
+
+memory::format get_desired_format(int channel)
+{
+    CpuFeatures cpu_f;
+    memory::format fmt_desired = memory::format::any;
+
+    if (cpu_f.is_supported(CpuFeatures::avx512_comm) && (channel % 16) == 0) {
+        fmt_desired = memory::format::nChw16c;
+    } else if (cpu_f.is_supported(CpuFeatures::avx2) && (channel % 8) == 0) {
+        fmt_desired = memory::format::nChw8c;
+    } else {
+        fmt_desired = memory::format::nchw;
+    }
+    return fmt_desired;
+}
+
+memory::format get_desired_format_weight(int channel0, int channel1)
+{
+    CpuFeatures cpu_f;
+    memory::format fmt_desired = memory::format::any;
+
+    if (cpu_f.is_supported(CpuFeatures::avx512_comm) && (channel0 % 16) == 0) {
+        if (channel1 % 16 == 0)
+            fmt_desired = memory::format::OIhw16i16o;
+        else
+            fmt_desired = memory::format::Oihw16o;
+    } else if (cpu_f.is_supported(CpuFeatures::avx2) && (channel0 % 8) == 0) {
+        if (channel1 % 8 == 0)
+            fmt_desired = memory::format::OIhw8i8o;
+        else
+            fmt_desired = memory::format::Ohwi8o;
+    } else {
+        fmt_desired = memory::format::nchw;
+    }
+    return fmt_desired;
+}
diff --git a/python/ideep4py/common/utils.h b/python/ideep4py/common/utils.h
new file mode 100644
index 00000000..4856520c
--- /dev/null
+++ b/python/ideep4py/common/utils.h
@@ -0,0 +1,202 @@
+/*
+ *Copyright (c) 2018 Intel Corporation.
+ *
+ *Permission is hereby granted, free of charge, to any person obtaining a copy
+ *of this software and associated documentation files (the "Software"), to deal
+ *in the Software without restriction, including without limitation the rights
+ *to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ *copies of the Software, and to permit persons to whom the Software is
+ *furnished to do so, subject to the following conditions:
+ *
+ *The above copyright notice and this permission notice shall be included in
+ *all copies or substantial portions of the Software.
+ *
+ *THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ *OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ *THE SOFTWARE.
+ *
+ */
+
+
+#ifndef _UTILS_H_
+#define _UTILS_H_
+
+#include <glog/logging.h>
+#include <mkldnn.hpp>
+#include <iostream>
+#include "op_param.h"
+#include "omp.h"
+using namespace mkldnn;
+
+#define GET_PTR(t, p, offset) reinterpret_cast<t*>( reinterpret_cast<size_t>(p) +static_cast<size_t>(offset) )
+
+memory::format get_desired_format(int channel);
+memory::format get_desired_format_weight(int channel0, int channel1);
+template<typename T>
+void eltwise_multiply(T* x1, T* x2, T* y, size_t n) {
+#pragma omp parallel for schedule(static)
+    for (size_t i = 0; i < n; ++i) {
+        y[i] = x1[i] * x2[i];
+    }
+}
+
+//
+//// map C type with mkldnn's
+//// float -> memory::data_type::f32
+//// int -> memory::data_type::s32
+//// int16_t -> memory::data_type::s16
+//// int8_t -> memory::data_type::s8
+//// uint8_t -> memory::data_type::u8
+//
+template<typename T>
+static inline mkldnn::memory::data_type memory_data_type() {
+    if (typeid(T) == typeid(float))
+        return mkldnn::memory::data_type::f32;
+    else if (typeid(T) == typeid(int))
+        return mkldnn::memory::data_type::s32;
+    else if (typeid(T) == typeid(int16_t))
+        return mkldnn::memory::data_type::s16;
+    else if (typeid(T) == typeid(int8_t))
+        return mkldnn::memory::data_type::s8;
+    else if (typeid(T) == typeid(uint8_t))
+        return mkldnn::memory::data_type::u8;
+
+    LOG(ERROR) << "Not support type";
+    return mkldnn::memory::data_type::data_undef;
+}
+
+// utils function conver int/double/bool/dims/ to string
+static inline std::string int_to_string(int value) {
+    std::ostringstream os;
+    os << std::hex << "I" << value << "_";
+    return os.str();
+}
+
+static inline std::string double_to_string(double value) {
+    std::ostringstream os;
+    os << "D" << value << "_";
+    return os.str();
+}
+
+static inline std::string float_to_string(float value) {
+    std::ostringstream os;
+    os << "F" << value << "_";
+    return os.str();
+}
+
+static inline std::string bool_to_string(bool value) {
+    std::ostringstream os;
+    os << "B" << value << "_";
+    return os.str();
+}
+
+static inline std::string dims_to_string(mkldnn::memory::dims dims) {
+   std::ostringstream os;
+   os << "DIMS:";
+   for (unsigned int i = 0; i < dims.size(); i++)
+       os << dims[i] << ",";
+   os << ";";
+   return os.str();
+}
+
+static inline std::string long_to_string(size_t value) {
+    std::ostringstream os;
+    os << std::hex << "L" << value << "_";
+    return os.str();
+}
+
+static inline mkldnn::algorithm pooling_algo_convert(pooling_param_t::algorithm input) {
+    switch(input) {
+        case pooling_param_t::algorithm::pooling_max:
+            return mkldnn::pooling_max;
+        case pooling_param_t::algorithm::pooling_avg:
+            return mkldnn::pooling_avg;
+        case pooling_param_t::algorithm::pooling_avg_include_padding:
+            return mkldnn::pooling_avg_include_padding;
+        case pooling_param_t::algorithm::pooling_avg_exclude_padding:
+            return mkldnn::pooling_avg_exclude_padding;
+        default:
+            LOG(ERROR) << "Not a valid pooling algo";
+            return mkldnn::pooling_max;
+    }
+}
+
+static inline mkldnn::algorithm lrn_algo_convert(lrn_param_t::algorithm input) {
+    switch(input) {
+        case lrn_param_t::algorithm::lrn_across_channels:
+            return mkldnn::lrn_across_channels;
+        case lrn_param_t::algorithm::lrn_within_channel:
+            return mkldnn::lrn_within_channel;
+        default:
+            LOG(ERROR) << "Not a valid lrn algo";
+            return mkldnn::lrn_across_channels;
+    }
+}
+
+template<typename T, typename U>
+inline T div_up(const T a, const U b) {
+    assert(b);
+    return(a + b - 1) / b;
+}
+template <typename T, typename U>
+inline void balance211(T n, U team, U tid, T &n_start, T &n_end) {
+    T n_min = 1;
+    T &n_my = n_end;
+    if (team <= 1 || n == 0) {
+        n_start = 0;
+        n_my = n;
+    } else if (n_min == 1) {
+        // team = T1 + T2
+        // n = T1*n1 + T2*n2  (n1 - n2 = 1)
+        T n1 = div_up(n, (T)team);
+        T n2 = n1 - 1;
+        T T1 = n - n2 * (T)team;
+        n_my = (T)tid < T1 ? n1 : n2; 
+        n_start = (T)tid <= T1 ? tid * n1 : T1 * n1 + ((T)tid - T1) * n2; 
+    }   
+
+    n_end += n_start;
+}
+
+inline void fast_memcpy(char* data_o, char *data_i, size_t len)
+{
+    size_t nelems_float = len / 4;
+    size_t nelems_char = len % 4;
+    const int block_size = 16; 
+    const auto num_blocks_float = nelems_float / block_size;
+    const auto rem_elems_float =  nelems_float % block_size;
+    float* output_f = (float*)data_o;
+    float* input_f = (float*) data_i;
+    char* output_c = (char*) data_o;
+    char* input_c = (char*) data_i;
+#   pragma omp parallel
+    {   
+        const int ithr = omp_get_thread_num();
+        const int nthr = omp_get_num_threads();
+        size_t start{0}, end{0};
+        balance211(num_blocks_float, nthr, ithr, start, end);  
+        start = start * block_size;
+        end = end * block_size;
+#       pragma omp simd
+        for (size_t e = start; e < end; ++e) {
+            output_f[e] = input_f[e];
+        }   
+        if (rem_elems_float != 0 && ithr ==  nthr -1 )  {
+            for (auto e = nelems_float - rem_elems_float; e < nelems_float; ++e) {
+                output_f[e] = input_f[e];
+            }   
+        }   
+        if (nelems_char != 0 && ithr ==  nthr -1){
+            for (auto e = nelems_float*4; e < len; ++e) {
+                output_c[e] = input_c[e];
+            }   
+        }   
+    }   
+    return;
+}
+
+#endif // _UTILS_H_
diff --git a/python/ideep4py/cosim/__init__.py b/python/ideep4py/cosim/__init__.py
new file mode 100644
index 00000000..0e3ed7c2
--- /dev/null
+++ b/python/ideep4py/cosim/__init__.py
@@ -0,0 +1 @@
+from ideep4py.cosim.cosim import cosim_verify, is_cosim  # NOQA
diff --git a/python/ideep4py/cosim/cosim.py b/python/ideep4py/cosim/cosim.py
new file mode 100644
index 00000000..681e2c2b
--- /dev/null
+++ b/python/ideep4py/cosim/cosim.py
@@ -0,0 +1,156 @@
+import logging
+import numpy as np
+import os
+
+from ideep4py import mdarray
+
+logging.basicConfig(level=logging.INFO,
+                    format='%(asctime)s [%(levelname)s]: %(message)s')
+global_config_cosim = bool(int(os.environ.get('CHAINER_ENABLE_COSIM', '0')))
+
+
+def is_cosim():
+    """Get the cosim mode.
+
+    Returns:
+        bool: Return ``True`` if chainer is in cosim mode.
+    """
+    return global_config_cosim
+
+
+# Convert chainer.variable to array out of plain_array
+def plain_array(params):
+    assert isinstance(params, tuple) \
+        or isinstance(params, list) \
+        or isinstance(params, mdarray) \
+        or isinstance(params, np.ndarray)
+    # plain_array does not support chainer.variable
+    # or isinstance(params, chainer.variable.Variable)
+
+    _params = ()
+
+    # plain_array does not support chainer.variable
+    # if isinstance(params, variable.Variable):
+    #     return np.array(params.data),
+    if isinstance(params, np.ndarray):
+        return params,
+    elif isinstance(params, mdarray):
+        return np.array(params),
+
+    for p in params:
+        # plain_array does not support chainer.variable
+        # if isinstance(p, variable.Variable):
+        #     p = np.array(p.data)
+        if isinstance(p, mdarray):
+            _params += (np.array(p),)
+        else:
+            _params += (p,)
+
+    return _params
+
+
+def force_array(x, dtype=None):
+    # numpy returns a float value (scalar) when a return value of an operator
+    # is a 0-dimension array.
+    # We need to convert such a value to a 0-dimension array because `Function`
+    # object needs to return an `numpy.ndarray`.
+    if np.isscalar(x):
+        if dtype is None:
+            return np.array(x)
+        else:
+            return np.array(x, dtype)
+    else:
+        if dtype is None:
+            return x
+        else:
+            return x.astype(dtype, copy=False)
+
+
+def expect_allclose(act, ref, atol=1e-4, rtol=1e-4, verbose=True):
+    """Failed if some corresponding element of act and ref differs too much.
+
+    Args:
+        act: Left-hand-side array.
+        ref: Right-hand-side array.
+        atol (float): Absolute tolerance.
+        rtol (float): Relative tolerance.
+        verbose (bool): If ``True``, it outputs verbose messages on error.
+    """
+    if not isinstance(act, np.ndarray) or not isinstance(ref, np.ndarray):
+        logging.warning('wrong array types')
+        return False
+
+    act = force_array(act)
+    ref = force_array(ref)
+
+    if (act.size != ref.size or act.itemsize != ref.itemsize
+            or act.shape != ref.shape):
+        logging.warning('size is not matched!\n \
+                         size: act={0} ref={1} \
+                         itemsize: act={2} ref={3}\n'
+                        'shape: act={4}, ref={5} dtype: act={6} ref={7}'
+                        .format(act.size, ref.size, act.itemsize, ref.itemsize,
+                                act.shape, ref.shape, act.dtype, ref.dtype))
+        return False
+
+    act = np.ascontiguousarray(act)
+    ref = np.ascontiguousarray(ref)
+
+    try:
+        np.testing.assert_allclose(act, ref, rtol, atol, verbose=verbose)
+    except Exception:
+        return False
+
+    return True
+
+
+def verify_results(func, acts, refs, inputs):
+    if acts is None and refs is None:
+        logging.warning('input results are None!')
+        return True
+    elif acts is None or refs is None:
+        logging.error('cosim: input results are None!')
+        return False
+
+    if len(acts) != len(refs):
+        logging.error('cosim: lengths of results \
+                      are different <acts_size={0} refs_size={1}>!'
+                      .format(len(acts), len(refs)))
+        return False
+
+    check_options = {'atol': 1e-3, 'rtol': 1e-2, 'verbose': True}
+
+    for (i, (act, ref)) in enumerate(zip(acts, refs)):
+        if ref is None and act is None:
+            continue
+        elif ref is None or act is None:
+            logging.error('cosim: one input result is None!')
+            return False
+
+        if not expect_allclose(*plain_array((act, ref)), **check_options):
+            logging.error('cosim: mismatched in {0} #{1} result!\n\
+                           size: {2}, itemsize: {3}\n'
+                          'shape: {4}, dtype: {5}'
+                          .format(func.__class__.__name__, i, act.size,
+                                  act.itemsize, act.shape, act.dtype))
+            return False
+
+    return True
+
+
+def cosim_verify(func, acts, inputs):
+    if not is_cosim():
+        return
+
+    logging.info('cosim test for function {0} ...'.format(
+        func.__class__.__name__))
+
+    refs = plain_array(func.forward_cpu(plain_array(inputs)))
+
+    if not verify_results(func, acts, refs, inputs):
+        logging.error('cosim test for function {0} ...FAILED'.format(
+            func.__class__.__name__))
+        raise RuntimeError
+
+    logging.info('cosim test for function {0} ...PASS'.format(
+        func.__class__.__name__))
diff --git a/python/ideep4py/include/blas/blas.h b/python/ideep4py/include/blas/blas.h
new file mode 100644
index 00000000..3b0e334d
--- /dev/null
+++ b/python/ideep4py/include/blas/blas.h
@@ -0,0 +1,30 @@
+/*
+ *Copyright (c) 2018 Intel Corporation.
+ *
+ *Permission is hereby granted, free of charge, to any person obtaining a copy
+ *of this software and associated documentation files (the "Software"), to deal
+ *in the Software without restriction, including without limitation the rights
+ *to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ *copies of the Software, and to permit persons to whom the Software is
+ *furnished to do so, subject to the following conditions:
+ *
+ *The above copyright notice and this permission notice shall be included in
+ *all copies or substantial portions of the Software.
+ *
+ *THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ *OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ *THE SOFTWARE.
+ *
+ */
+
+
+#ifndef _BLAS_H_
+#define _BLAS_H_
+
+#include "sum.h"
+
+#endif
diff --git a/python/ideep4py/include/blas/sum.h b/python/ideep4py/include/blas/sum.h
new file mode 100644
index 00000000..57ed0a19
--- /dev/null
+++ b/python/ideep4py/include/blas/sum.h
@@ -0,0 +1,33 @@
+/*
+ *Copyright (c) 2018 Intel Corporation.
+ *
+ *Permission is hereby granted, free of charge, to any person obtaining a copy
+ *of this software and associated documentation files (the "Software"), to deal
+ *in the Software without restriction, including without limitation the rights
+ *to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ *copies of the Software, and to permit persons to whom the Software is
+ *furnished to do so, subject to the following conditions:
+ *
+ *The above copyright notice and this permission notice shall be included in
+ *all copies or substantial portions of the Software.
+ *
+ *THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ *OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ *THE SOFTWARE.
+ *
+ */
+
+
+#ifndef _SUM_ALONG_AXIS_H_
+#define _SUM_ALONG_AXIS_H_
+
+#include <vector>
+#include "tensor.h"
+
+Tensor * blas_sum(Tensor *src, vector<int> axis);
+
+#endif
diff --git a/python/ideep4py/include/mkl/mkl_types.h b/python/ideep4py/include/mkl/mkl_types.h
new file mode 100644
index 00000000..bfa38e24
--- /dev/null
+++ b/python/ideep4py/include/mkl/mkl_types.h
@@ -0,0 +1,149 @@
+/*******************************************************************************
+* Copyright (c) 1999-2017, Intel Corporation
+* All rights reserved.
+*
+* Redistribution and use in source and binary forms, with or without
+* modification, are permitted provided that the following conditions are met:
+*
+*     * Redistributions of source code must retain the above copyright notice,
+*       this list of conditions and the following disclaimer.
+*     * Redistributions in binary form must reproduce the above copyright
+*       notice, this list of conditions and the following disclaimer in the
+*       documentation and/or other materials provided with the distribution.
+*     * Neither the name of Intel Corporation nor the names of its contributors
+*       may be used to endorse or promote products derived from this software
+*       without specific prior written permission.
+*
+* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+
+/*
+! Content:
+!      Intel(R) Math Kernel Library (Intel(R) MKL) types definition
+!****************************************************************************/
+
+#ifndef _MKL_TYPES_H_
+#define _MKL_TYPES_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif /* __cplusplus */
+
+/* MKL Complex type for single precision */
+#ifndef MKL_Complex8
+typedef
+struct _MKL_Complex8 {
+    float real;
+    float imag;
+} MKL_Complex8;
+#endif
+
+/* MKL Complex type for double precision */
+#ifndef MKL_Complex16
+typedef
+struct _MKL_Complex16 {
+    double real;
+    double imag;
+} MKL_Complex16;
+#endif
+
+/* MKL Version type */
+typedef
+struct {
+    int    MajorVersion;
+    int    MinorVersion;
+    int    UpdateVersion;
+    char * ProductStatus;
+    char * Build;
+    char * Processor;
+    char * Platform;
+} MKLVersion;
+
+/* MKL integer types for LP64 and ILP64 */
+#if (!defined(__INTEL_COMPILER)) & defined(_MSC_VER)
+    #define MKL_INT64 __int64
+    #define MKL_UINT64 unsigned __int64
+#else
+    #define MKL_INT64 long long int
+    #define MKL_UINT64 unsigned long long int
+#endif
+
+#ifdef MKL_ILP64
+
+/* MKL ILP64 integer types */
+#ifndef MKL_INT
+    #define MKL_INT MKL_INT64
+#endif
+#ifndef MKL_UINT
+    #define MKL_UINT MKL_UINT64
+#endif
+#define MKL_LONG MKL_INT64
+
+#else
+
+/* MKL LP64 integer types */
+#ifndef MKL_INT
+    #define MKL_INT int
+#endif
+#ifndef MKL_UINT
+    #define MKL_UINT unsigned int
+#endif
+#define MKL_LONG long int
+
+#endif
+
+/* MKL threading stuff. MKL Domain names */
+#define MKL_DOMAIN_ALL      0
+#define MKL_DOMAIN_BLAS     1
+#define MKL_DOMAIN_FFT      2
+#define MKL_DOMAIN_VML      3
+#define MKL_DOMAIN_PARDISO  4
+
+/* MKL CBWR stuff */
+
+/* options */
+#define MKL_CBWR_BRANCH 1
+#define MKL_CBWR_ALL   ~0
+
+/* common settings */
+#define MKL_CBWR_UNSET_ALL 0
+#define MKL_CBWR_OFF       0
+
+/* branch specific values */
+#define MKL_CBWR_BRANCH_OFF     1
+#define MKL_CBWR_AUTO           2
+#define MKL_CBWR_COMPATIBLE     3
+#define MKL_CBWR_SSE2           4
+#define MKL_CBWR_SSSE3          6
+#define MKL_CBWR_SSE4_1         7
+#define MKL_CBWR_SSE4_2         8
+#define MKL_CBWR_AVX            9
+#define MKL_CBWR_AVX2          10
+#define MKL_CBWR_AVX512_MIC    11
+#define MKL_CBWR_AVX512        12
+
+/* error codes */
+#define MKL_CBWR_SUCCESS                   0
+#define MKL_CBWR_ERR_INVALID_SETTINGS     -1
+#define MKL_CBWR_ERR_INVALID_INPUT        -2
+#define MKL_CBWR_ERR_UNSUPPORTED_BRANCH   -3
+#define MKL_CBWR_ERR_UNKNOWN_BRANCH       -4
+#define MKL_CBWR_ERR_MODE_CHANGE_FAILURE  -8
+
+/* Obsolete */
+#define MKL_CBWR_SSE3           5
+
+#ifdef __cplusplus
+}
+#endif /* __cplusplus */
+
+#endif /* _MKL_TYPES_H_ */
diff --git a/python/ideep4py/include/mkl/mkl_vsl.h b/python/ideep4py/include/mkl/mkl_vsl.h
new file mode 100644
index 00000000..a6ad20d3
--- /dev/null
+++ b/python/ideep4py/include/mkl/mkl_vsl.h
@@ -0,0 +1,51 @@
+/* file: mkl_vsl.h */
+/*******************************************************************************
+* Copyright (c) 2006-2017, Intel Corporation
+* All rights reserved.
+*
+* Redistribution and use in source and binary forms, with or without
+* modification, are permitted provided that the following conditions are met:
+*
+*     * Redistributions of source code must retain the above copyright notice,
+*       this list of conditions and the following disclaimer.
+*     * Redistributions in binary form must reproduce the above copyright
+*       notice, this list of conditions and the following disclaimer in the
+*       documentation and/or other materials provided with the distribution.
+*     * Neither the name of Intel Corporation nor the names of its contributors
+*       may be used to endorse or promote products derived from this software
+*       without specific prior written permission.
+*
+* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+
+/*
+//++
+//  The main VSL header file.
+//--
+*/
+
+#ifndef __MKL_VSL_H__
+#define __MKL_VSL_H__
+
+#ifdef __cplusplus
+extern "C" {
+#endif /* __cplusplus */
+
+#include "mkl_vsl_defines.h"
+#include "mkl_vsl_functions.h"
+#include "mkl_vsl_types.h"
+
+#ifdef __cplusplus
+}
+#endif /* __cplusplus */
+
+#endif /* __MKL_VSL_H__ */
diff --git a/python/ideep4py/include/mkl/mkl_vsl_defines.h b/python/ideep4py/include/mkl/mkl_vsl_defines.h
new file mode 100644
index 00000000..849044ac
--- /dev/null
+++ b/python/ideep4py/include/mkl/mkl_vsl_defines.h
@@ -0,0 +1,1094 @@
+/* file: mkl_vsl_defines.h */
+/*******************************************************************************
+* Copyright (c) 2006-2017, Intel Corporation
+* All rights reserved.
+*
+* Redistribution and use in source and binary forms, with or without
+* modification, are permitted provided that the following conditions are met:
+*
+*     * Redistributions of source code must retain the above copyright notice,
+*       this list of conditions and the following disclaimer.
+*     * Redistributions in binary form must reproduce the above copyright
+*       notice, this list of conditions and the following disclaimer in the
+*       documentation and/or other materials provided with the distribution.
+*     * Neither the name of Intel Corporation nor the names of its contributors
+*       may be used to endorse or promote products derived from this software
+*       without specific prior written permission.
+*
+* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+
+/*
+//++
+//  User-level macro definitions
+//--
+*/
+
+#ifndef __MKL_VSL_DEFINES_H__
+#define __MKL_VSL_DEFINES_H__
+
+#ifdef __cplusplus
+extern "C" {
+#endif /* __cplusplus */
+
+
+/*
+// "No error" status
+*/
+#define VSL_STATUS_OK                      0
+#define VSL_ERROR_OK                       0
+
+/*
+// Common errors (-1..-999)
+*/
+#define VSL_ERROR_FEATURE_NOT_IMPLEMENTED  -1
+#define VSL_ERROR_UNKNOWN                  -2
+#define VSL_ERROR_BADARGS                  -3
+#define VSL_ERROR_MEM_FAILURE              -4
+#define VSL_ERROR_NULL_PTR                 -5
+#define VSL_ERROR_CPU_NOT_SUPPORTED        -6
+
+
+/*
+// RNG errors (-1000..-1999)
+*/
+/* brng errors */
+#define VSL_RNG_ERROR_INVALID_BRNG_INDEX        -1000
+#define VSL_RNG_ERROR_LEAPFROG_UNSUPPORTED      -1002
+#define VSL_RNG_ERROR_SKIPAHEAD_UNSUPPORTED     -1003
+#define VSL_RNG_ERROR_BRNGS_INCOMPATIBLE        -1005
+#define VSL_RNG_ERROR_BAD_STREAM                -1006
+#define VSL_RNG_ERROR_BRNG_TABLE_FULL           -1007
+#define VSL_RNG_ERROR_BAD_STREAM_STATE_SIZE     -1008
+#define VSL_RNG_ERROR_BAD_WORD_SIZE             -1009
+#define VSL_RNG_ERROR_BAD_NSEEDS                -1010
+#define VSL_RNG_ERROR_BAD_NBITS                 -1011
+#define VSL_RNG_ERROR_QRNG_PERIOD_ELAPSED       -1012
+#define VSL_RNG_ERROR_LEAPFROG_NSTREAMS_TOO_BIG -1013
+#define VSL_RNG_ERROR_BRNG_NOT_SUPPORTED        -1014
+
+/* abstract stream related errors */
+#define VSL_RNG_ERROR_BAD_UPDATE                -1120
+#define VSL_RNG_ERROR_NO_NUMBERS                -1121
+#define VSL_RNG_ERROR_INVALID_ABSTRACT_STREAM   -1122
+
+/* non determenistic stream related errors */
+#define VSL_RNG_ERROR_NONDETERM_NOT_SUPPORTED     -1130
+#define VSL_RNG_ERROR_NONDETERM_NRETRIES_EXCEEDED -1131
+
+/* ARS5 stream related errors */
+#define VSL_RNG_ERROR_ARS5_NOT_SUPPORTED        -1140
+
+/* read/write stream to file errors */
+#define VSL_RNG_ERROR_FILE_CLOSE                -1100
+#define VSL_RNG_ERROR_FILE_OPEN                 -1101
+#define VSL_RNG_ERROR_FILE_WRITE                -1102
+#define VSL_RNG_ERROR_FILE_READ                 -1103
+
+#define VSL_RNG_ERROR_BAD_FILE_FORMAT           -1110
+#define VSL_RNG_ERROR_UNSUPPORTED_FILE_VER      -1111
+
+#define VSL_RNG_ERROR_BAD_MEM_FORMAT            -1200
+
+/* Convolution/correlation errors */
+#define VSL_CC_ERROR_NOT_IMPLEMENTED        (-2000)
+#define VSL_CC_ERROR_ALLOCATION_FAILURE     (-2001)
+#define VSL_CC_ERROR_BAD_DESCRIPTOR         (-2200)
+#define VSL_CC_ERROR_SERVICE_FAILURE        (-2210)
+#define VSL_CC_ERROR_EDIT_FAILURE           (-2211)
+#define VSL_CC_ERROR_EDIT_PROHIBITED        (-2212)
+#define VSL_CC_ERROR_COMMIT_FAILURE         (-2220)
+#define VSL_CC_ERROR_COPY_FAILURE           (-2230)
+#define VSL_CC_ERROR_DELETE_FAILURE         (-2240)
+#define VSL_CC_ERROR_BAD_ARGUMENT           (-2300)
+#define VSL_CC_ERROR_DIMS                   (-2301)
+#define VSL_CC_ERROR_START                  (-2302)
+#define VSL_CC_ERROR_DECIMATION             (-2303)
+#define VSL_CC_ERROR_XSHAPE                 (-2311)
+#define VSL_CC_ERROR_YSHAPE                 (-2312)
+#define VSL_CC_ERROR_ZSHAPE                 (-2313)
+#define VSL_CC_ERROR_XSTRIDE                (-2321)
+#define VSL_CC_ERROR_YSTRIDE                (-2322)
+#define VSL_CC_ERROR_ZSTRIDE                (-2323)
+#define VSL_CC_ERROR_X                      (-2331)
+#define VSL_CC_ERROR_Y                      (-2332)
+#define VSL_CC_ERROR_Z                      (-2333)
+#define VSL_CC_ERROR_JOB                    (-2100)
+#define VSL_CC_ERROR_KIND                   (-2110)
+#define VSL_CC_ERROR_MODE                   (-2120)
+#define VSL_CC_ERROR_TYPE                   (-2130)
+#define VSL_CC_ERROR_PRECISION              (-2400)
+#define VSL_CC_ERROR_EXTERNAL_PRECISION     (-2141)
+#define VSL_CC_ERROR_INTERNAL_PRECISION     (-2142)
+#define VSL_CC_ERROR_METHOD                 (-2400)
+#define VSL_CC_ERROR_OTHER                  (-2800)
+
+/*
+//++
+// SUMMARY STATTISTICS ERROR/WARNING CODES
+//--
+*/
+
+/*
+// Warnings
+*/
+#define VSL_SS_NOT_FULL_RANK_MATRIX                   4028
+#define VSL_SS_SEMIDEFINITE_COR                       4029
+/*
+// Errors (-4000..-4999)
+*/
+#define VSL_SS_ERROR_ALLOCATION_FAILURE              -4000
+#define VSL_SS_ERROR_BAD_DIMEN                       -4001
+#define VSL_SS_ERROR_BAD_OBSERV_N                    -4002
+#define VSL_SS_ERROR_STORAGE_NOT_SUPPORTED           -4003
+#define VSL_SS_ERROR_BAD_INDC_ADDR                   -4004
+#define VSL_SS_ERROR_BAD_WEIGHTS                     -4005
+#define VSL_SS_ERROR_BAD_MEAN_ADDR                   -4006
+#define VSL_SS_ERROR_BAD_2R_MOM_ADDR                 -4007
+#define VSL_SS_ERROR_BAD_3R_MOM_ADDR                 -4008
+#define VSL_SS_ERROR_BAD_4R_MOM_ADDR                 -4009
+#define VSL_SS_ERROR_BAD_2C_MOM_ADDR                 -4010
+#define VSL_SS_ERROR_BAD_3C_MOM_ADDR                 -4011
+#define VSL_SS_ERROR_BAD_4C_MOM_ADDR                 -4012
+#define VSL_SS_ERROR_BAD_KURTOSIS_ADDR               -4013
+#define VSL_SS_ERROR_BAD_SKEWNESS_ADDR               -4014
+#define VSL_SS_ERROR_BAD_MIN_ADDR                    -4015
+#define VSL_SS_ERROR_BAD_MAX_ADDR                    -4016
+#define VSL_SS_ERROR_BAD_VARIATION_ADDR              -4017
+#define VSL_SS_ERROR_BAD_COV_ADDR                    -4018
+#define VSL_SS_ERROR_BAD_COR_ADDR                    -4019
+#define VSL_SS_ERROR_BAD_ACCUM_WEIGHT_ADDR           -4020
+#define VSL_SS_ERROR_BAD_QUANT_ORDER_ADDR            -4021
+#define VSL_SS_ERROR_BAD_QUANT_ORDER                 -4022
+#define VSL_SS_ERROR_BAD_QUANT_ADDR                  -4023
+#define VSL_SS_ERROR_BAD_ORDER_STATS_ADDR            -4024
+#define VSL_SS_ERROR_MOMORDER_NOT_SUPPORTED          -4025
+#define VSL_SS_ERROR_ALL_OBSERVS_OUTLIERS            -4026
+#define VSL_SS_ERROR_BAD_ROBUST_COV_ADDR             -4027
+#define VSL_SS_ERROR_BAD_ROBUST_MEAN_ADDR            -4028
+#define VSL_SS_ERROR_METHOD_NOT_SUPPORTED            -4029
+#define VSL_SS_ERROR_BAD_GROUP_INDC_ADDR             -4030
+#define VSL_SS_ERROR_NULL_TASK_DESCRIPTOR            -4031
+#define VSL_SS_ERROR_BAD_OBSERV_ADDR                 -4032
+#define VSL_SS_ERROR_SINGULAR_COV                    -4033
+#define VSL_SS_ERROR_BAD_POOLED_COV_ADDR             -4034
+#define VSL_SS_ERROR_BAD_POOLED_MEAN_ADDR            -4035
+#define VSL_SS_ERROR_BAD_GROUP_COV_ADDR              -4036
+#define VSL_SS_ERROR_BAD_GROUP_MEAN_ADDR             -4037
+#define VSL_SS_ERROR_BAD_GROUP_INDC                  -4038
+#define VSL_SS_ERROR_BAD_OUTLIERS_PARAMS_ADDR        -4039
+#define VSL_SS_ERROR_BAD_OUTLIERS_PARAMS_N_ADDR      -4040
+#define VSL_SS_ERROR_BAD_OUTLIERS_WEIGHTS_ADDR       -4041
+#define VSL_SS_ERROR_BAD_ROBUST_COV_PARAMS_ADDR      -4042
+#define VSL_SS_ERROR_BAD_ROBUST_COV_PARAMS_N_ADDR    -4043
+#define VSL_SS_ERROR_BAD_STORAGE_ADDR                -4044
+#define VSL_SS_ERROR_BAD_PARTIAL_COV_IDX_ADDR        -4045
+#define VSL_SS_ERROR_BAD_PARTIAL_COV_ADDR            -4046
+#define VSL_SS_ERROR_BAD_PARTIAL_COR_ADDR            -4047
+#define VSL_SS_ERROR_BAD_MI_PARAMS_ADDR              -4048
+#define VSL_SS_ERROR_BAD_MI_PARAMS_N_ADDR            -4049
+#define VSL_SS_ERROR_BAD_MI_BAD_PARAMS_N             -4050
+#define VSL_SS_ERROR_BAD_MI_PARAMS                   -4051
+#define VSL_SS_ERROR_BAD_MI_INIT_ESTIMATES_N_ADDR    -4052
+#define VSL_SS_ERROR_BAD_MI_INIT_ESTIMATES_ADDR      -4053
+#define VSL_SS_ERROR_BAD_MI_SIMUL_VALS_ADDR          -4054
+#define VSL_SS_ERROR_BAD_MI_SIMUL_VALS_N_ADDR        -4055
+#define VSL_SS_ERROR_BAD_MI_ESTIMATES_N_ADDR         -4056
+#define VSL_SS_ERROR_BAD_MI_ESTIMATES_ADDR           -4057
+#define VSL_SS_ERROR_BAD_MI_SIMUL_VALS_N             -4058
+#define VSL_SS_ERROR_BAD_MI_ESTIMATES_N              -4059
+#define VSL_SS_ERROR_BAD_MI_OUTPUT_PARAMS            -4060
+#define VSL_SS_ERROR_BAD_MI_PRIOR_N_ADDR             -4061
+#define VSL_SS_ERROR_BAD_MI_PRIOR_ADDR               -4062
+#define VSL_SS_ERROR_BAD_MI_MISSING_VALS_N           -4063
+#define VSL_SS_ERROR_BAD_STREAM_QUANT_PARAMS_N_ADDR  -4064
+#define VSL_SS_ERROR_BAD_STREAM_QUANT_PARAMS_ADDR    -4065
+#define VSL_SS_ERROR_BAD_STREAM_QUANT_PARAMS_N       -4066
+#define VSL_SS_ERROR_BAD_STREAM_QUANT_PARAMS         -4067
+#define VSL_SS_ERROR_BAD_STREAM_QUANT_ORDER_ADDR     -4068
+#define VSL_SS_ERROR_BAD_STREAM_QUANT_ORDER          -4069
+#define VSL_SS_ERROR_BAD_STREAM_QUANT_ADDR           -4070
+#define VSL_SS_ERROR_BAD_PARAMTR_COR_ADDR            -4071
+#define VSL_SS_ERROR_BAD_COR                         -4072
+#define VSL_SS_ERROR_BAD_PARTIAL_COV_IDX             -4073
+#define VSL_SS_ERROR_BAD_SUM_ADDR                    -4074
+#define VSL_SS_ERROR_BAD_2R_SUM_ADDR                 -4075
+#define VSL_SS_ERROR_BAD_3R_SUM_ADDR                 -4076
+#define VSL_SS_ERROR_BAD_4R_SUM_ADDR                 -4077
+#define VSL_SS_ERROR_BAD_2C_SUM_ADDR                 -4078
+#define VSL_SS_ERROR_BAD_3C_SUM_ADDR                 -4079
+#define VSL_SS_ERROR_BAD_4C_SUM_ADDR                 -4080
+#define VSL_SS_ERROR_BAD_CP_ADDR                     -4081
+#define VSL_SS_ERROR_BAD_MDAD_ADDR                   -4082
+#define VSL_SS_ERROR_BAD_MNAD_ADDR                   -4083
+#define VSL_SS_ERROR_BAD_SORTED_OBSERV_ADDR          -4084
+#define VSL_SS_ERROR_INDICES_NOT_SUPPORTED           -4085
+
+
+/*
+// Internal errors caused by internal routines of the functions
+*/
+#define VSL_SS_ERROR_ROBCOV_INTERN_C1                -5000
+#define VSL_SS_ERROR_PARTIALCOV_INTERN_C1            -5010
+#define VSL_SS_ERROR_PARTIALCOV_INTERN_C2            -5011
+#define VSL_SS_ERROR_MISSINGVALS_INTERN_C1           -5021
+#define VSL_SS_ERROR_MISSINGVALS_INTERN_C2           -5022
+#define VSL_SS_ERROR_MISSINGVALS_INTERN_C3           -5023
+#define VSL_SS_ERROR_MISSINGVALS_INTERN_C4           -5024
+#define VSL_SS_ERROR_MISSINGVALS_INTERN_C5           -5025
+#define VSL_SS_ERROR_PARAMTRCOR_INTERN_C1            -5030
+#define VSL_SS_ERROR_COVRANK_INTERNAL_ERROR_C1       -5040
+#define VSL_SS_ERROR_INVCOV_INTERNAL_ERROR_C1        -5041
+#define VSL_SS_ERROR_INVCOV_INTERNAL_ERROR_C2        -5042
+
+
+/*
+// CONV/CORR RELATED MACRO DEFINITIONS
+*/
+#define VSL_CONV_MODE_AUTO        0
+#define VSL_CORR_MODE_AUTO        0
+#define VSL_CONV_MODE_DIRECT      1
+#define VSL_CORR_MODE_DIRECT      1
+#define VSL_CONV_MODE_FFT         2
+#define VSL_CORR_MODE_FFT         2
+#define VSL_CONV_PRECISION_SINGLE 1
+#define VSL_CORR_PRECISION_SINGLE 1
+#define VSL_CONV_PRECISION_DOUBLE 2
+#define VSL_CORR_PRECISION_DOUBLE 2
+
+/*
+//++
+//  BASIC RANDOM NUMBER GENERATOR (BRNG) RELATED MACRO DEFINITIONS
+//--
+*/
+
+/*
+//  MAX NUMBER OF BRNGS CAN BE REGISTERED IN VSL
+//  No more than VSL_MAX_REG_BRNGS basic generators can be registered in VSL
+//  (including predefined basic generators).
+//
+//  Change this number to increase/decrease number of BRNGs can be registered.
+*/
+#define VSL_MAX_REG_BRNGS           512
+
+/*
+//  PREDEFINED BRNG NAMES
+*/
+#define VSL_BRNG_SHIFT      20
+#define VSL_BRNG_INC        (1<<VSL_BRNG_SHIFT)
+
+#define VSL_BRNG_MCG31          (VSL_BRNG_INC)
+#define VSL_BRNG_R250           (VSL_BRNG_MCG31    +VSL_BRNG_INC)
+#define VSL_BRNG_MRG32K3A       (VSL_BRNG_R250     +VSL_BRNG_INC)
+#define VSL_BRNG_MCG59          (VSL_BRNG_MRG32K3A +VSL_BRNG_INC)
+#define VSL_BRNG_WH             (VSL_BRNG_MCG59    +VSL_BRNG_INC)
+#define VSL_BRNG_SOBOL          (VSL_BRNG_WH       +VSL_BRNG_INC)
+#define VSL_BRNG_NIEDERR        (VSL_BRNG_SOBOL    +VSL_BRNG_INC)
+#define VSL_BRNG_MT19937        (VSL_BRNG_NIEDERR  +VSL_BRNG_INC)
+#define VSL_BRNG_MT2203         (VSL_BRNG_MT19937  +VSL_BRNG_INC)
+#define VSL_BRNG_IABSTRACT      (VSL_BRNG_MT2203   +VSL_BRNG_INC)
+#define VSL_BRNG_DABSTRACT      (VSL_BRNG_IABSTRACT+VSL_BRNG_INC)
+#define VSL_BRNG_SABSTRACT      (VSL_BRNG_DABSTRACT+VSL_BRNG_INC)
+#define VSL_BRNG_SFMT19937      (VSL_BRNG_SABSTRACT+VSL_BRNG_INC)
+#define VSL_BRNG_NONDETERM      (VSL_BRNG_SFMT19937+VSL_BRNG_INC)
+#define VSL_BRNG_ARS5           (VSL_BRNG_NONDETERM+VSL_BRNG_INC)
+#define VSL_BRNG_PHILOX4X32X10  (VSL_BRNG_ARS5     +VSL_BRNG_INC)
+
+
+/*
+// PREDEFINED PARAMETERS FOR NON-DETERMNINISTIC RANDOM NUMBER
+// GENERATOR
+// The library provides an abstraction to the source of non-deterministic
+// random numbers supported in HW. Current version of the library provides
+// interface to RDRAND-based only, available in latest Intel CPU.
+*/
+#define VSL_BRNG_RDRAND  0x0
+#define VSL_BRNG_NONDETERM_NRETRIES 10
+
+/*
+//  LEAPFROG METHOD FOR GRAY-CODE BASED QUASI-RANDOM NUMBER BASIC GENERATORS
+//  VSL_BRNG_SOBOL and VSL_BRNG_NIEDERR are Gray-code based quasi-random number
+//  basic generators. In contrast to pseudorandom number basic generators,
+//  quasi-random ones take the dimension as initialization parameter.
+//
+//  Suppose that quasi-random number generator (QRNG) dimension is S. QRNG
+//  sequence is a sequence of S-dimensional vectors:
+//
+//     x0=(x0[0],x0[1],...,x0[S-1]),x1=(x1[0],x1[1],...,x1[S-1]),...
+//
+//  VSL treats the output of any basic generator as 1-dimensional, however:
+//
+//     x0[0],x0[1],...,x0[S-1],x1[0],x1[1],...,x1[S-1],...
+//
+//  Because of nature of VSL_BRNG_SOBOL and VSL_BRNG_NIEDERR QRNGs,
+//  the only S-stride Leapfrog method is supported for them. In other words,
+//  user can generate subsequences, which consist of fixed elements of
+//  vectors x0,x1,... For example, if 0 element is fixed, the following
+//  subsequence is generated:
+//
+//     x0[1],x1[1],x2[1],...
+//
+//  To use the s-stride Leapfrog method with given QRNG, user should call
+//  vslLeapfrogStream function with parameter k equal to element to be fixed
+//  (0<=k<S) and parameter nstreams equal to VSL_QRNG_LEAPFROG_COMPONENTS.
+*/
+#define VSL_QRNG_LEAPFROG_COMPONENTS    0x7fffffff
+
+/*
+//  USER-DEFINED PARAMETERS FOR QUASI-RANDOM NUMBER BASIC GENERATORS
+//  VSL_BRNG_SOBOL and VSL_BRNG_NIEDERR are Gray-code based quasi-random
+//  number basic generators. Default parameters of the generators
+//  support generation of quasi-random number vectors of dimensions
+//  S<=40 for SOBOL and S<=318 for NIEDERRITER. The library provides
+//  opportunity to register user-defined initial values for the
+//  generators and generate quasi-random vectors of desirable dimension.
+//  There is also opportunity to register user-defined parameters for
+//  default dimensions and obtain another sequence of quasi-random vectors.
+//  Service function vslNewStreamEx is used to pass the parameters to
+//  the library. Data are packed into array params, parameter of the routine.
+//  First element of the array is used for dimension S, second element
+//  contains indicator, VSL_USER_QRNG_INITIAL_VALUES, of user-defined
+//  parameters for quasi-random number generators.
+//  Macros VSL_USER_PRIMITIVE_POLYMS and VSL_USER_INIT_DIRECTION_NUMBERS
+//  are used to describe which data are passed to SOBOL QRNG and
+//  VSL_USER_IRRED_POLYMS - which data are passed to NIEDERRITER QRNG.
+//  For example, to demonstrate that both primitive polynomials and initial
+//  direction numbers are passed in SOBOL one should set third element of the
+//  array params to  VSL_USER_PRIMITIVE_POLYMS | VSL_USER_DIRECTION_NUMBERS.
+//  Macro VSL_QRNG_OVERRIDE_1ST_DIM_INIT is used to override default
+//  initialization for the first dimension. Macro VSL_USER_DIRECTION_NUMBERS
+//  is used when direction numbers calculated on the user side are passed
+//  into the generators. More detailed description of interface for
+//  registration of user-defined QRNG initial parameters can be found
+//  in VslNotes.pdf.
+*/
+#define VSL_USER_QRNG_INITIAL_VALUES     0x1
+#define VSL_USER_PRIMITIVE_POLYMS        0x1
+#define VSL_USER_INIT_DIRECTION_NUMBERS  0x2
+#define VSL_USER_IRRED_POLYMS            0x1
+#define VSL_USER_DIRECTION_NUMBERS       0x4
+#define VSL_QRNG_OVERRIDE_1ST_DIM_INIT   0x8
+
+
+/*
+//  INITIALIZATION METHODS FOR USER-DESIGNED BASIC RANDOM NUMBER GENERATORS.
+//  Each BRNG must support at least VSL_INIT_METHOD_STANDARD initialization
+//  method. In addition, VSL_INIT_METHOD_LEAPFROG and VSL_INIT_METHOD_SKIPAHEAD
+//  initialization methods can be supported.
+//
+//  If VSL_INIT_METHOD_LEAPFROG is not supported then initialization routine
+//  must return VSL_RNG_ERROR_LEAPFROG_UNSUPPORTED error code.
+//
+//  If VSL_INIT_METHOD_SKIPAHEAD is not supported then initialization routine
+//  must return VSL_RNG_ERROR_SKIPAHEAD_UNSUPPORTED error code.
+//
+//  If there is no error during initialization, the initialization routine must
+//  return VSL_ERROR_OK code.
+*/
+#define VSL_INIT_METHOD_STANDARD  0
+#define VSL_INIT_METHOD_LEAPFROG  1
+#define VSL_INIT_METHOD_SKIPAHEAD 2
+
+
+/*
+//++
+//  ACCURACY FLAG FOR DISTRIBUTION GENERATORS
+//  This flag defines mode of random number generation.
+//  If accuracy mode is set distribution generators will produce
+//  numbers lying exactly within definitional domain for all values
+//  of distribution parameters. In this case slight performance
+//  degradation is expected. By default accuracy mode is switched off
+//  admitting random numbers to be out of the definitional domain for
+//  specific values of distribution parameters.
+//  This macro is used to form names for accuracy versions of
+//  distribution number generators
+//--
+*/
+#define VSL_RNG_METHOD_ACCURACY_FLAG (1<<30)
+
+/*
+//++
+//  TRANSFORMATION METHOD NAMES FOR DISTRIBUTION RANDOM NUMBER GENERATORS
+//  VSL interface allows more than one generation method in a distribution
+//  transformation subroutine. Following macro definitions are used to
+//  specify generation method for given distribution generator.
+//
+//  Method name macro is constructed as
+//
+//     VSL_RNG_METHOD_<Distribution>_<Method>
+//
+//  where
+//
+//     <Distribution> - probability distribution
+//     <Method> - method name
+//
+//  VSL_RNG_METHOD_<Distribution>_<Method> should be used with
+//  vsl<precision>Rng<Distribution> function only, where
+//
+//     <precision> - s (single) or d (double)
+//     <Distribution> - probability distribution
+//--
+*/
+
+/*
+// Uniform
+//
+// <Method>   <Short Description>
+// STD        standard method. Currently there is only one method for this
+//            distribution generator
+*/
+#define VSL_RNG_METHOD_UNIFORM_STD 0 /* vsl{s,d,i}RngUniform */
+
+#define VSL_RNG_METHOD_UNIFORM_STD_ACCURATE \
+  VSL_RNG_METHOD_UNIFORM_STD | VSL_RNG_METHOD_ACCURACY_FLAG
+    /* accurate mode of vsl{d,s}RngUniform */
+
+/*
+// Uniform Bits
+//
+// <Method>   <Short Description>
+// STD        standard method. Currently there is only one method for this
+//            distribution generator
+*/
+#define VSL_RNG_METHOD_UNIFORMBITS_STD 0 /* vsliRngUniformBits */
+
+/*
+// Uniform Bits 32
+//
+// <Method>   <Short Description>
+// STD        standard method. Currently there is only one method for this
+//            distribution generator
+*/
+#define VSL_RNG_METHOD_UNIFORMBITS32_STD 0 /* vsliRngUniformBits32 */
+
+/*
+// Uniform Bits 64
+//
+// <Method>   <Short Description>
+// STD        standard method. Currently there is only one method for this
+//            distribution generator
+*/
+#define VSL_RNG_METHOD_UNIFORMBITS64_STD 0 /* vsliRngUniformBits64 */
+
+/*
+// Gaussian
+//
+// <Method>   <Short Description>
+// BOXMULLER  generates normally distributed random number x thru the pair of
+//            uniformly distributed numbers u1 and u2 according to the formula:
+//
+//               x=sqrt(-ln(u1))*sin(2*Pi*u2)
+//
+// BOXMULLER2 generates pair of normally distributed random numbers x1 and x2
+//            thru the pair of uniformly dustributed numbers u1 and u2
+//            according to the formula
+//
+//               x1=sqrt(-ln(u1))*sin(2*Pi*u2)
+//               x2=sqrt(-ln(u1))*cos(2*Pi*u2)
+//
+//            NOTE: implementation correctly works with odd vector lengths
+//
+// ICDF       inverse cumulative distribution function method
+*/
+#define VSL_RNG_METHOD_GAUSSIAN_BOXMULLER   0 /* vsl{d,s}RngGaussian */
+#define VSL_RNG_METHOD_GAUSSIAN_BOXMULLER2  1 /* vsl{d,s}RngGaussian */
+#define VSL_RNG_METHOD_GAUSSIAN_ICDF        2 /* vsl{d,s}RngGaussian */
+
+/*
+// GaussianMV - multivariate (correlated) normal
+// Multivariate (correlated) normal random number generator is based on
+// uncorrelated Gaussian random number generator (see vslsRngGaussian and
+// vsldRngGaussian functions):
+//
+// <Method>   <Short Description>
+// BOXMULLER  generates normally distributed random number x thru the pair of
+//            uniformly distributed numbers u1 and u2 according to the formula:
+//
+//               x=sqrt(-ln(u1))*sin(2*Pi*u2)
+//
+// BOXMULLER2 generates pair of normally distributed random numbers x1 and x2
+//            thru the pair of uniformly dustributed numbers u1 and u2
+//            according to the formula
+//
+//               x1=sqrt(-ln(u1))*sin(2*Pi*u2)
+//               x2=sqrt(-ln(u1))*cos(2*Pi*u2)
+//
+//            NOTE: implementation correctly works with odd vector lengths
+//
+// ICDF       inverse cumulative distribution function method
+*/
+#define VSL_RNG_METHOD_GAUSSIANMV_BOXMULLER   0 /* vsl{d,s}RngGaussianMV */
+#define VSL_RNG_METHOD_GAUSSIANMV_BOXMULLER2  1 /* vsl{d,s}RngGaussianMV */
+#define VSL_RNG_METHOD_GAUSSIANMV_ICDF        2 /* vsl{d,s}RngGaussianMV */
+
+/*
+// Exponential
+//
+// <Method>   <Short Description>
+// ICDF       inverse cumulative distribution function method
+*/
+#define VSL_RNG_METHOD_EXPONENTIAL_ICDF 0 /* vsl{d,s}RngExponential */
+
+#define VSL_RNG_METHOD_EXPONENTIAL_ICDF_ACCURATE \
+   VSL_RNG_METHOD_EXPONENTIAL_ICDF | VSL_RNG_METHOD_ACCURACY_FLAG
+    /* accurate mode of vsl{d,s}RngExponential */
+
+/*
+// Laplace
+//
+// <Method>   <Short Description>
+// ICDF       inverse cumulative distribution function method
+//
+// ICDF - inverse cumulative distribution function method:
+//
+//           x=+/-ln(u) with probability 1/2,
+//
+//        where
+//
+//           x - random number with Laplace distribution,
+//           u - uniformly distributed random number
+*/
+#define VSL_RNG_METHOD_LAPLACE_ICDF 0 /* vsl{d,s}RngLaplace */
+
+/*
+// Weibull
+//
+// <Method>   <Short Description>
+// ICDF       inverse cumulative distribution function method
+*/
+#define VSL_RNG_METHOD_WEIBULL_ICDF 0 /* vsl{d,s}RngWeibull */
+
+#define VSL_RNG_METHOD_WEIBULL_ICDF_ACCURATE \
+   VSL_RNG_METHOD_WEIBULL_ICDF | VSL_RNG_METHOD_ACCURACY_FLAG
+    /* accurate mode of vsl{d,s}RngWeibull */
+
+
+/*
+// Cauchy
+//
+// <Method>   <Short Description>
+// ICDF       inverse cumulative distribution function method
+*/
+#define VSL_RNG_METHOD_CAUCHY_ICDF 0 /* vsl{d,s}RngCauchy */
+
+/*
+// Rayleigh
+//
+// <Method>   <Short Description>
+// ICDF       inverse cumulative distribution function method
+*/
+#define VSL_RNG_METHOD_RAYLEIGH_ICDF 0 /* vsl{d,s}RngRayleigh */
+
+#define VSL_RNG_METHOD_RAYLEIGH_ICDF_ACCURATE \
+   VSL_RNG_METHOD_RAYLEIGH_ICDF | VSL_RNG_METHOD_ACCURACY_FLAG
+    /* accurate mode of vsl{d,s}RngRayleigh */
+
+/*
+// Lognormal
+//
+// <Method>   <Short Description>
+// BOXMULLER2       Box-Muller 2 algorithm based method
+*/
+#define VSL_RNG_METHOD_LOGNORMAL_BOXMULLER2 0 /* vsl{d,s}RngLognormal */
+#define VSL_RNG_METHOD_LOGNORMAL_ICDF 1       /* vsl{d,s}RngLognormal */
+
+#define VSL_RNG_METHOD_LOGNORMAL_BOXMULLER2_ACCURATE \
+   VSL_RNG_METHOD_LOGNORMAL_BOXMULLER2 | VSL_RNG_METHOD_ACCURACY_FLAG
+    /* accurate mode of vsl{d,s}RngLognormal */
+
+#define VSL_RNG_METHOD_LOGNORMAL_ICDF_ACCURATE \
+   VSL_RNG_METHOD_LOGNORMAL_ICDF | VSL_RNG_METHOD_ACCURACY_FLAG
+    /* accurate mode of vsl{d,s}RngLognormal */
+
+
+/*
+// Gumbel
+//
+// <Method>   <Short Description>
+// ICDF       inverse cumulative distribution function method
+*/
+#define VSL_RNG_METHOD_GUMBEL_ICDF 0 /* vsl{d,s}RngGumbel */
+
+/*
+// Gamma
+//
+// Comments:
+// alpha>1             - algorithm of Marsaglia is used, nonlinear
+//                       transformation of gaussian numbers based on
+//                       acceptance/rejection method with squeezes;
+// alpha>=0.6, alpha<1 - rejection from the Weibull distribution is used;
+// alpha<0.6           - transformation of exponential power distribution
+//                       (EPD) is used, EPD random numbers are generated
+//                       by means of acceptance/rejection technique;
+// alpha=1             - gamma distribution reduces to exponential
+//                       distribution
+*/
+#define VSL_RNG_METHOD_GAMMA_GNORM 0 /* vsl{d,s}RngGamma */
+
+#define VSL_RNG_METHOD_GAMMA_GNORM_ACCURATE \
+   VSL_RNG_METHOD_GAMMA_GNORM | VSL_RNG_METHOD_ACCURACY_FLAG
+    /* accurate mode of vsl{d,s}RngGamma */
+
+
+/*
+// Beta
+//
+// Comments:
+// CJA - stands for first letters of Cheng, Johnk, and Atkinson.
+// Cheng    - for min(p,q) > 1 method of Cheng,
+//            generation of beta random numbers of the second kind
+//            based on acceptance/rejection technique and its
+//            transformation to beta random numbers of the first kind;
+// Johnk    - for max(p,q) < 1 methods of Johnk and Atkinson:
+//            if q + K*p^2+C<=0, K=0.852..., C=-0.956...
+//            algorithm of Johnk:
+//            beta distributed random number is generated as
+//            u1^(1/p) / (u1^(1/p)+u2^(1/q)), if u1^(1/p)+u2^(1/q)<=1;
+//            otherwise switching algorithm of Atkinson: interval (0,1)
+//            is divided into two domains (0,t) and (t,1), on each interval
+//            acceptance/rejection technique with convenient majorizing
+//            function is used;
+// Atkinson - for min(p,q)<1, max(p,q)>1 switching algorithm of Atkinson
+//            is used (with another point t, see short description above);
+// ICDF     - inverse cumulative distribution function method according
+//            to formulas x=1-u^(1/q) for p = 1, and x = u^(1/p) for q=1,
+//            where x is beta distributed random number,
+//            u - uniformly distributed random number.
+//            for p=q=1 beta distribution reduces to uniform distribution.
+//
+*/
+#define VSL_RNG_METHOD_BETA_CJA 0 /* vsl{d,s}RngBeta */
+
+#define VSL_RNG_METHOD_BETA_CJA_ACCURATE \
+   VSL_RNG_METHOD_BETA_CJA | VSL_RNG_METHOD_ACCURACY_FLAG
+    /* accurate mode of vsl{d,s}RngBeta */
+
+/*
+// Bernoulli
+//
+// <Method>   <Short Description>
+// ICDF       inverse cumulative distribution function method
+*/
+#define VSL_RNG_METHOD_BERNOULLI_ICDF 0 /* vsliRngBernoulli */
+
+/*
+// Geometric
+//
+// <Method>   <Short Description>
+// ICDF       inverse cumulative distribution function method
+*/
+#define VSL_RNG_METHOD_GEOMETRIC_ICDF 0 /* vsliRngGeometric */
+
+/*
+// Binomial
+//
+// <Method>   <Short Description>
+// BTPE       for ntrial*min(p,1-p)>30 acceptance/rejection method with
+//            decomposition onto 4 regions:
+//
+//               * 2 parallelograms;
+//               * triangle;
+//               * left exponential tail;
+//               * right exponential tail.
+//
+//            othewise table lookup method is used
+*/
+#define VSL_RNG_METHOD_BINOMIAL_BTPE 0 /* vsliRngBinomial */
+
+/*
+// Hypergeometric
+//
+// <Method>   <Short Description>
+// H2PE       if mode of distribution is large, acceptance/rejection method is
+//            used with decomposition onto 3 regions:
+//
+//               * rectangular;
+//               * left exponential tail;
+//               * right exponential tail.
+//
+//            othewise table lookup method is used
+*/
+#define VSL_RNG_METHOD_HYPERGEOMETRIC_H2PE 0 /* vsliRngHypergeometric */
+
+/*
+// Poisson
+//
+// <Method>   <Short Description>
+// PTPE       if lambda>=27, acceptance/rejection method is used with
+//            decomposition onto 4 regions:
+//
+//               * 2 parallelograms;
+//               * triangle;
+//               * left exponential tail;
+//               * right exponential tail.
+//
+//            othewise table lookup method is used
+//
+// POISNORM   for lambda>=1 method is based on Poisson inverse CDF
+//            approximation by Gaussian inverse CDF; for lambda<1
+//            table lookup method is used.
+*/
+#define VSL_RNG_METHOD_POISSON_PTPE     0 /* vsliRngPoisson */
+#define VSL_RNG_METHOD_POISSON_POISNORM 1 /* vsliRngPoisson */
+
+/*
+// Poisson
+//
+// <Method>   <Short Description>
+// POISNORM   for lambda>=1 method is based on Poisson inverse CDF
+//            approximation by Gaussian inverse CDF; for lambda<1
+//            ICDF method is used.
+*/
+#define VSL_RNG_METHOD_POISSONV_POISNORM 0 /* vsliRngPoissonV */
+
+/*
+// Negbinomial
+//
+// <Method>   <Short Description>
+// NBAR       if (a-1)*(1-p)/p>=100, acceptance/rejection method is used with
+//            decomposition onto 5 regions:
+//
+//               * rectangular;
+//               * 2 trapezoid;
+//               * left exponential tail;
+//               * right exponential tail.
+//
+//            othewise table lookup method is used.
+*/
+#define VSL_RNG_METHOD_NEGBINOMIAL_NBAR 0 /* vsliRngNegbinomial */
+
+/*
+//++
+//  MATRIX STORAGE SCHEMES
+//--
+*/
+
+/*
+// Some multivariate random number generators, e.g. GaussianMV, operate
+// with matrix parameters. To optimize matrix parameters usage VSL offers
+// following matrix storage schemes. (See VSL documentation for more details).
+//
+// FULL     - whole matrix is stored
+// PACKED   - lower/higher triangular matrix is packed in 1-dimensional array
+// DIAGONAL - diagonal elements are packed in 1-dimensional array
+*/
+#define VSL_MATRIX_STORAGE_FULL     0
+#define VSL_MATRIX_STORAGE_PACKED   1
+#define VSL_MATRIX_STORAGE_DIAGONAL 2
+
+
+/*
+// SUMMARY STATISTICS (SS) RELATED MACRO DEFINITIONS
+*/
+
+/*
+//++
+//  MATRIX STORAGE SCHEMES
+//--
+*/
+/*
+// SS routines work with matrix parameters, e.g. matrix of observations,
+// variance-covariance matrix. To optimize work with matrices the library
+// provides the following storage matrix schemes.
+*/
+/*
+// Matrix of observations:
+// ROWS    - observations of the random vector are stored in raws, that
+//           is, i-th row of the matrix of observations contains values
+//           of i-th component of the random vector
+// COLS    - observations of the random vector are stored in columns that
+//           is, i-th column of the matrix of observations contains values
+//           of i-th component of the random vector
+*/
+#define VSL_SS_MATRIX_STORAGE_ROWS     0x00010000
+#define VSL_SS_MATRIX_STORAGE_COLS     0x00020000
+
+/*
+// Variance-covariance/correlation matrix:
+// FULL     - whole matrix is stored
+// L_PACKED - lower triangular matrix is stored as 1-dimensional array
+// U_PACKED - upper triangular matrix is stored as 1-dimensional array
+*/
+#define VSL_SS_MATRIX_STORAGE_FULL            0x00000000
+#define VSL_SS_MATRIX_STORAGE_L_PACKED        0x00000001
+#define VSL_SS_MATRIX_STORAGE_U_PACKED        0x00000002
+
+
+/*
+//++
+//  SUMMARY STATISTICS LIBRARY METHODS
+//--
+*/
+/*
+// SS routines provide computation of basic statistical estimates
+// (central/raw moments up to 4th order, variance-covariance,
+//  minimum, maximum, skewness/kurtosis) using the following methods
+//  - FAST  - estimates are computed for price of one or two passes over
+//            observations using highly optimized MKL routines
+//  - 1PASS - estimate is computed for price of one pass of the observations
+//  - FAST_USER_MEAN - estimates are computed for price of one or two passes
+//            over observations given user defined mean for central moments,
+//            covariance and correlation
+//  - CP_TO_COVCOR - convert cross-product matrix to variance-covariance/
+//            correlation matrix
+//  - SUM_TO_MOM - convert raw/central sums to raw/central moments
+//
+*/
+#define VSL_SS_METHOD_FAST                    0x00000001
+#define VSL_SS_METHOD_1PASS                   0x00000002
+#define VSL_SS_METHOD_FAST_USER_MEAN          0x00000100
+#define VSL_SS_METHOD_CP_TO_COVCOR            0x00000200
+#define VSL_SS_METHOD_SUM_TO_MOM              0x00000400
+
+/*
+// SS provides routine for parametrization of correlation matrix using
+// SPECTRAL DECOMPOSITION (SD) method
+*/
+#define VSL_SS_METHOD_SD                      0x00000004
+
+/*
+// SS routine for robust estimation of variance-covariance matrix
+// and mean supports Rocke algorithm, TBS-estimator
+*/
+#define VSL_SS_METHOD_TBS                     0x00000008
+
+/*
+//  SS routine for estimation of missing values
+//  supports Multiple Imputation (MI) method
+*/
+#define VSL_SS_METHOD_MI                      0x00000010
+
+/*
+// SS provides routine for detection of outliers, BACON method
+*/
+#define VSL_SS_METHOD_BACON                   0x00000020
+
+/*
+// SS supports routine for estimation of quantiles for streaming data
+// using the following methods:
+// - ZW      - intermediate estimates of quantiles during processing
+//             the next block are computed
+// - ZW_FAST - intermediate estimates of quantiles during processing
+//             the next block are not computed
+*/
+#define VSL_SS_METHOD_SQUANTS_ZW              0x00000040
+#define VSL_SS_METHOD_SQUANTS_ZW_FAST         0x00000080
+
+
+/*
+// Input of BACON algorithm is set of 3 parameters:
+// - Initialization method of the algorithm
+// - Parameter alfa such that 1-alfa is percentile of Chi2 distribution
+// - Stopping criterion
+*/
+/*
+// Number of BACON algorithm parameters
+*/
+#define VSL_SS_BACON_PARAMS_N         3
+
+/*
+// SS implementation of BACON algorithm supports two initialization methods:
+// - Mahalanobis distance based method
+// - Median based method
+*/
+#define VSL_SS_METHOD_BACON_MAHALANOBIS_INIT  0x00000001
+#define VSL_SS_METHOD_BACON_MEDIAN_INIT       0x00000002
+
+/*
+// SS routine for sorting data, RADIX method
+*/
+#define VSL_SS_METHOD_RADIX                   0x00100000
+
+/*
+// Input of TBS algorithm is set of 4 parameters:
+// - Breakdown point
+// - Asymptotic rejection probability
+// - Stopping criterion
+// - Maximum number of iterations
+*/
+/*
+// Number of TBS algorithm parameters
+*/
+#define VSL_SS_TBS_PARAMS_N           4
+
+/*
+// Input of MI algorithm is set of 5 parameters:
+// - Maximal number of iterations for EM algorithm
+// - Maximal number of iterations for DA algorithm
+// - Stopping criterion
+// - Number of sets to impute
+// - Total number of missing values in dataset
+*/
+/*
+// Number of MI algorithm parameters
+*/
+#define VSL_SS_MI_PARAMS_SIZE         5
+
+/*
+// SS MI algorithm expects that missing values are
+// marked with NANs
+*/
+#define VSL_SS_DNAN                    0xFFF8000000000000
+#define VSL_SS_SNAN                    0xFFC00000
+
+/*
+// Input of ZW algorithm is 1 parameter:
+// - accuracy of quantile estimation
+*/
+/*
+// Number of ZW algorithm parameters
+*/
+#define VSL_SS_SQUANTS_ZW_PARAMS_N   1
+
+
+/*
+//++
+// MACROS USED SS EDIT AND COMPUTE ROUTINES
+//--
+*/
+
+/*
+// SS EditTask routine is way to edit input and output parameters of the task,
+// e.g., pointers to arrays which hold observations, weights of observations,
+// arrays of mean estimates or covariance estimates.
+// Macros below define parameters available for modification
+*/
+#define VSL_SS_ED_DIMEN                                 1
+#define VSL_SS_ED_OBSERV_N                              2
+#define VSL_SS_ED_OBSERV                                3
+#define VSL_SS_ED_OBSERV_STORAGE                        4
+#define VSL_SS_ED_INDC                                  5
+#define VSL_SS_ED_WEIGHTS                               6
+#define VSL_SS_ED_MEAN                                  7
+#define VSL_SS_ED_2R_MOM                                8
+#define VSL_SS_ED_3R_MOM                                9
+#define VSL_SS_ED_4R_MOM                               10
+#define VSL_SS_ED_2C_MOM                               11
+#define VSL_SS_ED_3C_MOM                               12
+#define VSL_SS_ED_4C_MOM                               13
+#define VSL_SS_ED_SUM                                  67
+#define VSL_SS_ED_2R_SUM                               68
+#define VSL_SS_ED_3R_SUM                               69
+#define VSL_SS_ED_4R_SUM                               70
+#define VSL_SS_ED_2C_SUM                               71
+#define VSL_SS_ED_3C_SUM                               72
+#define VSL_SS_ED_4C_SUM                               73
+#define VSL_SS_ED_KURTOSIS                             14
+#define VSL_SS_ED_SKEWNESS                             15
+#define VSL_SS_ED_MIN                                  16
+#define VSL_SS_ED_MAX                                  17
+#define VSL_SS_ED_VARIATION                            18
+#define VSL_SS_ED_COV                                  19
+#define VSL_SS_ED_COV_STORAGE                          20
+#define VSL_SS_ED_COR                                  21
+#define VSL_SS_ED_COR_STORAGE                          22
+#define VSL_SS_ED_CP                                   74
+#define VSL_SS_ED_CP_STORAGE                           75
+#define VSL_SS_ED_ACCUM_WEIGHT                         23
+#define VSL_SS_ED_QUANT_ORDER_N                        24
+#define VSL_SS_ED_QUANT_ORDER                          25
+#define VSL_SS_ED_QUANT_QUANTILES                      26
+#define VSL_SS_ED_ORDER_STATS                          27
+#define VSL_SS_ED_GROUP_INDC                           28
+#define VSL_SS_ED_POOLED_COV_STORAGE                   29
+#define VSL_SS_ED_POOLED_MEAN                          30
+#define VSL_SS_ED_POOLED_COV                           31
+#define VSL_SS_ED_GROUP_COV_INDC                       32
+#define VSL_SS_ED_REQ_GROUP_INDC                       32
+#define VSL_SS_ED_GROUP_MEAN                           33
+#define VSL_SS_ED_GROUP_COV_STORAGE                    34
+#define VSL_SS_ED_GROUP_COV                            35
+#define VSL_SS_ED_ROBUST_COV_STORAGE                   36
+#define VSL_SS_ED_ROBUST_COV_PARAMS_N                  37
+#define VSL_SS_ED_ROBUST_COV_PARAMS                    38
+#define VSL_SS_ED_ROBUST_MEAN                          39
+#define VSL_SS_ED_ROBUST_COV                           40
+#define VSL_SS_ED_OUTLIERS_PARAMS_N                    41
+#define VSL_SS_ED_OUTLIERS_PARAMS                      42
+#define VSL_SS_ED_OUTLIERS_WEIGHT                      43
+#define VSL_SS_ED_ORDER_STATS_STORAGE                  44
+#define VSL_SS_ED_PARTIAL_COV_IDX                      45
+#define VSL_SS_ED_PARTIAL_COV                          46
+#define VSL_SS_ED_PARTIAL_COV_STORAGE                  47
+#define VSL_SS_ED_PARTIAL_COR                          48
+#define VSL_SS_ED_PARTIAL_COR_STORAGE                  49
+#define VSL_SS_ED_MI_PARAMS_N                          50
+#define VSL_SS_ED_MI_PARAMS                            51
+#define VSL_SS_ED_MI_INIT_ESTIMATES_N                  52
+#define VSL_SS_ED_MI_INIT_ESTIMATES                    53
+#define VSL_SS_ED_MI_SIMUL_VALS_N                      54
+#define VSL_SS_ED_MI_SIMUL_VALS                        55
+#define VSL_SS_ED_MI_ESTIMATES_N                       56
+#define VSL_SS_ED_MI_ESTIMATES                         57
+#define VSL_SS_ED_MI_PRIOR_N                           58
+#define VSL_SS_ED_MI_PRIOR                             59
+#define VSL_SS_ED_PARAMTR_COR                          60
+#define VSL_SS_ED_PARAMTR_COR_STORAGE                  61
+#define VSL_SS_ED_STREAM_QUANT_PARAMS_N                62
+#define VSL_SS_ED_STREAM_QUANT_PARAMS                  63
+#define VSL_SS_ED_STREAM_QUANT_ORDER_N                 64
+#define VSL_SS_ED_STREAM_QUANT_ORDER                   65
+#define VSL_SS_ED_STREAM_QUANT_QUANTILES               66
+#define VSL_SS_ED_MDAD                                 76
+#define VSL_SS_ED_MNAD                                 77
+#define VSL_SS_ED_SORTED_OBSERV                        78
+#define VSL_SS_ED_SORTED_OBSERV_STORAGE                79
+
+
+/*
+// SS Compute routine calculates estimates supported by the library
+// Macros below define estimates to compute
+*/
+#define VSL_SS_MEAN                       0x0000000000000001
+#define VSL_SS_2R_MOM                     0x0000000000000002
+#define VSL_SS_3R_MOM                     0x0000000000000004
+#define VSL_SS_4R_MOM                     0x0000000000000008
+#define VSL_SS_2C_MOM                     0x0000000000000010
+#define VSL_SS_3C_MOM                     0x0000000000000020
+#define VSL_SS_4C_MOM                     0x0000000000000040
+#define VSL_SS_SUM                        0x0000000002000000
+#define VSL_SS_2R_SUM                     0x0000000004000000
+#define VSL_SS_3R_SUM                     0x0000000008000000
+#define VSL_SS_4R_SUM                     0x0000000010000000
+#define VSL_SS_2C_SUM                     0x0000000020000000
+#define VSL_SS_3C_SUM                     0x0000000040000000
+#define VSL_SS_4C_SUM                     0x0000000080000000
+#define VSL_SS_KURTOSIS                   0x0000000000000080
+#define VSL_SS_SKEWNESS                   0x0000000000000100
+#define VSL_SS_VARIATION                  0x0000000000000200
+#define VSL_SS_MIN                        0x0000000000000400
+#define VSL_SS_MAX                        0x0000000000000800
+#define VSL_SS_COV                        0x0000000000001000
+#define VSL_SS_COR                        0x0000000000002000
+#define VSL_SS_CP                         0x0000000100000000
+#define VSL_SS_POOLED_COV                 0x0000000000004000
+#define VSL_SS_GROUP_COV                  0x0000000000008000
+#define VSL_SS_POOLED_MEAN                0x0000000800000000
+#define VSL_SS_GROUP_MEAN                 0x0000001000000000
+#define VSL_SS_QUANTS                     0x0000000000010000
+#define VSL_SS_ORDER_STATS                0x0000000000020000
+#define VSL_SS_SORTED_OBSERV              0x0000008000000000
+#define VSL_SS_ROBUST_COV                 0x0000000000040000
+#define VSL_SS_OUTLIERS                   0x0000000000080000
+#define VSL_SS_PARTIAL_COV                0x0000000000100000
+#define VSL_SS_PARTIAL_COR                0x0000000000200000
+#define VSL_SS_MISSING_VALS               0x0000000000400000
+#define VSL_SS_PARAMTR_COR                0x0000000000800000
+#define VSL_SS_STREAM_QUANTS              0x0000000001000000
+#define VSL_SS_MDAD                       0x0000000200000000
+#define VSL_SS_MNAD                       0x0000000400000000
+
+#ifdef __cplusplus
+}
+#endif /* __cplusplus */
+
+#endif /* __MKL_VSL_DEFINES_H__ */
diff --git a/python/ideep4py/include/mkl/mkl_vsl_functions.h b/python/ideep4py/include/mkl/mkl_vsl_functions.h
new file mode 100644
index 00000000..b09229bb
--- /dev/null
+++ b/python/ideep4py/include/mkl/mkl_vsl_functions.h
@@ -0,0 +1,854 @@
+/* file: mkl_vsl_functions.h */
+/*******************************************************************************
+* Copyright (c) 2006-2017, Intel Corporation
+* All rights reserved.
+*
+* Redistribution and use in source and binary forms, with or without
+* modification, are permitted provided that the following conditions are met:
+*
+*     * Redistributions of source code must retain the above copyright notice,
+*       this list of conditions and the following disclaimer.
+*     * Redistributions in binary form must reproduce the above copyright
+*       notice, this list of conditions and the following disclaimer in the
+*       documentation and/or other materials provided with the distribution.
+*     * Neither the name of Intel Corporation nor the names of its contributors
+*       may be used to endorse or promote products derived from this software
+*       without specific prior written permission.
+*
+* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+
+/*
+//++
+//  User-level VSL function declarations
+//--
+*/
+
+#ifndef __MKL_VSL_FUNCTIONS_H__
+#define __MKL_VSL_FUNCTIONS_H__
+
+#include "mkl_vsl_types.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif /* __cplusplus */
+
+/*
+//++
+//  EXTERNAL API MACROS.
+//  Used to construct VSL function declaration. Change them if you are going to
+//  provide different API for VSL functions.
+//--
+*/
+
+#if  !defined(_Mkl_Api)
+#define _Mkl_Api(rtype,name,arg)   extern rtype name    arg;
+#endif
+
+#if  !defined(_mkl_api)
+#define _mkl_api(rtype,name,arg)   extern rtype name##_ arg;
+#endif
+
+#if  !defined(_MKL_API)
+#define _MKL_API(rtype,name,arg)   extern rtype name##_ arg;
+#endif
+
+/*
+//++
+//  VSL CONTINUOUS DISTRIBUTION GENERATOR FUNCTION DECLARATIONS.
+//--
+*/
+/* Cauchy distribution */
+_Mkl_Api(int,vdRngCauchy,(const MKL_INT  , VSLStreamStatePtr  , const MKL_INT  , double [], const double  , const double  ))
+_MKL_API(int,VDRNGCAUCHY,(const MKL_INT *, VSLStreamStatePtr *, const MKL_INT *, double [], const double *, const double *))
+_mkl_api(int,vdrngcauchy,(const MKL_INT *, VSLStreamStatePtr *, const MKL_INT *, double [], const double *, const double *))
+_Mkl_Api(int,vsRngCauchy,(const MKL_INT  , VSLStreamStatePtr  , const MKL_INT  , float [],  const float  ,  const float   ))
+_MKL_API(int,VSRNGCAUCHY,(const MKL_INT *, VSLStreamStatePtr *, const MKL_INT *, float [],  const float *,  const float * ))
+_mkl_api(int,vsrngcauchy,(const MKL_INT *, VSLStreamStatePtr *, const MKL_INT *, float [],  const float *,  const float * ))
+
+/* Uniform distribution */
+_Mkl_Api(int,vdRngUniform,(const MKL_INT  , VSLStreamStatePtr  , const MKL_INT  , double [], const double  , const double  ))
+_MKL_API(int,VDRNGUNIFORM,(const MKL_INT *, VSLStreamStatePtr *, const MKL_INT *, double [], const double *, const double *))
+_mkl_api(int,vdrnguniform,(const MKL_INT *, VSLStreamStatePtr *, const MKL_INT *, double [], const double *, const double *))
+_Mkl_Api(int,vsRngUniform,(const MKL_INT  , VSLStreamStatePtr  , const MKL_INT  , float [],  const float  ,  const float   ))
+_MKL_API(int,VSRNGUNIFORM,(const MKL_INT *, VSLStreamStatePtr *, const MKL_INT *, float [],  const float *,  const float * ))
+_mkl_api(int,vsrnguniform,(const MKL_INT *, VSLStreamStatePtr *, const MKL_INT *, float [],  const float *,  const float * ))
+
+/* Gaussian distribution */
+_Mkl_Api(int,vdRngGaussian,(const MKL_INT  , VSLStreamStatePtr  , const MKL_INT  , double [], const double  , const double  ))
+_MKL_API(int,VDRNGGAUSSIAN,(const MKL_INT *, VSLStreamStatePtr *, const MKL_INT *, double [], const double *, const double *))
+_mkl_api(int,vdrnggaussian,(const MKL_INT *, VSLStreamStatePtr *, const MKL_INT *, double [], const double *, const double *))
+_Mkl_Api(int,vsRngGaussian,(const MKL_INT  , VSLStreamStatePtr  , const MKL_INT  , float [],  const float  ,  const float   ))
+_MKL_API(int,VSRNGGAUSSIAN,(const MKL_INT *, VSLStreamStatePtr *, const MKL_INT *, float [],  const float *,  const float * ))
+_mkl_api(int,vsrnggaussian,(const MKL_INT *, VSLStreamStatePtr *, const MKL_INT *, float [],  const float *,  const float * ))
+
+/* GaussianMV distribution */
+_Mkl_Api(int,vdRngGaussianMV,(const MKL_INT  , VSLStreamStatePtr  , const MKL_INT  , double [], const MKL_INT  ,  const MKL_INT  , const double *, const double *))
+_MKL_API(int,VDRNGGAUSSIANMV,(const MKL_INT *, VSLStreamStatePtr *, const MKL_INT *, double [], const MKL_INT *,  const MKL_INT *, const double *, const double *))
+_mkl_api(int,vdrnggaussianmv,(const MKL_INT *, VSLStreamStatePtr *, const MKL_INT *, double [], const MKL_INT *,  const MKL_INT *, const double *, const double *))
+_Mkl_Api(int,vsRngGaussianMV,(const MKL_INT  , VSLStreamStatePtr  , const MKL_INT  , float [],  const MKL_INT  ,  const MKL_INT  , const float *,  const float * ))
+_MKL_API(int,VSRNGGAUSSIANMV,(const MKL_INT *, VSLStreamStatePtr *, const MKL_INT *, float [],  const MKL_INT *,  const MKL_INT *, const float *,  const float * ))
+_mkl_api(int,vsrnggaussianmv,(const MKL_INT *, VSLStreamStatePtr *, const MKL_INT *, float [],  const MKL_INT *,  const MKL_INT *, const float *,  const float * ))
+
+/* Exponential distribution */
+_Mkl_Api(int,vdRngExponential,(const MKL_INT  , VSLStreamStatePtr  , const MKL_INT  ,  double [], const double  , const double  ))
+_MKL_API(int,VDRNGEXPONENTIAL,(const MKL_INT *, VSLStreamStatePtr *, const MKL_INT *,  double [], const double *, const double *))
+_mkl_api(int,vdrngexponential,(const MKL_INT *, VSLStreamStatePtr *, const MKL_INT *,  double [], const double *, const double *))
+_Mkl_Api(int,vsRngExponential,(const MKL_INT  , VSLStreamStatePtr  , const MKL_INT  ,  float [],  const float  ,  const float   ))
+_MKL_API(int,VSRNGEXPONENTIAL,(const MKL_INT *, VSLStreamStatePtr *, const MKL_INT *,  float [],  const float *,  const float * ))
+_mkl_api(int,vsrngexponential,(const MKL_INT *, VSLStreamStatePtr *, const MKL_INT *,  float [],  const float *,  const float * ))
+
+/* Laplace distribution */
+_Mkl_Api(int,vdRngLaplace,(const MKL_INT  , VSLStreamStatePtr  , const MKL_INT  , double [], const double  , const double  ))
+_MKL_API(int,VDRNGLAPLACE,(const MKL_INT *, VSLStreamStatePtr *, const MKL_INT *, double [], const double *, const double *))
+_mkl_api(int,vdrnglaplace,(const MKL_INT *, VSLStreamStatePtr *, const MKL_INT *, double [], const double *, const double *))
+_Mkl_Api(int,vsRngLaplace,(const MKL_INT  , VSLStreamStatePtr  , const MKL_INT  , float [],  const float  ,  const float   ))
+_MKL_API(int,VSRNGLAPLACE,(const MKL_INT *, VSLStreamStatePtr *, const MKL_INT *, float [],  const float *,  const float * ))
+_mkl_api(int,vsrnglaplace,(const MKL_INT *, VSLStreamStatePtr *, const MKL_INT *, float [],  const float *,  const float * ))
+
+/* Weibull distribution */
+_Mkl_Api(int,vdRngWeibull,(const MKL_INT  , VSLStreamStatePtr  , const MKL_INT  , double [], const double  , const double  , const double  ))
+_MKL_API(int,VDRNGWEIBULL,(const MKL_INT *, VSLStreamStatePtr *, const MKL_INT *, double [], const double *, const double *, const double *))
+_mkl_api(int,vdrngweibull,(const MKL_INT *, VSLStreamStatePtr *, const MKL_INT *, double [], const double *, const double *, const double *))
+_Mkl_Api(int,vsRngWeibull,(const MKL_INT  , VSLStreamStatePtr  , const MKL_INT  , float [],  const float  ,  const float  ,  const float   ))
+_MKL_API(int,VSRNGWEIBULL,(const MKL_INT *, VSLStreamStatePtr *, const MKL_INT *, float [],  const float *,  const float *,  const float * ))
+_mkl_api(int,vsrngweibull,(const MKL_INT *, VSLStreamStatePtr *, const MKL_INT *, float [],  const float *,  const float *,  const float * ))
+
+/* Rayleigh distribution */
+_Mkl_Api(int,vdRngRayleigh,(const MKL_INT  , VSLStreamStatePtr  , const MKL_INT  ,  double [], const double  , const double  ))
+_MKL_API(int,VDRNGRAYLEIGH,(const MKL_INT *, VSLStreamStatePtr *, const MKL_INT *,  double [], const double *, const double *))
+_mkl_api(int,vdrngrayleigh,(const MKL_INT *, VSLStreamStatePtr *, const MKL_INT *,  double [], const double *, const double *))
+_Mkl_Api(int,vsRngRayleigh,(const MKL_INT  , VSLStreamStatePtr  , const MKL_INT  ,  float [],  const float  ,  const float   ))
+_MKL_API(int,VSRNGRAYLEIGH,(const MKL_INT *, VSLStreamStatePtr *, const MKL_INT *,  float [],  const float *,  const float * ))
+_mkl_api(int,vsrngrayleigh,(const MKL_INT *, VSLStreamStatePtr *, const MKL_INT *,  float [],  const float *,  const float * ))
+
+/* Lognormal distribution */
+_Mkl_Api(int,vdRngLognormal,(const MKL_INT  , VSLStreamStatePtr  , const MKL_INT  , double [], const double  , const double  , const double  , const double  ))
+_MKL_API(int,VDRNGLOGNORMAL,(const MKL_INT *, VSLStreamStatePtr *, const MKL_INT *, double [], const double *, const double *, const double *, const double *))
+_mkl_api(int,vdrnglognormal,(const MKL_INT *, VSLStreamStatePtr *, const MKL_INT *, double [], const double *, const double *, const double *, const double *))
+_Mkl_Api(int,vsRngLognormal,(const MKL_INT  , VSLStreamStatePtr  , const MKL_INT  , float [],  const float  ,  const float  ,  const float  ,  const float   ))
+_MKL_API(int,VSRNGLOGNORMAL,(const MKL_INT *, VSLStreamStatePtr *, const MKL_INT *, float [],  const float *,  const float *,  const float *,  const float * ))
+_mkl_api(int,vsrnglognormal,(const MKL_INT *, VSLStreamStatePtr *, const MKL_INT *, float [],  const float *,  const float *,  const float *,  const float * ))
+
+/* Gumbel distribution */
+_Mkl_Api(int,vdRngGumbel,(const MKL_INT  , VSLStreamStatePtr  , const MKL_INT  , double [], const double  , const double  ))
+_MKL_API(int,VDRNGGUMBEL,(const MKL_INT *, VSLStreamStatePtr *, const MKL_INT *, double [], const double *, const double *))
+_mkl_api(int,vdrnggumbel,(const MKL_INT *, VSLStreamStatePtr *, const MKL_INT *, double [], const double *, const double *))
+_Mkl_Api(int,vsRngGumbel,(const MKL_INT  , VSLStreamStatePtr  , const MKL_INT  , float [],  const float  ,  const float   ))
+_MKL_API(int,VSRNGGUMBEL,(const MKL_INT *, VSLStreamStatePtr *, const MKL_INT *, float [],  const float *,  const float * ))
+_mkl_api(int,vsrnggumbel,(const MKL_INT *, VSLStreamStatePtr *, const MKL_INT *, float [],  const float *,  const float * ))
+
+/* Gamma distribution */
+_Mkl_Api(int,vdRngGamma,(const MKL_INT  , VSLStreamStatePtr  , const MKL_INT  , double [], const double  , const double  , const double  ))
+_MKL_API(int,VDRNGGAMMA,(const MKL_INT *, VSLStreamStatePtr *, const MKL_INT *, double [], const double *, const double *, const double *))
+_mkl_api(int,vdrnggamma,(const MKL_INT *, VSLStreamStatePtr *, const MKL_INT *, double [], const double *, const double *, const double *))
+_Mkl_Api(int,vsRngGamma,(const MKL_INT  , VSLStreamStatePtr  , const MKL_INT  , float [],  const float  ,  const float  ,  const float   ))
+_MKL_API(int,VSRNGGAMMA,(const MKL_INT *, VSLStreamStatePtr *, const MKL_INT *, float [],  const float *,  const float *,  const float * ))
+_mkl_api(int,vsrnggamma,(const MKL_INT *, VSLStreamStatePtr *, const MKL_INT *, float [],  const float *,  const float *,  const float * ))
+
+/* Beta distribution */
+_Mkl_Api(int,vdRngBeta,(const MKL_INT  , VSLStreamStatePtr  , const MKL_INT  , double [], const double  , const double  , const double  , const double  ))
+_MKL_API(int,VDRNGBETA,(const MKL_INT *, VSLStreamStatePtr *, const MKL_INT *, double [], const double *, const double *, const double *, const double *))
+_mkl_api(int,vdrngbeta,(const MKL_INT *, VSLStreamStatePtr *, const MKL_INT *, double [], const double *, const double *, const double *, const double *))
+_Mkl_Api(int,vsRngBeta,(const MKL_INT  , VSLStreamStatePtr  , const MKL_INT  , float [],  const float  ,  const float  ,  const float  ,  const float   ))
+_MKL_API(int,VSRNGBETA,(const MKL_INT *, VSLStreamStatePtr *, const MKL_INT *, float [],  const float *,  const float *,  const float *,  const float * ))
+_mkl_api(int,vsrngbeta,(const MKL_INT *, VSLStreamStatePtr *, const MKL_INT *, float [],  const float *,  const float *,  const float *,  const float * ))
+
+/*
+//++
+//  VSL DISCRETE DISTRIBUTION GENERATOR FUNCTION DECLARATIONS.
+//--
+*/
+/* Bernoulli distribution */
+_Mkl_Api(int,viRngBernoulli,(const MKL_INT  , VSLStreamStatePtr  , const MKL_INT  , int [], const double  ))
+_MKL_API(int,VIRNGBERNOULLI,(const MKL_INT *, VSLStreamStatePtr *, const MKL_INT *, int [], const double *))
+_mkl_api(int,virngbernoulli,(const MKL_INT *, VSLStreamStatePtr *, const MKL_INT *, int [], const double *))
+
+/* Uniform distribution */
+_Mkl_Api(int,viRngUniform,(const MKL_INT  , VSLStreamStatePtr  , const MKL_INT  , int [], const int  , const int  ))
+_MKL_API(int,VIRNGUNIFORM,(const MKL_INT *, VSLStreamStatePtr *, const MKL_INT *, int [], const int *, const int *))
+_mkl_api(int,virnguniform,(const MKL_INT *, VSLStreamStatePtr *, const MKL_INT *, int [], const int *, const int *))
+
+/* UniformBits distribution */
+_Mkl_Api(int,viRngUniformBits,(const MKL_INT  , VSLStreamStatePtr  , const MKL_INT  , unsigned int []))
+_MKL_API(int,VIRNGUNIFORMBITS,(const MKL_INT *, VSLStreamStatePtr *, const MKL_INT *, unsigned int []))
+_mkl_api(int,virnguniformbits,(const MKL_INT *, VSLStreamStatePtr *, const MKL_INT *, unsigned int []))
+
+/* UniformBits32 distribution */
+_Mkl_Api(int,viRngUniformBits32,(const MKL_INT  , VSLStreamStatePtr  , const MKL_INT  , unsigned int []))
+_MKL_API(int,VIRNGUNIFORMBITS32,(const MKL_INT *, VSLStreamStatePtr *, const MKL_INT *, unsigned int []))
+_mkl_api(int,virnguniformbits32,(const MKL_INT *, VSLStreamStatePtr *, const MKL_INT *, unsigned int []))
+
+/* UniformBits64 distribution */
+_Mkl_Api(int,viRngUniformBits64,(const MKL_INT  , VSLStreamStatePtr  , const MKL_INT  , unsigned MKL_INT64 []))
+_MKL_API(int,VIRNGUNIFORMBITS64,(const MKL_INT *, VSLStreamStatePtr *, const MKL_INT *, unsigned MKL_INT64 []))
+_mkl_api(int,virnguniformbits64,(const MKL_INT *, VSLStreamStatePtr *, const MKL_INT *, unsigned MKL_INT64 []))
+
+/* Geometric distribution */
+_Mkl_Api(int,viRngGeometric,(const MKL_INT  , VSLStreamStatePtr  , const MKL_INT  , int [], const double  ))
+_MKL_API(int,VIRNGGEOMETRIC,(const MKL_INT *, VSLStreamStatePtr *, const MKL_INT *, int [], const double *))
+_mkl_api(int,virnggeometric,(const MKL_INT *, VSLStreamStatePtr *, const MKL_INT *, int [], const double *))
+
+/* Binomial distribution */
+_Mkl_Api(int,viRngBinomial,(const MKL_INT  , VSLStreamStatePtr  , const MKL_INT  , int [], const int  , const double  ))
+_MKL_API(int,VIRNGBINOMIAL,(const MKL_INT *, VSLStreamStatePtr *, const MKL_INT *, int [], const int *, const double *))
+_mkl_api(int,virngbinomial,(const MKL_INT *, VSLStreamStatePtr *, const MKL_INT *, int [], const int *, const double *))
+
+/* Hypergeometric distribution */
+_Mkl_Api(int,viRngHypergeometric,(const MKL_INT  , VSLStreamStatePtr  , const MKL_INT  , int [], const int  , const int  , const int  ))
+_MKL_API(int,VIRNGHYPERGEOMETRIC,(const MKL_INT *, VSLStreamStatePtr *, const MKL_INT *, int [], const int *, const int *, const int *))
+_mkl_api(int,virnghypergeometric,(const MKL_INT *, VSLStreamStatePtr *, const MKL_INT *, int [], const int *, const int *, const int *))
+
+/* Negbinomial distribution */
+_Mkl_Api(int,viRngNegbinomial,(const MKL_INT  , VSLStreamStatePtr  , const MKL_INT  , int [], const double  , const double  ))
+_Mkl_Api(int,viRngNegBinomial,(const MKL_INT  , VSLStreamStatePtr  , const MKL_INT  , int [], const double  , const double  ))
+_MKL_API(int,VIRNGNEGBINOMIAL,(const MKL_INT *, VSLStreamStatePtr *, const MKL_INT *, int [], const double *, const double *))
+_mkl_api(int,virngnegbinomial,(const MKL_INT *, VSLStreamStatePtr *, const MKL_INT *, int [], const double *, const double *))
+
+/* Poisson distribution */
+_Mkl_Api(int,viRngPoisson,(const MKL_INT  , VSLStreamStatePtr  , const MKL_INT  , int [], const double  ))
+_MKL_API(int,VIRNGPOISSON,(const MKL_INT *, VSLStreamStatePtr *, const MKL_INT *, int [], const double *))
+_mkl_api(int,virngpoisson,(const MKL_INT *, VSLStreamStatePtr *, const MKL_INT *, int [], const double *))
+
+/* PoissonV distribution */
+_Mkl_Api(int,viRngPoissonV,(const MKL_INT  , VSLStreamStatePtr  , const MKL_INT  , int [], const double []))
+_MKL_API(int,VIRNGPOISSONV,(const MKL_INT *, VSLStreamStatePtr *, const MKL_INT *, int [], const double []))
+_mkl_api(int,virngpoissonv,(const MKL_INT *, VSLStreamStatePtr *, const MKL_INT *, int [], const double []))
+
+
+/*
+//++
+//  VSL SERVICE FUNCTION DECLARATIONS.
+//--
+*/
+/* NewStream - stream creation/initialization */
+_Mkl_Api(int,vslNewStream,(VSLStreamStatePtr* , const MKL_INT  , const MKL_UINT  ))
+_mkl_api(int,vslnewstream,(VSLStreamStatePtr* , const MKL_INT *, const MKL_UINT *))
+_MKL_API(int,VSLNEWSTREAM,(VSLStreamStatePtr* , const MKL_INT *, const MKL_UINT *))
+
+/* NewStreamEx - advanced stream creation/initialization */
+_Mkl_Api(int,vslNewStreamEx,(VSLStreamStatePtr* , const MKL_INT  , const MKL_INT  , const unsigned int[]))
+_mkl_api(int,vslnewstreamex,(VSLStreamStatePtr* , const MKL_INT *, const MKL_INT *, const unsigned int[]))
+_MKL_API(int,VSLNEWSTREAMEX,(VSLStreamStatePtr* , const MKL_INT *, const MKL_INT *, const unsigned int[]))
+
+_Mkl_Api(int,vsliNewAbstractStream,(VSLStreamStatePtr* , const MKL_INT  , const unsigned int[], const iUpdateFuncPtr))
+_mkl_api(int,vslinewabstractstream,(VSLStreamStatePtr* , const MKL_INT *, const unsigned int[], const iUpdateFuncPtr))
+_MKL_API(int,VSLINEWABSTRACTSTREAM,(VSLStreamStatePtr* , const MKL_INT *, const unsigned int[], const iUpdateFuncPtr))
+
+_Mkl_Api(int,vsldNewAbstractStream,(VSLStreamStatePtr* , const MKL_INT  , const double[], const double  , const double  , const dUpdateFuncPtr))
+_mkl_api(int,vsldnewabstractstream,(VSLStreamStatePtr* , const MKL_INT *, const double[], const double *, const double *, const dUpdateFuncPtr))
+_MKL_API(int,VSLDNEWABSTRACTSTREAM,(VSLStreamStatePtr* , const MKL_INT *, const double[], const double *, const double *, const dUpdateFuncPtr))
+
+_Mkl_Api(int,vslsNewAbstractStream,(VSLStreamStatePtr* , const MKL_INT  , const float[], const float  , const float  , const sUpdateFuncPtr))
+_mkl_api(int,vslsnewabstractstream,(VSLStreamStatePtr* , const MKL_INT *, const float[], const float *, const float *, const sUpdateFuncPtr))
+_MKL_API(int,VSLSNEWABSTRACTSTREAM,(VSLStreamStatePtr* , const MKL_INT *, const float[], const float *, const float *, const sUpdateFuncPtr))
+
+/* DeleteStream - delete stream */
+_Mkl_Api(int,vslDeleteStream,(VSLStreamStatePtr*))
+_mkl_api(int,vsldeletestream,(VSLStreamStatePtr*))
+_MKL_API(int,VSLDELETESTREAM,(VSLStreamStatePtr*))
+
+/* CopyStream - copy all stream information */
+_Mkl_Api(int,vslCopyStream,(VSLStreamStatePtr*, const VSLStreamStatePtr))
+_mkl_api(int,vslcopystream,(VSLStreamStatePtr*, const VSLStreamStatePtr))
+_MKL_API(int,VSLCOPYSTREAM,(VSLStreamStatePtr*, const VSLStreamStatePtr))
+
+/* CopyStreamState - copy stream state only */
+_Mkl_Api(int,vslCopyStreamState,(VSLStreamStatePtr  , const VSLStreamStatePtr  ))
+_mkl_api(int,vslcopystreamstate,(VSLStreamStatePtr *, const VSLStreamStatePtr *))
+_MKL_API(int,VSLCOPYSTREAMSTATE,(VSLStreamStatePtr *, const VSLStreamStatePtr *))
+
+/* LeapfrogStream - leapfrog method */
+_Mkl_Api(int,vslLeapfrogStream,(VSLStreamStatePtr  , const MKL_INT  , const MKL_INT  ))
+_mkl_api(int,vslleapfrogstream,(VSLStreamStatePtr *, const MKL_INT *, const MKL_INT *))
+_MKL_API(int,VSLLEAPFROGSTREAM,(VSLStreamStatePtr *, const MKL_INT *, const MKL_INT *))
+
+/* SkipAheadStream - skip-ahead method */
+_Mkl_Api(int,vslSkipAheadStream,(VSLStreamStatePtr  , const long long int  ))
+_mkl_api(int,vslskipaheadstream,(VSLStreamStatePtr *, const long long int *))
+_MKL_API(int,VSLSKIPAHEADSTREAM,(VSLStreamStatePtr *, const long long int *))
+
+/* GetStreamStateBrng - get BRNG associated with given stream */
+_Mkl_Api(int,vslGetStreamStateBrng,(const VSLStreamStatePtr  ))
+_mkl_api(int,vslgetstreamstatebrng,(const VSLStreamStatePtr *))
+_MKL_API(int,VSLGETSTREAMSTATEBRNG,(const VSLStreamStatePtr *))
+
+/* GetNumRegBrngs - get number of registered BRNGs */
+_Mkl_Api(int,vslGetNumRegBrngs,(void))
+_mkl_api(int,vslgetnumregbrngs,(void))
+_MKL_API(int,VSLGETNUMREGBRNGS,(void))
+
+/* RegisterBrng - register new BRNG */
+_Mkl_Api(int,vslRegisterBrng,(const VSLBRngProperties* ))
+_mkl_api(int,vslregisterbrng,(const VSLBRngProperties* ))
+_MKL_API(int,VSLREGISTERBRNG,(const VSLBRngProperties* ))
+
+/* GetBrngProperties - get BRNG properties */
+_Mkl_Api(int,vslGetBrngProperties,(const int  , VSLBRngProperties* ))
+_mkl_api(int,vslgetbrngproperties,(const int *, VSLBRngProperties* ))
+_MKL_API(int,VSLGETBRNGPROPERTIES,(const int *, VSLBRngProperties* ))
+
+/* SaveStreamF - save random stream descriptive data to file */
+_Mkl_Api(int,vslSaveStreamF,(const VSLStreamStatePtr  , const char*             ))
+_mkl_api(int,vslsavestreamf,(const VSLStreamStatePtr *, const char* , const int ))
+_MKL_API(int,VSLSAVESTREAMF,(const VSLStreamStatePtr *, const char* , const int ))
+
+/* LoadStreamF - load random stream descriptive data from file */
+_Mkl_Api(int,vslLoadStreamF,(VSLStreamStatePtr *, const char*             ))
+_mkl_api(int,vslloadstreamf,(VSLStreamStatePtr *, const char* , const int ))
+_MKL_API(int,VSLLOADSTREAMF,(VSLStreamStatePtr *, const char* , const int ))
+
+/* SaveStreamM - save random stream descriptive data to memory */
+_Mkl_Api(int,vslSaveStreamM,(const VSLStreamStatePtr  , char* ))
+_mkl_api(int,vslsavestreamm,(const VSLStreamStatePtr *, char* ))
+_MKL_API(int,VSLSAVESTREAMM,(const VSLStreamStatePtr *, char* ))
+
+/* LoadStreamM - load random stream descriptive data from memory */
+_Mkl_Api(int,vslLoadStreamM,(VSLStreamStatePtr *, const char* ))
+_mkl_api(int,vslloadstreamm,(VSLStreamStatePtr *, const char* ))
+_MKL_API(int,VSLLOADSTREAMM,(VSLStreamStatePtr *, const char* ))
+
+/* GetStreamSize - get size of random stream descriptive data */
+_Mkl_Api(int,vslGetStreamSize,(const VSLStreamStatePtr))
+_mkl_api(int,vslgetstreamsize,(const VSLStreamStatePtr))
+_MKL_API(int,VSLGETSTREAMSIZE,(const VSLStreamStatePtr))
+
+/*
+//++
+//  VSL CONVOLUTION AND CORRELATION FUNCTION DECLARATIONS.
+//--
+*/
+
+_Mkl_Api(int,vsldConvNewTask,(VSLConvTaskPtr* , const MKL_INT  , const MKL_INT  , const MKL_INT [], const MKL_INT [], const MKL_INT []))
+_mkl_api(int,vsldconvnewtask,(VSLConvTaskPtr* , const MKL_INT* , const MKL_INT* , const MKL_INT [], const MKL_INT [], const MKL_INT []))
+_MKL_API(int,VSLDCONVNEWTASK,(VSLConvTaskPtr* , const MKL_INT* , const MKL_INT* , const MKL_INT [], const MKL_INT [], const MKL_INT []))
+
+_Mkl_Api(int,vslsConvNewTask,(VSLConvTaskPtr* , const MKL_INT  , const MKL_INT  , const MKL_INT [], const MKL_INT [], const MKL_INT []))
+_mkl_api(int,vslsconvnewtask,(VSLConvTaskPtr* , const MKL_INT* , const MKL_INT* , const MKL_INT [], const MKL_INT [], const MKL_INT []))
+_MKL_API(int,VSLSCONVNEWTASK,(VSLConvTaskPtr* , const MKL_INT* , const MKL_INT* , const MKL_INT [], const MKL_INT [], const MKL_INT []))
+
+_Mkl_Api(int,vslzConvNewTask,(VSLConvTaskPtr* , const MKL_INT  , const MKL_INT  , const MKL_INT [], const MKL_INT [], const MKL_INT []))
+_mkl_api(int,vslzconvnewtask,(VSLConvTaskPtr* , const MKL_INT* , const MKL_INT* , const MKL_INT [], const MKL_INT [], const MKL_INT []))
+_MKL_API(int,VSLZCONVNEWTASK,(VSLConvTaskPtr* , const MKL_INT* , const MKL_INT* , const MKL_INT [], const MKL_INT [], const MKL_INT []))
+
+_Mkl_Api(int,vslcConvNewTask,(VSLConvTaskPtr* , const MKL_INT  , const MKL_INT  , const MKL_INT [], const MKL_INT [], const MKL_INT []))
+_mkl_api(int,vslcconvnewtask,(VSLConvTaskPtr* , const MKL_INT* , const MKL_INT* , const MKL_INT [], const MKL_INT [], const MKL_INT []))
+_MKL_API(int,VSLCCONVNEWTASK,(VSLConvTaskPtr* , const MKL_INT* , const MKL_INT* , const MKL_INT [], const MKL_INT [], const MKL_INT []))
+
+_Mkl_Api(int,vsldCorrNewTask,(VSLCorrTaskPtr* , const MKL_INT  , const MKL_INT  , const MKL_INT [], const MKL_INT [], const MKL_INT []))
+_mkl_api(int,vsldcorrnewtask,(VSLCorrTaskPtr* , const MKL_INT* , const MKL_INT* , const MKL_INT [], const MKL_INT [], const MKL_INT []))
+_MKL_API(int,VSLDCORRNEWTASK,(VSLCorrTaskPtr* , const MKL_INT* , const MKL_INT* , const MKL_INT [], const MKL_INT [], const MKL_INT []))
+
+_Mkl_Api(int,vslsCorrNewTask,(VSLCorrTaskPtr* , const MKL_INT  , const MKL_INT  , const MKL_INT [], const MKL_INT [], const MKL_INT []))
+_mkl_api(int,vslscorrnewtask,(VSLCorrTaskPtr* , const MKL_INT* , const MKL_INT* , const MKL_INT [], const MKL_INT [], const MKL_INT []))
+_MKL_API(int,VSLSCORRNEWTASK,(VSLCorrTaskPtr* , const MKL_INT* , const MKL_INT* , const MKL_INT [], const MKL_INT [], const MKL_INT []))
+
+_Mkl_Api(int,vslzCorrNewTask,(VSLCorrTaskPtr* , const MKL_INT  , const MKL_INT  , const MKL_INT [], const MKL_INT [], const MKL_INT []))
+_mkl_api(int,vslzcorrnewtask,(VSLCorrTaskPtr* , const MKL_INT* , const MKL_INT* , const MKL_INT [], const MKL_INT [], const MKL_INT []))
+_MKL_API(int,VSLZCORRNEWTASK,(VSLCorrTaskPtr* , const MKL_INT* , const MKL_INT* , const MKL_INT [], const MKL_INT [], const MKL_INT []))
+
+_Mkl_Api(int,vslcCorrNewTask,(VSLCorrTaskPtr* , const MKL_INT  , const MKL_INT  , const MKL_INT [], const MKL_INT [], const MKL_INT []))
+_mkl_api(int,vslccorrnewtask,(VSLCorrTaskPtr* , const MKL_INT* , const MKL_INT* , const MKL_INT [], const MKL_INT [], const MKL_INT []))
+_MKL_API(int,VSLCCORRNEWTASK,(VSLCorrTaskPtr* , const MKL_INT* , const MKL_INT* , const MKL_INT [], const MKL_INT [], const MKL_INT []))
+
+
+_Mkl_Api(int,vsldConvNewTask1D,(VSLConvTaskPtr* , const MKL_INT  , const MKL_INT  , const MKL_INT ,  const MKL_INT  ))
+_mkl_api(int,vsldconvnewtask1d,(VSLConvTaskPtr* , const MKL_INT* , const MKL_INT* , const MKL_INT* , const MKL_INT* ))
+_MKL_API(int,VSLDCONVNEWTASK1D,(VSLConvTaskPtr* , const MKL_INT* , const MKL_INT* , const MKL_INT* , const MKL_INT* ))
+
+_Mkl_Api(int,vslsConvNewTask1D,(VSLConvTaskPtr* , const MKL_INT  , const MKL_INT  , const MKL_INT ,  const MKL_INT  ))
+_mkl_api(int,vslsconvnewtask1d,(VSLConvTaskPtr* , const MKL_INT* , const MKL_INT* , const MKL_INT* , const MKL_INT* ))
+_MKL_API(int,VSLSCONVNEWTASK1D,(VSLConvTaskPtr* , const MKL_INT* , const MKL_INT* , const MKL_INT* , const MKL_INT* ))
+
+_Mkl_Api(int,vslzConvNewTask1D,(VSLConvTaskPtr* , const MKL_INT  , const MKL_INT  , const MKL_INT ,  const MKL_INT  ))
+_mkl_api(int,vslzconvnewtask1d,(VSLConvTaskPtr* , const MKL_INT* , const MKL_INT* , const MKL_INT* , const MKL_INT* ))
+_MKL_API(int,VSLZCONVNEWTASK1D,(VSLConvTaskPtr* , const MKL_INT* , const MKL_INT* , const MKL_INT* , const MKL_INT* ))
+
+_Mkl_Api(int,vslcConvNewTask1D,(VSLConvTaskPtr* , const MKL_INT  , const MKL_INT  , const MKL_INT ,  const MKL_INT  ))
+_mkl_api(int,vslcconvnewtask1d,(VSLConvTaskPtr* , const MKL_INT* , const MKL_INT* , const MKL_INT* , const MKL_INT* ))
+_MKL_API(int,VSLCCONVNEWTASK1D,(VSLConvTaskPtr* , const MKL_INT* , const MKL_INT* , const MKL_INT* , const MKL_INT* ))
+
+_Mkl_Api(int,vsldCorrNewTask1D,(VSLCorrTaskPtr* , const MKL_INT  , const MKL_INT  , const MKL_INT ,  const MKL_INT  ))
+_mkl_api(int,vsldcorrnewtask1d,(VSLCorrTaskPtr* , const MKL_INT* , const MKL_INT* , const MKL_INT* , const MKL_INT* ))
+_MKL_API(int,VSLDCORRNEWTASK1D,(VSLCorrTaskPtr* , const MKL_INT* , const MKL_INT* , const MKL_INT* , const MKL_INT* ))
+
+_Mkl_Api(int,vslsCorrNewTask1D,(VSLCorrTaskPtr* , const MKL_INT  , const MKL_INT  , const MKL_INT ,  const MKL_INT  ))
+_mkl_api(int,vslscorrnewtask1d,(VSLCorrTaskPtr* , const MKL_INT* , const MKL_INT* , const MKL_INT* , const MKL_INT* ))
+_MKL_API(int,VSLSCORRNEWTASK1D,(VSLCorrTaskPtr* , const MKL_INT* , const MKL_INT* , const MKL_INT* , const MKL_INT* ))
+
+_Mkl_Api(int,vslzCorrNewTask1D,(VSLCorrTaskPtr* , const MKL_INT  , const MKL_INT  , const MKL_INT ,  const MKL_INT  ))
+_mkl_api(int,vslzcorrnewtask1d,(VSLCorrTaskPtr* , const MKL_INT* , const MKL_INT* , const MKL_INT* , const MKL_INT* ))
+_MKL_API(int,VSLZCORRNEWTASK1D,(VSLCorrTaskPtr* , const MKL_INT* , const MKL_INT* , const MKL_INT* , const MKL_INT* ))
+
+_Mkl_Api(int,vslcCorrNewTask1D,(VSLCorrTaskPtr* , const MKL_INT  , const MKL_INT  , const MKL_INT ,  const MKL_INT  ))
+_mkl_api(int,vslccorrnewtask1d,(VSLCorrTaskPtr* , const MKL_INT* , const MKL_INT* , const MKL_INT* , const MKL_INT* ))
+_MKL_API(int,VSLCCORRNEWTASK1D,(VSLCorrTaskPtr* , const MKL_INT* , const MKL_INT* , const MKL_INT* , const MKL_INT* ))
+
+
+_Mkl_Api(int,vsldConvNewTaskX,(VSLConvTaskPtr* , const MKL_INT  , const MKL_INT  , const MKL_INT [], const MKL_INT [], const MKL_INT [], const double [], const MKL_INT []))
+_mkl_api(int,vsldconvnewtaskx,(VSLConvTaskPtr* , const MKL_INT* , const MKL_INT* , const MKL_INT [], const MKL_INT [], const MKL_INT [], const double [], const MKL_INT []))
+_MKL_API(int,VSLDCONVNEWTASKX,(VSLConvTaskPtr* , const MKL_INT* , const MKL_INT* , const MKL_INT [], const MKL_INT [], const MKL_INT [], const double [], const MKL_INT []))
+
+_Mkl_Api(int,vslsConvNewTaskX,(VSLConvTaskPtr* , const MKL_INT  , const MKL_INT  , const MKL_INT [], const MKL_INT [], const MKL_INT [], const float [],  const MKL_INT []))
+_mkl_api(int,vslsconvnewtaskx,(VSLConvTaskPtr* , const MKL_INT* , const MKL_INT* , const MKL_INT [], const MKL_INT [], const MKL_INT [], const float [],  const MKL_INT []))
+_MKL_API(int,VSLSCONVNEWTASKX,(VSLConvTaskPtr* , const MKL_INT* , const MKL_INT* , const MKL_INT [], const MKL_INT [], const MKL_INT [], const float [],  const MKL_INT []))
+
+_Mkl_Api(int,vslzConvNewTaskX,(VSLConvTaskPtr* , const MKL_INT  , const MKL_INT  , const MKL_INT [], const MKL_INT [], const MKL_INT [], const MKL_Complex16 [], const MKL_INT []))
+_mkl_api(int,vslzconvnewtaskx,(VSLConvTaskPtr* , const MKL_INT* , const MKL_INT* , const MKL_INT [], const MKL_INT [], const MKL_INT [], const MKL_Complex16 [], const MKL_INT []))
+_MKL_API(int,VSLZCONVNEWTASKX,(VSLConvTaskPtr* , const MKL_INT* , const MKL_INT* , const MKL_INT [], const MKL_INT [], const MKL_INT [], const MKL_Complex16 [], const MKL_INT []))
+
+_Mkl_Api(int,vslcConvNewTaskX,(VSLConvTaskPtr* , const MKL_INT  , const MKL_INT  , const MKL_INT [], const MKL_INT [], const MKL_INT [], const MKL_Complex8 [],  const MKL_INT []))
+_mkl_api(int,vslcconvnewtaskx,(VSLConvTaskPtr* , const MKL_INT* , const MKL_INT* , const MKL_INT [], const MKL_INT [], const MKL_INT [], const MKL_Complex8 [],  const MKL_INT []))
+_MKL_API(int,VSLCCONVNEWTASKX,(VSLConvTaskPtr* , const MKL_INT* , const MKL_INT* , const MKL_INT [], const MKL_INT [], const MKL_INT [], const MKL_Complex8 [],  const MKL_INT []))
+
+_Mkl_Api(int,vsldCorrNewTaskX,(VSLCorrTaskPtr* , const MKL_INT  , const MKL_INT  , const MKL_INT [], const MKL_INT [], const MKL_INT [], const double [], const MKL_INT []))
+_mkl_api(int,vsldcorrnewtaskx,(VSLCorrTaskPtr* , const MKL_INT* , const MKL_INT* , const MKL_INT [], const MKL_INT [], const MKL_INT [], const double [], const MKL_INT []))
+_MKL_API(int,VSLDCORRNEWTASKX,(VSLCorrTaskPtr* , const MKL_INT* , const MKL_INT* , const MKL_INT [], const MKL_INT [], const MKL_INT [], const double [], const MKL_INT []))
+
+_Mkl_Api(int,vslsCorrNewTaskX,(VSLCorrTaskPtr* , const MKL_INT  , const MKL_INT  , const MKL_INT [], const MKL_INT [], const MKL_INT [], const float [],  const MKL_INT []))
+_mkl_api(int,vslscorrnewtaskx,(VSLCorrTaskPtr* , const MKL_INT* , const MKL_INT* , const MKL_INT [], const MKL_INT [], const MKL_INT [], const float [],  const MKL_INT []))
+_MKL_API(int,VSLSCORRNEWTASKX,(VSLCorrTaskPtr* , const MKL_INT* , const MKL_INT* , const MKL_INT [], const MKL_INT [], const MKL_INT [], const float [],  const MKL_INT []))
+
+_Mkl_Api(int,vslzCorrNewTaskX,(VSLCorrTaskPtr* , const MKL_INT  , const MKL_INT  , const MKL_INT [], const MKL_INT [], const MKL_INT [], const MKL_Complex16 [], const MKL_INT []))
+_mkl_api(int,vslzcorrnewtaskx,(VSLCorrTaskPtr* , const MKL_INT* , const MKL_INT* , const MKL_INT [], const MKL_INT [], const MKL_INT [], const MKL_Complex16 [], const MKL_INT []))
+_MKL_API(int,VSLZCORRNEWTASKX,(VSLCorrTaskPtr* , const MKL_INT* , const MKL_INT* , const MKL_INT [], const MKL_INT [], const MKL_INT [], const MKL_Complex16 [], const MKL_INT []))
+
+_Mkl_Api(int,vslcCorrNewTaskX,(VSLCorrTaskPtr* , const MKL_INT  , const MKL_INT  , const MKL_INT [], const MKL_INT [], const MKL_INT [], const MKL_Complex8 [],  const MKL_INT []))
+_mkl_api(int,vslccorrnewtaskx,(VSLCorrTaskPtr* , const MKL_INT* , const MKL_INT* , const MKL_INT [], const MKL_INT [], const MKL_INT [], const MKL_Complex8 [],  const MKL_INT []))
+_MKL_API(int,VSLCCORRNEWTASKX,(VSLCorrTaskPtr* , const MKL_INT* , const MKL_INT* , const MKL_INT [], const MKL_INT [], const MKL_INT [], const MKL_Complex8 [],  const MKL_INT []))
+
+
+_Mkl_Api(int,vsldConvNewTaskX1D,(VSLConvTaskPtr* , const MKL_INT  , const MKL_INT  , const MKL_INT  , const MKL_INT  , const double [], const MKL_INT  ))
+_mkl_api(int,vsldconvnewtaskx1d,(VSLConvTaskPtr* , const MKL_INT* , const MKL_INT* , const MKL_INT* , const MKL_INT* , const double [], const MKL_INT* ))
+_MKL_API(int,VSLDCONVNEWTASKX1D,(VSLConvTaskPtr* , const MKL_INT* , const MKL_INT* , const MKL_INT* , const MKL_INT* , const double [], const MKL_INT* ))
+
+_Mkl_Api(int,vslsConvNewTaskX1D,(VSLConvTaskPtr* , const MKL_INT  , const MKL_INT  , const MKL_INT  , const MKL_INT  , const float [],  const MKL_INT  ))
+_mkl_api(int,vslsconvnewtaskx1d,(VSLConvTaskPtr* , const MKL_INT* , const MKL_INT* , const MKL_INT* , const MKL_INT* , const float [],  const MKL_INT* ))
+_MKL_API(int,VSLSCONVNEWTASKX1D,(VSLConvTaskPtr* , const MKL_INT* , const MKL_INT* , const MKL_INT* , const MKL_INT* , const float [],  const MKL_INT* ))
+
+_Mkl_Api(int,vslzConvNewTaskX1D,(VSLConvTaskPtr* , const MKL_INT  , const MKL_INT  , const MKL_INT  , const MKL_INT  , const MKL_Complex16 [], const MKL_INT  ))
+_mkl_api(int,vslzconvnewtaskx1d,(VSLConvTaskPtr* , const MKL_INT* , const MKL_INT* , const MKL_INT* , const MKL_INT* , const MKL_Complex16 [], const MKL_INT* ))
+_MKL_API(int,VSLZCONVNEWTASKX1D,(VSLConvTaskPtr* , const MKL_INT* , const MKL_INT* , const MKL_INT* , const MKL_INT* , const MKL_Complex16 [], const MKL_INT* ))
+
+_Mkl_Api(int,vslcConvNewTaskX1D,(VSLConvTaskPtr* , const MKL_INT  , const MKL_INT  , const MKL_INT  , const MKL_INT  , const MKL_Complex8 [],  const MKL_INT  ))
+_mkl_api(int,vslcconvnewtaskx1d,(VSLConvTaskPtr* , const MKL_INT* , const MKL_INT* , const MKL_INT* , const MKL_INT* , const MKL_Complex8 [],  const MKL_INT* ))
+_MKL_API(int,VSLCCONVNEWTASKX1D,(VSLConvTaskPtr* , const MKL_INT* , const MKL_INT* , const MKL_INT* , const MKL_INT* , const MKL_Complex8 [],  const MKL_INT* ))
+
+_Mkl_Api(int,vsldCorrNewTaskX1D,(VSLCorrTaskPtr* , const MKL_INT  , const MKL_INT  , const MKL_INT  , const MKL_INT  , const double [], const MKL_INT  ))
+_mkl_api(int,vsldcorrnewtaskx1d,(VSLCorrTaskPtr* , const MKL_INT* , const MKL_INT* , const MKL_INT* , const MKL_INT* , const double [], const MKL_INT* ))
+_MKL_API(int,VSLDCORRNEWTASKX1D,(VSLCorrTaskPtr* , const MKL_INT* , const MKL_INT* , const MKL_INT* , const MKL_INT* , const double [], const MKL_INT* ))
+
+_Mkl_Api(int,vslsCorrNewTaskX1D,(VSLCorrTaskPtr* , const MKL_INT  , const MKL_INT  , const MKL_INT  , const MKL_INT  , const float [],  const MKL_INT  ))
+_mkl_api(int,vslscorrnewtaskx1d,(VSLCorrTaskPtr* , const MKL_INT* , const MKL_INT* , const MKL_INT* , const MKL_INT* , const float [],  const MKL_INT* ))
+_MKL_API(int,VSLSCORRNEWTASKX1D,(VSLCorrTaskPtr* , const MKL_INT* , const MKL_INT* , const MKL_INT* , const MKL_INT* , const float [],  const MKL_INT* ))
+
+_Mkl_Api(int,vslzCorrNewTaskX1D,(VSLCorrTaskPtr* , const MKL_INT  , const MKL_INT  , const MKL_INT  , const MKL_INT  , const MKL_Complex16 [], const MKL_INT  ))
+_mkl_api(int,vslzcorrnewtaskx1d,(VSLCorrTaskPtr* , const MKL_INT* , const MKL_INT* , const MKL_INT* , const MKL_INT* , const MKL_Complex16 [], const MKL_INT* ))
+_MKL_API(int,VSLZCORRNEWTASKX1D,(VSLCorrTaskPtr* , const MKL_INT* , const MKL_INT* , const MKL_INT* , const MKL_INT* , const MKL_Complex16 [], const MKL_INT* ))
+
+_Mkl_Api(int,vslcCorrNewTaskX1D,(VSLCorrTaskPtr* , const MKL_INT  , const MKL_INT  , const MKL_INT  , const MKL_INT  , const MKL_Complex8 [],  const MKL_INT  ))
+_mkl_api(int,vslccorrnewtaskx1d,(VSLCorrTaskPtr* , const MKL_INT* , const MKL_INT* , const MKL_INT* , const MKL_INT* , const MKL_Complex8 [],  const MKL_INT* ))
+_MKL_API(int,VSLCCORRNEWTASKX1D,(VSLCorrTaskPtr* , const MKL_INT* , const MKL_INT* , const MKL_INT* , const MKL_INT* , const MKL_Complex8 [],  const MKL_INT* ))
+
+
+_Mkl_Api(int,vslConvDeleteTask,(VSLConvTaskPtr* ))
+_mkl_api(int,vslconvdeletetask,(VSLConvTaskPtr* ))
+_MKL_API(int,VSLCONVDeleteTask,(VSLConvTaskPtr* ))
+
+_Mkl_Api(int,vslCorrDeleteTask,(VSLCorrTaskPtr* ))
+_mkl_api(int,vslcorrdeletetask,(VSLCorrTaskPtr* ))
+_MKL_API(int,VSLCORRDeleteTask,(VSLCorrTaskPtr* ))
+
+
+_Mkl_Api(int,vslConvCopyTask,(VSLConvTaskPtr* , const VSLConvTaskPtr  ))
+_mkl_api(int,vslconvcopytask,(VSLConvTaskPtr* , const VSLConvTaskPtr* ))
+_MKL_API(int,VSLCONVCopyTask,(VSLConvTaskPtr* , const VSLConvTaskPtr* ))
+
+_Mkl_Api(int,vslCorrCopyTask,(VSLCorrTaskPtr* , const VSLCorrTaskPtr  ))
+_mkl_api(int,vslcorrcopytask,(VSLCorrTaskPtr* , const VSLCorrTaskPtr* ))
+_MKL_API(int,VSLCORRCopyTask,(VSLCorrTaskPtr* , const VSLCorrTaskPtr* ))
+
+
+_Mkl_Api(int,vslConvSetMode,(VSLConvTaskPtr  , const MKL_INT  ))
+_mkl_api(int,vslconvsetmode,(VSLConvTaskPtr* , const MKL_INT* ))
+_MKL_API(int,VSLCONVSETMODE,(VSLConvTaskPtr* , const MKL_INT* ))
+
+_Mkl_Api(int,vslCorrSetMode,(VSLCorrTaskPtr  , const MKL_INT  ))
+_mkl_api(int,vslcorrsetmode,(VSLCorrTaskPtr* , const MKL_INT* ))
+_MKL_API(int,VSLCORRSETMODE,(VSLCorrTaskPtr* , const MKL_INT* ))
+
+
+_Mkl_Api(int,vslConvSetInternalPrecision,(VSLConvTaskPtr  , const MKL_INT  ))
+_mkl_api(int,vslconvsetinternalprecision,(VSLConvTaskPtr* , const MKL_INT* ))
+_MKL_API(int,VSLCONVSETINTERNALPRECISION,(VSLConvTaskPtr* , const MKL_INT* ))
+
+_Mkl_Api(int,vslCorrSetInternalPrecision,(VSLCorrTaskPtr  , const MKL_INT  ))
+_mkl_api(int,vslcorrsetinternalprecision,(VSLCorrTaskPtr* , const MKL_INT* ))
+_MKL_API(int,VSLCORRSETINTERNALPRECISION,(VSLCorrTaskPtr* , const MKL_INT* ))
+
+
+_Mkl_Api(int,vslConvSetStart,(VSLConvTaskPtr  , const MKL_INT []))
+_mkl_api(int,vslconvsetstart,(VSLConvTaskPtr* , const MKL_INT []))
+_MKL_API(int,VSLCONVSETSTART,(VSLConvTaskPtr* , const MKL_INT []))
+
+_Mkl_Api(int,vslCorrSetStart,(VSLCorrTaskPtr  , const MKL_INT []))
+_mkl_api(int,vslcorrsetstart,(VSLCorrTaskPtr* , const MKL_INT []))
+_MKL_API(int,VSLCORRSETSTART,(VSLCorrTaskPtr* , const MKL_INT []))
+
+
+_Mkl_Api(int,vslConvSetDecimation,(VSLConvTaskPtr  , const MKL_INT []))
+_mkl_api(int,vslconvsetdecimation,(VSLConvTaskPtr* , const MKL_INT []))
+_MKL_API(int,VSLCONVSETDECIMATION,(VSLConvTaskPtr* , const MKL_INT []))
+
+_Mkl_Api(int,vslCorrSetDecimation,(VSLCorrTaskPtr  , const MKL_INT []))
+_mkl_api(int,vslcorrsetdecimation,(VSLCorrTaskPtr* , const MKL_INT []))
+_MKL_API(int,VSLCORRSETDECIMATION,(VSLCorrTaskPtr* , const MKL_INT []))
+
+
+_Mkl_Api(int,vsldConvExec,(VSLConvTaskPtr  , const double [], const MKL_INT [], const double [], const MKL_INT [], double [], const MKL_INT []))
+_mkl_api(int,vsldconvexec,(VSLConvTaskPtr* , const double [], const MKL_INT [], const double [], const MKL_INT [], double [], const MKL_INT []))
+_MKL_API(int,VSLDCONVEXEC,(VSLConvTaskPtr* , const double [], const MKL_INT [], const double [], const MKL_INT [], double [], const MKL_INT []))
+
+_Mkl_Api(int,vslsConvExec,(VSLConvTaskPtr  , const float [],  const MKL_INT [], const float [],  const MKL_INT [], float [],  const MKL_INT []))
+_mkl_api(int,vslsconvexec,(VSLConvTaskPtr* , const float [],  const MKL_INT [], const float [],  const MKL_INT [], float [],  const MKL_INT []))
+_MKL_API(int,VSLSCONVEXEC,(VSLConvTaskPtr* , const float [],  const MKL_INT [], const float [],  const MKL_INT [], float [],  const MKL_INT []))
+
+_Mkl_Api(int,vslzConvExec,(VSLConvTaskPtr  , const MKL_Complex16 [], const MKL_INT [], const MKL_Complex16 [], const MKL_INT [], MKL_Complex16 [], const MKL_INT []))
+_mkl_api(int,vslzconvexec,(VSLConvTaskPtr* , const MKL_Complex16 [], const MKL_INT [], const MKL_Complex16 [], const MKL_INT [], MKL_Complex16 [], const MKL_INT []))
+_MKL_API(int,VSLZCONVEXEC,(VSLConvTaskPtr* , const MKL_Complex16 [], const MKL_INT [], const MKL_Complex16 [], const MKL_INT [], MKL_Complex16 [], const MKL_INT []))
+
+_Mkl_Api(int,vslcConvExec,(VSLConvTaskPtr  , const MKL_Complex8 [],  const MKL_INT [], const MKL_Complex8 [],  const MKL_INT [], MKL_Complex8 [],  const MKL_INT []))
+_mkl_api(int,vslcconvexec,(VSLConvTaskPtr* , const MKL_Complex8 [],  const MKL_INT [], const MKL_Complex8 [],  const MKL_INT [], MKL_Complex8 [],  const MKL_INT []))
+_MKL_API(int,VSLCCONVEXEC,(VSLConvTaskPtr* , const MKL_Complex8 [],  const MKL_INT [], const MKL_Complex8 [],  const MKL_INT [], MKL_Complex8 [],  const MKL_INT []))
+
+_Mkl_Api(int,vsldCorrExec,(VSLCorrTaskPtr  , const double [], const MKL_INT [], const double [], const MKL_INT [], double [], const MKL_INT []))
+_mkl_api(int,vsldcorrexec,(VSLCorrTaskPtr* , const double [], const MKL_INT [], const double [], const MKL_INT [], double [], const MKL_INT []))
+_MKL_API(int,VSLDCORREXEC,(VSLCorrTaskPtr* , const double [], const MKL_INT [], const double [], const MKL_INT [], double [], const MKL_INT []))
+
+_Mkl_Api(int,vslsCorrExec,(VSLCorrTaskPtr  , const float [],  const MKL_INT [], const float [],  const MKL_INT [], float [],  const MKL_INT []))
+_mkl_api(int,vslscorrexec,(VSLCorrTaskPtr* , const float [],  const MKL_INT [], const float [],  const MKL_INT [], float [],  const MKL_INT []))
+_MKL_API(int,VSLSCORREXEC,(VSLCorrTaskPtr* , const float [],  const MKL_INT [], const float [],  const MKL_INT [], float [],  const MKL_INT []))
+
+_Mkl_Api(int,vslzCorrExec,(VSLCorrTaskPtr  , const MKL_Complex16 [], const MKL_INT [], const MKL_Complex16 [], const MKL_INT [], MKL_Complex16 [], const MKL_INT []))
+_mkl_api(int,vslzcorrexec,(VSLCorrTaskPtr* , const MKL_Complex16 [], const MKL_INT [], const MKL_Complex16 [], const MKL_INT [], MKL_Complex16 [], const MKL_INT []))
+_MKL_API(int,VSLZCORREXEC,(VSLCorrTaskPtr* , const MKL_Complex16 [], const MKL_INT [], const MKL_Complex16 [], const MKL_INT [], MKL_Complex16 [], const MKL_INT []))
+
+_Mkl_Api(int,vslcCorrExec,(VSLCorrTaskPtr  , const MKL_Complex8 [],  const MKL_INT [], const MKL_Complex8 [],  const MKL_INT [], MKL_Complex8 [],  const MKL_INT []))
+_mkl_api(int,vslccorrexec,(VSLCorrTaskPtr* , const MKL_Complex8 [],  const MKL_INT [], const MKL_Complex8 [],  const MKL_INT [], MKL_Complex8 [],  const MKL_INT []))
+_MKL_API(int,VSLCCORREXEC,(VSLCorrTaskPtr* , const MKL_Complex8 [],  const MKL_INT [], const MKL_Complex8 [],  const MKL_INT [], MKL_Complex8 [],  const MKL_INT []))
+
+
+_Mkl_Api(int,vsldConvExec1D,(VSLConvTaskPtr  , const double [], const MKL_INT  , const double [], const MKL_INT  , double [], const MKL_INT  ))
+_mkl_api(int,vsldconvexec1d,(VSLConvTaskPtr* , const double [], const MKL_INT* , const double [], const MKL_INT* , double [], const MKL_INT* ))
+_MKL_API(int,VSLDCONVEXEC1D,(VSLConvTaskPtr* , const double [], const MKL_INT* , const double [], const MKL_INT* , double [], const MKL_INT* ))
+
+_Mkl_Api(int,vslsConvExec1D,(VSLConvTaskPtr  , const float [],  const MKL_INT  , const float [],  const MKL_INT  , float [],  const MKL_INT  ))
+_mkl_api(int,vslsconvexec1d,(VSLConvTaskPtr* , const float [],  const MKL_INT* , const float [],  const MKL_INT* , float [],  const MKL_INT* ))
+_MKL_API(int,VSLSCONVEXEC1D,(VSLConvTaskPtr* , const float [],  const MKL_INT* , const float [],  const MKL_INT* , float [],  const MKL_INT* ))
+
+_Mkl_Api(int,vslzConvExec1D,(VSLConvTaskPtr  , const MKL_Complex16 [], const MKL_INT  , const MKL_Complex16 [], const MKL_INT  , MKL_Complex16 [], const MKL_INT  ))
+_mkl_api(int,vslzconvexec1d,(VSLConvTaskPtr* , const MKL_Complex16 [], const MKL_INT* , const MKL_Complex16 [], const MKL_INT* , MKL_Complex16 [], const MKL_INT* ))
+_MKL_API(int,VSLZCONVEXEC1D,(VSLConvTaskPtr* , const MKL_Complex16 [], const MKL_INT* , const MKL_Complex16 [], const MKL_INT* , MKL_Complex16 [], const MKL_INT* ))
+
+_Mkl_Api(int,vslcConvExec1D,(VSLConvTaskPtr  , const MKL_Complex8 [],  const MKL_INT  , const MKL_Complex8 [],  const MKL_INT  , MKL_Complex8 [],  const MKL_INT  ))
+_mkl_api(int,vslcconvexec1d,(VSLConvTaskPtr* , const MKL_Complex8 [],  const MKL_INT* , const MKL_Complex8 [],  const MKL_INT* , MKL_Complex8 [],  const MKL_INT* ))
+_MKL_API(int,VSLCCONVEXEC1D,(VSLConvTaskPtr* , const MKL_Complex8 [],  const MKL_INT* , const MKL_Complex8 [],  const MKL_INT* , MKL_Complex8 [],  const MKL_INT* ))
+
+_Mkl_Api(int,vsldCorrExec1D,(VSLCorrTaskPtr  , const double [], const MKL_INT  , const double [], const MKL_INT  , double [], const MKL_INT  ))
+_mkl_api(int,vsldcorrexec1d,(VSLCorrTaskPtr* , const double [], const MKL_INT* , const double [], const MKL_INT* , double [], const MKL_INT* ))
+_MKL_API(int,VSLDCORREXEC1D,(VSLCorrTaskPtr* , const double [], const MKL_INT* , const double [], const MKL_INT* , double [], const MKL_INT* ))
+
+_Mkl_Api(int,vslsCorrExec1D,(VSLCorrTaskPtr  , const float [],  const MKL_INT  , const float [],  const MKL_INT  , float [],  const MKL_INT  ))
+_mkl_api(int,vslscorrexec1d,(VSLCorrTaskPtr* , const float [],  const MKL_INT* , const float [],  const MKL_INT* , float [],  const MKL_INT* ))
+_MKL_API(int,VSLSCORREXEC1D,(VSLCorrTaskPtr* , const float [],  const MKL_INT* , const float [],  const MKL_INT* , float [],  const MKL_INT* ))
+
+_Mkl_Api(int,vslzCorrExec1D,(VSLCorrTaskPtr  , const MKL_Complex16 [], const MKL_INT  , const MKL_Complex16 [], const MKL_INT  , MKL_Complex16 [], const MKL_INT  ))
+_mkl_api(int,vslzcorrexec1d,(VSLCorrTaskPtr* , const MKL_Complex16 [], const MKL_INT* , const MKL_Complex16 [], const MKL_INT* , MKL_Complex16 [], const MKL_INT* ))
+_MKL_API(int,VSLZCORREXEC1D,(VSLCorrTaskPtr* , const MKL_Complex16 [], const MKL_INT* , const MKL_Complex16 [], const MKL_INT* , MKL_Complex16 [], const MKL_INT* ))
+
+_Mkl_Api(int,vslcCorrExec1D,(VSLCorrTaskPtr  , const MKL_Complex8 [],  const MKL_INT  , const MKL_Complex8 [],  const MKL_INT  , MKL_Complex8 [],  const MKL_INT  ))
+_mkl_api(int,vslccorrexec1d,(VSLCorrTaskPtr* , const MKL_Complex8 [],  const MKL_INT* , const MKL_Complex8 [],  const MKL_INT* , MKL_Complex8 [],  const MKL_INT* ))
+_MKL_API(int,VSLCCORREXEC1D,(VSLCorrTaskPtr* , const MKL_Complex8 [],  const MKL_INT* , const MKL_Complex8 [],  const MKL_INT* , MKL_Complex8 [],  const MKL_INT* ))
+
+
+_Mkl_Api(int,vsldConvExecX,(VSLConvTaskPtr  , const double [], const MKL_INT [], double [], const MKL_INT []))
+_mkl_api(int,vsldconvexecx,(VSLConvTaskPtr* , const double [], const MKL_INT [], double [], const MKL_INT []))
+_MKL_API(int,VSLDCONVEXECX,(VSLConvTaskPtr* , const double [], const MKL_INT [], double [], const MKL_INT []))
+
+_Mkl_Api(int,vslsConvExecX,(VSLConvTaskPtr  , const float [],  const MKL_INT [], float [],  const MKL_INT []))
+_mkl_api(int,vslsconvexecx,(VSLConvTaskPtr* , const float [],  const MKL_INT [], float [],  const MKL_INT []))
+_MKL_API(int,VSLSCONVEXECX,(VSLConvTaskPtr* , const float [],  const MKL_INT [], float [],  const MKL_INT []))
+
+_Mkl_Api(int,vslzConvExecX,(VSLConvTaskPtr  , const MKL_Complex16 [], const MKL_INT [], MKL_Complex16 [], const MKL_INT []))
+_mkl_api(int,vslzconvexecx,(VSLConvTaskPtr* , const MKL_Complex16 [], const MKL_INT [], MKL_Complex16 [], const MKL_INT []))
+_MKL_API(int,VSLZCONVEXECX,(VSLConvTaskPtr* , const MKL_Complex16 [], const MKL_INT [], MKL_Complex16 [], const MKL_INT []))
+
+_Mkl_Api(int,vslcConvExecX,(VSLConvTaskPtr  , const MKL_Complex8 [],  const MKL_INT [], MKL_Complex8 [],  const MKL_INT []))
+_mkl_api(int,vslcconvexecx,(VSLConvTaskPtr* , const MKL_Complex8 [],  const MKL_INT [], MKL_Complex8 [],  const MKL_INT []))
+_MKL_API(int,VSLCCONVEXECX,(VSLConvTaskPtr* , const MKL_Complex8 [],  const MKL_INT [], MKL_Complex8 [],  const MKL_INT []))
+
+_Mkl_Api(int,vsldCorrExecX,(VSLCorrTaskPtr  , const double [], const MKL_INT [], double [], const MKL_INT []))
+_mkl_api(int,vsldcorrexecx,(VSLCorrTaskPtr* , const double [], const MKL_INT [], double [], const MKL_INT []))
+_MKL_API(int,VSLDCORREXECX,(VSLCorrTaskPtr* , const double [], const MKL_INT [], double [], const MKL_INT []))
+
+_Mkl_Api(int,vslsCorrExecX,(VSLCorrTaskPtr  , const float [],  const MKL_INT [], float [],  const MKL_INT []))
+_mkl_api(int,vslscorrexecx,(VSLCorrTaskPtr* , const float [],  const MKL_INT [], float [],  const MKL_INT []))
+_MKL_API(int,VSLSCORREXECX,(VSLCorrTaskPtr* , const float [],  const MKL_INT [], float [],  const MKL_INT []))
+
+_Mkl_Api(int,vslzCorrExecX,(VSLCorrTaskPtr  , const MKL_Complex16 [], const MKL_INT [], MKL_Complex16 [], const MKL_INT []))
+_mkl_api(int,vslzcorrexecx,(VSLCorrTaskPtr* , const MKL_Complex16 [], const MKL_INT [], MKL_Complex16 [], const MKL_INT []))
+_MKL_API(int,VSLZCORREXECX,(VSLCorrTaskPtr* , const MKL_Complex16 [], const MKL_INT [], MKL_Complex16 [], const MKL_INT []))
+
+_Mkl_Api(int,vslcCorrExecX,(VSLCorrTaskPtr  , const MKL_Complex8 [],  const MKL_INT [], MKL_Complex8 [],  const MKL_INT []))
+_mkl_api(int,vslccorrexecx,(VSLCorrTaskPtr* , const MKL_Complex8 [],  const MKL_INT [], MKL_Complex8 [],  const MKL_INT []))
+_MKL_API(int,VSLCCORREXECX,(VSLCorrTaskPtr* , const MKL_Complex8 [],  const MKL_INT [], MKL_Complex8 [],  const MKL_INT []))
+
+
+_Mkl_Api(int,vsldConvExecX1D,(VSLConvTaskPtr  , const double [], const MKL_INT  , double [], const MKL_INT  ))
+_mkl_api(int,vsldconvexecx1d,(VSLConvTaskPtr* , const double [], const MKL_INT* , double [], const MKL_INT* ))
+_MKL_API(int,VSLDCONVEXECX1D,(VSLConvTaskPtr* , const double [], const MKL_INT* , double [], const MKL_INT* ))
+
+_Mkl_Api(int,vslsConvExecX1D,(VSLConvTaskPtr  , const float [],  const MKL_INT  , float [],  const MKL_INT  ))
+_mkl_api(int,vslsconvexecx1d,(VSLConvTaskPtr* , const float [],  const MKL_INT* , float [],  const MKL_INT* ))
+_MKL_API(int,VSLSCONVEXECX1D,(VSLConvTaskPtr* , const float [],  const MKL_INT* , float [],  const MKL_INT* ))
+
+_Mkl_Api(int,vslzConvExecX1D,(VSLConvTaskPtr  , const MKL_Complex16 [], const MKL_INT  , MKL_Complex16 [], const MKL_INT  ))
+_mkl_api(int,vslzconvexecx1d,(VSLConvTaskPtr* , const MKL_Complex16 [], const MKL_INT* , MKL_Complex16 [], const MKL_INT* ))
+_MKL_API(int,VSLZCONVEXECX1D,(VSLConvTaskPtr* , const MKL_Complex16 [], const MKL_INT* , MKL_Complex16 [], const MKL_INT* ))
+
+_Mkl_Api(int,vslcConvExecX1D,(VSLConvTaskPtr  , const MKL_Complex8 [],  const MKL_INT  , MKL_Complex8 [],  const MKL_INT  ))
+_mkl_api(int,vslcconvexecx1d,(VSLConvTaskPtr* , const MKL_Complex8 [],  const MKL_INT* , MKL_Complex8 [],  const MKL_INT* ))
+_MKL_API(int,VSLCCONVEXECX1D,(VSLConvTaskPtr* , const MKL_Complex8 [],  const MKL_INT* , MKL_Complex8 [],  const MKL_INT* ))
+
+_Mkl_Api(int,vsldCorrExecX1D,(VSLCorrTaskPtr  , const double [], const MKL_INT  , double [], const MKL_INT  ))
+_mkl_api(int,vsldcorrexecx1d,(VSLCorrTaskPtr* , const double [], const MKL_INT* , double [], const MKL_INT* ))
+_MKL_API(int,VSLDCORREXECX1D,(VSLCorrTaskPtr* , const double [], const MKL_INT* , double [], const MKL_INT* ))
+
+_Mkl_Api(int,vslsCorrExecX1D,(VSLCorrTaskPtr  , const float [],  const MKL_INT  , float [],  const MKL_INT  ))
+_mkl_api(int,vslscorrexecx1d,(VSLCorrTaskPtr* , const float [],  const MKL_INT* , float [],  const MKL_INT* ))
+_MKL_API(int,VSLSCORREXECX1D,(VSLCorrTaskPtr* , const float [],  const MKL_INT* , float [],  const MKL_INT* ))
+
+_Mkl_Api(int,vslzCorrExecX1D,(VSLCorrTaskPtr  , const MKL_Complex16 [], const MKL_INT  , MKL_Complex16 [], const MKL_INT  ))
+_mkl_api(int,vslzcorrexecx1d,(VSLCorrTaskPtr* , const MKL_Complex16 [], const MKL_INT* , MKL_Complex16 [], const MKL_INT* ))
+_MKL_API(int,VSLZCORREXECX1D,(VSLCorrTaskPtr* , const MKL_Complex16 [], const MKL_INT* , MKL_Complex16 [], const MKL_INT* ))
+
+_Mkl_Api(int,vslcCorrExecX1D,(VSLCorrTaskPtr  , const MKL_Complex8 [],  const MKL_INT  , MKL_Complex8 [],  const MKL_INT  ))
+_mkl_api(int,vslccorrexecx1d,(VSLCorrTaskPtr* , const MKL_Complex8 [],  const MKL_INT* , MKL_Complex8 [],  const MKL_INT* ))
+_MKL_API(int,VSLCCORREXECX1D,(VSLCorrTaskPtr* , const MKL_Complex8 [],  const MKL_INT* , MKL_Complex8 [],  const MKL_INT* ))
+
+
+/*
+//++
+//  SUMMARARY STATTISTICS LIBRARY ROUTINES
+//--
+*/
+
+/*
+//  Task constructors
+*/
+_Mkl_Api(int,vsldSSNewTask,(VSLSSTaskPtr* , const MKL_INT* , const MKL_INT* , const MKL_INT* , const double [], const double [], const MKL_INT []))
+_mkl_api(int,vsldssnewtask,(VSLSSTaskPtr* , const MKL_INT* , const MKL_INT* , const MKL_INT* , const double [], const double [], const MKL_INT []))
+_MKL_API(int,VSLDSSNEWTASK,(VSLSSTaskPtr* , const MKL_INT* , const MKL_INT* , const MKL_INT* , const double [], const double [], const MKL_INT []))
+
+_Mkl_Api(int,vslsSSNewTask,(VSLSSTaskPtr* , const MKL_INT* , const MKL_INT* , const MKL_INT* , const float  [], const float  [], const MKL_INT []))
+_mkl_api(int,vslsssnewtask,(VSLSSTaskPtr* , const MKL_INT* , const MKL_INT* , const MKL_INT* , const float  [], const float  [], const MKL_INT []))
+_MKL_API(int,VSLSSSNEWTASK,(VSLSSTaskPtr* , const MKL_INT* , const MKL_INT* , const MKL_INT* , const float  [], const float  [], const MKL_INT []))
+
+
+/*
+// Task editors
+*/
+
+/*
+// Editor to modify a task parameter
+*/
+_Mkl_Api(int,vsldSSEditTask,(VSLSSTaskPtr  , const MKL_INT  , const double* ))
+_mkl_api(int,vsldssedittask,(VSLSSTaskPtr* , const MKL_INT* , const double* ))
+_MKL_API(int,VSLDSSEDITTASK,(VSLSSTaskPtr* , const MKL_INT* , const double* ))
+
+_Mkl_Api(int,vslsSSEditTask,(VSLSSTaskPtr  , const MKL_INT  , const float* ))
+_mkl_api(int,vslsssedittask,(VSLSSTaskPtr* , const MKL_INT* , const float* ))
+_MKL_API(int,VSLSSSEDITTASK,(VSLSSTaskPtr* , const MKL_INT* , const float* ))
+
+_Mkl_Api(int,vsliSSEditTask,(VSLSSTaskPtr  , const MKL_INT  , const MKL_INT* ))
+_mkl_api(int,vslissedittask,(VSLSSTaskPtr* , const MKL_INT* , const MKL_INT* ))
+_MKL_API(int,VSLISSEDITTASK,(VSLSSTaskPtr* , const MKL_INT* , const MKL_INT* ))
+
+/*
+// Task specific editors
+*/
+
+/*
+// Editors to modify moments related parameters
+*/
+_Mkl_Api(int,vsldSSEditMoments,(VSLSSTaskPtr  , double* , double* , double* , double* , double* , double* , double* ))
+_mkl_api(int,vsldsseditmoments,(VSLSSTaskPtr* , double* , double* , double* , double* , double* , double* , double* ))
+_MKL_API(int,VSLDSSEDITMOMENTS,(VSLSSTaskPtr* , double* , double* , double* , double* , double* , double* , double* ))
+
+_Mkl_Api(int,vslsSSEditMoments,(VSLSSTaskPtr  , float* , float* , float* , float* , float* , float* , float* ))
+_mkl_api(int,vslssseditmoments,(VSLSSTaskPtr* , float* , float* , float* , float* , float* , float* , float* ))
+_MKL_API(int,VSLSSSEDITMOMENTS,(VSLSSTaskPtr* , float* , float* , float* , float* , float* , float* , float* ))
+
+
+/*
+// Editors to modify sums related parameters
+*/
+_Mkl_Api(int,vsldSSEditSums,(VSLSSTaskPtr  , double* , double* , double* , double* , double* , double* , double* ))
+_mkl_api(int,vsldsseditsums,(VSLSSTaskPtr* , double* , double* , double* , double* , double* , double* , double* ))
+_MKL_API(int,VSLDSSEDITSUMS,(VSLSSTaskPtr* , double* , double* , double* , double* , double* , double* , double* ))
+
+_Mkl_Api(int,vslsSSEditSums,(VSLSSTaskPtr  , float* , float* , float* , float* , float* , float* , float* ))
+_mkl_api(int,vslssseditsums,(VSLSSTaskPtr* , float* , float* , float* , float* , float* , float* , float* ))
+_MKL_API(int,VSLSSSEDITSUMS,(VSLSSTaskPtr* , float* , float* , float* , float* , float* , float* , float* ))
+
+
+/*
+// Editors to modify variance-covariance/correlation matrix related parameters
+*/
+_Mkl_Api(int,vsldSSEditCovCor,(VSLSSTaskPtr  , double* , double* ,  const MKL_INT* , double* , const MKL_INT* ))
+_mkl_api(int,vsldsseditcovcor,(VSLSSTaskPtr* , double* , double* ,  const MKL_INT* , double* , const MKL_INT* ))
+_MKL_API(int,VSLDSSEDITCOVCOR,(VSLSSTaskPtr* , double* , double* ,  const MKL_INT* , double* , const MKL_INT* ))
+
+_Mkl_Api(int,vslsSSEditCovCor,(VSLSSTaskPtr  , float* , float* , const MKL_INT* , float* , const MKL_INT* ))
+_mkl_api(int,vslssseditcovcor,(VSLSSTaskPtr* , float* , float* , const MKL_INT* , float* , const MKL_INT* ))
+_MKL_API(int,VSLSSSEDITCOVCOR,(VSLSSTaskPtr* , float* , float* , const MKL_INT* , float* , const MKL_INT* ))
+
+
+/*
+// Editors to modify cross-product matrix related parameters
+*/
+_Mkl_Api(int,vsldSSEditCP,(VSLSSTaskPtr  , double* , double* ,  double* , const MKL_INT* ))
+_mkl_api(int,vsldsseditcp,(VSLSSTaskPtr* , double* , double* ,  double* , const MKL_INT* ))
+_MKL_API(int,VSLDSSEDITCP,(VSLSSTaskPtr* , double* , double* ,  double* , const MKL_INT* ))
+
+_Mkl_Api(int,vslsSSEditCP,(VSLSSTaskPtr  , float* , float* , float* , const MKL_INT* ))
+_mkl_api(int,vslssseditcp,(VSLSSTaskPtr* , float* , float* , float* , const MKL_INT* ))
+_MKL_API(int,VSLSSSEDITCP,(VSLSSTaskPtr* , float* , float* , float* , const MKL_INT* ))
+
+
+/*
+// Editors to modify partial variance-covariance matrix related parameters
+*/
+_Mkl_Api(int,vsldSSEditPartialCovCor,(VSLSSTaskPtr  , const MKL_INT [], const double* , const MKL_INT* , const double* , const MKL_INT* , double* , const MKL_INT* , double* , const MKL_INT* ))
+_mkl_api(int,vsldsseditpartialcovcor,(VSLSSTaskPtr* , const MKL_INT [], const double* , const MKL_INT* , const double* , const MKL_INT* , double* , const MKL_INT* , double* , const MKL_INT* ))
+_MKL_API(int,VSLDSSEDITPARTIALCOVCOR,(VSLSSTaskPtr* , const MKL_INT [], const double* , const MKL_INT* , const double* , const MKL_INT* , double* , const MKL_INT* , double* , const MKL_INT* ))
+
+_Mkl_Api(int,vslsSSEditPartialCovCor,(VSLSSTaskPtr  , const MKL_INT [], const float* , const MKL_INT* , const float* , const MKL_INT* , float* ,  const MKL_INT* , float* ,  const MKL_INT* ))
+_mkl_api(int,vslssseditpartialcovcor,(VSLSSTaskPtr* , const MKL_INT [], const float* , const MKL_INT* , const float* , const MKL_INT* , float* ,  const MKL_INT* , float* ,  const MKL_INT* ))
+_MKL_API(int,VSLSSSEDITPARTIALCOVCOR,(VSLSSTaskPtr* , const MKL_INT [], const float* , const MKL_INT* , const float* , const MKL_INT* , float* ,  const MKL_INT* , float* ,  const MKL_INT* ))
+
+
+/*
+// Editors to modify quantiles related parameters
+*/
+_Mkl_Api(int,vsldSSEditQuantiles,(VSLSSTaskPtr  , const MKL_INT* , const double* , double* , double* , const MKL_INT* ))
+_mkl_api(int,vsldsseditquantiles,(VSLSSTaskPtr* , const MKL_INT* , const double* , double* , double* , const MKL_INT* ))
+_MKL_API(int,VSLDSSEDITQUANTILES,(VSLSSTaskPtr* , const MKL_INT* , const double* , double* , double* , const MKL_INT* ))
+
+_Mkl_Api(int,vslsSSEditQuantiles,(VSLSSTaskPtr  , const MKL_INT* , const float* , float* , float* , const MKL_INT* ))
+_mkl_api(int,vslssseditquantiles,(VSLSSTaskPtr* , const MKL_INT* , const float* , float* , float* , const MKL_INT* ))
+_MKL_API(int,VSLSSSEDITQUANTILES,(VSLSSTaskPtr* , const MKL_INT* , const float* , float* , float* , const MKL_INT* ))
+
+
+/*
+// Editors to modify stream data quantiles related parameters
+*/
+_Mkl_Api(int,vsldSSEditStreamQuantiles,(VSLSSTaskPtr  , const MKL_INT* , const double* , double* , const MKL_INT* , const double* ))
+_mkl_api(int,vsldsseditstreamquantiles,(VSLSSTaskPtr* , const MKL_INT* , const double* , double* , const MKL_INT* , const double* ))
+_MKL_API(int,VSLDSSEDITSTREAMQUANTILES,(VSLSSTaskPtr* , const MKL_INT* , const double* , double* , const MKL_INT* , const double* ))
+
+_Mkl_Api(int,vslsSSEditStreamQuantiles,(VSLSSTaskPtr  , const MKL_INT* , const float* , float* , const MKL_INT* , const float* ))
+_mkl_api(int,vslssseditstreamquantiles,(VSLSSTaskPtr* , const MKL_INT* , const float* , float* , const MKL_INT* , const float* ))
+_MKL_API(int,VSLSSSEDITSTREAMQUANTILES,(VSLSSTaskPtr* , const MKL_INT* , const float* , float* , const MKL_INT* , const float* ))
+
+/*
+// Editors to modify pooled/group variance-covariance matrix related parameters
+*/
+_Mkl_Api(int,vsldSSEditPooledCovariance,(VSLSSTaskPtr  , const MKL_INT* , double* , double* , const MKL_INT* , double* , double* ))
+_mkl_api(int,vsldsseditpooledcovariance,(VSLSSTaskPtr* , const MKL_INT* , double* , double* , const MKL_INT* , double* , double* ))
+_MKL_API(int,VSLDSSEDITPOOLEDCOVARIANCE,(VSLSSTaskPtr* , const MKL_INT* , double* , double* , const MKL_INT* , double* , double* ))
+
+_Mkl_Api(int,vslsSSEditPooledCovariance,(VSLSSTaskPtr  , const MKL_INT* , float* , float* , const MKL_INT* , float* , float* ))
+_mkl_api(int,vslssseditpooledcovariance,(VSLSSTaskPtr* , const MKL_INT* , float* , float* , const MKL_INT* , float* , float* ))
+_MKL_API(int,VSLSSSEDITPOOLEDCOVARIANCE,(VSLSSTaskPtr* , const MKL_INT* , float* , float* , const MKL_INT* , float* , float* ))
+
+
+/*
+// Editors to modify robust variance-covariance matrix related parameters
+*/
+_Mkl_Api(int,vsldSSEditRobustCovariance,(VSLSSTaskPtr  , const MKL_INT* , const MKL_INT* ,  const double* , double* , double* ))
+_mkl_api(int,vsldsseditrobustcovariance,(VSLSSTaskPtr* , const MKL_INT* , const MKL_INT* ,  const double* , double* , double* ))
+_MKL_API(int,VSLDSSEDITROBUSTCOVARIANCE,(VSLSSTaskPtr* , const MKL_INT* , const MKL_INT* ,  const double* , double* , double* ))
+
+_Mkl_Api(int,vslsSSEditRobustCovariance,(VSLSSTaskPtr  , const MKL_INT* , const MKL_INT* ,  const float* , float* , float* ))
+_mkl_api(int,vslssseditrobustcovariance,(VSLSSTaskPtr* , const MKL_INT* , const MKL_INT* ,  const float* , float* , float* ))
+_MKL_API(int,VSLSSSEDITROBUSTCOVARIANCE,(VSLSSTaskPtr* , const MKL_INT* , const MKL_INT* ,  const float* , float* , float* ))
+
+
+/*
+// Editors to modify outliers detection parameters
+*/
+_Mkl_Api(int,vsldSSEditOutliersDetection,(VSLSSTaskPtr  , const MKL_INT* , const double* , double* ))
+_mkl_api(int,vsldsseditoutliersdetection,(VSLSSTaskPtr* , const MKL_INT* , const double* , double* ))
+_MKL_API(int,VSLDSSEDITOUTLIERSDETECTION,(VSLSSTaskPtr* , const MKL_INT* , const double* , double* ))
+
+_Mkl_Api(int,vslsSSEditOutliersDetection,(VSLSSTaskPtr  , const MKL_INT* , const float* , float* ))
+_mkl_api(int,vslssseditoutliersdetection,(VSLSSTaskPtr* , const MKL_INT* , const float* , float* ))
+_MKL_API(int,VSLSSSEDITOUTLIERSDETECTION,(VSLSSTaskPtr* , const MKL_INT* , const float* , float* ))
+
+/*
+// Editors to modify missing values support parameters
+*/
+_Mkl_Api(int,vsldSSEditMissingValues,(VSLSSTaskPtr  , const MKL_INT* , const double* , const MKL_INT* , const double* , const MKL_INT* , const double* , const MKL_INT* , double* , const MKL_INT* , double* ))
+_mkl_api(int,vsldsseditmissingvalues,(VSLSSTaskPtr* , const MKL_INT* , const double* , const MKL_INT* , const double* , const MKL_INT* , const double* , const MKL_INT* , double* , const MKL_INT* , double* ))
+_MKL_API(int,VSLDSSEDITMISSINGVALUES,(VSLSSTaskPtr* , const MKL_INT* , const double* , const MKL_INT* , const double* , const MKL_INT* , const double* , const MKL_INT* , double* , const MKL_INT* , double* ))
+
+_Mkl_Api(int,vslsSSEditMissingValues,(VSLSSTaskPtr  , const MKL_INT* , const float* , const MKL_INT* , const float* , const MKL_INT* , const float* , const MKL_INT* , float* , const MKL_INT* , float* ))
+_mkl_api(int,vslssseditmissingvalues,(VSLSSTaskPtr* , const MKL_INT* , const float* , const MKL_INT* , const float* , const MKL_INT* , const float* , const MKL_INT* , float* , const MKL_INT* , float* ))
+_MKL_API(int,VSLSSSEDITMISSINGVALUES,(VSLSSTaskPtr* , const MKL_INT* , const float* , const MKL_INT* , const float* , const MKL_INT* , const float* , const MKL_INT* , float* , const MKL_INT* , float* ))
+
+/*
+// Editors to modify matrixparametrization parameters
+*/
+_Mkl_Api(int,vsldSSEditCorParameterization,(VSLSSTaskPtr  , const double* , const MKL_INT* , double* , const MKL_INT* ))
+_mkl_api(int,vsldsseditcorparameterization,(VSLSSTaskPtr* , const double* , const MKL_INT* , double* , const MKL_INT* ))
+_MKL_API(int,VSLDSSEDITCORPARAMETERIZATION,(VSLSSTaskPtr* , const double* , const MKL_INT* , double* , const MKL_INT* ))
+
+_Mkl_Api(int,vslsSSEditCorParameterization,(VSLSSTaskPtr  , const float* , const MKL_INT* , float* , const MKL_INT* ))
+_mkl_api(int,vslssseditcorparameterization,(VSLSSTaskPtr* , const float* , const MKL_INT* , float* , const MKL_INT* ))
+_MKL_API(int,VSLSSSEDITCORPARAMETERIZATION,(VSLSSTaskPtr* , const float* , const MKL_INT* , float* , const MKL_INT* ))
+
+
+/*
+// Compute routines
+*/
+_Mkl_Api(int,vsldSSCompute,(VSLSSTaskPtr  , const unsigned MKL_INT64  , const MKL_INT  ))
+_mkl_api(int,vsldsscompute,(VSLSSTaskPtr* , const unsigned MKL_INT64* , const MKL_INT* ))
+_MKL_API(int,VSLDSSCOMPUTE,(VSLSSTaskPtr* , const unsigned MKL_INT64* , const MKL_INT* ))
+
+_Mkl_Api(int,vslsSSCompute,(VSLSSTaskPtr  , const unsigned MKL_INT64  , const MKL_INT  ))
+_mkl_api(int,vslssscompute,(VSLSSTaskPtr* , const unsigned MKL_INT64* , const MKL_INT* ))
+_MKL_API(int,VSLSSSCOMPUTE,(VSLSSTaskPtr* , const unsigned MKL_INT64* , const MKL_INT* ))
+
+
+/*
+// Task destructor
+*/
+_Mkl_Api(int,vslSSDeleteTask,(VSLSSTaskPtr* ))
+_mkl_api(int,vslssdeletetask,(VSLSSTaskPtr* ))
+_MKL_API(int,VSLSSDELETETASK,(VSLSSTaskPtr* ))
+
+#ifdef __cplusplus
+}
+#endif /* __cplusplus */
+
+#endif /* __MKL_VSL_FUNCTIONS_H__ */
diff --git a/python/ideep4py/include/mkl/mkl_vsl_types.h b/python/ideep4py/include/mkl/mkl_vsl_types.h
new file mode 100644
index 00000000..48c4e38e
--- /dev/null
+++ b/python/ideep4py/include/mkl/mkl_vsl_types.h
@@ -0,0 +1,126 @@
+/* file: mkl_vsl_types.h */
+/*******************************************************************************
+* Copyright (c) 2006-2017, Intel Corporation
+* All rights reserved.
+*
+* Redistribution and use in source and binary forms, with or without
+* modification, are permitted provided that the following conditions are met:
+*
+*     * Redistributions of source code must retain the above copyright notice,
+*       this list of conditions and the following disclaimer.
+*     * Redistributions in binary form must reproduce the above copyright
+*       notice, this list of conditions and the following disclaimer in the
+*       documentation and/or other materials provided with the distribution.
+*     * Neither the name of Intel Corporation nor the names of its contributors
+*       may be used to endorse or promote products derived from this software
+*       without specific prior written permission.
+*
+* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+
+/*
+//++
+//  This file contains user-level type definitions.
+//--
+*/
+
+#ifndef __MKL_VSL_TYPES_H__
+#define __MKL_VSL_TYPES_H__
+
+#ifdef __cplusplus
+extern "C" {
+#endif /* __cplusplus */
+
+#include "mkl_types.h"
+
+/*
+//++
+//  TYPEDEFS
+//--
+*/
+
+/*
+//  POINTER TO STREAM STATE STRUCTURE
+//  This is a void pointer to hide implementation details.
+*/
+typedef void* VSLStreamStatePtr;
+typedef void* VSLConvTaskPtr;
+typedef void* VSLCorrTaskPtr;
+typedef void* VSLSSTaskPtr;
+
+/*
+//  POINTERS TO BASIC RANDOM NUMBER GENERATOR FUNCTIONS
+//  Each BRNG must have following implementations:
+//
+//  * Stream initialization (InitStreamPtr)
+//  * Integer-value recurrence implementation (iBRngPtr)
+//  * Single precision implementation (sBRngPtr) - for random number generation
+//    uniformly distributed on the [a,b] interval
+//  * Double precision implementation (dBRngPtr) - for random number generation
+//    uniformly distributed on the [a,b] interval
+*/
+typedef int (*InitStreamPtr)( int method, VSLStreamStatePtr stream, \
+        int n, const unsigned int params[] );
+typedef int (*sBRngPtr)( VSLStreamStatePtr stream, int n, float r[], \
+        float a, float b );
+typedef int (*dBRngPtr)( VSLStreamStatePtr stream, int n, double r[], \
+        double a, double b );
+typedef int (*iBRngPtr)( VSLStreamStatePtr stream, int n, unsigned int r[] );
+
+/*********** Pointers to callback functions for abstract streams *************/
+typedef int (*iUpdateFuncPtr)( VSLStreamStatePtr stream, int* n, \
+     unsigned int ibuf[], int* nmin, int* nmax, int* idx );
+typedef int (*dUpdateFuncPtr)( VSLStreamStatePtr stream, int* n,
+     double dbuf[], int* nmin, int* nmax, int* idx );
+typedef int (*sUpdateFuncPtr)( VSLStreamStatePtr stream, int* n, \
+     float sbuf[], int* nmin, int* nmax, int* idx );
+
+
+/*
+//  BASIC RANDOM NUMBER GENERATOR PROPERTIES STRUCTURE
+//  The structure describes the properties of given basic generator, e.g. size
+//  of the stream state structure, pointers to function implementations, etc.
+//
+//  BRNG properties structure fields:
+//  StreamStateSize - size of the stream state structure (in bytes)
+//  WordSize        - size of base word (in bytes). Typically this is 4 bytes.
+//  NSeeds          - number of words necessary to describe generator's state
+//  NBits           - number of bits actually used in base word. For example,
+//                    only 31 least significant bits are actually used in
+//                    basic random number generator MCG31m1 with 4-byte base
+//                    word. NBits field is useful while interpreting random
+//                    words as a sequence of random bits.
+//  IncludesZero    - FALSE if 0 cannot be generated in integer-valued
+//                    implementation; TRUE if 0 can be potentially generated in
+//                    integer-valued implementation.
+//  InitStream      - pointer to stream state initialization function
+//  sBRng           - pointer to single precision implementation
+//  dBRng           - pointer to double precision implementation
+//  iBRng           - pointer to integer-value implementation
+*/
+typedef struct _VSLBRngProperties {
+    int StreamStateSize;       /* Stream state size (in bytes) */
+    int NSeeds;                /* Number of seeds */
+    int IncludesZero;          /* Zero flag */
+    int WordSize;              /* Size (in bytes) of base word */
+    int NBits;                 /* Number of actually used bits */
+    InitStreamPtr InitStream;  /* Pointer to InitStream func */
+    sBRngPtr sBRng;            /* Pointer to S func */
+    dBRngPtr dBRng;            /* Pointer to D func */
+    iBRngPtr iBRng;            /* Pointer to I func */
+} VSLBRngProperties;
+
+#ifdef __cplusplus
+}
+#endif /* __cplusplus */
+
+#endif /* __MKL_VSL_TYPES_H__ */
diff --git a/python/ideep4py/include/mm/mem.h b/python/ideep4py/include/mm/mem.h
new file mode 100644
index 00000000..88784bc5
--- /dev/null
+++ b/python/ideep4py/include/mm/mem.h
@@ -0,0 +1,193 @@
+/*
+ *Copyright (c) 2018 Intel Corporation.
+ *
+ *Permission is hereby granted, free of charge, to any person obtaining a copy
+ *of this software and associated documentation files (the "Software"), to deal
+ *in the Software without restriction, including without limitation the rights
+ *to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ *copies of the Software, and to permit persons to whom the Software is
+ *furnished to do so, subject to the following conditions:
+ *
+ *The above copyright notice and this permission notice shall be included in
+ *all copies or substantial portions of the Software.
+ *
+ *THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ *OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ *THE SOFTWARE.
+ *
+ */
+
+
+#pragma once
+
+#include <stdlib.h>
+#include <mutex>
+#include <list>
+#include <functional>
+#include <sstream>
+#include <string>
+#include <memory>
+#include <vector>
+#include "utils.h"
+
+using namespace std;
+static constexpr int DEFAULT_ALIGNMENT = 64;
+
+typedef enum {
+    MPOOL_ANON,
+    MPOOL_REORDER,
+    MPOOL_ELTWISE_FWD,
+    MPOOL_ELTWISE_BWD,
+    MPOOL_BN_FWD,
+    MPOOL_BN_BWD,
+    MPOOL_LRN_FWD,
+    MPOOL_LRN_BWD,
+    MPOOL_CONV_FWD,
+    MPOOL_CONV_BWD,
+    MPOOL_POOLING_FWD,
+    MPOOL_POOLING_BWD,
+    MPOOL_IP_FWD,
+    MPOOL_IP_BWD,
+    MPOOL_CONCAT_FWD,
+    MPOOL_CONCAT_BWD,
+} mem_pool_t;
+
+template <std::size_t ALIGNMENT>
+class Memory {
+public:
+    Memory() : alloc_size_(0), free_size_(0), seq_(0) {}
+    Memory(const char *name) : alloc_size_(0), free_size_(0)
+             , seq_(0), name_(name) {}
+    virtual ~Memory() {
+        //std::cout << name_ << " alloc size " << alloc_size_ << " free size "
+        //    << free_size_ << std::endl;
+    }
+
+    void* malloc(size_t size) {
+        std::lock_guard<std::mutex> lock(mutex_);
+        void *ptr;
+        int idx = to_index(size);
+
+        if (!free_hashline_[idx].empty()) {
+            block_t *block = nullptr; 
+            std::list<block_t *> &list = free_hashline_[idx];
+            typename std::list<Memory<ALIGNMENT>::block_t*>::iterator it;
+            for(it=list.begin(); it != list.end(); ++it) {
+                if((*it)->header_.size_ == size) {
+                    block = *it; 
+                    break;
+                }
+            }
+            if (block) {
+                list.erase(it);
+                void *ptr = static_cast<void *>(block);
+                free_size_ -= size;
+                //std::cout << name_ << " cache alloc seq " << block->header_.seq_ << " size " << block->header_.size_ << std::endl;
+                return GET_PTR(void, ptr, ALIGNMENT);
+            }
+        }
+        // No cached memory
+        size_t len = size + ALIGNMENT;
+        int rc = ::posix_memalign(&ptr, ALIGNMENT, len);
+        if (rc != 0) {
+            throw std::invalid_argument("Out of memory");
+        }
+        block_t *block = static_cast<block_t *>(ptr); 
+        block->header_.size_ = size;
+        alloc_size_ += size;
+        //std::cout << name_ << " system alloc seq " << seq_ << " size " << size << std::endl;
+        block->header_.seq_ = seq_++;
+        return GET_PTR(void, ptr, ALIGNMENT);
+    }
+
+    void free(void* ptr) {
+        std::lock_guard<std::mutex> lock(mutex_);
+        //block_t *block = static_cast<block_t *>(ptr - ALIGNMENT);
+        block_t *block = GET_PTR(block_t, ptr, -ALIGNMENT);
+        int idx = to_index(block->header_.size_);
+        free_hashline_[idx].push_back(block);
+        free_size_ += block->header_.size_;
+        //std::cout << name_ << " free seq " << block->header_.seq_ << " size " << block->header_.size_ << std::endl;
+    }
+
+    void epoch() {
+    }
+private:
+    int to_index(std::size_t size) {
+        std::string str = long_to_string(size);
+        std::size_t hash = std::hash<std::string>{}(str);
+        int idx = hash % MAX_ENTRY;
+        return idx;
+    }
+
+    typedef union _header_str {
+        struct {
+            std::size_t size_;
+            int seq_;
+        };
+        char pad_[ALIGNMENT];
+    } header_t;
+
+    typedef struct _block_str {
+        header_t header_;
+        char data_[];
+    } block_t;
+
+    static constexpr int MAX_ENTRY = 512;
+
+    std::size_t alloc_size_;
+    std::size_t free_size_;
+    std::list<block_t *> free_hashline_[MAX_ENTRY];
+    std::mutex mutex_;
+    int seq_;
+    std::string name_;
+};
+
+void* dnn_malloc(size_t size, mem_pool_t pool=MPOOL_ANON);
+void dnn_free(void *p, mem_pool_t pool=MPOOL_ANON);
+
+// Just grab it from MKL-DNN
+namespace avx {
+#if 1
+    inline void* malloc(size_t size, int alignment) {
+        return ::dnn_malloc(size);
+    }
+    inline void free(void* p) { ::dnn_free(p); }
+#else
+    inline void* malloc(size_t size, int alignment) {
+        void *ptr;
+        int rc = ::posix_memalign(&ptr, alignment, size);
+        return (rc == 0) ? ptr : 0;
+    }
+    inline void free(void* p) { ::free(p); }
+#endif
+
+    struct compatible {
+        enum { default_alignment = DEFAULT_ALIGNMENT };
+        static void* operator new(size_t sz) {
+            return malloc(sz, default_alignment);
+        }
+        static void* operator new(size_t sz, void* p) { (void)sz; return p; }
+        static void* operator new[](size_t sz) {
+            return malloc(sz, default_alignment);
+        }
+        static void operator delete(void* p) {
+            free(p); }
+        static void operator delete[](void* p) {
+            free(p); }
+    };
+
+    struct byte: public compatible {
+        char q;
+    };
+}
+
+class Allocator {
+    public:
+        static std::shared_ptr<avx::byte> malloc(size_t len, mem_pool_t mpool);
+        static std::shared_ptr<avx::byte> malloc(vector<int> dims, int element_sz, mem_pool_t mpool);
+};
diff --git a/python/ideep4py/include/mm/mkldnn_ex.h b/python/ideep4py/include/mm/mkldnn_ex.h
new file mode 100644
index 00000000..d9380a9d
--- /dev/null
+++ b/python/ideep4py/include/mm/mkldnn_ex.h
@@ -0,0 +1,92 @@
+/*
+ *Copyright (c) 2018 Intel Corporation.
+ *
+ *Permission is hereby granted, free of charge, to any person obtaining a copy
+ *of this software and associated documentation files (the "Software"), to deal
+ *in the Software without restriction, including without limitation the rights
+ *to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ *copies of the Software, and to permit persons to whom the Software is
+ *furnished to do so, subject to the following conditions:
+ *
+ *The above copyright notice and this permission notice shall be included in
+ *all copies or substantial portions of the Software.
+ *
+ *THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ *OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ *THE SOFTWARE.
+ *
+ */
+
+
+#pragma once
+#include "mkldnn.hpp"
+#include "reorder.h"
+
+inline static mkldnn::memory reorder_if_must(mkldnn::memory user
+        , mkldnn::memory::primitive_desc expect
+        , std::unique_ptr<mkldnn::memory> &mreorder
+        , std::vector<mkldnn::primitive> *dag) {
+
+    if (user.get_primitive_desc() != expect) {
+        mkldnn::memory interm(expect);
+#if 0
+        auto user_mpd = user.get_primitive_desc();
+        mkldnn::memory::format user_fmt = static_cast<mkldnn::memory::format>(
+                user_mpd.desc().data.format);
+        mkldnn::memory::format mkl_fmt = static_cast<mkldnn::memory::format>(
+                expect.desc().data.format);
+        mkldnn::memory::data_type dtype = static_cast<mkldnn::memory::data_type>(
+                expect.desc().data.data_type);
+
+        if ((user_fmt == mkldnn::memory::format::nChw16c &&
+                    mkl_fmt == mkldnn::memory::format::nChw8c) ||
+                (mkl_fmt == mkldnn::memory::format::nChw16c &&
+                 user_fmt == mkldnn::memory::format::nChw8c)) {
+            auto m = expect.desc().data;
+            int n = m.dims[0], c = m.dims[1], h = m.dims[2], w = m.dims[3];
+            mkldnn::memory::dims tz = {n, c, h, w};
+            mreorder.reset(new mkldnn::memory({{{ tz }, dtype, mkldnn::memory::format::nchw }, expect.get_engine()}));
+            //auto mreorder = new mkldnn::memory({{{ tz }, dtype, mkldnn::memory::format::nchw }, expect.get_engine()});
+            auto rep1 = mkldnn::reorder(user, *mreorder);
+            auto rep2 = mkldnn::reorder(*mreorder, interm);
+            dag->push_back(rep1);
+            dag->push_back(rep2);
+            //static int spl_nr = 0;
+            //printf("\n   %d *Reorder(split) iutput from:%d, to:%d\n", spl_nr++, user_fmt, mkl_fmt);
+        } else {
+            dag->push_back(mkldnn::reorder(user, interm));
+        }
+#else
+        dag->push_back(mkldnn::reorder(user, interm));
+#endif
+        return interm;
+    }
+
+    return user;
+}
+
+template<typename T>
+inline static void axpby(Tensor *dst, T a, Tensor *x, T b, Tensor *y) {
+    std::vector<mkldnn::primitive> prims;
+    std::unique_ptr<mkldnn::memory> mreorder;
+
+    /// Reorder to x's format
+    auto mid = reorder_if_must(y->mkldnn_memory(), x->mkldnn_memory().get_primitive_desc()
+            , mreorder, &prims);
+
+    mkldnn::sum::primitive_desc sum_pd(std::vector<float>({(float)a, (float)b})
+            , {x->mkldnn_memory().get_primitive_desc(), mid.get_primitive_desc()});
+
+    std::vector<mkldnn::memory::primitive::at> inputs_at {x->mkldnn_memory(), mid};
+
+    mkldnn::sum sum_prim(sum_pd, inputs_at, dst->mkldnn_memory());
+    prims.push_back(sum_prim);
+
+    mkldnn::stream s(mkldnn::stream::kind::eager);
+    s.submit(prims).wait();
+}
+
diff --git a/python/ideep4py/include/mm/reorder.h b/python/ideep4py/include/mm/reorder.h
new file mode 100644
index 00000000..76d58a0f
--- /dev/null
+++ b/python/ideep4py/include/mm/reorder.h
@@ -0,0 +1,202 @@
+/*
+ *Copyright (c) 2018 Intel Corporation.
+ *
+ *Permission is hereby granted, free of charge, to any person obtaining a copy
+ *of this software and associated documentation files (the "Software"), to deal
+ *in the Software without restriction, including without limitation the rights
+ *to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ *copies of the Software, and to permit persons to whom the Software is
+ *furnished to do so, subject to the following conditions:
+ *
+ *The above copyright notice and this permission notice shall be included in
+ *all copies or substantial portions of the Software.
+ *
+ *THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ *OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ *THE SOFTWARE.
+ *
+ */
+
+
+#pragma once
+
+#include "mkldnn.hpp"
+#include "tensor.h"
+
+extern engine cpu_engine;
+static constexpr int MAX_NDIM = 12; //XXX: For now
+
+class Reorderer {
+//protected:
+public:
+    bool non_trivial_;
+    mkldnn::memory dst_;
+    std::shared_ptr<avx::byte> data_;
+
+    int ndims_;
+    int size_;
+    char format_[4];
+    Py_ssize_t itemsize_;
+    Py_ssize_t strides_[MAX_NDIM];
+    Py_ssize_t shape_[MAX_NDIM];
+
+    void _collect_buffer_info() {
+        auto md = dst_.get_primitive_desc().desc();
+        int ndims = md.data.ndims;
+
+        ndims_ = ndims;
+        switch(static_cast<mkldnn::memory::data_type>(md.data.data_type)) {
+            case mkldnn::memory::f32:
+                strcpy(format_, "f");
+                itemsize_ = 4;
+                break;
+            case mkldnn::memory::s32:
+                strcpy(format_, "i");
+                itemsize_ = 4;
+                break;
+            case mkldnn::memory::s16:
+                strcpy(format_, "h");
+                itemsize_ = 2;
+                break;
+            case mkldnn::memory::s8:
+                strcpy(format_, "b");
+                itemsize_ = 1;
+                break;
+            case mkldnn::memory::u8:
+                strcpy(format_, "B");
+                itemsize_ = 1;
+                break;
+            default:
+                break;
+        }
+
+        for (int i = 0; i < ndims; i ++) {
+            shape_[i] = md.data.dims[i];
+        }
+
+        Py_ssize_t sd = itemsize_;
+
+        for (int i = ndims -1; i >= 0; --i) {
+            strides_[i] = sd;
+            sd *= shape_[i];
+        }
+    }
+
+    inline avx::byte *data() const { return data_.get(); }
+
+public:
+#if 0
+    Reorderer(const py_handle in)
+        :Reorderer(in.get()) {}
+#endif
+    Reorderer(const Tensor *src)
+        : non_trivial_(src->incompatible()), dst_([src] () {
+                if (src->incompatible()) {
+                    auto md_data = src->desc().data;
+
+                    mkldnn::memory::dims adims(md_data.dims
+                        , md_data.dims + md_data.ndims);
+
+                    mkldnn::memory::primitive_desc pd ({adims
+                        , static_cast<mkldnn::memory::data_type>(md_data.data_type)
+                        , static_cast<mkldnn::memory::format>(::public_format(md_data.format))}
+                        , src->get_engine());
+
+                    // XXX: magic number 4 is a hack
+                    return mkldnn::memory(pd, reinterpret_cast<void *>(4));
+                } else {
+                    return src->mkldnn_memory();
+                }} ()), size_(src->size()) {
+            if (src->incompatible()) {
+                auto pd = dst_.get_primitive_desc();
+
+                data_ = std::shared_ptr<avx::byte>(new avx::byte [pd.get_size()]
+                        , [](avx::byte *p) {delete [] p;});
+
+                dst_.set_data_handle(data_.get());
+
+            } else {
+                data_ = src->share_data();
+            }
+
+            _collect_buffer_info();
+        }
+
+    mkldnn::reorder fire(const Tensor *src) {
+        mkldnn::reorder reorder(src->mkldnn_memory(), dst_);
+        mkldnn::stream s(mkldnn::stream::eager);
+
+        s.submit({reorder}).wait();
+        return reorder;
+    }
+
+    mkldnn::reorder sync(const Tensor *src) {
+        mkldnn::reorder reorder(dst_, src->mkldnn_memory());
+        mkldnn::stream s(mkldnn::stream::eager);
+
+        s.submit({reorder}).wait();
+        return reorder;
+    }
+
+    inline bool non_trivial() const {
+        return non_trivial_;
+    }
+
+#if 0
+    // PEP 3118 interface
+    int build_view(Py_buffer *view, int flags) {
+        view->buf = data_.get();
+        view->itemsize = itemsize_;
+        view->readonly = 0;
+        view->internal = nullptr;
+        view->len = size_ * itemsize_;
+
+        if ((flags & PyBUF_FORMAT) == PyBUF_FORMAT) {
+            view->format = format_;
+        } else {
+            view->format = nullptr;
+        }
+
+        if ((flags & PyBUF_ND) == PyBUF_ND) {
+            view->ndim = ndims_;
+            view->shape = shape_;
+        } else {
+            view->ndim = 0;
+            view->shape = nullptr;
+        }
+
+        if ((flags & PyBUF_STRIDES) == PyBUF_STRIDES) {
+            view->strides = strides_;
+        } else {
+            view->strides = nullptr;
+        }
+
+        view->suboffsets = nullptr;
+
+        return 0;
+    }
+
+    // Array protocol
+    PyArrayInterface *build_array_struct(void) {
+        auto arrstr = new PyArrayInterface();
+
+        arrstr->two = 2;
+        arrstr->nd = ndims_;
+        arrstr->typekind = *((char *)format_);
+        arrstr->itemsize = itemsize_;
+        arrstr->flags = NPY_ARRAY_C_CONTIGUOUS | NPY_ARRAY_NOTSWAPPED |
+            NPY_ARRAY_ALIGNED | NPY_ARRAY_WRITEABLE;
+        arrstr->flags &= ~(NPY_ARRAY_UPDATEIFCOPY | NPY_ARRAY_OWNDATA);
+        arrstr->shape = shape_;
+        arrstr->strides = strides_;
+        arrstr->data = data_.get();
+        arrstr->descr = nullptr;
+
+        return arrstr;
+    }
+#endif
+};
diff --git a/python/ideep4py/include/mm/tensor.h b/python/ideep4py/include/mm/tensor.h
new file mode 100644
index 00000000..6e4a9423
--- /dev/null
+++ b/python/ideep4py/include/mm/tensor.h
@@ -0,0 +1,513 @@
+/*
+ *Copyright (c) 2018 Intel Corporation.
+ *
+ *Permission is hereby granted, free of charge, to any person obtaining a copy
+ *of this software and associated documentation files (the "Software"), to deal
+ *in the Software without restriction, including without limitation the rights
+ *to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ *copies of the Software, and to permit persons to whom the Software is
+ *furnished to do so, subject to the following conditions:
+ *
+ *The above copyright notice and this permission notice shall be included in
+ *all copies or substantial portions of the Software.
+ *
+ *THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ *OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ *THE SOFTWARE.
+ *
+ */
+
+
+#pragma once
+
+#include <vector>
+#include <numeric>
+#include "mkldnn.hpp"
+#include "mem.h"
+#include "utils.h"
+using namespace std;
+using namespace mkldnn;
+extern engine cpu_engine;
+
+typedef size_t size_type;
+enum data_type_t {
+    UNKNOWN_TYPE = 0,
+    FLOAT32,
+    SINT32,
+    SINT16,
+    SINT8,
+    UINT8,
+};
+
+inline int type2size(data_type_t type) {
+    int size = 0;
+    switch (type) {
+        case FLOAT32:
+            size = 4;
+            break;
+        case SINT32:
+            size = 4;
+            break;
+        case SINT16:
+            size = 2;
+            break;
+        case SINT8:
+            size = 1;
+            break;
+        case UINT8:
+            size = 1;
+            break;
+        default:
+            break;
+    }
+    return size;
+}
+
+inline size_t prod(vector<int>dims, int ndims)
+{
+    size_t prod = 1;
+    for (int i = (ndims - 1); i >= 0; i--) {
+        prod *= dims[i];
+    }
+    return prod;
+}
+
+//input_type:'d'-->data, 'w'-->weight
+inline mkldnn_memory_format_t ndims2format(int ndims, char input_type = 'd')
+{
+    mkldnn_memory_format_t fmt = mkldnn_any;
+    switch (ndims) {
+        case 1:
+            fmt = mkldnn_x;
+            break;
+        case 2:
+            fmt = (input_type == 'd') ? mkldnn_nc : mkldnn_oi;
+            break;
+        case 4:
+            fmt = (input_type == 'd') ? mkldnn_nchw : mkldnn_oihw;
+            break;
+        default:
+            throw mkldnn::error(mkldnn_invalid_arguments
+                    , "MKLDNN does not support dimensions"
+                    + ndims);
+    }
+
+    return fmt;
+}
+
+
+inline mkldnn_memory_format_t ndims2format_preferred(int ndims, vector<int> dims, char input_type = 'd')
+{
+    mkldnn_memory_format_t fmt = mkldnn_any; 
+    switch (ndims) {
+        case 1:
+            fmt = mkldnn_x;
+            break;
+        case 2:
+            fmt = (input_type == 'd') ? mkldnn_nc : mkldnn_oi;
+            break;
+        case 4:
+            if (input_type == 'd') {
+                fmt = (mkldnn_memory_format_t)get_desired_format(dims[1]);
+            } else if (input_type == 'w') {
+                fmt = (mkldnn_memory_format_t)get_desired_format_weight(dims[0], dims[1]);
+            }
+            break;
+        default:
+            throw mkldnn::error(mkldnn_invalid_arguments
+                    , "MKLDNN does not support dimensions"
+                    + ndims);
+    }
+
+    return fmt;
+}
+
+
+
+inline mkldnn_memory_format_t public_format(mkldnn_memory_format_t origin)
+{
+    mkldnn_memory_format_t ret;
+    // review this relations carefully
+    switch(origin) {
+        case mkldnn_nchw:
+        case mkldnn_nhwc:
+        case mkldnn_chwn:
+        case mkldnn_nChw8c:
+        case mkldnn_nChw16c:
+            ret = mkldnn_nchw;
+            break;
+        case mkldnn_oihw:
+        case mkldnn_ihwo:
+        case mkldnn_hwio:
+        case mkldnn_OIhw8i8o:
+        case mkldnn_OIhw16i16o:
+        case mkldnn_OIhw8o8i:
+        case mkldnn_OIhw16o16i:
+        case mkldnn_OIhw8i16o2i:
+        case mkldnn_OIhw8o16i2o:
+        case mkldnn_Oihw8o:
+        case mkldnn_Oihw16o:
+        case mkldnn_Ohwi8o:
+        case mkldnn_Ohwi16o:
+        case mkldnn_OhIw16o4i:
+            ret = mkldnn_oihw;
+            break;
+        default:
+            ret = origin;
+            break;
+    }
+
+    return ret;
+}
+
+inline mkldnn_memory_format_t format_2_as_4(mkldnn_memory_format_t origin)
+{
+    mkldnn_memory_format_t ret;
+    // review this relations carefully
+    switch(origin) {
+        case mkldnn_nc:
+            ret = mkldnn_nchw;
+            break;
+        case mkldnn_oi:
+            ret = mkldnn_oihw;
+            break;
+        default:
+            ret = origin;
+            break;
+    }
+    return ret;
+}
+
+class Tensor {
+public:
+    // Allocate memory in constructor
+    Tensor() : ndims_(0), type_(UNKNOWN_TYPE), size_(0), data_(nullptr) {}
+    virtual ~Tensor() = default; 
+
+    Tensor(int ndims, vector<int> dims, data_type_t type)
+        : ndims_(ndims), dims_(dims), type_(type) {
+            size_ = std::accumulate(dims.begin(), dims.begin() + ndims, 1
+                    , std::multiplies<int>());
+            data_ = std::shared_ptr<avx::byte>(new avx::byte [len()]
+                    , [] (avx::byte *p) {delete [] p;});
+            mm_fmt_ = ndims2format(ndims);
+            memory::data_type dt = to_mkldnn_type();
+            mem_.reset(new mkldnn::memory(
+                        { { { dims_ }, dt, static_cast<memory::format>(mm_fmt_) }
+                        , cpu_engine }, data_.get()));
+        }
+    // input_type: 'd': data, 'w': weight
+    Tensor(int ndims, vector<int> dims, void *data, data_type_t type, char input_type='d')
+        : ndims_(ndims), dims_(dims), type_(type) {
+            size_ = std::accumulate(dims.begin(), dims.begin() + ndims, 1
+                    , std::multiplies<int>());
+            data_ = std::shared_ptr<avx::byte>(new avx::byte [len()]
+                    , [] (avx::byte *p) {delete [] p;});
+            //memcpy(data_.get(), data, len());
+            memory::data_type dt = to_mkldnn_type();
+            if (dt == memory::data_type::f32 && len() > 0) { //currently, mkldnn only support most f32 currently, may add int8 in future?
+                auto mm_fmt_i = ndims2format(ndims, input_type);
+                mm_fmt_ = ndims2format_preferred(ndims, dims, input_type);
+                auto mem_i = mkldnn::memory(
+                            { { { dims_ }, dt, static_cast<memory::format>(mm_fmt_i) }
+                            , cpu_engine }, data);
+
+                mem_.reset(new mkldnn::memory(
+                            { { { dims_ }, dt, static_cast<memory::format>(mm_fmt_) }
+                            , cpu_engine }, data_.get()));
+                auto reorder_prim = reorder(mem_i, *mem_);
+                std::vector<mkldnn::primitive> prims = {reorder_prim};
+                mkldnn::stream s(mkldnn::stream::kind::eager);
+                s.submit(prims).wait();
+            } else {
+                mm_fmt_ =  ndims2format(ndims, input_type);
+                fast_memcpy((char*)data_.get(), (char*)data, len());
+                mem_.reset(new mkldnn::memory(
+                            { { { dims_ }, dt, static_cast<memory::format>(mm_fmt_) }
+                            , cpu_engine }, data_.get()));                
+            }
+        }
+
+    Tensor(int ndims, vector<int> dims, std::shared_ptr<avx::byte> data, data_type_t type)
+        : ndims_(ndims), dims_(dims), type_(type) {
+            size_ = std::accumulate(dims.begin(), dims.begin() + ndims, 1
+                    , std::multiplies<int>());
+            data_ = data;
+            mm_fmt_ = ndims2format(ndims);
+            memory::data_type dt = to_mkldnn_type();
+            mem_.reset(new mkldnn::memory(
+                        { { { dims_ }, dt, static_cast<memory::format>(mm_fmt_) }
+                        , cpu_engine }, data_.get()));
+        }
+
+    Tensor(int ndims, vector<int> dims, std::shared_ptr<avx::byte> data
+            , mkldnn_memory_format_t mm_fmt, data_type_t type)
+        : ndims_(ndims), dims_(dims), type_(type) {
+            size_ = std::accumulate(dims.begin(), dims.begin() + ndims, 1
+                    , std::multiplies<int>());
+            data_ = data;
+            mm_fmt_ = mm_fmt;
+            memory::data_type dt = to_mkldnn_type();
+            mem_.reset(new mkldnn::memory(
+                        { { { dims_ }, dt, static_cast<memory::format>(mm_fmt_) }
+                        , cpu_engine }, data_.get()));
+        }
+
+    Tensor(int ndims, vector<int> dims,
+            mkldnn_memory_format_t mm_fmt, data_type_t type)
+        : Tensor(ndims, dims, type) {
+            mm_fmt_ = mm_fmt;
+            memory::data_type dt = to_mkldnn_type();
+            mem_.reset(new mkldnn::memory(
+                        { { { dims_ }, dt, static_cast<memory::format>(mm_fmt_) }
+                        , cpu_engine }, data_.get()));
+        }
+        
+#if 0
+    Tensor(int ndims, vector<int> dims, void *data,
+            mkldnn_memory_format_t mm_fmt, data_type_t type=FLOAT32)
+        : Tensor(ndims, dims, data, type) {
+            mm_fmt_ = mm_fmt;
+            memory::data_type dt = to_mkldnn_type();
+            mem_.reset(new mkldnn::memory(
+                        { { { dims_ }, dt, static_cast<memory::format>(mm_fmt_) }
+                        , cpu_engine }, data_.get()));
+        }
+#endif
+        
+    Tensor(mkldnn::memory::dims dims
+        , mkldnn_data_type_t dt
+        , mkldnn::memory::format format
+        , shared_ptr<avx::byte> data)
+        : ndims_(dims.size()), dims_(dims) {
+            type_ = to_tensor_type(dt);
+            size_ = std::accumulate(dims.begin(), dims.end(), 1
+                    , std::multiplies<int>());
+            data_ = data;
+            mm_fmt_ = mkldnn_memory_format_t(format);
+            mem_.reset(new mkldnn::memory(
+                        { { { dims_ }, static_cast<memory::data_type>(dt)
+                        , static_cast<memory::format>(mm_fmt_) }
+                        , cpu_engine }, data_.get()));
+
+        }
+
+    Tensor(mkldnn::memory::dims dims
+        , mkldnn::memory::data_type dt
+        , mkldnn::memory::format format
+        , const mkldnn::engine &engine)
+            : Tensor({{std::move(dims), dt, format}, engine}) {}
+
+    Tensor(mkldnn::memory::primitive_desc pd) {
+        auto md = pd.desc().data;
+        ndims_ = md.ndims;
+        dims_.assign(md.dims, md.dims + md.ndims);
+        type_ = to_tensor_type(md.data_type);
+        size_ = std::accumulate(md.dims, md.dims + md.ndims, 1
+                , std::multiplies<int>());
+        data_ = std::shared_ptr<avx::byte>(new avx::byte [len()]
+                , [] (avx::byte *p) {delete [] p;});
+        mm_fmt_ = md.format;
+        memory::data_type dt = to_mkldnn_type();
+        mem_.reset(new mkldnn::memory(
+                    { { { dims_ }, dt , static_cast<memory::format>(mm_fmt_) }
+                    , cpu_engine }, data_.get()));
+    }
+
+    inline void reset_memory(mkldnn_memory_format_t mkldnn_mfmt, avx::byte *data) {
+        mm_fmt_ = mkldnn_mfmt;
+        data_.reset(data);
+        memory::data_type dt = to_mkldnn_type();
+        mem_.reset(new mkldnn::memory(
+                    { { { dims_ }, dt, static_cast<memory::format>(mm_fmt_) }
+                    , cpu_engine }, data_.get()));
+    }
+
+    inline void reset_memory(mkldnn_memory_format_t mkldnn_mfmt, shared_ptr<avx::byte> data) {
+        mm_fmt_ = mkldnn_mfmt;
+        data_ = data;
+        memory::data_type dt = to_mkldnn_type();
+        mem_.reset(new mkldnn::memory(
+                    { { { dims_ }, dt, static_cast<memory::format>(mm_fmt_) }
+                    , cpu_engine }, data_.get()));
+    }
+
+    inline void reset_memory(mkldnn_memory_format_t mkldnn_mfmt, vector<int> dims) {
+        mm_fmt_ = mkldnn_mfmt;
+        memory::data_type dt = to_mkldnn_type();
+        mem_.reset(new mkldnn::memory(
+                    { { { dims }, dt, static_cast<memory::format>(mm_fmt_) }
+                    , cpu_engine }, data_.get()));
+    }
+
+    inline size_t len() {
+        return size_ * type2size(type_);
+    }
+
+    inline bool incompatible() const {
+        return (public_format(mm_fmt_) != mm_fmt_);
+    }
+
+    inline memory::data_type to_mkldnn_type() const {
+        memory::data_type type;
+        switch (type_) {
+            case FLOAT32:
+                type = memory::data_type::f32;
+                break;
+            case SINT32:
+                type = memory::data_type::s32;
+                break;
+            case SINT16:
+                type = memory::data_type::s16;
+                break;
+            case SINT8:
+                type = memory::data_type::s8;
+                break;
+            case UINT8:
+                type = memory::data_type::u8;
+                break;
+            default:
+                type = memory::data_undef;
+                break;
+        }
+        return type;
+    }
+
+    inline data_type_t to_tensor_type(mkldnn_data_type_t type) const {
+        data_type_t dt;
+        switch (type) {
+            case mkldnn_f32:
+                dt = FLOAT32;
+                break;
+            case mkldnn_s32:
+                dt = SINT32;
+                break;
+            case mkldnn_s16:
+                dt = SINT16;
+                break;
+            case mkldnn_s8:
+                dt = SINT8;
+                break;
+            case mkldnn_u8:
+                dt = UINT8;
+                break;
+            default:
+                dt = UNKNOWN_TYPE;
+                break;
+        }
+        return dt;
+    }
+
+    inline void *data() const { return data_.get(); }
+    inline std::shared_ptr<avx::byte> share_data() const {
+        return data_;
+    }
+
+    inline size_type size() const { return size_; }
+    inline mkldnn::engine get_engine() const {
+        return cpu_engine;
+    }
+
+    inline int ndims() const {
+        return ndims_;
+    }
+
+    inline vector<int> dims() const {
+        return dims_;
+    }
+
+    inline data_type_t type() const {
+        return type_;
+    }
+
+    inline mkldnn::memory mkldnn_memory() const {
+        return *(to_mkldnn_memory());
+    }
+
+    inline memory::desc desc() const {
+        return to_mkldnn_memory()->get_primitive_desc().desc();
+    }
+
+    inline mkldnn_memory_format_t format() const {
+        return mm_fmt_;
+    }
+
+    inline mkldnn::memory::format cxx_format() const {
+        return static_cast<mkldnn::memory::format>(mm_fmt_);
+    }
+
+    inline mkldnn::memory::dims cxx_dims() const {
+        mkldnn::memory::dims ret(dims_.begin(), dims_.begin() + ndims_);
+        return ret;
+    }
+
+    inline mkldnn::memory::data_type cxx_data_type() const {
+        return static_cast<mkldnn::memory::data_type>(to_mkldnn_type());
+    }
+
+    inline Tensor *reshape(vector<int> dims) {
+        int ndims = dims.size();
+        // Reorder to public format
+        mkldnn_memory_format_t public_fmt = public_format(mm_fmt_);
+        if (public_fmt != mm_fmt_) {
+            //printf("reorder----\n");
+            memory::data_type dt = to_mkldnn_type();
+            auto data = new avx::byte [len()];
+            auto mem = mkldnn::memory(
+                    { { { dims_ }, dt, static_cast<memory::format>(public_fmt) }
+                    , cpu_engine }, data);
+            
+            auto reorder_prim = reorder(*mem_, mem);
+            std::vector<mkldnn::primitive> prims = { reorder_prim };
+            mkldnn::stream s(mkldnn::stream::kind::eager);
+            s.submit(prims).wait();
+
+            reset_memory(public_fmt, data);
+        }
+
+        return new Tensor(ndims, dims, data_, type_);
+    }
+
+    inline bool copyto(Tensor *src) {
+        if ((src->type() != type()) || (src->dims() != dims())) {
+            return false;
+        }
+        mm_fmt_ = src->format(); 
+        fast_memcpy((char*)data_.get(), (char*)src->data(), len());
+        memory::data_type dt = to_mkldnn_type();
+        mem_.reset(new mkldnn::memory(
+                    { { { dims_ }, dt, static_cast<memory::format>(mm_fmt_) }
+                    , cpu_engine }, data_.get()));
+        return true;
+    }
+
+    inline void copyto(char *src) {
+        mm_fmt_ = public_format(mm_fmt_);
+        fast_memcpy((char*)data_.get(), src, len());
+        memory::data_type dt = to_mkldnn_type();
+        mem_.reset(new mkldnn::memory(
+                    { { { dims_ }, dt, static_cast<memory::format>(mm_fmt_) }
+                    , cpu_engine }, data_.get()));
+        return;
+    }
+
+    Tensor * sum(vector<int> axis);
+
+protected:
+    int ndims_;
+    vector<int> dims_;
+    data_type_t type_;
+    size_t size_;
+    std::shared_ptr<avx::byte> data_;
+
+    mkldnn_memory_format_t mm_fmt_;
+    std::shared_ptr<mkldnn::memory> mem_;
+private:
+    inline shared_ptr<mkldnn::memory> to_mkldnn_memory() const {
+        return mem_;
+    }
+};
diff --git a/python/ideep4py/include/primitives/bn.h b/python/ideep4py/include/primitives/bn.h
new file mode 100644
index 00000000..e9c3da82
--- /dev/null
+++ b/python/ideep4py/include/primitives/bn.h
@@ -0,0 +1,55 @@
+/*
+ *Copyright (c) 2018 Intel Corporation.
+ *
+ *Permission is hereby granted, free of charge, to any person obtaining a copy
+ *of this software and associated documentation files (the "Software"), to deal
+ *in the Software without restriction, including without limitation the rights
+ *to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ *copies of the Software, and to permit persons to whom the Software is
+ *furnished to do so, subject to the following conditions:
+ *
+ *The above copyright notice and this permission notice shall be included in
+ *all copies or substantial portions of the Software.
+ *
+ *THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ *OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ *THE SOFTWARE.
+ *
+ */
+
+
+#ifndef _BN_H_
+#define _BN_H_
+
+#include <mkldnn.hpp>
+#include <vector>
+#include <memory>
+#include "layer.h"
+#include "tensor.h"
+
+template <typename T>
+class batch_normalization : public Layer<T>
+{
+public:
+    batch_normalization() {};
+    ~batch_normalization() {};
+
+    static std::vector<Tensor *> Forward(Tensor *src,
+                                         Tensor *w,
+                                         Tensor *mean,
+                                         Tensor *var,
+                                         float eps);
+
+    static std::vector<Tensor *> Backward(Tensor *src,
+                                          Tensor *diff_dst,
+                                          Tensor *mean,
+                                          Tensor *var,
+                                          Tensor *w,
+                                          float eps);
+};
+
+#endif
diff --git a/python/ideep4py/include/primitives/concat.h b/python/ideep4py/include/primitives/concat.h
new file mode 100644
index 00000000..ea449ca2
--- /dev/null
+++ b/python/ideep4py/include/primitives/concat.h
@@ -0,0 +1,62 @@
+/*
+ *Copyright (c) 2018 Intel Corporation.
+ *
+ *Permission is hereby granted, free of charge, to any person obtaining a copy
+ *of this software and associated documentation files (the "Software"), to deal
+ *in the Software without restriction, including without limitation the rights
+ *to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ *copies of the Software, and to permit persons to whom the Software is
+ *furnished to do so, subject to the following conditions:
+ *
+ *The above copyright notice and this permission notice shall be included in
+ *all copies or substantial portions of the Software.
+ *
+ *THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ *OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ *THE SOFTWARE.
+ *
+ */
+
+
+#ifndef _CONCAT_H_
+#define _CONCAT_H_
+
+#include <mkldnn.hpp>
+#include <vector>
+#include <memory>
+#include "layer.h"
+#include "op_param.h"
+#include "tensor.h"
+
+template <typename T>
+class Concat : public Layer<T>
+{
+public:
+    Concat();
+    ~Concat();
+
+    /*
+     * Concat Forward
+     * params:
+     * src: input vector
+     * axis
+     */
+    static Tensor *Forward(std::vector<Tensor*> src, int axis); 
+
+    /*
+     * Concat backward
+     * params:
+     * diff_dst: input vector
+     * axis
+     */
+    static std::vector<Tensor*> Backward(Tensor *diff_dst, std::vector<int> offsets, int axis); 
+};
+
+#endif // _CONCAT_H_
+
+
+// vim: et ts=4 sw=4 cindent cino^=l0,\:0,N-s
diff --git a/python/ideep4py/include/primitives/conv.h b/python/ideep4py/include/primitives/conv.h
new file mode 100644
index 00000000..7369eefd
--- /dev/null
+++ b/python/ideep4py/include/primitives/conv.h
@@ -0,0 +1,98 @@
+/*
+ *Copyright (c) 2018 Intel Corporation.
+ *
+ *Permission is hereby granted, free of charge, to any person obtaining a copy
+ *of this software and associated documentation files (the "Software"), to deal
+ *in the Software without restriction, including without limitation the rights
+ *to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ *copies of the Software, and to permit persons to whom the Software is
+ *furnished to do so, subject to the following conditions:
+ *
+ *The above copyright notice and this permission notice shall be included in
+ *all copies or substantial portions of the Software.
+ *
+ *THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ *OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ *THE SOFTWARE.
+ *
+ */
+
+
+#ifndef _CONV_H_
+#define _CONV_H_
+
+#include <mkldnn.hpp>
+#include <vector>
+#include <memory>
+#include "layer.h"
+#include "op_param.h"
+#include "tensor.h"
+
+template <typename T>
+class Convolution2D : public Layer<T>
+{
+public:
+    Convolution2D();
+    ~Convolution2D();
+
+    /*
+     * Convolution Forward
+     * Y = W*X + b
+     * params:
+     * src: input, x
+     * weight: weights, w
+     * dst: output, y
+     * bias: bias, b
+     * cp: convolution parameters
+     */
+    static Tensor *Forward(Tensor *src, 
+                          Tensor *weights, 
+                          Tensor *bias,
+                          conv_param_t *cp);
+
+    /*
+     * Convolution backward weights
+     * gW = gy*x
+     * params:
+     * src: input, x
+     * diff_dst: diff dst, gy
+     * cp: convolution parameters
+     */
+    static Tensor *BackwardWeights(Tensor *src,
+                                   Tensor *diff_dst,
+                                   conv_param_t *cp);
+
+    /*
+     * Convolution backward weights & bias
+     * gW = gy*x
+     * params:
+     * src: input, x
+     * diff_dst: diff dst, gy
+     * cp: convolution parameters
+     */
+    static std::vector<Tensor *> BackwardWeightsBias(Tensor *src,
+                                                     Tensor *diff_dst,
+                                                     conv_param_t *cp);
+
+    /*
+     * Convolution backward data
+     * gx = gy*w
+     * param:
+     * weights: weights, w
+     * diff_dst: diff dst, gy
+     * cp: convolution parameters
+     */
+    static Tensor *BackwardData(Tensor *weights, 
+                               Tensor *diff_dst,
+                               conv_param_t *cp);
+
+};
+
+#endif // _CONV_H_
+
+
+// vim: et ts=4 sw=4 cindent cino^=l0,\:0,N-s
diff --git a/python/ideep4py/include/primitives/dropout.h b/python/ideep4py/include/primitives/dropout.h
new file mode 100644
index 00000000..24af68c6
--- /dev/null
+++ b/python/ideep4py/include/primitives/dropout.h
@@ -0,0 +1,58 @@
+/*
+ *Copyright (c) 2018 Intel Corporation.
+ *
+ *Permission is hereby granted, free of charge, to any person obtaining a copy
+ *of this software and associated documentation files (the "Software"), to deal
+ *in the Software without restriction, including without limitation the rights
+ *to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ *copies of the Software, and to permit persons to whom the Software is
+ *furnished to do so, subject to the following conditions:
+ *
+ *The above copyright notice and this permission notice shall be included in
+ *all copies or substantial portions of the Software.
+ *
+ *THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ *OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ *THE SOFTWARE.
+ *
+ */
+
+
+#pragma once
+
+#include <mkldnn.hpp>
+#include <vector>
+#include <memory>
+#include "layer.h"
+#include "tensor.h"
+
+template <typename T>
+class Dropout : public Layer<T> {
+public:
+    /*
+     * Dropout Forward
+     * params:
+     * x: input
+     * ratio: input, dropout ratio
+     * y: output, vector. [0]: mask, [1]: y
+     * y = mask*x
+     */
+    static std::vector<Tensor*> Forward(Tensor* x, float ratio);
+
+    /*
+     * Dropout backward
+     * params:
+     * mask: input, dropout mask generated in the forward
+     * gy: input
+     * gx: output
+     * gx = mask*gy
+     */
+    static Tensor* Backward(Tensor* mask, Tensor* gy);
+};
+
+
+// vim: et ts=4 sw=4 cindent cino^=l0,\:0,N-s
diff --git a/python/ideep4py/include/primitives/eltwise.h b/python/ideep4py/include/primitives/eltwise.h
new file mode 100644
index 00000000..71d5fdd6
--- /dev/null
+++ b/python/ideep4py/include/primitives/eltwise.h
@@ -0,0 +1,80 @@
+/*
+ *Copyright (c) 2018 Intel Corporation.
+ *
+ *Permission is hereby granted, free of charge, to any person obtaining a copy
+ *of this software and associated documentation files (the "Software"), to deal
+ *in the Software without restriction, including without limitation the rights
+ *to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ *copies of the Software, and to permit persons to whom the Software is
+ *furnished to do so, subject to the following conditions:
+ *
+ *The above copyright notice and this permission notice shall be included in
+ *all copies or substantial portions of the Software.
+ *
+ *THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ *OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ *THE SOFTWARE.
+ *
+ */
+
+
+#pragma once
+
+#include <mkldnn.hpp>
+#include <vector>
+#include <memory>
+#include "layer.h"
+#include "tensor.h"
+
+typedef enum _eltwise_algorithm {
+    ELTWISE_RELU = mkldnn::eltwise_relu,
+    ELTWISE_TANH = mkldnn::eltwise_tanh,
+    ELTWISE_ELU = mkldnn::eltwise_elu,
+    ELTWISE_SQUARE = mkldnn::eltwise_square,
+    ELTWISE_ABS = mkldnn::eltwise_abs,
+    ELTWISE_SQRT = mkldnn::eltwise_sqrt,
+    ELTWISE_LINEAR = mkldnn::eltwise_linear,
+    ELTWISE_BOUNDED_RELU = mkldnn::eltwise_bounded_relu,
+    ELTWISE_SOFT_RELU = mkldnn::eltwise_soft_relu,
+    ELTWISE_LOGISTIC = mkldnn::eltwise_logistic,
+} eltwise_algorithm_t;
+
+
+static inline mkldnn::algorithm ideepy2mkldnn_eltwise_algorithm(eltwise_algorithm_t alg_kind) {
+    return (mkldnn::algorithm)alg_kind;
+}
+
+template <typename...> class Eltwise;
+template <typename T1, typename T2>
+class Eltwise<T1, T2> : public Layer<T1>
+{
+public:
+    Eltwise();
+    ~Eltwise();
+    
+    /*
+     * Eltwise Forward
+     * params:
+     * src: input, x
+     * dst: output, y
+     * y = max(x, 0)
+     */
+    static Tensor *Forward(Tensor *src, eltwise_algorithm_t alg_kind, T2 alpha, T2 beta); 
+
+    /*
+     * Eltwise backward data
+     * params:
+     * src: input, x
+     * diff_dst: input, gy
+     * dst: output, gx
+     * gx = gy*y
+     */
+    static Tensor *Backward(Tensor *src, Tensor *diff_dst, eltwise_algorithm_t alg_kind, T2 alpha, T2 beta);
+};
+
+
+// vim: et ts=4 sw=4 cindent cino^=l0,\:0,N-s
diff --git a/python/ideep4py/include/primitives/layer.h b/python/ideep4py/include/primitives/layer.h
new file mode 100644
index 00000000..6ab412dd
--- /dev/null
+++ b/python/ideep4py/include/primitives/layer.h
@@ -0,0 +1,40 @@
+/*
+ *Copyright (c) 2018 Intel Corporation.
+ *
+ *Permission is hereby granted, free of charge, to any person obtaining a copy
+ *of this software and associated documentation files (the "Software"), to deal
+ *in the Software without restriction, including without limitation the rights
+ *to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ *copies of the Software, and to permit persons to whom the Software is
+ *furnished to do so, subject to the following conditions:
+ *
+ *The above copyright notice and this permission notice shall be included in
+ *all copies or substantial portions of the Software.
+ *
+ *THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ *OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ *THE SOFTWARE.
+ *
+ */
+
+
+#ifndef _LAYER_H_
+#define _LAYER_H_
+
+#include <mkldnn.hpp>
+#include <vector>
+
+template <typename T>
+class Layer {
+public:
+    virtual ~Layer() {}
+};
+
+#endif // _LAYER_H_
+
+
+// vim: et ts=4 sw=4 cindent cino^=l0,\:0,N-s
diff --git a/python/ideep4py/include/primitives/linear.h b/python/ideep4py/include/primitives/linear.h
new file mode 100644
index 00000000..f329118f
--- /dev/null
+++ b/python/ideep4py/include/primitives/linear.h
@@ -0,0 +1,74 @@
+/*
+ *Copyright (c) 2018 Intel Corporation.
+ *
+ *Permission is hereby granted, free of charge, to any person obtaining a copy
+ *of this software and associated documentation files (the "Software"), to deal
+ *in the Software without restriction, including without limitation the rights
+ *to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ *copies of the Software, and to permit persons to whom the Software is
+ *furnished to do so, subject to the following conditions:
+ *
+ *The above copyright notice and this permission notice shall be included in
+ *all copies or substantial portions of the Software.
+ *
+ *THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ *OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ *THE SOFTWARE.
+ *
+ */
+
+
+#ifndef _LINEAR_H_
+#define _LINEAR_H_
+
+#include <mkldnn.hpp>
+#include <vector>
+#include <memory>
+#include "layer.h"
+#include "op_param.h"
+#include "tensor.h"
+
+template <typename T>
+class Linear : public Layer<T>
+{
+public:
+    Linear();
+    ~Linear();
+    /*
+     *Linear forward
+     * Y = W*X + b
+     * params:
+     * src: input, x
+     * weights: weights, w
+     * dst: output, y
+     * bias: bias, b
+     */
+    static Tensor *Forward( Tensor* src,
+                            Tensor* weights,
+                            Tensor* bias);
+    /*
+     * Linear backward weights
+     * gW = gy*x
+     * params:
+     * src: input, x
+     * diff_dst: diff dst, gy
+     */
+    static std::vector<Tensor*> BackwardWeights(Tensor* src,
+                                                Tensor* diff_dst,
+                                                bool need_bias);
+    /*
+     * Linear backward data
+     * gx = gy*w
+     * param:
+     * weights: weights, w
+     * diff_dst: diff dst, gy
+     */
+    static Tensor *BackwardData(Tensor* weights,
+                                Tensor* diff_dst);
+};
+#endif //_LINEAR_H_
+
diff --git a/python/ideep4py/include/primitives/lrn.h b/python/ideep4py/include/primitives/lrn.h
new file mode 100755
index 00000000..293193b0
--- /dev/null
+++ b/python/ideep4py/include/primitives/lrn.h
@@ -0,0 +1,69 @@
+/*
+ *Copyright (c) 2018 Intel Corporation.
+ *
+ *Permission is hereby granted, free of charge, to any person obtaining a copy
+ *of this software and associated documentation files (the "Software"), to deal
+ *in the Software without restriction, including without limitation the rights
+ *to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ *copies of the Software, and to permit persons to whom the Software is
+ *furnished to do so, subject to the following conditions:
+ *
+ *The above copyright notice and this permission notice shall be included in
+ *all copies or substantial portions of the Software.
+ *
+ *THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ *OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ *THE SOFTWARE.
+ *
+ */
+
+
+#ifndef _LRN_H_
+#define _LRN_H_
+
+#include <mkldnn.hpp>
+#include <vector>
+#include <memory>
+#include "layer.h"
+#include "op_param.h"
+#include "tensor.h"
+
+template <typename T>
+class LocalResponseNormalization : public Layer<T>
+{
+public:
+    LocalResponseNormalization();
+    ~LocalResponseNormalization();
+
+    /*
+     * Lrn Forward
+     * params:
+     * src: input, x
+     * pp: lrn parameters
+     *
+     * ret
+     * vector<Tensor*>:
+     * return dst and workspace
+     */
+    static std::vector<Tensor *> Forward(Tensor *src, lrn_param_t *pp);
+
+    /*
+     * Lrn backward
+     * param:
+     * src: x
+     * diff_dst: diff dst, gy
+     * pp: lrn parameters
+     * return diff_src gx
+     */
+    static Tensor *Backward(Tensor* src, Tensor *diff_dst, Tensor *ws, lrn_param_t* pp);
+
+};
+
+#endif // _LRN_H_
+
+
+// vim: et ts=4 sw=4 cindent cino^=l0,\:0,N-s
diff --git a/python/ideep4py/include/primitives/op_param.h b/python/ideep4py/include/primitives/op_param.h
new file mode 100644
index 00000000..dff288f4
--- /dev/null
+++ b/python/ideep4py/include/primitives/op_param.h
@@ -0,0 +1,66 @@
+/*
+ *Copyright (c) 2018 Intel Corporation.
+ *
+ *Permission is hereby granted, free of charge, to any person obtaining a copy
+ *of this software and associated documentation files (the "Software"), to deal
+ *in the Software without restriction, including without limitation the rights
+ *to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ *copies of the Software, and to permit persons to whom the Software is
+ *furnished to do so, subject to the following conditions:
+ *
+ *The above copyright notice and this permission notice shall be included in
+ *all copies or substantial portions of the Software.
+ *
+ *THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ *OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ *THE SOFTWARE.
+ *
+ */
+
+
+#ifndef _OP_PARAM_H_
+#define _OP_PARAM_H_
+
+#include <vector>
+
+struct conv_param_t {
+    std::vector<int> out_dims;
+    int kh, kw; // kernel size
+    int dilate_y = 0, dilate_x = 0; // in MKL-DNN, common conv is treated as 0 dilate
+    int sy, sx; // stride
+    int pad_lh, pad_lw, pad_rh, pad_rw; //padding
+};
+
+struct pooling_param_t {
+    std::vector<int> out_dims;
+    int kh, kw; // kernel size
+    int sy, sx; // stride
+    int pad_lh, pad_lw, pad_rh, pad_rw; //padding
+
+    enum algorithm {
+        pooling_max,
+        pooling_avg,
+        pooling_avg_include_padding,
+        pooling_avg_exclude_padding,
+    } algo_kind;
+};
+
+struct lrn_param_t {
+    int n; // local size
+    double k;
+    double alpha;
+    double beta;
+
+    enum algorithm {
+        lrn_across_channels,
+        lrn_within_channel,
+    } algo_kind;
+};
+#endif // _OP_PARAM_H_
+
+
+// vim: et ts=4 sw=4 cindent cino^=l0,\:0,N-s
diff --git a/python/ideep4py/include/primitives/ops/bn_bwd.h b/python/ideep4py/include/primitives/ops/bn_bwd.h
new file mode 100644
index 00000000..bd355129
--- /dev/null
+++ b/python/ideep4py/include/primitives/ops/bn_bwd.h
@@ -0,0 +1,95 @@
+/*
+ *Copyright (c) 2018 Intel Corporation.
+ *
+ *Permission is hereby granted, free of charge, to any person obtaining a copy
+ *of this software and associated documentation files (the "Software"), to deal
+ *in the Software without restriction, including without limitation the rights
+ *to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ *copies of the Software, and to permit persons to whom the Software is
+ *furnished to do so, subject to the following conditions:
+ *
+ *The above copyright notice and this permission notice shall be included in
+ *all copies or substantial portions of the Software.
+ *
+ *THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ *OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ *THE SOFTWARE.
+ *
+ */
+
+
+#ifndef _BN_BWD_H_
+#define _BN_BWD_H_
+
+#include <mkldnn.hpp>
+#include <vector>
+#include "op.h"
+
+template <typename T>
+class batch_normalization_bwd : public Op<T> {
+public:
+    batch_normalization_bwd(mkldnn::memory::dims src_d,
+                            mkldnn::memory::dims diff_dst_d,
+                            float eps, bool scale_shift) :
+                            flags_(0), bn_size_(src_d[1]),
+                            bn_bwd_(nullptr), src_mem_(nullptr),
+                            diff_dst_mem_(nullptr), mean_mem_(nullptr),
+                            var_mem_(nullptr), w_mem_(nullptr),
+                            diff_src_mem_(nullptr), diff_w_mem_(nullptr),
+                            bwd_stream_(new mkldnn::stream(mkldnn::stream::kind::eager)) {
+        setup(src_d, diff_dst_d, eps, scale_shift);
+    }
+
+    ~batch_normalization_bwd() {}
+
+    void setup(mkldnn::memory::dims src_d,
+               mkldnn::memory::dims diff_dst_d,
+               float eps, bool scale_shift);
+
+    void execute(void *src, void *diff_dst, void *mean,
+                 void *var, void *w, void *diff_src, void *diff_w);
+
+public:
+    mkldnn_memory_format_t get_src_fmt() {
+        return (*src_mem_).get_primitive_desc().desc().data.format;
+    }
+
+    mkldnn_memory_format_t get_diff_dst_fmt() {
+        return (*diff_dst_mem_).get_primitive_desc().desc().data.format;
+    }
+
+    mkldnn_memory_format_t get_diff_src_fmt() {
+        return (*diff_src_mem_).get_primitive_desc().desc().data.format;
+    }
+
+    mkldnn_memory_format_t get_diff_w_fmt() {
+        return (*diff_w_mem_).get_primitive_desc().desc().data.format;
+    }
+
+private:
+    unsigned long flags_;
+    int bn_size_;
+
+    std::shared_ptr<mkldnn::primitive> bn_bwd_;
+
+    std::shared_ptr<mkldnn::memory> src_mem_;
+    std::shared_ptr<mkldnn::memory> diff_dst_mem_;
+    std::shared_ptr<mkldnn::memory> mean_mem_;
+    std::shared_ptr<mkldnn::memory> var_mem_;
+    std::shared_ptr<mkldnn::memory> w_mem_;
+    std::shared_ptr<mkldnn::memory> diff_src_mem_;
+    std::shared_ptr<mkldnn::memory> diff_w_mem_;
+
+    std::vector<mkldnn::primitive> bwd_primitives_;
+    std::shared_ptr<mkldnn::stream> bwd_stream_;
+
+    mkldnn::memory::desc get_desc_data(mkldnn::memory m) {
+        return m.get_primitive_desc().desc().data;
+    }
+};
+
+#endif // _BN_BWD_H_
diff --git a/python/ideep4py/include/primitives/ops/bn_fwd.h b/python/ideep4py/include/primitives/ops/bn_fwd.h
new file mode 100644
index 00000000..4fcb0ee5
--- /dev/null
+++ b/python/ideep4py/include/primitives/ops/bn_fwd.h
@@ -0,0 +1,113 @@
+/*
+ *Copyright (c) 2018 Intel Corporation.
+ *
+ *Permission is hereby granted, free of charge, to any person obtaining a copy
+ *of this software and associated documentation files (the "Software"), to deal
+ *in the Software without restriction, including without limitation the rights
+ *to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ *copies of the Software, and to permit persons to whom the Software is
+ *furnished to do so, subject to the following conditions:
+ *
+ *The above copyright notice and this permission notice shall be included in
+ *all copies or substantial portions of the Software.
+ *
+ *THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ *OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ *THE SOFTWARE.
+ *
+ */
+
+
+#ifndef _BN_FWD_H_
+#define _BN_FWD_H_
+
+#include <mkldnn.hpp>
+#include <vector>
+#include "op.h"
+
+template <typename T>
+class batch_normalization_fwd : public Op<T> {
+public:
+    batch_normalization_fwd(mkldnn::memory::dims src_d,
+                            float eps,
+                            bool scale_shift,
+                            bool global_stats,
+                            bool training) :
+                            flags_(0), pkind_(mkldnn::forward_training),
+                            bn_size_(src_d[1]), bn_fwd_(nullptr), src_mem_(nullptr),
+                            w_mem_(nullptr), dst_mem_(nullptr),
+                            mean_mem_(nullptr), var_mem_(nullptr),
+                            fwd_stream_(new mkldnn::stream(mkldnn::stream::kind::eager)) {
+        setup(src_d, eps, scale_shift, global_stats, training);
+    }
+
+    ~batch_normalization_fwd() {}
+
+    void setup(mkldnn::memory::dims src_d, float eps,
+               bool scale_shift, bool global_stats, bool training);
+
+    void execute(void *src, void *w, void *dst, void *mean, void *var);
+
+public:
+    mkldnn_memory_format_t get_src_fmt() {
+        return (*src_mem_).get_primitive_desc().desc().data.format;
+    }
+
+    mkldnn_memory_format_t get_dst_fmt() {
+        return (*dst_mem_).get_primitive_desc().desc().data.format;
+    }
+
+    mkldnn_memory_format_t get_mean_fmt() {
+        return (*mean_mem_).get_primitive_desc().desc().data.format;
+    }
+
+    int get_mean_ndims() {
+        return static_cast<int>((*mean_mem_).get_primitive_desc().desc().data.ndims);
+    }
+
+    mkldnn::memory::dims get_mean_dims() {
+        std::vector<int> dims;
+        dims.push_back(bn_size_);
+        return dims;
+    }
+
+    mkldnn_memory_format_t get_var_fmt() {
+        return (*var_mem_).get_primitive_desc().desc().data.format;
+    }
+
+    int get_var_ndims() {
+        return static_cast<int>((*var_mem_).get_primitive_desc().desc().data.ndims);
+    }
+
+    mkldnn::memory::dims get_var_dims() {
+        std::vector<int> dims;
+        dims.push_back(bn_size_);
+        return dims;
+    }
+
+private:
+    unsigned long flags_;
+    mkldnn::prop_kind pkind_;
+    int bn_size_;
+
+    std::shared_ptr<mkldnn::primitive> bn_fwd_;
+
+    std::shared_ptr<mkldnn::memory> src_mem_;
+    std::shared_ptr<mkldnn::memory> w_mem_;
+    std::shared_ptr<mkldnn::memory> dst_mem_;
+    std::shared_ptr<mkldnn::memory> mean_mem_;
+    std::shared_ptr<mkldnn::memory> var_mem_;
+
+    std::vector<mkldnn::primitive> fwd_primitives_;
+    std::shared_ptr<mkldnn::stream> fwd_stream_;
+
+    mkldnn::memory::desc get_desc_data(mkldnn::memory m) {
+        return m.get_primitive_desc().desc().data;
+    }
+};
+
+#endif // _BN_FWD_H_
diff --git a/python/ideep4py/include/primitives/ops/concat_bwd.h b/python/ideep4py/include/primitives/ops/concat_bwd.h
new file mode 100644
index 00000000..e7ae06b8
--- /dev/null
+++ b/python/ideep4py/include/primitives/ops/concat_bwd.h
@@ -0,0 +1,86 @@
+/*
+ *Copyright (c) 2018 Intel Corporation.
+ *
+ *Permission is hereby granted, free of charge, to any person obtaining a copy
+ *of this software and associated documentation files (the "Software"), to deal
+ *in the Software without restriction, including without limitation the rights
+ *to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ *copies of the Software, and to permit persons to whom the Software is
+ *furnished to do so, subject to the following conditions:
+ *
+ *The above copyright notice and this permission notice shall be included in
+ *all copies or substantial portions of the Software.
+ *
+ *THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ *OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ *THE SOFTWARE.
+ *
+ */
+
+
+#ifndef _CONCAT_BWD_H_
+#define _CONCAT_BWD_H_
+
+#include <mkldnn.hpp>
+#include <vector>
+#include <memory>
+#include "op.h"
+
+template <typename T>
+class ConcatBwd : public Op<T>
+{
+public:
+    ConcatBwd(std::vector<mkldnn::memory::dims> diff_src_ds,
+              mkldnn::memory::dims diff_dst_d,
+              int axis);
+    ~ConcatBwd();
+
+    /*
+     * Concat backward primitive setup
+     * Params:
+     * src_ds: inputs
+     * dst_d: output, (n, out_c, out_h, out_w)
+     * axis: axis to concat
+     */
+    void setup(std::vector<mkldnn::memory::dims> diff_src_ds,
+               mkldnn::memory::dims diff_dst_d,
+               int axis);
+
+    /*
+     * Concat forward execute with bias
+     */
+    void execute(std::vector<void*> diff_srcs, void *diff_dst);
+
+public:
+    // expected memory format for this primitive instance
+    // forward
+    std::vector<mkldnn::memory::format> diff_src_fmts_;
+    mkldnn::memory::format diff_dst_fmt_;
+    
+private:
+    int axis_;
+
+    //MKLDNN memory
+    //memory desc
+    std::vector<mkldnn::memory> diff_src_mems_; // gxs
+    
+    std::shared_ptr<mkldnn::memory::desc> diff_dst_md_; // gy 
+    std::shared_ptr<mkldnn::memory::primitive_desc> diff_dst_mpd_; // gy 
+    std::shared_ptr<mkldnn::memory> diff_dst_mem_; // gy
+
+    //desc & prmitive desc
+    std::shared_ptr<mkldnn::reorder::primitive_desc> reorder_pd_;
+    std::shared_ptr<mkldnn::reorder> reorder_prim_;
+    
+    std::shared_ptr<mkldnn::stream> bwd_stream_;
+    std::vector<mkldnn::primitive> bwd_primitives_; //bwd primitive vector
+};
+
+#endif // _CONCAT_BWD_H_
+
+
+// vim: et ts=4 sw=4 cindent cino^=l0,\:0,N-s
diff --git a/python/ideep4py/include/primitives/ops/concat_fwd.h b/python/ideep4py/include/primitives/ops/concat_fwd.h
new file mode 100644
index 00000000..e8484337
--- /dev/null
+++ b/python/ideep4py/include/primitives/ops/concat_fwd.h
@@ -0,0 +1,86 @@
+/*
+ *Copyright (c) 2018 Intel Corporation.
+ *
+ *Permission is hereby granted, free of charge, to any person obtaining a copy
+ *of this software and associated documentation files (the "Software"), to deal
+ *in the Software without restriction, including without limitation the rights
+ *to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ *copies of the Software, and to permit persons to whom the Software is
+ *furnished to do so, subject to the following conditions:
+ *
+ *The above copyright notice and this permission notice shall be included in
+ *all copies or substantial portions of the Software.
+ *
+ *THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ *OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ *THE SOFTWARE.
+ *
+ */
+
+
+#ifndef _CONCAT_FWD_H_
+#define _CONCAT_FWD_H_
+
+#include <mkldnn.hpp>
+#include <vector>
+#include <memory>
+#include "op.h"
+
+template <typename T>
+class ConcatFwd : public Op<T>
+{
+public:
+    ConcatFwd(std::vector<mkldnn::memory::dims> src_ds,
+              mkldnn::memory::dims dst_d, int axis);
+    ~ConcatFwd();
+
+    /*
+     * Concat forward primitive setup
+     * Params:
+     * src_ds: inputs
+     * dst_d: output, (n, out_c, out_h, out_w)
+     * axis: axis to concat
+     */
+    void setup(std::vector<mkldnn::memory::dims> src_d, 
+               mkldnn::memory::dims dst_d,
+               int axis);
+
+    /*
+     * Concat forward execute with bias
+     */
+    void execute(std::vector<void*> src, void *dst);
+
+public:
+    // expected memory format for this primitive instance
+    // forward
+    std::vector<mkldnn::memory::format> src_fmts_;
+    mkldnn::memory::format dst_fmt_;
+    
+private:
+    int axis_;
+
+    //MKLDNN memory
+    //forward
+    //memory desc
+    std::vector<mkldnn::memory::primitive_desc> src_mpds_; //xs
+    std::vector<mkldnn::memory> src_mems_;
+    std::vector<mkldnn::primitive::at> src_prim_at_; // xs
+    
+    std::shared_ptr<mkldnn::memory::desc> dst_md_; // y 
+    std::shared_ptr<mkldnn::memory> dst_mem_; //y
+
+    std::shared_ptr<mkldnn::stream> fwd_stream_;
+
+    //desc & prmitive desc
+    std::shared_ptr<mkldnn::concat::primitive_desc> concat_pd_;
+    std::shared_ptr<mkldnn::concat> concat_fwd_;
+};
+
+#endif // _CONCAT_FWD_H_
+
+
+// vim: et ts=4 sw=4 cindent cino^=l0,\:0,N-s
diff --git a/python/ideep4py/include/primitives/ops/conv_bwd_data.h b/python/ideep4py/include/primitives/ops/conv_bwd_data.h
new file mode 100644
index 00000000..765d99f6
--- /dev/null
+++ b/python/ideep4py/include/primitives/ops/conv_bwd_data.h
@@ -0,0 +1,112 @@
+/*
+ *Copyright (c) 2018 Intel Corporation.
+ *
+ *Permission is hereby granted, free of charge, to any person obtaining a copy
+ *of this software and associated documentation files (the "Software"), to deal
+ *in the Software without restriction, including without limitation the rights
+ *to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ *copies of the Software, and to permit persons to whom the Software is
+ *furnished to do so, subject to the following conditions:
+ *
+ *The above copyright notice and this permission notice shall be included in
+ *all copies or substantial portions of the Software.
+ *
+ *THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ *OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ *THE SOFTWARE.
+ *
+ */
+
+
+#ifndef _CONV_BWD_DATA_H_
+#define _CONV_BWD_DATA_H_
+
+#include <mkldnn.hpp>
+#include <vector>
+#include <memory>
+#include "op.h"
+
+template <typename T>
+class Convolution2DBwdData : public Op<T>
+{
+public:
+    Convolution2DBwdData(mkldnn::memory::dims diff_src_d,
+                         mkldnn::memory::dims w_d,
+                         mkldnn::memory::dims diff_dst_d,
+                         int dilate_y, int dilate_x,
+                         int sy, int sx,
+                         int pad_lh, int pad_lw, int pad_rh, int pad_rw);
+    ~Convolution2DBwdData();
+
+    /*
+     * Convolution backward data primitive setup
+     * Params:
+     * diff_src_d: input, (n,c,h,w)
+     * w_d: diff weight, (out_c, in_c, h, w)
+     * diff_dst_d: output, (n, out_c, out_h, out_w)
+     */
+    void setup(mkldnn::memory::dims diff_src_d, 
+               mkldnn::memory::dims w_d,
+               mkldnn::memory::dims diff_dst_d,
+               int dilate_y, int dilate_x,
+               int sy, int sx,
+               int pad_lh, int pad_lw,
+               int pad_rh, int pad_rw);
+
+    /*
+     * Convolution backward weights without bias
+     */
+    void execute(void* diff_src, void* w, void* diff_dst);
+
+public:
+    // expected memory format for this primitive instance
+    // forward
+    mkldnn::memory::format diff_src_fmt_;
+    mkldnn::memory::format weights_fmt_;
+    mkldnn::memory::format diff_dst_fmt_;
+    
+    // convolution primitive
+    std::shared_ptr<mkldnn::primitive> conv_bwd_data_;
+
+private:
+    //MKLDNN memory
+    //backward weights
+    std::shared_ptr<mkldnn::memory> diff_src_mem_; // gx
+    std::shared_ptr<mkldnn::memory> weights_mem_;// W
+    std::shared_ptr<mkldnn::memory> diff_dst_mem_; //gy
+    
+    //
+    std::shared_ptr<mkldnn::stream> bwd_data_stream_;
+    std::vector<mkldnn::primitive> bwd_data_primitives_;
+
+    //desc & prmitive desc
+    //backward weights
+    std::shared_ptr<mkldnn::convolution_backward_data::desc> bwd_data_desc_;
+    std::shared_ptr<mkldnn::convolution_backward_data::primitive_desc> bwd_data_pd_;
+
+    // FIXME
+    // forward hint, will be remove in future
+    std::shared_ptr<mkldnn::convolution_forward::desc> fwd_desc_;
+    std::shared_ptr<mkldnn::convolution_forward::primitive_desc> fwd_pd_;
+
+    //memory dims
+    mkldnn::memory::dims dilates_;
+    mkldnn::memory::dims strides_;
+    mkldnn::memory::dims padding_l_;
+    mkldnn::memory::dims padding_r_;
+
+    //memory desc
+    //forward & backward can share same mem desc
+    std::shared_ptr<mkldnn::memory::desc> diff_src_md_; //gx
+    std::shared_ptr<mkldnn::memory::desc> weights_md_;// W
+    std::shared_ptr<mkldnn::memory::desc> diff_dst_md_; // gy
+};
+
+#endif // _CONV_BWD_DATA_H_
+
+
+// vim: et ts=4 sw=4 cindent cino^=l0,\:0,N-s
diff --git a/python/ideep4py/include/primitives/ops/conv_bwd_weights.h b/python/ideep4py/include/primitives/ops/conv_bwd_weights.h
new file mode 100644
index 00000000..2f16f276
--- /dev/null
+++ b/python/ideep4py/include/primitives/ops/conv_bwd_weights.h
@@ -0,0 +1,118 @@
+/*
+ *Copyright (c) 2018 Intel Corporation.
+ *
+ *Permission is hereby granted, free of charge, to any person obtaining a copy
+ *of this software and associated documentation files (the "Software"), to deal
+ *in the Software without restriction, including without limitation the rights
+ *to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ *copies of the Software, and to permit persons to whom the Software is
+ *furnished to do so, subject to the following conditions:
+ *
+ *The above copyright notice and this permission notice shall be included in
+ *all copies or substantial portions of the Software.
+ *
+ *THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ *OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ *THE SOFTWARE.
+ *
+ */
+
+
+#ifndef _CONV_BWD_WEIGHTS_H_
+#define _CONV_BWD_WEIGHTS_H_
+
+#include <mkldnn.hpp>
+#include <vector>
+#include <memory>
+#include "op.h"
+
+template <typename T>
+class Convolution2DBwdWeights : public Op<T>
+{
+public:
+    Convolution2DBwdWeights(mkldnn::memory::dims src_d, mkldnn::memory::dims diff_w_d,
+                     mkldnn::memory::dims diff_b_d, mkldnn::memory::dims diff_dst_d,
+                     int dilate_y, int dilate_x,
+                     int sy, int sx,
+                     int pad_lh, int pad_lw, int pad_rh, int pad_rw);
+    ~Convolution2DBwdWeights();
+
+    /*
+     * Convolution backward weight primitive setup
+     * Params:
+     * src_d: input, (n,c,h,w)
+     * diff_w_d: diff weight, (out_c, in_c, h, w)
+     * diff_b_d: diff_bias
+     * diff_dst_d: output, (n, out_c, out_h, out_w)
+     */
+    void setup(mkldnn::memory::dims src_d, mkldnn::memory::dims diff_w_d,
+               mkldnn::memory::dims diff_b_d, mkldnn::memory::dims diff_dst_d,
+               int dilate_y, int dilate_x,
+               int sy, int sx,
+               int pad_lh, int pad_lw,
+               int pad_rh, int pad_rw);
+
+    /*
+     * Convolution backward weights with bias
+     */
+    void execute(void* src, void* diff_w, void* diff_b, void* diff_dst);
+
+    /*
+     * Convolution backward weights without bias
+     */
+    void execute(void* src, void* diff_w, void* diff_dst);
+
+public:
+    // expected memory format for this primitive instance
+    // forward
+    mkldnn::memory::format src_fmt_;
+    mkldnn::memory::format diff_weights_fmt_;
+    mkldnn::memory::format diff_dst_fmt_;
+    
+    // convolution primitive
+    std::shared_ptr<mkldnn::primitive> conv_bwd_weights_;
+
+private:
+    //MKLDNN memory
+    //backward weights
+    std::shared_ptr<mkldnn::memory> src_mem_; // x
+    std::shared_ptr<mkldnn::memory> diff_weights_mem_;// gW
+    std::shared_ptr<mkldnn::memory> diff_bias_mem_;// gb
+    std::shared_ptr<mkldnn::memory> diff_dst_mem_; //gy
+    
+    //
+    std::shared_ptr<mkldnn::stream> bwd_weights_stream_;
+    std::vector<mkldnn::primitive> bwd_weights_primitives_;
+
+    //desc & prmitive desc
+    //backward weights
+    std::shared_ptr<mkldnn::convolution_backward_weights::desc> bwd_weights_desc_;
+    std::shared_ptr<mkldnn::convolution_backward_weights::primitive_desc> bwd_weights_pd_;
+
+    // FIXME
+    // forward hint, will be remove in future
+    std::shared_ptr<mkldnn::convolution_forward::desc> fwd_desc_;
+    std::shared_ptr<mkldnn::convolution_forward::primitive_desc> fwd_pd_;
+
+    //memory dims
+    mkldnn::memory::dims dilates_;
+    mkldnn::memory::dims strides_;
+    mkldnn::memory::dims padding_l_;
+    mkldnn::memory::dims padding_r_;
+
+    //memory desc
+    //forward & backward can share same mem desc
+    std::shared_ptr<mkldnn::memory::desc> src_md_; //x
+    std::shared_ptr<mkldnn::memory::desc> diff_weights_md_;// gW
+    std::shared_ptr<mkldnn::memory::desc> diff_bias_md_; // gb
+    std::shared_ptr<mkldnn::memory::desc> diff_dst_md_; // gy
+};
+
+#endif // _CONV_BWD_WEIGHTS_H_
+
+
+// vim: et ts=4 sw=4 cindent cino^=l0,\:0,N-s
diff --git a/python/ideep4py/include/primitives/ops/conv_fwd.h b/python/ideep4py/include/primitives/ops/conv_fwd.h
new file mode 100644
index 00000000..2f2bf216
--- /dev/null
+++ b/python/ideep4py/include/primitives/ops/conv_fwd.h
@@ -0,0 +1,111 @@
+/*
+ *Copyright (c) 2018 Intel Corporation.
+ *
+ *Permission is hereby granted, free of charge, to any person obtaining a copy
+ *of this software and associated documentation files (the "Software"), to deal
+ *in the Software without restriction, including without limitation the rights
+ *to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ *copies of the Software, and to permit persons to whom the Software is
+ *furnished to do so, subject to the following conditions:
+ *
+ *The above copyright notice and this permission notice shall be included in
+ *all copies or substantial portions of the Software.
+ *
+ *THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ *OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ *THE SOFTWARE.
+ *
+ */
+
+
+#ifndef _CONV_FWD_H_
+#define _CONV_FWD_H_
+
+#include <mkldnn.hpp>
+#include <vector>
+#include <memory>
+#include "op.h"
+
+template <typename T>
+class Convolution2DFwd : public Op<T>
+{
+public:
+    Convolution2DFwd(mkldnn::memory::dims src_d, mkldnn::memory::dims w_d,
+                     mkldnn::memory::dims b_d, mkldnn::memory::dims dst_d,
+                     int dilate_y, int dilate_x,
+                     int sy, int sx,
+                     int pad_lh, int pad_lw, int pad_rh, int pad_rw);
+    ~Convolution2DFwd();
+
+    /*
+     * Convolution forward primitive setup
+     * Params:
+     * src_d: input, (n,c,h,w)
+     * W_d: weight, (out_c, in_c, h, w)
+     * b_d: bias, if no bias, expected b_d as None dims ({}), not NULL
+     * dst_d: output, (n, out_c, out_h, out_w)
+     */
+    void setup(mkldnn::memory::dims src_d, mkldnn::memory::dims w_d,
+               mkldnn::memory::dims b_d, mkldnn::memory::dims dst_d,
+               int dilate_y, int dilate_x,
+               int s1, int s2,
+               int pl1, int pl2,
+               int pr1, int pr2);
+
+    /*
+     * Convolution forward execute with bias
+     */
+    void execute(void* src, void* w, void* b, void* dst);
+
+    /*
+     * Convolution forward execute without bias
+     */
+    void execute(void* src, void* w, void* dst);
+
+public:
+    // expected memory format for this primitive instance
+    // forward
+    mkldnn::memory::format src_fmt_;
+    mkldnn::memory::format weights_fmt_;
+    mkldnn::memory::format dst_fmt_;
+    
+    // convolution primitive
+    std::shared_ptr<mkldnn::primitive> conv_fwd_;
+
+private:
+    //MKLDNN memory
+    //forward
+    std::shared_ptr<mkldnn::memory> src_mem_; // x
+    std::shared_ptr<mkldnn::memory> weights_mem_;// W
+    std::shared_ptr<mkldnn::memory> bias_mem_;// b
+    std::shared_ptr<mkldnn::memory> dst_mem_; //y
+
+    std::shared_ptr<mkldnn::stream> fwd_stream_;
+    std::vector<mkldnn::primitive> fwd_primitives_;
+
+    //desc & prmitive desc
+    //forward
+    std::shared_ptr<mkldnn::convolution_forward::desc> fwd_desc_;
+    std::shared_ptr<mkldnn::convolution_forward::primitive_desc> fwd_pd_;
+
+    //memory dims
+    mkldnn::memory::dims dilates_;
+    mkldnn::memory::dims strides_;
+    mkldnn::memory::dims padding_l_;
+    mkldnn::memory::dims padding_r_;
+
+    //memory desc
+    std::shared_ptr<mkldnn::memory::desc> src_md_; //x 
+    std::shared_ptr<mkldnn::memory::desc> weights_md_;// W
+    std::shared_ptr<mkldnn::memory::desc> bias_md_; // b
+    std::shared_ptr<mkldnn::memory::desc> dst_md_; // y 
+};
+
+#endif // _CONV_FWD_H_
+
+
+// vim: et ts=4 sw=4 cindent cino^=l0,\:0,N-s
diff --git a/python/ideep4py/include/primitives/ops/eltwise_bwd.h b/python/ideep4py/include/primitives/ops/eltwise_bwd.h
new file mode 100644
index 00000000..0da8989d
--- /dev/null
+++ b/python/ideep4py/include/primitives/ops/eltwise_bwd.h
@@ -0,0 +1,89 @@
+/*
+ *Copyright (c) 2018 Intel Corporation.
+ *
+ *Permission is hereby granted, free of charge, to any person obtaining a copy
+ *of this software and associated documentation files (the "Software"), to deal
+ *in the Software without restriction, including without limitation the rights
+ *to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ *copies of the Software, and to permit persons to whom the Software is
+ *furnished to do so, subject to the following conditions:
+ *
+ *The above copyright notice and this permission notice shall be included in
+ *all copies or substantial portions of the Software.
+ *
+ *THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ *OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ *THE SOFTWARE.
+ *
+ */
+
+
+#pragma once
+
+#include <mkldnn.hpp>
+#include <vector>
+#include "op.h"
+
+template <typename...> class EltwiseBwd;
+template <typename T1, typename T2>
+class EltwiseBwd<T1, T2> : public Op<T1>
+{
+public:
+    EltwiseBwd(mkldnn::memory::dims src_d, mkldnn::algorithm alg_kind, mkldnn::memory::format dst_diff_fmt, T2 alpha, T2 beta);
+    ~EltwiseBwd();
+
+    /*
+     * Eltwise backward primitive setup
+     * Params:
+     * src_d: input, (n,c,h,w)
+     * dst_d: output, (n, out_c, out_h, out_w)
+     */
+    void setup(mkldnn::memory::dims src_d, mkldnn::algorithm alg_kind, mkldnn::memory::format dst_diff_fmt, T2 alpha, T2 beta);
+
+    /*
+     * Eltwise backward execute
+     */
+    void execute(void* src, void* dst_diff, void *src_diff);
+
+public:
+    // expected memory format for this primitive instance
+    // backward
+    mkldnn::memory::format src_diff_fmt_;
+    
+    // Eltwise primitive
+    std::shared_ptr<mkldnn::primitive> eltwise_bwd_;
+
+private:
+    //MKLDNN memory
+    //backward
+    std::shared_ptr<mkldnn::memory> src_mem_; // x
+    std::shared_ptr<mkldnn::memory> dst_diff_mem_; //gy
+    std::shared_ptr<mkldnn::memory> src_diff_mem_; //gx
+
+    std::shared_ptr<mkldnn::stream> bwd_stream_;
+    std::vector<mkldnn::primitive> bwd_primitives_;
+
+    //desc & prmitive desc
+    //backward
+    std::shared_ptr<mkldnn::eltwise_backward::desc> bwd_desc_;
+    std::shared_ptr<mkldnn::eltwise_backward::primitive_desc> bwd_pd_;
+
+    //memory desc
+    std::shared_ptr<mkldnn::memory::desc> src_md_; //x 
+    std::shared_ptr<mkldnn::memory::desc> dst_diff_md_; // gy 
+
+    //memory primitive desc
+    std::shared_ptr<mkldnn::memory::primitive_desc> src_mpd_; //x 
+    std::shared_ptr<mkldnn::memory::primitive_desc> dst_diff_mpd_; //gy 
+
+    // fwd primitive desc
+    std::shared_ptr<mkldnn::eltwise_forward::desc> fwd_desc_;
+    std::shared_ptr<mkldnn::eltwise_forward::primitive_desc> fwd_pd_;
+};
+
+
+// vim: et ts=4 sw=4 cindent cino^=l0,\:0,N-s
diff --git a/python/ideep4py/include/primitives/ops/eltwise_fwd.h b/python/ideep4py/include/primitives/ops/eltwise_fwd.h
new file mode 100644
index 00000000..12d75d6d
--- /dev/null
+++ b/python/ideep4py/include/primitives/ops/eltwise_fwd.h
@@ -0,0 +1,84 @@
+/*
+ *Copyright (c) 2018 Intel Corporation.
+ *
+ *Permission is hereby granted, free of charge, to any person obtaining a copy
+ *of this software and associated documentation files (the "Software"), to deal
+ *in the Software without restriction, including without limitation the rights
+ *to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ *copies of the Software, and to permit persons to whom the Software is
+ *furnished to do so, subject to the following conditions:
+ *
+ *The above copyright notice and this permission notice shall be included in
+ *all copies or substantial portions of the Software.
+ *
+ *THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ *OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ *THE SOFTWARE.
+ *
+ */
+
+
+#pragma once
+
+#include <mkldnn.hpp>
+#include <vector>
+#include "op.h"
+
+template <typename...> class EltwiseFwd;
+template <typename T1, typename T2>
+class EltwiseFwd<T1, T2> : public Op<T1>
+{
+public:
+    EltwiseFwd(mkldnn::memory::dims src_d, mkldnn::algorithm alg_kind, mkldnn::memory::format src_fmt, T2 alpha, T2 beta);
+    ~EltwiseFwd();
+
+    /*
+     * Eltwise forward primitive setup
+     * Params:
+     * src_d: input, (n,c,h,w)
+     * dst_d: output, (n, out_c, out_h, out_w)
+     */
+    void setup(mkldnn::memory::dims src_d, mkldnn::algorithm alg_kind, mkldnn::memory::format src_fmt, T2 alpha, T2 beta);
+
+    /*
+     * Eltwise forward execute
+     */
+    void execute(void* src, void* dst);
+
+public:
+    // expected memory format for this primitive instance
+    // forward
+    mkldnn::memory::format src_fmt_;
+    mkldnn::memory::format dst_fmt_;
+    
+    // Eltwise primitive
+    std::shared_ptr<mkldnn::primitive> eltwise_fwd_;
+
+private:
+    //MKLDNN memory
+    //forward
+    std::shared_ptr<mkldnn::memory> src_mem_; // x
+    std::shared_ptr<mkldnn::memory> dst_mem_; //y
+
+    std::shared_ptr<mkldnn::stream> fwd_stream_;
+    std::vector<mkldnn::primitive> fwd_primitives_;
+
+    //desc & prmitive desc
+    //forward
+    std::shared_ptr<mkldnn::eltwise_forward::desc> fwd_desc_;
+    std::shared_ptr<mkldnn::eltwise_forward::primitive_desc> fwd_pd_;
+
+    //memory desc
+    std::shared_ptr<mkldnn::memory::desc> src_md_; //x 
+    std::shared_ptr<mkldnn::memory::desc> dst_md_; // y 
+
+    //memory primitive desc
+    std::shared_ptr<mkldnn::memory::primitive_desc> src_mpd_; //x 
+};
+
+
+// vim: et ts=4 sw=4 cindent cino^=l0,\:0,N-s
diff --git a/python/ideep4py/include/primitives/ops/linear_bwd_data.h b/python/ideep4py/include/primitives/ops/linear_bwd_data.h
new file mode 100644
index 00000000..f4cf7d82
--- /dev/null
+++ b/python/ideep4py/include/primitives/ops/linear_bwd_data.h
@@ -0,0 +1,125 @@
+/*
+ *Copyright (c) 2018 Intel Corporation.
+ *
+ *Permission is hereby granted, free of charge, to any person obtaining a copy
+ *of this software and associated documentation files (the "Software"), to deal
+ *in the Software without restriction, including without limitation the rights
+ *to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ *copies of the Software, and to permit persons to whom the Software is
+ *furnished to do so, subject to the following conditions:
+ *
+ *The above copyright notice and this permission notice shall be included in
+ *all copies or substantial portions of the Software.
+ *
+ *THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ *OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ *THE SOFTWARE.
+ *
+ */
+
+
+#ifndef _LINEAR_BWD_DATA_H_
+#define _LINEAR_BWD_DATA_H_
+
+#include <mkldnn.hpp>
+#include <vector>
+#include <memory>
+#include "op.h"
+
+template <typename T>
+class LinearBwdData : public Op<T>
+{
+public:
+    LinearBwdData(mkldnn::memory::dims diff_src_d,
+                  mkldnn::memory::dims w_d,
+                  mkldnn::memory::dims diff_dst_d);
+    ~LinearBwdData();
+    /*
+     * Linear backward data primitive setup
+     * Params:
+     * diff_src_d: input, (n,c,h,w)
+     * w_d: diff weight, (out_c, in_c, h, w)
+     * diff_dst_d: output, (n, out_c, out_h, out_w)
+     */
+    void setup(mkldnn::memory::dims diff_src_d,
+               mkldnn::memory::dims w_d,
+               mkldnn::memory::dims diff_dst_d);
+    /*
+     * Linear backward weights without bias
+     */
+    void execute(void* diff_src, void* w, void* diff_dst);
+public:
+    // expected memory format for this primitive instance
+    // forward
+    mkldnn::memory::format diff_src_fmt_;
+    mkldnn::memory::format weights_fmt_;
+    mkldnn::memory::format diff_dst_fmt_;
+
+    //linear primitive
+    std::shared_ptr<mkldnn::primitive> linear_bwd_data_;
+private:
+    //MKLDNN memory
+    //backward weights
+    std::shared_ptr<mkldnn::memory> diff_src_mem_; // gx
+    std::shared_ptr<mkldnn::memory> weights_mem_;//w
+    std::shared_ptr<mkldnn::memory> diff_dst_mem_; //gy
+
+    //
+    std::shared_ptr<mkldnn::stream> bwd_data_stream_;
+    std::vector<mkldnn::primitive> bwd_data_primitives_;
+    
+    //desc & primitive desc
+    //backward weights
+    std::shared_ptr<mkldnn::inner_product_backward_data::desc> bwd_data_desc_;
+    std::shared_ptr<mkldnn::inner_product_backward_data::primitive_desc> bwd_data_pd_;
+    
+    //FIXME
+    //forward hint, will be removed in the future;
+    std::shared_ptr<mkldnn::inner_product_forward::desc> fwd_desc_;
+    std::shared_ptr<mkldnn::inner_product_forward::primitive_desc> fwd_pd_;
+
+    //memory desc
+    //forward & backward can share the same mem desc
+    std::shared_ptr<mkldnn::memory::desc> diff_src_md_; //gx
+    std::shared_ptr<mkldnn::memory::desc> weights_md_; // W
+    std::shared_ptr<mkldnn::memory::desc> diff_dst_md_; //gy
+};
+#endif //_LINEAR_BWD_DATA_H
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/python/ideep4py/include/primitives/ops/linear_bwd_weights.h b/python/ideep4py/include/primitives/ops/linear_bwd_weights.h
new file mode 100644
index 00000000..62600bd4
--- /dev/null
+++ b/python/ideep4py/include/primitives/ops/linear_bwd_weights.h
@@ -0,0 +1,116 @@
+/*
+ *Copyright (c) 2018 Intel Corporation.
+ *
+ *Permission is hereby granted, free of charge, to any person obtaining a copy
+ *of this software and associated documentation files (the "Software"), to deal
+ *in the Software without restriction, including without limitation the rights
+ *to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ *copies of the Software, and to permit persons to whom the Software is
+ *furnished to do so, subject to the following conditions:
+ *
+ *The above copyright notice and this permission notice shall be included in
+ *all copies or substantial portions of the Software.
+ *
+ *THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ *OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ *THE SOFTWARE.
+ *
+ */
+
+
+#ifndef _LINEAR_BWD_WEIGHTS_H_
+#define _LINEAR_BWD_WEIGHTS_H_
+
+#include <mkldnn.hpp>
+#include <vector>
+#include <memory>
+#include "op.h"
+
+template<typename T>
+class LinearBwdWeights : public Op<T>
+{
+public:
+    LinearBwdWeights(mkldnn::memory::dims src_d, mkldnn::memory::dims diff_w_d,
+                     mkldnn::memory::dims diff_b_d, mkldnn::memory::dims diff_dst_d);
+    ~LinearBwdWeights();
+    /*
+     * Linear backward weights primitive setup
+     * Params:
+     * src_d: input, (n,c,h,w)
+     * diff_w_d: diff weight, (out_c, in_c, h, w)
+     * diff_b_d: diff_bias
+     * diff_dst_d: output, (n, out_c, out_h, out_w)
+     */
+    void setup(mkldnn::memory::dims src_d, mkldnn::memory::dims diff_w_d,
+               mkldnn::memory::dims diff_b_d, mkldnn::memory::dims diff_dst_d);
+    /*
+     * Linear backward wieghts with bias
+     */
+    void execute(void* src, void* diff_w, void* diff_b, void* diff_dst);
+    /*
+     * Linear backward weights without bias
+     */
+    void execute(void* src, void* diff_w, void* diff_dst);
+public:
+    //expected memory format for this primitive instance
+    // forward
+    mkldnn::memory::format src_fmt_;
+    mkldnn::memory::format diff_weights_fmt_;
+    mkldnn::memory::format diff_dst_fmt_;
+    //linear primitive
+    std::shared_ptr<mkldnn::primitive> linear_bwd_weights_;
+private:
+    //MKLDNN memory
+    //backward weights
+    std::shared_ptr<mkldnn::memory> src_mem_;//x
+    std::shared_ptr<mkldnn::memory> diff_weights_mem_; // gw
+    std::shared_ptr<mkldnn::memory> diff_bias_mem_; //gb
+    std::shared_ptr<mkldnn::memory> diff_dst_mem_; // gy
+    //
+    std::shared_ptr<mkldnn::stream> bwd_weights_stream_;
+    std::vector<mkldnn::primitive> bwd_weights_primitives_;
+    //desc & primitive desc
+    //backward weights
+    std::shared_ptr<mkldnn::inner_product_backward_weights::desc> bwd_weights_desc_;
+    std::shared_ptr<mkldnn::inner_product_backward_weights::primitive_desc> bwd_weights_pd_;
+    //FIXME
+    //forward hint, will be removed in future
+    std::shared_ptr<mkldnn::inner_product_forward::desc> fwd_desc_;
+    std::shared_ptr<mkldnn::inner_product_forward::primitive_desc> fwd_pd_;
+
+    //memory desc
+    //forward & backward can share the same mem desc
+    std::shared_ptr<mkldnn::memory::desc> src_md_; //x
+    std::shared_ptr<mkldnn::memory::desc> diff_weights_md_;//gW
+    std::shared_ptr<mkldnn::memory::desc> diff_bias_md_;//gb
+    std::shared_ptr<mkldnn::memory::desc> diff_dst_md_;//gy
+};
+
+#endif //_CONV_BWD_WEIGHTS_H
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/python/ideep4py/include/primitives/ops/linear_fwd.h b/python/ideep4py/include/primitives/ops/linear_fwd.h
new file mode 100644
index 00000000..ae22eef7
--- /dev/null
+++ b/python/ideep4py/include/primitives/ops/linear_fwd.h
@@ -0,0 +1,116 @@
+/*
+ *Copyright (c) 2018 Intel Corporation.
+ *
+ *Permission is hereby granted, free of charge, to any person obtaining a copy
+ *of this software and associated documentation files (the "Software"), to deal
+ *in the Software without restriction, including without limitation the rights
+ *to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ *copies of the Software, and to permit persons to whom the Software is
+ *furnished to do so, subject to the following conditions:
+ *
+ *The above copyright notice and this permission notice shall be included in
+ *all copies or substantial portions of the Software.
+ *
+ *THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ *OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ *THE SOFTWARE.
+ *
+ */
+
+
+#ifndef _LINEAR_FWD_H_
+#define _LINEAR_FWD_H_
+
+#include <mkldnn.hpp>
+#include <vector>
+#include <memory>
+#include "op.h"
+
+template <typename T>
+class LinearFwd : public Op<T>
+{
+public:
+    LinearFwd(mkldnn::memory::dims src_d, mkldnn::memory::dims w_d,
+              mkldnn::memory::dims b_d, mkldnn::memory::dims dst_d);
+    ~LinearFwd();
+    /*Linear forward primitive setup
+     * Params:
+     * src_d: input, (n, c, h, w)
+     * W_d: weight, (out_c, in_c, h, w)
+     * b_d: bias, if no bias, expected b_d as None dims({}), not NULL
+     * dst_d: output, (n, out_c, out_h, out_w)
+     */
+    void setup(mkldnn::memory::dims src_d, mkldnn::memory::dims w_d,
+               mkldnn::memory::dims b_d, mkldnn::memory::dims dst_d);
+    /*
+     * Linear forward execute with bias
+     */
+    void execute(void *src, void* w, void* b, void* dst);
+    /*
+     * Linear forward execute without bias
+     */
+    void execute(void *src, void* w, void* dst);
+public:
+    //expected memory format for this primitive instance
+    //forward
+    mkldnn::memory::format src_fmt_;
+    mkldnn::memory::format weights_fmt_;
+    mkldnn::memory::format dst_fmt_;
+    //linear primitive
+    std::shared_ptr<mkldnn::primitive> linear_fwd_;
+private:
+    //MKLDNN memory
+    //forward
+    std::shared_ptr<mkldnn::memory> src_mem_;// x
+    std::shared_ptr<mkldnn::memory> weights_mem_;// W
+    std::shared_ptr<mkldnn::memory> bias_mem_;// b
+    std::shared_ptr<mkldnn::memory> dst_mem_; // y
+    
+    std::shared_ptr<mkldnn::stream> fwd_stream_;
+    std::vector<mkldnn::primitive> fwd_primitives_;
+
+    //desc & primitive desc
+    //forward
+    std::shared_ptr<mkldnn::inner_product_forward::desc> fwd_desc_;
+    std::shared_ptr<mkldnn::inner_product_forward::primitive_desc> fwd_pd_;
+    //memory desc
+    std::shared_ptr<mkldnn::memory::desc> src_md_;//x
+    std::shared_ptr<mkldnn::memory::desc> weights_md_;//W
+    std::shared_ptr<mkldnn::memory::desc> bias_md_;//b
+    std::shared_ptr<mkldnn::memory::desc> dst_md_;// y
+};
+#endif //__LINEAR_FWD_H_
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/python/ideep4py/include/primitives/ops/lrn_bwd.h b/python/ideep4py/include/primitives/ops/lrn_bwd.h
new file mode 100755
index 00000000..59e024af
--- /dev/null
+++ b/python/ideep4py/include/primitives/ops/lrn_bwd.h
@@ -0,0 +1,106 @@
+/*
+ *Copyright (c) 2018 Intel Corporation.
+ *
+ *Permission is hereby granted, free of charge, to any person obtaining a copy
+ *of this software and associated documentation files (the "Software"), to deal
+ *in the Software without restriction, including without limitation the rights
+ *to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ *copies of the Software, and to permit persons to whom the Software is
+ *furnished to do so, subject to the following conditions:
+ *
+ *The above copyright notice and this permission notice shall be included in
+ *all copies or substantial portions of the Software.
+ *
+ *THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ *OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ *THE SOFTWARE.
+ *
+ */
+
+
+#pragma once
+#ifndef _LRN_BWD_H_
+#define _LRN_BWD_H_
+
+#include <glog/logging.h>
+#include <iostream>
+#include <mkldnn.hpp>
+#include <vector>
+#include "op.h"
+
+template <typename T>
+class LocalResponseNormalizationBwd: public Op<T>{
+public:
+    LocalResponseNormalizationBwd(mkldnn::memory::dims src_d,
+            mkldnn::memory::dims diff_dst_d,
+            mkldnn::memory::dims ws_d,
+            mkldnn::memory::data_type ws_dt,
+            int n, double k, double alpha, double beta,
+            mkldnn::algorithm alg_kind); // alg_kind = mkldnn::algorithm::lrn_across_channels
+
+    ~LocalResponseNormalizationBwd();
+    
+    /*
+     * lrn backward primitive setup
+     * Params:
+     * src_d: src
+     * diff_dst_d: diff dst
+     */
+    void setup(mkldnn::memory::dims src_d, 
+               mkldnn::memory::dims diff_dst_d,
+               mkldnn::memory::dims ws_d,
+               mkldnn::memory::data_type ws_dt,
+               int n, double k, double alpha, double beta,
+               mkldnn::algorithm alg_kind); // alg_kind = mkldnn::algorithm::lrn_across_channels
+
+    /*
+     * lrn backward execute 
+     * params:
+     * src:
+     * diff_src: diff_src
+     * diff_dst: diff_dst
+     * ws: workspace
+     */
+    void execute(void *src, void *diff_src, void *diff_dst, void *ws=NULL);
+
+public:
+    // expected memory format
+    mkldnn::memory::format src_fmt_;
+    mkldnn::memory::format diff_src_fmt_;
+    mkldnn::memory::format diff_dst_fmt_;
+    mkldnn::memory::format ws_fmt_;
+
+    // algo
+    mkldnn::algorithm alg_kind_;
+private:
+    // lrn primitive
+    std::shared_ptr<mkldnn::lrn_backward> bwd_;
+    std::shared_ptr<mkldnn::stream> bwd_stream_;
+    
+    // MKL-DNN memory, just dummy data
+    std::shared_ptr<mkldnn::memory> src_mem_;
+    std::shared_ptr<mkldnn::memory> ws_mem_;
+    std::shared_ptr<mkldnn::memory> diff_src_mem_;
+    std::shared_ptr<mkldnn::memory> diff_dst_mem_;
+    std::shared_ptr<mkldnn::memory::desc> src_md_;
+    std::shared_ptr<mkldnn::memory::desc> diff_src_md_;
+    std::shared_ptr<mkldnn::memory::desc> diff_dst_md_;
+
+    // fwd hint
+    std::shared_ptr<mkldnn::lrn_forward::desc> fwd_desc_;
+    std::shared_ptr<mkldnn::lrn_forward::primitive_desc> fwd_pd_;
+    
+    std::shared_ptr<mkldnn::lrn_backward::desc> bwd_desc_;
+    std::shared_ptr<mkldnn::lrn_backward::primitive_desc> bwd_pd_;
+    
+    std::vector<mkldnn::primitive> bwd_primitives_;
+};
+
+#endif // _LRN_BWD_H_
+
+
+// vim: et ts=4 sw=4 cindent cino^=l0,\:0,N-s
diff --git a/python/ideep4py/include/primitives/ops/lrn_fwd.h b/python/ideep4py/include/primitives/ops/lrn_fwd.h
new file mode 100755
index 00000000..ec45443f
--- /dev/null
+++ b/python/ideep4py/include/primitives/ops/lrn_fwd.h
@@ -0,0 +1,97 @@
+/*
+ *Copyright (c) 2018 Intel Corporation.
+ *
+ *Permission is hereby granted, free of charge, to any person obtaining a copy
+ *of this software and associated documentation files (the "Software"), to deal
+ *in the Software without restriction, including without limitation the rights
+ *to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ *copies of the Software, and to permit persons to whom the Software is
+ *furnished to do so, subject to the following conditions:
+ *
+ *The above copyright notice and this permission notice shall be included in
+ *all copies or substantial portions of the Software.
+ *
+ *THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ *OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ *THE SOFTWARE.
+ *
+ */
+
+
+#pragma once
+#ifndef _LRN_FWD_H_
+#define _LRN_FWD_H_
+
+#include <glog/logging.h>
+#include <iostream>
+#include <mkldnn.hpp>
+#include <vector>
+#include "op.h"
+
+template <typename T>
+class LocalResponseNormalizationFwd: public Op<T>{
+public:
+    LocalResponseNormalizationFwd(mkldnn::memory::dims src_d, mkldnn::memory::format src_fmt,
+            int n, double k, double alpha, double beta,
+            mkldnn::algorithm alg_kind); // alg_kind = mkldnn::algorithm::lrn_across_channels
+
+    ~LocalResponseNormalizationFwd();
+
+    /*
+     * lrn forward primitive setup
+     * Params:
+     * src_d: input
+     * dst_d: out_put
+     */
+    void setup(mkldnn::memory::dims src_d, mkldnn::memory::format src_fmt,
+            int n, double k, double alpha, double beta,
+            mkldnn::algorithm alg_kind); // alg_kind = mkldnn::algorithm::lrn_across_channels                                      
+
+    /*
+     * lrn forward execute 
+     * params:
+     * src: input
+     * dst: output
+     * ws: workspace
+     */
+    void execute(void *src, void *dst, void *ws=NULL);
+
+public:
+    // expected memory format
+    mkldnn::memory::format src_fmt_;
+    mkldnn::memory::format dst_fmt_;
+    mkldnn::memory::format ws_fmt_;
+    //workspace size
+    mkldnn::memory::dims ws_dims_;
+    mkldnn::memory::data_type ws_dt_;
+    size_t ws_size_;
+
+    // algo
+    mkldnn::algorithm alg_kind_;
+    // int local_size_;
+private:
+    // lrn primitive
+    std::shared_ptr<mkldnn::lrn_forward> fwd_;
+    std::shared_ptr<mkldnn::stream> fwd_stream_;
+    
+    // MKL-DNN memory, just dummy data
+    std::shared_ptr<mkldnn::memory> ws_mem_;
+    std::shared_ptr<mkldnn::memory> src_mem_;
+    std::shared_ptr<mkldnn::memory> dst_mem_;
+    std::shared_ptr<mkldnn::memory::desc> src_md_;
+    std::shared_ptr<mkldnn::memory::desc> dst_md_;
+
+    std::shared_ptr<mkldnn::lrn_forward::desc> fwd_desc_;
+    std::shared_ptr<mkldnn::lrn_forward::primitive_desc> fwd_pd_;
+    
+    std::vector<mkldnn::primitive> fwd_primitives_;
+};
+
+#endif // _LRN_FWD_H_
+
+
+// vim: et ts=4 sw=4 cindent cino^=l0,\:0,N-s
diff --git a/python/ideep4py/include/primitives/ops/op.h b/python/ideep4py/include/primitives/ops/op.h
new file mode 100644
index 00000000..73020301
--- /dev/null
+++ b/python/ideep4py/include/primitives/ops/op.h
@@ -0,0 +1,42 @@
+/*
+ *Copyright (c) 2018 Intel Corporation.
+ *
+ *Permission is hereby granted, free of charge, to any person obtaining a copy
+ *of this software and associated documentation files (the "Software"), to deal
+ *in the Software without restriction, including without limitation the rights
+ *to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ *copies of the Software, and to permit persons to whom the Software is
+ *furnished to do so, subject to the following conditions:
+ *
+ *The above copyright notice and this permission notice shall be included in
+ *all copies or substantial portions of the Software.
+ *
+ *THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ *OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ *THE SOFTWARE.
+ *
+ */
+
+
+#ifndef _OP_H_
+#define _OP_H_
+
+#include <mkldnn.hpp>
+#include <vector>
+
+template <typename T>
+class Op {
+public:
+    virtual ~Op() {}
+    virtual void execute(){ return; };
+    virtual void setup(){ return; };
+};
+
+#endif // _OP_H_
+
+
+// vim: et ts=4 sw=4 cindent cino^=l0,\:0,N-s
diff --git a/python/ideep4py/include/primitives/ops/pooling_bwd.h b/python/ideep4py/include/primitives/ops/pooling_bwd.h
new file mode 100644
index 00000000..101c1eec
--- /dev/null
+++ b/python/ideep4py/include/primitives/ops/pooling_bwd.h
@@ -0,0 +1,107 @@
+/*
+ *Copyright (c) 2018 Intel Corporation.
+ *
+ *Permission is hereby granted, free of charge, to any person obtaining a copy
+ *of this software and associated documentation files (the "Software"), to deal
+ *in the Software without restriction, including without limitation the rights
+ *to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ *copies of the Software, and to permit persons to whom the Software is
+ *furnished to do so, subject to the following conditions:
+ *
+ *The above copyright notice and this permission notice shall be included in
+ *all copies or substantial portions of the Software.
+ *
+ *THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ *OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ *THE SOFTWARE.
+ *
+ */
+
+
+#pragma once
+#ifndef _POOLING_BWD_H_
+#define _POOLING_BWD_H_
+
+#include <glog/logging.h>
+#include <iostream>
+#include <mkldnn.hpp>
+#include <vector>
+#include "op.h"
+
+template <typename T>
+class Pooling2DBwd: public Op<T>{
+public:
+    Pooling2DBwd(mkldnn::memory::dims diff_src_d,
+                 mkldnn::memory::dims diff_dst_d,
+                 mkldnn::memory::dims ws_d,
+                 mkldnn::memory::data_type ws_dt,
+                 int ker_h, int ker_w,
+                 int sy, int sx,
+                 int pad_lh, int pad_lw, int pad_rh, int pad_rw,
+                 mkldnn::algorithm alg_kind); // alg_kind = pooling_max
+                                            // or pooling_avg
+    ~Pooling2DBwd();
+    
+    /*
+     * Pooling backward primitive setup
+     * Params:
+     * diff_src_d: diff src
+     * diff_dst_d: diff dst
+     */
+    void setup(mkldnn::memory::dims diff_src_d, 
+               mkldnn::memory::dims diff_dst_d,
+               mkldnn::memory::dims ws_d,
+               mkldnn::memory::data_type ws_dt,
+               int ker_h, int ker_w,
+               int sy, int sx,
+               int pad_lh, int pad_lw, int pad_rh, int pad_rw,
+               mkldnn::algorithm alg_kind); // alg_kind = pooling_max
+                                            // or pooling_avg
+
+    /*
+     * Pooling backward execute 
+     * params:
+     * diff_src: diff_src
+     * diff_dst: diff_dst
+     * ws: workspace
+     */
+    void execute(void *diff_src, void *diff_dst, void *ws=NULL);
+
+public:
+    // expected memory format
+    mkldnn::memory::format diff_src_fmt_;
+    mkldnn::memory::format diff_dst_fmt_;
+    mkldnn::memory::format ws_fmt_;
+
+    // algo
+    mkldnn::algorithm alg_kind_;
+private:
+    // pooling primitive
+    std::shared_ptr<mkldnn::pooling_backward> bwd_;
+    std::shared_ptr<mkldnn::stream> bwd_stream_;
+    
+    // MKL-DNN memory, just dummy data
+    std::shared_ptr<mkldnn::memory> ws_mem_;
+    std::shared_ptr<mkldnn::memory> diff_src_mem_;
+    std::shared_ptr<mkldnn::memory> diff_dst_mem_;
+    std::shared_ptr<mkldnn::memory::desc> diff_src_md_;
+    std::shared_ptr<mkldnn::memory::desc> diff_dst_md_;
+
+    // fwd hint
+    std::shared_ptr<mkldnn::pooling_forward::desc> fwd_desc_;
+    std::shared_ptr<mkldnn::pooling_forward::primitive_desc> fwd_pd_;
+    
+    std::shared_ptr<mkldnn::pooling_backward::desc> bwd_desc_;
+    std::shared_ptr<mkldnn::pooling_backward::primitive_desc> bwd_pd_;
+    
+    std::vector<mkldnn::primitive> bwd_primitives_;
+};
+
+#endif // _POOLING_BWD_H_
+
+
+// vim: et ts=4 sw=4 cindent cino^=l0,\:0,N-s
diff --git a/python/ideep4py/include/primitives/ops/pooling_fwd.h b/python/ideep4py/include/primitives/ops/pooling_fwd.h
new file mode 100644
index 00000000..9c716373
--- /dev/null
+++ b/python/ideep4py/include/primitives/ops/pooling_fwd.h
@@ -0,0 +1,101 @@
+/*
+ *Copyright (c) 2018 Intel Corporation.
+ *
+ *Permission is hereby granted, free of charge, to any person obtaining a copy
+ *of this software and associated documentation files (the "Software"), to deal
+ *in the Software without restriction, including without limitation the rights
+ *to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ *copies of the Software, and to permit persons to whom the Software is
+ *furnished to do so, subject to the following conditions:
+ *
+ *The above copyright notice and this permission notice shall be included in
+ *all copies or substantial portions of the Software.
+ *
+ *THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ *OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ *THE SOFTWARE.
+ *
+ */
+
+
+#pragma once
+#ifndef _POOLING_FWD_H_
+#define _POOLING_FWD_H_
+
+#include <glog/logging.h>
+#include <iostream>
+#include <mkldnn.hpp>
+#include <vector>
+#include "op.h"
+
+template <typename T>
+class Pooling2DFwd: public Op<T>{
+public:
+    Pooling2DFwd(mkldnn::memory::dims src_d, mkldnn::memory::dims dst_d,
+                 int ker_h, int ker_w,
+                 int sy, int sx,
+                 int pad_lh, int pad_lw, int pad_rh, int pad_rw,
+                 mkldnn::algorithm alg_kind); // alg_kind = pooling_max
+                                            // or pooling_avg
+    ~Pooling2DFwd();
+    
+    /*
+     * Pooling forward primitive setup
+     * Params:
+     * src_d: input
+     * dst_d: out_put
+     */
+    void setup(mkldnn::memory::dims src_d, mkldnn::memory::dims dst_d,
+               int ker_h, int ker_w,
+               int sy, int sx,
+               int pad_lh, int pad_lw, int pad_rh, int pad_rw,
+               mkldnn::algorithm alg_kind); // alg_kind = pooling_max
+                                            // or pooling_avg
+
+    /*
+     * Pooling forward execute 
+     * params:
+     * src: input
+     * dst: output
+     * ws: workspace
+     */
+    void execute(void *src, void *dst, void *ws=NULL);
+
+public:
+    // expected memory format
+    mkldnn::memory::format src_fmt_;
+    mkldnn::memory::format dst_fmt_;
+    mkldnn::memory::format ws_fmt_;
+    //workspace size
+    mkldnn::memory::dims ws_dims_;
+    mkldnn::memory::data_type ws_dt_;
+    size_t ws_size_;
+
+    // algo
+    mkldnn::algorithm alg_kind_;
+private:
+    // pooling primitive
+    std::shared_ptr<mkldnn::pooling_forward> fwd_;
+    std::shared_ptr<mkldnn::stream> fwd_stream_;
+    
+    // MKL-DNN memory, just dummy data
+    std::shared_ptr<mkldnn::memory> ws_mem_;
+    std::shared_ptr<mkldnn::memory> src_mem_;
+    std::shared_ptr<mkldnn::memory> dst_mem_;
+    std::shared_ptr<mkldnn::memory::desc> src_md_;
+    std::shared_ptr<mkldnn::memory::desc> dst_md_;
+
+    std::shared_ptr<mkldnn::pooling_forward::desc> fwd_desc_;
+    std::shared_ptr<mkldnn::pooling_forward::primitive_desc> fwd_pd_;
+    
+    std::vector<mkldnn::primitive> fwd_primitives_;
+};
+
+#endif // _POOLING_FWD_H_
+
+
+// vim: et ts=4 sw=4 cindent cino^=l0,\:0,N-s
diff --git a/python/ideep4py/include/primitives/ops/reorder_op.h b/python/ideep4py/include/primitives/ops/reorder_op.h
new file mode 100644
index 00000000..28a548bc
--- /dev/null
+++ b/python/ideep4py/include/primitives/ops/reorder_op.h
@@ -0,0 +1,78 @@
+/*
+ *Copyright (c) 2018 Intel Corporation.
+ *
+ *Permission is hereby granted, free of charge, to any person obtaining a copy
+ *of this software and associated documentation files (the "Software"), to deal
+ *in the Software without restriction, including without limitation the rights
+ *to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ *copies of the Software, and to permit persons to whom the Software is
+ *furnished to do so, subject to the following conditions:
+ *
+ *The above copyright notice and this permission notice shall be included in
+ *all copies or substantial portions of the Software.
+ *
+ *THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ *OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ *THE SOFTWARE.
+ *
+ */
+
+
+#ifndef _REORDER_OP_H_
+#define _REORDER_OP_H_
+
+#include <mkldnn.hpp>
+#include <vector>
+#include <memory>
+#include "op.h"
+
+template <typename T>
+class ReorderOp : public Op<T>
+{
+public:
+    ReorderOp(mkldnn::memory::dims dims, mkldnn::memory::format src_fmt, mkldnn::memory::format dst_fmt);
+    ~ReorderOp();
+
+    /*
+     * Reorder primitive setup
+     * Params:
+     * dims:
+     * src_fmt: 
+     * dst_fmt:
+     */
+    void setup(mkldnn::memory::dims dims, mkldnn::memory::format src_fmt, mkldnn::memory::format dst_fmt);
+
+    /*
+     * reorder execute
+     */
+    void execute(void* src, void* dst);
+
+public:
+    // expected memory format for this primitive instance
+    mkldnn::memory::format src_fmt_;
+    mkldnn::memory::format dst_fmt_;
+    
+    // reorder primitive
+    std::shared_ptr<mkldnn::reorder> reorder_prim_;
+
+private:
+    //MKLDNN memory
+    //forward
+    std::shared_ptr<mkldnn::memory> src_mem_; // x
+    std::shared_ptr<mkldnn::memory> dst_mem_; //y
+
+    std::shared_ptr<mkldnn::stream> reorder_stream_;
+
+    //memory desc
+    std::shared_ptr<mkldnn::memory::desc> src_md_; //x 
+    std::shared_ptr<mkldnn::memory::desc> dst_md_; // y 
+};
+
+#endif // _REORDER_OP_H_
+
+
+// vim: et ts=4 sw=4 cindent cino^=l0,\:0,N-s
diff --git a/python/ideep4py/include/primitives/pooling.h b/python/ideep4py/include/primitives/pooling.h
new file mode 100644
index 00000000..8c0403ef
--- /dev/null
+++ b/python/ideep4py/include/primitives/pooling.h
@@ -0,0 +1,71 @@
+/*
+ *Copyright (c) 2018 Intel Corporation.
+ *
+ *Permission is hereby granted, free of charge, to any person obtaining a copy
+ *of this software and associated documentation files (the "Software"), to deal
+ *in the Software without restriction, including without limitation the rights
+ *to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ *copies of the Software, and to permit persons to whom the Software is
+ *furnished to do so, subject to the following conditions:
+ *
+ *The above copyright notice and this permission notice shall be included in
+ *all copies or substantial portions of the Software.
+ *
+ *THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ *OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ *THE SOFTWARE.
+ *
+ */
+
+
+#ifndef _POOLING_H_
+#define _POOLING_H_
+
+#include <mkldnn.hpp>
+#include <vector>
+#include <memory>
+#include "layer.h"
+#include "op_param.h"
+#include "tensor.h"
+
+template <typename T>
+class Pooling2D : public Layer<T>
+{
+public:
+    Pooling2D();
+    ~Pooling2D();
+
+    /*
+     * Pooling Forward
+     * params:
+     * src: input, x
+     * pp: pooling parameters
+     *
+     * ret
+     * vector<Tensor*>:
+     * Max pooling: return dst and workspace
+     * Avg pooling: return dst
+     */
+    static std::vector<Tensor *> Forward(Tensor *src, 
+                                         pooling_param_t *pp);
+
+    /*
+     * Pooling backward
+     * param:
+     * diff_dst: diff dst, gy
+     * pp: pooling parameters
+     */
+    static Tensor *Backward(Tensor *diff_dst,
+                            Tensor *ws,
+                            pooling_param_t *pp);
+
+};
+
+#endif // _POOLING_H_
+
+
+// vim: et ts=4 sw=4 cindent cino^=l0,\:0,N-s
diff --git a/python/ideep4py/include/primitives/prim_mgr/bn_bwd_factory.h b/python/ideep4py/include/primitives/prim_mgr/bn_bwd_factory.h
new file mode 100644
index 00000000..86096a88
--- /dev/null
+++ b/python/ideep4py/include/primitives/prim_mgr/bn_bwd_factory.h
@@ -0,0 +1,96 @@
+/*
+ *Copyright (c) 2018 Intel Corporation.
+ *
+ *Permission is hereby granted, free of charge, to any person obtaining a copy
+ *of this software and associated documentation files (the "Software"), to deal
+ *in the Software without restriction, including without limitation the rights
+ *to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ *copies of the Software, and to permit persons to whom the Software is
+ *furnished to do so, subject to the following conditions:
+ *
+ *The above copyright notice and this permission notice shall be included in
+ *all copies or substantial portions of the Software.
+ *
+ *THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ *OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ *THE SOFTWARE.
+ *
+ */
+
+
+#ifndef _BN_BWD_FACTORY_
+#define _BN_BWD_FACTORY_
+
+#include <mkldnn.hpp>
+#include <string>
+#include "op.h"
+#include "op_factory.h"
+#include <unordered_map>
+#include "utils.h"
+#include "bn_bwd.h"
+
+template <typename T>
+class batch_normalization_bwd_factory : public OpFactory<T> {
+
+private:
+    batch_normalization_bwd_factory() {}
+    ~batch_normalization_bwd_factory() {}
+
+public:
+    static batch_normalization_bwd<T> * get(mkldnn::memory::dims src_d,
+            mkldnn::memory::dims diff_dst_d, float eps, bool scale_shift) {
+        auto bn_bwd = dynamic_cast<batch_normalization_bwd<T>*>(
+                      batch_normalization_bwd_factory<T>::get_instance().get_bn_bwd(
+                      src_d, diff_dst_d, eps, scale_shift));
+
+        if (bn_bwd == nullptr) {
+            bn_bwd = new batch_normalization_bwd<T>(
+                     src_d, diff_dst_d, eps, scale_shift);
+            batch_normalization_bwd_factory<T>::get_instance().set_bn_bwd(
+                     src_d, diff_dst_d, eps, scale_shift, bn_bwd);
+        }
+
+        return bn_bwd;
+    }
+
+    static batch_normalization_bwd_factory & get_instance() {
+        static batch_normalization_bwd_factory instance_;
+        return instance_;
+    }
+
+private:
+#define BN_BWD_PREFIX "bn_bwd_"
+    Op<T> * get_bn_bwd(mkldnn::memory::dims src_d,
+                       mkldnn::memory::dims diff_dst_d,
+                       float eps, bool scale_shift) {
+
+        std::string key = BN_BWD_PREFIX;
+
+        key += dims_to_string(src_d);
+        key += dims_to_string(diff_dst_d);
+        key += float_to_string(eps);
+        key += bool_to_string(scale_shift);
+
+        return this->get_op(key);
+    }
+
+    void set_bn_bwd(mkldnn::memory::dims src_d,
+                    mkldnn::memory::dims diff_dst_d,
+                    float eps, bool scale_shift, Op<T> *op) {
+
+        std::string key = BN_BWD_PREFIX;
+
+        key += dims_to_string(src_d);
+        key += dims_to_string(diff_dst_d);
+        key += float_to_string(eps);
+        key += bool_to_string(scale_shift);
+
+        this->set_op(key, op);
+    }
+};
+
+#endif // _BN_BWD_FACTORY_
diff --git a/python/ideep4py/include/primitives/prim_mgr/bn_fwd_factory.h b/python/ideep4py/include/primitives/prim_mgr/bn_fwd_factory.h
new file mode 100644
index 00000000..d3b36b76
--- /dev/null
+++ b/python/ideep4py/include/primitives/prim_mgr/bn_fwd_factory.h
@@ -0,0 +1,98 @@
+/*
+ *Copyright (c) 2018 Intel Corporation.
+ *
+ *Permission is hereby granted, free of charge, to any person obtaining a copy
+ *of this software and associated documentation files (the "Software"), to deal
+ *in the Software without restriction, including without limitation the rights
+ *to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ *copies of the Software, and to permit persons to whom the Software is
+ *furnished to do so, subject to the following conditions:
+ *
+ *The above copyright notice and this permission notice shall be included in
+ *all copies or substantial portions of the Software.
+ *
+ *THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ *OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ *THE SOFTWARE.
+ *
+ */
+
+
+#ifndef _BN_FWD_FACTORY_
+#define _BN_FWD_FACTORY_
+
+#include <mkldnn.hpp>
+#include <string>
+#include "op.h"
+#include "op_factory.h"
+#include <unordered_map>
+#include "utils.h"
+#include "bn_fwd.h"
+
+template <typename T>
+class batch_normalization_fwd_factory : public OpFactory<T> {
+
+private:
+    batch_normalization_fwd_factory() {}
+    ~batch_normalization_fwd_factory() {}
+
+public:
+    static batch_normalization_fwd<T> * get(
+            mkldnn::memory::dims src_d, float eps,
+            bool scale_shift, bool global_stats, bool training) {
+
+        auto bn_fwd = dynamic_cast<batch_normalization_fwd<T>*>(
+                      batch_normalization_fwd_factory<T>::get_instance().get_bn_fwd(
+                      src_d, eps, scale_shift, global_stats, training));
+
+        if (bn_fwd == nullptr) {
+            bn_fwd = new batch_normalization_fwd<T>(
+                         src_d, eps, scale_shift, global_stats, training);
+            batch_normalization_fwd_factory<T>::get_instance().set_bn_fwd(
+                    src_d, eps, scale_shift, global_stats, training, bn_fwd);
+        }
+
+        return bn_fwd;
+    }
+
+    static batch_normalization_fwd_factory & get_instance() {
+        static batch_normalization_fwd_factory instance_;
+        return instance_;
+    }
+
+private:
+#define BN_FWD_PREFIX "bn_fwd_"
+    Op<T> * get_bn_fwd(mkldnn::memory::dims src_d, float eps, bool scale_shift,
+                       bool global_stats, bool training) {
+
+        std::string key = BN_FWD_PREFIX;
+
+        key += dims_to_string(src_d);
+        key += float_to_string(eps);
+        key += bool_to_string(scale_shift);
+        key += bool_to_string(global_stats);
+        key += bool_to_string(training);
+
+        return this->get_op(key);
+    }
+
+    void set_bn_fwd(mkldnn::memory::dims src_d, float eps, bool scale_shift,
+                    bool global_stats, bool training, Op<T> *op) {
+
+        std::string key = BN_FWD_PREFIX;
+
+        key += dims_to_string(src_d);
+        key += float_to_string(eps);
+        key += bool_to_string(scale_shift);
+        key += bool_to_string(global_stats);
+        key += bool_to_string(training);
+
+        this->set_op(key, op);
+    }
+};
+
+#endif // _BN_FWD_FACTORY_
diff --git a/python/ideep4py/include/primitives/prim_mgr/concat_bwd_factory.h b/python/ideep4py/include/primitives/prim_mgr/concat_bwd_factory.h
new file mode 100644
index 00000000..ee3607a8
--- /dev/null
+++ b/python/ideep4py/include/primitives/prim_mgr/concat_bwd_factory.h
@@ -0,0 +1,99 @@
+/*
+ *Copyright (c) 2018 Intel Corporation.
+ *
+ *Permission is hereby granted, free of charge, to any person obtaining a copy
+ *of this software and associated documentation files (the "Software"), to deal
+ *in the Software without restriction, including without limitation the rights
+ *to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ *copies of the Software, and to permit persons to whom the Software is
+ *furnished to do so, subject to the following conditions:
+ *
+ *The above copyright notice and this permission notice shall be included in
+ *all copies or substantial portions of the Software.
+ *
+ *THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ *OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ *THE SOFTWARE.
+ *
+ */
+
+
+#ifndef _CONCAT_BWD_FACTORY_
+#define _CONCAT_BWD_FACTORY_
+#include <mkldnn.hpp>
+#include <string>
+#include "op.h"
+#include "op_factory.h"
+#include <unordered_map>
+#include "utils.h"
+#include "concat_bwd.h"
+
+template <typename T>
+class ConcatBwdFactory : public OpFactory<T>
+{
+private:
+    ConcatBwdFactory() {}
+    ~ConcatBwdFactory() {}
+
+public:
+    static ConcatBwd<T>* get( std::vector<mkldnn::memory::dims> diff_src,
+                         mkldnn::memory::dims diff_dst,
+                         int axis) {
+        ConcatBwd<T>* concat_backward = NULL;
+
+        //try to find a suitable one in pool
+        concat_backward = dynamic_cast<ConcatBwd<T>*> (
+                            ConcatBwdFactory<T>::get_instance().get_concat_bwd(diff_src, diff_dst, axis));
+
+        if (concat_backward == NULL) {
+            //LOG(INFO) << "create a new one for concat bwd";
+            concat_backward = new ConcatBwd<T>( diff_src, diff_dst, axis);
+            ConcatBwdFactory<T>::get_instance().set_concat_bwd( diff_src, diff_dst, axis, concat_backward);
+        } else {
+            //LOG(INFO) << "reuse exist one for concat bwd";
+        }
+        return concat_backward;
+    }
+
+    static ConcatBwdFactory& get_instance() {
+        static ConcatBwdFactory instance_;
+        return instance_;
+    }
+
+private:
+#define CONCAT_BWD_PREFIX "concat_bwd_"
+    Op<T>* get_concat_bwd( std::vector<mkldnn::memory::dims> diff_src, 
+			   mkldnn::memory::dims diff_dst,
+                           int axis) {
+        std::string key = CONCAT_BWD_PREFIX;
+ 
+        for (int i = 0; i < diff_src.size(); i++) {
+            key += dims_to_string(diff_src[i]);
+        }
+        key += dims_to_string(diff_dst);
+        key += int_to_string(axis);
+
+        return this->get_op(key);
+    }
+
+    void set_concat_bwd( std::vector<mkldnn::memory::dims> diff_src, 
+                          mkldnn::memory::dims diff_dst,
+                          int axis,
+                          Op<T> *op) {
+        std::string key = CONCAT_BWD_PREFIX;
+ 
+        for (int i = 0; i < diff_src.size(); i++) {
+            key += dims_to_string(diff_src[i]);
+        }
+        key += dims_to_string(diff_dst);
+        key += int_to_string(axis);
+
+        this->set_op(key, op);
+    }
+};
+
+#endif // _CONCAT_BWD_FACTORY_
diff --git a/python/ideep4py/include/primitives/prim_mgr/concat_fwd_factory.h b/python/ideep4py/include/primitives/prim_mgr/concat_fwd_factory.h
new file mode 100644
index 00000000..7ced4089
--- /dev/null
+++ b/python/ideep4py/include/primitives/prim_mgr/concat_fwd_factory.h
@@ -0,0 +1,99 @@
+/*
+ *Copyright (c) 2018 Intel Corporation.
+ *
+ *Permission is hereby granted, free of charge, to any person obtaining a copy
+ *of this software and associated documentation files (the "Software"), to deal
+ *in the Software without restriction, including without limitation the rights
+ *to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ *copies of the Software, and to permit persons to whom the Software is
+ *furnished to do so, subject to the following conditions:
+ *
+ *The above copyright notice and this permission notice shall be included in
+ *all copies or substantial portions of the Software.
+ *
+ *THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ *OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ *THE SOFTWARE.
+ *
+ */
+
+
+#ifndef _CONCAT_FWD_FACTORY_
+#define _CONCAT_FWD_FACTORY_
+#include <mkldnn.hpp>
+#include <string>
+#include "op.h"
+#include "op_factory.h"
+#include <unordered_map>
+#include "utils.h"
+#include "concat_fwd.h"
+
+template <typename T>
+class ConcatFwdFactory : public OpFactory<T>
+{
+private:
+    ConcatFwdFactory() {}
+    ~ConcatFwdFactory() {}
+
+public:
+    static ConcatFwd<T>* get( std::vector<mkldnn::memory::dims> src,
+                         mkldnn::memory::dims dst,
+                         int axis) {
+        ConcatFwd<T>* concat_forward = NULL;
+
+        //try to find a suitable one in pool
+        concat_forward = dynamic_cast<ConcatFwd<T>*> (
+                            ConcatFwdFactory<T>::get_instance().get_concat_fwd(src, dst, axis));
+
+        if (concat_forward == NULL) {
+            //LOG(INFO) << "create a new one for concat fwd";
+            concat_forward = new ConcatFwd<T>( src, dst, axis);
+            ConcatFwdFactory<T>::get_instance().set_concat_fwd( src, dst, axis, concat_forward);
+        } else {
+            //LOG(INFO) << "reuse exist one for concat fwd";
+        }
+        return concat_forward;
+    }
+
+    static ConcatFwdFactory& get_instance() {
+        static ConcatFwdFactory instance_;
+        return instance_;
+    }
+
+private:
+#define CONCAT_FWD_PREFIX "concat_fwd_"
+    Op<T>* get_concat_fwd( std::vector<mkldnn::memory::dims> src, 
+			   mkldnn::memory::dims dst,
+                           int axis) {
+        std::string key = CONCAT_FWD_PREFIX;
+ 
+        for (int i = 0; i < src.size(); i++) {
+            key += dims_to_string(src[i]);
+        }
+        key += dims_to_string(dst);
+        key += int_to_string(axis);
+
+        return this->get_op(key);
+    }
+
+    void set_concat_fwd( std::vector<mkldnn::memory::dims> src, 
+                          mkldnn::memory::dims dst,
+                          int axis,
+                          Op<T> *op) {
+        std::string key = CONCAT_FWD_PREFIX;
+ 
+        for (int i = 0; i < src.size(); i++) {
+            key += dims_to_string(src[i]);
+        }
+        key += dims_to_string(dst);
+        key += int_to_string(axis);
+
+        this->set_op(key, op);
+    }
+};
+
+#endif // _CONCAT_FWD_FACTORY_
diff --git a/python/ideep4py/include/primitives/prim_mgr/conv_bwd_data_factory.h b/python/ideep4py/include/primitives/prim_mgr/conv_bwd_data_factory.h
new file mode 100644
index 00000000..3c092563
--- /dev/null
+++ b/python/ideep4py/include/primitives/prim_mgr/conv_bwd_data_factory.h
@@ -0,0 +1,120 @@
+/*
+ *Copyright (c) 2018 Intel Corporation.
+ *
+ *Permission is hereby granted, free of charge, to any person obtaining a copy
+ *of this software and associated documentation files (the "Software"), to deal
+ *in the Software without restriction, including without limitation the rights
+ *to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ *copies of the Software, and to permit persons to whom the Software is
+ *furnished to do so, subject to the following conditions:
+ *
+ *The above copyright notice and this permission notice shall be included in
+ *all copies or substantial portions of the Software.
+ *
+ *THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ *OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ *THE SOFTWARE.
+ *
+ */
+
+
+#ifndef _CONV_BWD_DATA_FACTORY_
+#define _CONV_BWD_DATA_FACTORY_
+#include <mkldnn.hpp>
+#include <string>
+#include "op.h"
+#include "op_factory.h"
+#include <unordered_map>
+#include "utils.h"
+#include "conv_bwd_data.h"
+
+template <typename T>
+class Convolution2DBwdDataFactory : public OpFactory<T>
+{
+private:
+    Convolution2DBwdDataFactory() {}
+    ~Convolution2DBwdDataFactory() {}
+
+public:
+    static Convolution2DBwdData<T>* get(mkldnn::memory::dims diff_src,
+                                        mkldnn::memory::dims w,
+                                        mkldnn::memory::dims diff_dst,
+                                        int dilate_y, int dilate_x,
+                                        int sy, int sx,
+                                        int pad_lh, int pad_lw, int pad_rh, int pad_rw) {
+        Convolution2DBwdData<T>* conv2d_backward_data = NULL;
+
+        //try to find a suitable one in pool
+        conv2d_backward_data = dynamic_cast<Convolution2DBwdData<T>*> (
+                            Convolution2DBwdDataFactory<T>::get_instance().get_conv2d_bwd_data( diff_src, w, diff_dst, dilate_y, dilate_x, sy, sx, pad_lh, pad_lw, pad_rh, pad_rw));
+
+        if (conv2d_backward_data == NULL) {
+            //LOG(INFO) << "create a new one for conv2d bwd data";
+            conv2d_backward_data = new Convolution2DBwdData<T>( diff_src, w, diff_dst, dilate_y, dilate_x, sy, sx, pad_lh, pad_lw, pad_rh, pad_rw);
+            Convolution2DBwdDataFactory<T>::get_instance().set_conv2d_bwd_data( diff_src, w, diff_dst, dilate_y, dilate_x, sy, sx, pad_lh, pad_lw, pad_rh, pad_rw, conv2d_backward_data);
+        } else {
+            //LOG(INFO) << "reuse a existed one for conv2d bwd data";
+        }
+        return conv2d_backward_data;
+    }
+
+    static Convolution2DBwdDataFactory& get_instance() {
+        static Convolution2DBwdDataFactory instance_;
+        return instance_;
+    }
+
+private:
+#define CONVOLUTION2D_BWD_DATA_PREFIX "conv2d_bwd_data_"
+    Op<T>* get_conv2d_bwd_data(mkldnn::memory::dims diff_src,
+                               mkldnn::memory::dims w,
+                               mkldnn::memory::dims diff_dst,
+                               int dilate_y, int dilate_x,
+                               int sy, int sx,
+                               int pad_lh, int pad_lw, int pad_rh, int pad_rw) {
+        std::string key = CONVOLUTION2D_BWD_DATA_PREFIX;
+
+        key += dims_to_string(diff_src);
+        key += dims_to_string(w);
+        key += dims_to_string(diff_dst);
+        key += int_to_string(dilate_y);
+        key += int_to_string(dilate_x);
+        key += int_to_string(sy);
+        key += int_to_string(sx);
+        key += int_to_string(pad_lh);
+        key += int_to_string(pad_lw);
+        key += int_to_string(pad_rh);
+        key += int_to_string(pad_rw);
+
+        return this->get_op(key);
+    }
+
+    void set_conv2d_bwd_data(mkldnn::memory::dims diff_src,
+                             mkldnn::memory::dims w,
+                             mkldnn::memory::dims diff_dst,
+                             int dilate_y, int dilate_x,
+                             int sy, int sx,
+                             int pad_lh, int pad_lw, int pad_rh, int pad_rw,
+                             Op<T> *op) {
+        std::string key = CONVOLUTION2D_BWD_DATA_PREFIX;
+
+        key += dims_to_string(diff_src);
+        key += dims_to_string(w);
+        key += dims_to_string(diff_dst);
+        key += int_to_string(dilate_y);
+        key += int_to_string(dilate_x);
+        key += int_to_string(sy);
+        key += int_to_string(sx);
+        key += int_to_string(pad_lh);
+        key += int_to_string(pad_lw);
+        key += int_to_string(pad_rh);
+        key += int_to_string(pad_rw);
+
+        this->set_op(key, op);
+    }
+};
+
+#endif // _CONV_BWD_DATA_FACTORY_
diff --git a/python/ideep4py/include/primitives/prim_mgr/conv_bwd_weights_factory.h b/python/ideep4py/include/primitives/prim_mgr/conv_bwd_weights_factory.h
new file mode 100644
index 00000000..b33e42ac
--- /dev/null
+++ b/python/ideep4py/include/primitives/prim_mgr/conv_bwd_weights_factory.h
@@ -0,0 +1,119 @@
+/*
+ *Copyright (c) 2018 Intel Corporation.
+ *
+ *Permission is hereby granted, free of charge, to any person obtaining a copy
+ *of this software and associated documentation files (the "Software"), to deal
+ *in the Software without restriction, including without limitation the rights
+ *to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ *copies of the Software, and to permit persons to whom the Software is
+ *furnished to do so, subject to the following conditions:
+ *
+ *The above copyright notice and this permission notice shall be included in
+ *all copies or substantial portions of the Software.
+ *
+ *THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ *OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ *THE SOFTWARE.
+ *
+ */
+
+
+#ifndef _CONV_BWD_WEIGHTS_FACTORY_
+#define _CONV_BWD_WEIGHTS_FACTORY_
+#include <mkldnn.hpp>
+#include <string>
+#include "op.h"
+#include "op_factory.h"
+#include <unordered_map>
+#include "utils.h"
+#include "conv_bwd_weights.h"
+
+template <typename T>
+class Convolution2DBwdWeightsFactory : public OpFactory<T>
+{
+private:
+    Convolution2DBwdWeightsFactory() {}
+    ~Convolution2DBwdWeightsFactory() {}
+
+public:
+    static Convolution2DBwdWeights<T>* get(mkldnn::memory::dims x, mkldnn::memory::dims diff_w,
+                                           mkldnn::memory::dims diff_b, mkldnn::memory::dims diff_y,
+                                           int dilate_y, int dilate_x,
+                                           int sy, int sx,
+                                           int pad_lh, int pad_lw, int pad_rh, int pad_rw) {
+        Convolution2DBwdWeights<T>* conv2d_backward_weights = NULL;
+
+        //try to find a suitable one in pool
+        conv2d_backward_weights = dynamic_cast<Convolution2DBwdWeights<T>*> (
+                            Convolution2DBwdWeightsFactory<T>::get_instance().get_conv2d_bwd_weights( x, diff_w, diff_b, diff_y, dilate_y, dilate_x, sy, sx, pad_lh, pad_lw, pad_rh, pad_rw));
+
+        if (conv2d_backward_weights == NULL) {
+            //LOG(INFO) << "create a new one for conv2d bwd weights";
+            conv2d_backward_weights = new Convolution2DBwdWeights<T>( x, diff_w, diff_b, diff_y, dilate_y, dilate_x, sy, sx, pad_lh, pad_lw, pad_rh, pad_rw);
+            Convolution2DBwdWeightsFactory<T>::get_instance().set_conv2d_bwd_weights( x, diff_w, diff_b, diff_y, dilate_y, dilate_x, sy, sx, pad_lh, pad_lw, pad_rh, pad_rw, conv2d_backward_weights);
+        } else {
+            //LOG(INFO) << "reuse existed one for conv2d bwd weights";
+        }
+        return conv2d_backward_weights;
+    }
+
+    static Convolution2DBwdWeightsFactory& get_instance() {
+        static Convolution2DBwdWeightsFactory instance_;
+        return instance_;
+    }
+
+private:
+#define CONVOLUTION2D_BWD_WEIGHTS_PREFIX "conv2d_bwd_weights_"
+    Op<T>* get_conv2d_bwd_weights(mkldnn::memory::dims x, mkldnn::memory::dims diff_w,
+                                  mkldnn::memory::dims diff_b, mkldnn::memory::dims diff_y,
+                                  int dilate_y, int dilate_x,
+                                  int sy, int sx,
+                                  int pad_lh, int pad_lw, int pad_rh, int pad_rw) {
+        std::string key = CONVOLUTION2D_BWD_WEIGHTS_PREFIX;
+
+        key += dims_to_string(x);
+        key += dims_to_string(diff_w);
+        key += dims_to_string(diff_b);
+        key += dims_to_string(diff_y);
+        key += int_to_string(dilate_y);
+        key += int_to_string(dilate_x);
+        key += int_to_string(sy);
+        key += int_to_string(sx);
+        key += int_to_string(pad_lh);
+        key += int_to_string(pad_lw);
+        key += int_to_string(pad_rh);
+        key += int_to_string(pad_rw);
+
+        return this->get_op(key);
+    }
+
+    void set_conv2d_bwd_weights(mkldnn::memory::dims x, mkldnn::memory::dims diff_w,
+                                mkldnn::memory::dims diff_b, mkldnn::memory::dims diff_y,
+                                int dilate_y, int dilate_x,
+                                int sy, int sx,
+                                int pad_lh, int pad_lw, int pad_rh, int pad_rw,
+                                Op<T> *op) {
+        std::string key = CONVOLUTION2D_BWD_WEIGHTS_PREFIX;
+
+        key += dims_to_string(x);
+        key += dims_to_string(diff_w);
+        key += dims_to_string(diff_b);
+        key += dims_to_string(diff_y);
+        key += int_to_string(dilate_y);
+        key += int_to_string(dilate_x);
+        key += int_to_string(sy);
+        key += int_to_string(sx);
+        key += int_to_string(pad_lh);
+        key += int_to_string(pad_lw);
+        key += int_to_string(pad_rh);
+        key += int_to_string(pad_rw);
+
+        this->set_op(key, op);
+    }
+};
+
+#endif // _CONV_BWD_WEIGHTS_FACTORY_
diff --git a/python/ideep4py/include/primitives/prim_mgr/conv_fwd_factory.h b/python/ideep4py/include/primitives/prim_mgr/conv_fwd_factory.h
new file mode 100644
index 00000000..421b002f
--- /dev/null
+++ b/python/ideep4py/include/primitives/prim_mgr/conv_fwd_factory.h
@@ -0,0 +1,119 @@
+/*
+ *Copyright (c) 2018 Intel Corporation.
+ *
+ *Permission is hereby granted, free of charge, to any person obtaining a copy
+ *of this software and associated documentation files (the "Software"), to deal
+ *in the Software without restriction, including without limitation the rights
+ *to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ *copies of the Software, and to permit persons to whom the Software is
+ *furnished to do so, subject to the following conditions:
+ *
+ *The above copyright notice and this permission notice shall be included in
+ *all copies or substantial portions of the Software.
+ *
+ *THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ *OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ *THE SOFTWARE.
+ *
+ */
+
+
+#ifndef _CONV_FWD_FACTORY_
+#define _CONV_FWD_FACTORY_
+#include <mkldnn.hpp>
+#include <string>
+#include "op.h"
+#include "op_factory.h"
+#include <unordered_map>
+#include "utils.h"
+#include "conv_fwd.h"
+
+template <typename T>
+class Convolution2DFwdFactory : public OpFactory<T>
+{
+private:
+    Convolution2DFwdFactory() {}
+    ~Convolution2DFwdFactory() {}
+
+public:
+    static Convolution2DFwd<T>* get( mkldnn::memory::dims x, mkldnn::memory::dims w,
+                         mkldnn::memory::dims b, mkldnn::memory::dims y,
+                         int dilate_y, int dilate_x,
+                         int sy, int sx,
+                         int pad_lh, int pad_lw, int pad_rh, int pad_rw) {
+        Convolution2DFwd<T>* conv2d_forward = NULL;
+
+        //try to find a suitable one in pool
+        conv2d_forward = dynamic_cast<Convolution2DFwd<T>*> (
+                            Convolution2DFwdFactory<T>::get_instance().get_conv2d_fwd( x, w, b, y, dilate_y, dilate_x, sy, sx, pad_lh, pad_lw, pad_rh, pad_rw));
+
+        if (conv2d_forward == NULL) {
+            //LOG(INFO) << "create a new one for conv2d fwd";
+            conv2d_forward = new Convolution2DFwd<T>( x, w, b, y, dilate_y, dilate_x, sy, sx, pad_lh, pad_lw, pad_rh, pad_rw);
+            Convolution2DFwdFactory<T>::get_instance().set_conv2d_fwd( x, w, b, y, dilate_y, dilate_x, sy, sx, pad_lh, pad_lw, pad_rh, pad_rw, conv2d_forward);
+        } else {
+            //LOG(INFO) << "reuse exist one for conv2d fwd";
+        }
+        return conv2d_forward;
+    }
+
+    static Convolution2DFwdFactory& get_instance() {
+        static Convolution2DFwdFactory instance_;
+        return instance_;
+    }
+
+private:
+#define CONVOLUTION2D_FWD_PREFIX "conv2d_fwd_"
+    Op<T>* get_conv2d_fwd( mkldnn::memory::dims x, mkldnn::memory::dims w,
+                           mkldnn::memory::dims b, mkldnn::memory::dims y,
+                           int dilate_y, int dilate_x,
+                           int sy, int sx,
+                           int pad_lh, int pad_lw, int pad_rh, int pad_rw) {
+        std::string key = CONVOLUTION2D_FWD_PREFIX;
+
+        key += dims_to_string(x);
+        key += dims_to_string(w);
+        key += dims_to_string(b);
+        key += dims_to_string(y);
+        key += int_to_string(dilate_y);
+        key += int_to_string(dilate_x);
+        key += int_to_string(sy);
+        key += int_to_string(sx);
+        key += int_to_string(pad_lh);
+        key += int_to_string(pad_lw);
+        key += int_to_string(pad_rh);
+        key += int_to_string(pad_rw);
+
+        return this->get_op(key);
+    }
+
+    void set_conv2d_fwd( mkldnn::memory::dims x, mkldnn::memory::dims w,
+                         mkldnn::memory::dims b, mkldnn::memory::dims y,
+                         int dilate_y, int dilate_x,
+                         int sy, int sx,
+                         int pad_lh, int pad_lw, int pad_rh, int pad_rw,
+                         Op<T> *op) {
+        std::string key = CONVOLUTION2D_FWD_PREFIX;
+
+        key += dims_to_string(x);
+        key += dims_to_string(w);
+        key += dims_to_string(b);
+        key += dims_to_string(y);
+        key += int_to_string(dilate_y);
+        key += int_to_string(dilate_x);
+        key += int_to_string(sy);
+        key += int_to_string(sx);
+        key += int_to_string(pad_lh);
+        key += int_to_string(pad_lw);
+        key += int_to_string(pad_rh);
+        key += int_to_string(pad_rw);
+
+        this->set_op(key, op);
+    }
+};
+
+#endif // _CONV_FWD_FACTORY_
diff --git a/python/ideep4py/include/primitives/prim_mgr/eltwise_bwd_factory.h b/python/ideep4py/include/primitives/prim_mgr/eltwise_bwd_factory.h
new file mode 100644
index 00000000..5fc8a902
--- /dev/null
+++ b/python/ideep4py/include/primitives/prim_mgr/eltwise_bwd_factory.h
@@ -0,0 +1,90 @@
+/*
+ *Copyright (c) 2018 Intel Corporation.
+ *
+ *Permission is hereby granted, free of charge, to any person obtaining a copy
+ *of this software and associated documentation files (the "Software"), to deal
+ *in the Software without restriction, including without limitation the rights
+ *to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ *copies of the Software, and to permit persons to whom the Software is
+ *furnished to do so, subject to the following conditions:
+ *
+ *The above copyright notice and this permission notice shall be included in
+ *all copies or substantial portions of the Software.
+ *
+ *THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ *OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ *THE SOFTWARE.
+ *
+ */
+
+
+#pragma once
+
+#include <mkldnn.hpp>
+#include <string>
+#include "op.h"
+#include "op_factory.h"
+#include <unordered_map>
+#include "utils.h"
+#include "eltwise_bwd.h"
+
+template <typename T1, typename T2>
+class EltwiseBwdFactory : public OpFactory<T1>
+{
+private:
+    EltwiseBwdFactory() {}
+    ~EltwiseBwdFactory() {}
+
+public:
+    static EltwiseBwd<T1, T2>* get(mkldnn::memory::dims x, mkldnn::algorithm alg_kind, mkldnn::memory::format dst_diff_fmt, T2 alpha, T2 beta) {
+        EltwiseBwd<T1, T2>* eltwise_backward = nullptr;
+
+        //try to find a suitable one in pool
+        eltwise_backward = dynamic_cast<EltwiseBwd<T1, T2>*> (
+                            EltwiseBwdFactory<T1, T2>::get_instance().get_eltwise_bwd(x, alg_kind, dst_diff_fmt, alpha, beta));
+
+        if (eltwise_backward == nullptr) {
+            //LOG(INFO) << "create a new one for eltwise bwd";
+            eltwise_backward = new EltwiseBwd<T1, T2>(x, alg_kind, dst_diff_fmt, alpha, beta);
+            EltwiseBwdFactory<T1, T2>::get_instance().set_eltwise_bwd(x, alg_kind, dst_diff_fmt, alpha, beta, eltwise_backward);
+        } else {
+            //LOG(INFO) << "reuse exist one for eltwise bwd";
+        }
+        return eltwise_backward;
+    }
+
+    static EltwiseBwdFactory& get_instance() {
+        static EltwiseBwdFactory instance_;
+        return instance_;
+    }
+
+private:
+#define ELTWISE_BWD_PREFIX "eltwise_bwd_"
+    Op<T1>* get_eltwise_bwd(mkldnn::memory::dims x, mkldnn::algorithm alg_kind, mkldnn::memory::format dst_diff_fmt, T2 alpha, T2 beta) {
+        std::string key = ELTWISE_BWD_PREFIX;
+
+        key += dims_to_string(x);
+        key += int_to_string((int)alg_kind);
+        key + float_to_string((float)alpha);
+        key + float_to_string((float)beta);
+        key += int_to_string(dst_diff_fmt);
+
+        return this->get_op(key);
+    }
+
+    void set_eltwise_bwd(mkldnn::memory::dims x, mkldnn::algorithm alg_kind, mkldnn::memory::format dst_diff_fmt, T2 alpha, T2 beta, Op<T1> *op) {
+        std::string key = ELTWISE_BWD_PREFIX;
+
+        key += dims_to_string(x);
+        key += int_to_string((int)alg_kind);
+        key + float_to_string((float)alpha);
+        key + float_to_string((float)beta);
+        key += int_to_string(dst_diff_fmt);
+
+        this->set_op(key, op);
+    }
+};
diff --git a/python/ideep4py/include/primitives/prim_mgr/eltwise_fwd_factory.h b/python/ideep4py/include/primitives/prim_mgr/eltwise_fwd_factory.h
new file mode 100644
index 00000000..db5bd3e4
--- /dev/null
+++ b/python/ideep4py/include/primitives/prim_mgr/eltwise_fwd_factory.h
@@ -0,0 +1,93 @@
+/*
+ *Copyright (c) 2018 Intel Corporation.
+ *
+ *Permission is hereby granted, free of charge, to any person obtaining a copy
+ *of this software and associated documentation files (the "Software"), to deal
+ *in the Software without restriction, including without limitation the rights
+ *to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ *copies of the Software, and to permit persons to whom the Software is
+ *furnished to do so, subject to the following conditions:
+ *
+ *The above copyright notice and this permission notice shall be included in
+ *all copies or substantial portions of the Software.
+ *
+ *THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ *OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ *THE SOFTWARE.
+ *
+ */
+
+
+#pragma once
+
+#include <mkldnn.hpp>
+#include <string>
+#include <typeinfo>
+#include "op.h"
+#include "op_factory.h"
+#include <unordered_map>
+#include "utils.h"
+#include "eltwise_fwd.h"
+
+template <typename T1, typename T2>
+class EltwiseFwdFactory : public OpFactory<T1>
+{
+private:
+    EltwiseFwdFactory() {}
+    ~EltwiseFwdFactory() {}
+
+public:
+    static EltwiseFwd<T1, T2>* get(mkldnn::memory::dims x, mkldnn::algorithm alg_kind, mkldnn::memory::format src_fmt, T2 alpha, T2 beta) {
+        EltwiseFwd<T1, T2>* eltwise_forward = nullptr;
+
+        //try to find a suitable one in pool
+        eltwise_forward = dynamic_cast<EltwiseFwd<T1, T2>*> (
+                            EltwiseFwdFactory<T1, T2>::get_instance().get_eltwise_fwd(x, alg_kind, src_fmt, alpha, beta));
+
+        if (eltwise_forward == nullptr) {
+            //LOG(INFO) << "create a new one for eltwise fwd";
+            eltwise_forward = new EltwiseFwd<T1, T2>(x, alg_kind, src_fmt, alpha, beta);
+            EltwiseFwdFactory<T1, T2>::get_instance().set_eltwise_fwd(x, alg_kind, src_fmt, alpha, beta, eltwise_forward);
+        } else {
+            //LOG(INFO) << "reuse exist one for eltwise fwd";
+        }
+        return eltwise_forward;
+    }
+
+    static EltwiseFwdFactory& get_instance() {
+        static EltwiseFwdFactory instance_;
+        return instance_;
+    }
+
+private:
+#define ELTWISE_FWD_PREFIX "eltwise_fwd_"
+    Op<T1>* get_eltwise_fwd(mkldnn::memory::dims x, mkldnn::algorithm alg_kind, mkldnn::memory::format src_fmt, T2 alpha, T2 beta) {
+        std::string key = ELTWISE_FWD_PREFIX;
+
+        key += dims_to_string(x);
+        key += int_to_string((int)alg_kind);
+        // key += typeid(alpha).name();
+        key + float_to_string((float)alpha);
+        key + float_to_string((float)beta);
+        key += int_to_string(src_fmt);
+
+        return this->get_op(key);
+    }
+
+    void set_eltwise_fwd(mkldnn::memory::dims x, mkldnn::algorithm alg_kind, mkldnn::memory::format src_fmt, T2 alpha, T2 beta, Op<T1>* op) {
+        std::string key = ELTWISE_FWD_PREFIX;
+
+        key += dims_to_string(x);
+        key += int_to_string((int)alg_kind);
+        // key += typeid(alpha).name();
+        key + float_to_string((float)alpha);
+        key + float_to_string((float)beta);
+        key += int_to_string(src_fmt);
+
+        this->set_op(key, op);
+    }
+};
diff --git a/python/ideep4py/include/primitives/prim_mgr/linear_bwd_data_factory.h b/python/ideep4py/include/primitives/prim_mgr/linear_bwd_data_factory.h
new file mode 100644
index 00000000..115846a0
--- /dev/null
+++ b/python/ideep4py/include/primitives/prim_mgr/linear_bwd_data_factory.h
@@ -0,0 +1,92 @@
+/*
+ *Copyright (c) 2018 Intel Corporation.
+ *
+ *Permission is hereby granted, free of charge, to any person obtaining a copy
+ *of this software and associated documentation files (the "Software"), to deal
+ *in the Software without restriction, including without limitation the rights
+ *to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ *copies of the Software, and to permit persons to whom the Software is
+ *furnished to do so, subject to the following conditions:
+ *
+ *The above copyright notice and this permission notice shall be included in
+ *all copies or substantial portions of the Software.
+ *
+ *THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ *OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ *THE SOFTWARE.
+ *
+ */
+
+
+#ifndef _LINEAR_BWD_DATA_FACTORY_
+#define _LINEAR_BWD_DATA_FACTORY_
+#include <mkldnn.hpp>
+#include <string>
+#include "op.h"
+#include "op_factory.h"
+#include <unordered_map>
+#include "utils.h"
+#include "linear_bwd_data.h"
+
+template <typename T>
+class LinearBwdDataFactory : public OpFactory<T>
+{
+private:
+    LinearBwdDataFactory() {}
+    ~LinearBwdDataFactory() {}
+
+public:
+    static LinearBwdData<T>* get(mkldnn::memory::dims diff_src,
+            mkldnn::memory::dims w, mkldnn::memory::dims diff_dst) {
+        LinearBwdData<T>* linear_backward_data = NULL;
+        //try to find a suitable one in pool
+        linear_backward_data = dynamic_cast<LinearBwdData<T>*>(
+                LinearBwdDataFactory<T>::get_instance().get_linear_bwd_data(diff_src, w, diff_dst));
+        if (linear_backward_data == NULL) {
+            //LOG(INFO) << "create a new one for linear bwd data";
+            linear_backward_data = new LinearBwdData<T>(diff_src, w, diff_dst);
+            LinearBwdDataFactory<T>::get_instance().set_linear_bwd_data(diff_src, w, diff_dst, linear_backward_data);
+        } else {
+            //LOG(INFO) << "reuse a exited one for linear bwd data";
+        }
+        return linear_backward_data;
+    }
+
+    static LinearBwdDataFactory& get_instance() {
+        static LinearBwdDataFactory instance_;
+        return instance_;
+    }
+
+private:
+#define LINEAR_BWD_DATA_PREFIX "linear_bwd_data_"
+    Op<T>* get_linear_bwd_data(mkldnn::memory::dims diff_src,
+                               mkldnn::memory::dims w,
+                               mkldnn::memory::dims diff_dst) {
+        std::string key = LINEAR_BWD_DATA_PREFIX;
+
+        key += dims_to_string(diff_src);
+        key += dims_to_string(w);
+        key += dims_to_string(diff_dst);
+
+        return this->get_op(key);
+    }
+
+    void set_linear_bwd_data(mkldnn::memory::dims diff_src,
+                             mkldnn::memory::dims w,
+                             mkldnn::memory::dims diff_dst,
+                             Op<T> *op) {
+        std::string key = LINEAR_BWD_DATA_PREFIX;
+
+        key += dims_to_string(diff_src);
+        key += dims_to_string(w);
+        key += dims_to_string(diff_dst);
+
+        this->set_op(key, op);
+    }
+};
+
+#endif //_LINEAR_BWD_DATA_FACTORY_
diff --git a/python/ideep4py/include/primitives/prim_mgr/linear_bwd_weights_factory.h b/python/ideep4py/include/primitives/prim_mgr/linear_bwd_weights_factory.h
new file mode 100644
index 00000000..c073ff05
--- /dev/null
+++ b/python/ideep4py/include/primitives/prim_mgr/linear_bwd_weights_factory.h
@@ -0,0 +1,96 @@
+/*
+ *Copyright (c) 2018 Intel Corporation.
+ *
+ *Permission is hereby granted, free of charge, to any person obtaining a copy
+ *of this software and associated documentation files (the "Software"), to deal
+ *in the Software without restriction, including without limitation the rights
+ *to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ *copies of the Software, and to permit persons to whom the Software is
+ *furnished to do so, subject to the following conditions:
+ *
+ *The above copyright notice and this permission notice shall be included in
+ *all copies or substantial portions of the Software.
+ *
+ *THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ *OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ *THE SOFTWARE.
+ *
+ */
+
+
+#ifndef _LINEAR_BWD_WEIGHTS_FACTORY_
+#define _LINEAR_BWD_WEIGHTS_FACTORY_
+#include <mkldnn.hpp>
+#include <string>
+#include "op.h"
+#include "op_factory.h"
+#include <unordered_map>
+#include "utils.h"
+#include "linear_bwd_weights.h"
+
+template <typename T>
+class LinearBwdWeightsFactory : public OpFactory<T>
+{
+private:
+    LinearBwdWeightsFactory() {}
+    ~LinearBwdWeightsFactory() {}
+
+public:
+    static LinearBwdWeights<T>* get(mkldnn::memory::dims x, mkldnn::memory::dims diff_w,
+            mkldnn::memory::dims diff_b, mkldnn::memory::dims diff_y) {
+        LinearBwdWeights<T>* linear_backward_weights = NULL;
+        //try to find a suit one in pool
+        linear_backward_weights = dynamic_cast<LinearBwdWeights<T>*>(
+                LinearBwdWeightsFactory<T>::get_instance().get_linear_bwd_weights(x, diff_w, diff_b, diff_y));
+        if (linear_backward_weights == NULL) {
+            //LOG(INFO) << "create a new one for linear bwd weights";
+            linear_backward_weights = new LinearBwdWeights<T>(x, diff_w, diff_b, diff_y);
+            LinearBwdWeightsFactory<T>::get_instance().set_linear_bwd_weights(x, diff_w, diff_b, diff_y, linear_backward_weights);
+        } else {
+            //LOG(INFO) << "reuse existed one for linear bwd weights";
+        }
+        return linear_backward_weights;
+    }
+
+    static LinearBwdWeightsFactory& get_instance() {
+        static LinearBwdWeightsFactory instance_;
+        return instance_;
+    }
+
+private:
+#define LINEAR_BWD_WEIGHTS_PREFIX "linear_bwd_weights_"
+    Op<T>* get_linear_bwd_weights(mkldnn::memory::dims x,
+                                  mkldnn::memory::dims diff_w,
+                                  mkldnn::memory::dims diff_b,
+                                  mkldnn::memory::dims diff_y) {
+        std::string key = LINEAR_BWD_WEIGHTS_PREFIX;
+
+        key += dims_to_string(x);
+        key += dims_to_string(diff_w);
+        key += dims_to_string(diff_b);
+        key += dims_to_string(diff_y);
+
+        return this->get_op(key);
+    }
+
+    void set_linear_bwd_weights(mkldnn::memory::dims x,
+                                mkldnn::memory::dims diff_w,
+                                mkldnn::memory::dims diff_b,
+                                mkldnn::memory::dims diff_y,
+                                Op<T> *op) {
+        std::string key = LINEAR_BWD_WEIGHTS_PREFIX;
+
+        key += dims_to_string(x);
+        key += dims_to_string(diff_w);
+        key += dims_to_string(diff_b);
+        key += dims_to_string(diff_y);
+
+        this->set_op(key, op);
+    }
+};
+
+#endif//_LINEAR_BWD_WEIGHTS_FACTORY_
diff --git a/python/ideep4py/include/primitives/prim_mgr/linear_fwd_factory.h b/python/ideep4py/include/primitives/prim_mgr/linear_fwd_factory.h
new file mode 100644
index 00000000..2e8f951f
--- /dev/null
+++ b/python/ideep4py/include/primitives/prim_mgr/linear_fwd_factory.h
@@ -0,0 +1,95 @@
+/*
+ *Copyright (c) 2018 Intel Corporation.
+ *
+ *Permission is hereby granted, free of charge, to any person obtaining a copy
+ *of this software and associated documentation files (the "Software"), to deal
+ *in the Software without restriction, including without limitation the rights
+ *to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ *copies of the Software, and to permit persons to whom the Software is
+ *furnished to do so, subject to the following conditions:
+ *
+ *The above copyright notice and this permission notice shall be included in
+ *all copies or substantial portions of the Software.
+ *
+ *THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ *OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ *THE SOFTWARE.
+ *
+ */
+
+
+#ifndef _LINEAR_FWD_FACTORY_
+#define _LINEAR_FWD_FACTORY_
+#include <mkldnn.hpp>
+#include <string>
+#include "op.h"
+#include "op_factory.h"
+#include <unordered_map>
+#include "utils.h"
+#include "linear_fwd.h"
+
+template <typename T>
+class LinearFwdFactory : public OpFactory<T>
+{
+private:
+    LinearFwdFactory() {}
+    ~LinearFwdFactory() {}
+
+public:
+    static LinearFwd<T>* get(mkldnn::memory::dims x, mkldnn::memory::dims w,
+            mkldnn::memory::dims b, mkldnn::memory::dims y) {
+        LinearFwd<T>* linear_forward = NULL;
+        //try to find a suitable one in pool
+        linear_forward = dynamic_cast<LinearFwd<T>*> (
+                LinearFwdFactory<T>::get_instance().get_linear_fwd(x, w, b, y));
+        if (linear_forward == NULL) {
+            //LOG(INFO) << "create a new one for linear fwd";
+            linear_forward = new LinearFwd<T>(x, w, b, y);
+            LinearFwdFactory<T>::get_instance().set_linear_fwd(x, w, b, y, linear_forward);
+        } else {
+            //LOG(INFO) << "reuse exist one linear fwd";
+        }
+        return linear_forward;
+    }
+    static LinearFwdFactory& get_instance() {
+        static LinearFwdFactory instance_;
+        return instance_;
+    }
+
+private:
+#define LINEAR_FWD_PREFIX "linear_fwd_"
+    Op<T>* get_linear_fwd(mkldnn::memory::dims x,
+                          mkldnn::memory::dims w,
+                          mkldnn::memory::dims b,
+                          mkldnn::memory::dims y) {
+        std::string key = LINEAR_FWD_PREFIX;
+
+        key += dims_to_string(x);
+        key += dims_to_string(w);
+        key += dims_to_string(b);
+        key += dims_to_string(y);
+
+        return this->get_op(key);
+    }
+
+    void set_linear_fwd(mkldnn::memory::dims x,
+                        mkldnn::memory::dims w,
+                        mkldnn::memory::dims b,
+                        mkldnn::memory::dims y,
+                        Op<T>* op) {
+        std::string key = LINEAR_FWD_PREFIX;
+
+        key += dims_to_string(x);
+        key += dims_to_string(w);
+        key += dims_to_string(b);
+        key += dims_to_string(y);
+
+        return;
+    }
+};
+
+#endif //_LINEAR_FWD_FACTORY
diff --git a/python/ideep4py/include/primitives/prim_mgr/lrn_bwd_factory.h b/python/ideep4py/include/primitives/prim_mgr/lrn_bwd_factory.h
new file mode 100644
index 00000000..d3110aa7
--- /dev/null
+++ b/python/ideep4py/include/primitives/prim_mgr/lrn_bwd_factory.h
@@ -0,0 +1,117 @@
+/*
+ *Copyright (c) 2018 Intel Corporation.
+ *
+ *Permission is hereby granted, free of charge, to any person obtaining a copy
+ *of this software and associated documentation files (the "Software"), to deal
+ *in the Software without restriction, including without limitation the rights
+ *to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ *copies of the Software, and to permit persons to whom the Software is
+ *furnished to do so, subject to the following conditions:
+ *
+ *The above copyright notice and this permission notice shall be included in
+ *all copies or substantial portions of the Software.
+ *
+ *THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ *OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ *THE SOFTWARE.
+ *
+ */
+
+
+#ifndef _LRN_BWD_FACTORY_
+#define _LRN_BWD_FACTORY_
+#include <mkldnn.hpp>
+#include <string>
+#include "op.h"
+#include "op_factory.h"
+#include <unordered_map>
+#include "utils.h"
+#include "lrn_bwd.h"
+
+template <typename T>
+class LocalResponseNormalizationBwdFactory : public OpFactory<T>
+{
+private:
+    LocalResponseNormalizationBwdFactory() {}
+    ~LocalResponseNormalizationBwdFactory() {}
+
+public:
+    static LocalResponseNormalizationBwd<T>* get(mkldnn::memory::dims src_d,
+                                mkldnn::memory::dims dst_d,
+                                mkldnn::memory::dims ws_d,
+                                mkldnn::memory::data_type ws_dt,
+                                int n, double k, double alpha, double beta,
+                                mkldnn::algorithm alg_kind) {
+
+        LocalResponseNormalizationBwd<T>* lrn_backward = NULL;
+
+        //try to find a suitable one in pool
+        lrn_backward = dynamic_cast<LocalResponseNormalizationBwd<T>*> (
+            LocalResponseNormalizationBwdFactory<T>::get_instance().get_lrn_bwd( src_d, dst_d, ws_d, ws_dt, n, k, alpha, beta, alg_kind));
+
+        if (lrn_backward == NULL) {
+            //LOG(INFO) << "create a new one for lrn bwd: " << alg_kind;
+            lrn_backward = new LocalResponseNormalizationBwd<T>( src_d, dst_d, ws_d, ws_dt, n, k, alpha, beta, alg_kind);
+            LocalResponseNormalizationBwdFactory<T>::get_instance().set_lrn_bwd( src_d, dst_d, ws_d, ws_dt, n, k, alpha, beta, alg_kind, lrn_backward);
+        } else {
+            //LOG(INFO) << "reuse exist one for lrn bwd: " << alg_kind;
+        }
+        return lrn_backward;
+    }
+
+    static LocalResponseNormalizationBwdFactory& get_instance() {
+        static LocalResponseNormalizationBwdFactory instance_;
+        return instance_;
+    }
+
+private:
+#define LRN_BWD_PREFIX "lrn_bwd_"
+    Op<T>* get_lrn_bwd(mkldnn::memory::dims src_d,
+                             mkldnn::memory::dims dst_d,
+                             mkldnn::memory::dims ws_d,
+                             mkldnn::memory::data_type ws_dt,
+                             int n, double k, double alpha, double beta,
+                             mkldnn::algorithm alg_kind) {
+        std::string key = LRN_BWD_PREFIX;
+
+        key += dims_to_string(src_d);
+        key += dims_to_string(dst_d);
+        key += dims_to_string(ws_d);
+        key += int_to_string(ws_dt);
+        key += int_to_string(n);
+        key += double_to_string(k);
+        key += double_to_string(alpha);
+        key += double_to_string(beta);
+        key += int_to_string(alg_kind);
+
+        return this->get_op(key);
+    };
+
+    void set_lrn_bwd(mkldnn::memory::dims src_d,
+                           mkldnn::memory::dims dst_d,
+                           mkldnn::memory::dims ws_d,
+                           mkldnn::memory::data_type ws_dt,
+                           int n, double k, double alpha, double beta,
+                           mkldnn::algorithm alg_kind,
+                           Op<T> *op) {
+        std::string key = LRN_BWD_PREFIX;
+
+        key += dims_to_string(src_d);
+        key += dims_to_string(dst_d);
+        key += dims_to_string(ws_d);
+        key += int_to_string(ws_dt);
+        key += int_to_string(n);
+        key += double_to_string(k);
+        key += double_to_string(alpha);
+        key += double_to_string(beta);
+        key += int_to_string(alg_kind);
+
+        this->set_op(key, op);
+    }
+};
+
+#endif // _LRN_BWD_FACTORY_
diff --git a/python/ideep4py/include/primitives/prim_mgr/lrn_fwd_factory.h b/python/ideep4py/include/primitives/prim_mgr/lrn_fwd_factory.h
new file mode 100755
index 00000000..4cb36cb9
--- /dev/null
+++ b/python/ideep4py/include/primitives/prim_mgr/lrn_fwd_factory.h
@@ -0,0 +1,106 @@
+/*
+ *Copyright (c) 2018 Intel Corporation.
+ *
+ *Permission is hereby granted, free of charge, to any person obtaining a copy
+ *of this software and associated documentation files (the "Software"), to deal
+ *in the Software without restriction, including without limitation the rights
+ *to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ *copies of the Software, and to permit persons to whom the Software is
+ *furnished to do so, subject to the following conditions:
+ *
+ *The above copyright notice and this permission notice shall be included in
+ *all copies or substantial portions of the Software.
+ *
+ *THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ *OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ *THE SOFTWARE.
+ *
+ */
+
+
+#ifndef _LRN_FWD_FACTORY_
+#define _LRN_FWD_FACTORY_
+#include <mkldnn.hpp>
+#include <string>
+#include "op.h"
+#include "op_factory.h"
+#include <unordered_map>
+#include "utils.h"
+#include "lrn_fwd.h"
+
+template <typename T>
+class LocalResponseNormalizationFwdFactory : public OpFactory<T>
+{
+private:
+    LocalResponseNormalizationFwdFactory() {}
+    ~LocalResponseNormalizationFwdFactory() {}
+
+public:
+    static LocalResponseNormalizationFwd<T>* get(
+            mkldnn::memory::dims src_d, mkldnn::memory::format src_fmt,
+            int n, double k, double alpha, double beta,
+            mkldnn::algorithm alg_kind)
+    {
+        LocalResponseNormalizationFwd<T>* lrn_forward = NULL;
+
+        //try to find a suitable one in pool
+        lrn_forward = dynamic_cast<LocalResponseNormalizationFwd<T>*> (
+            LocalResponseNormalizationFwdFactory<T>::get_instance().get_lrn_fwd(src_d, src_fmt, n, k, alpha, beta, alg_kind));
+
+        if (lrn_forward == NULL) {
+            //LOG(INFO) << "create a new one for lrn fwd: " << alg_kind;
+            lrn_forward = new LocalResponseNormalizationFwd<T>(src_d, src_fmt, n, k, alpha, beta, alg_kind);
+            LocalResponseNormalizationFwdFactory<T>::get_instance().set_lrn_fwd( src_d, src_fmt, n, k, alpha, beta, alg_kind, lrn_forward);
+        } else {
+            //LOG(INFO) << "reuse exist one for lrn fwd: " << alg_kind;
+        }
+        return lrn_forward;
+    }
+
+    static LocalResponseNormalizationFwdFactory& get_instance() {
+        static LocalResponseNormalizationFwdFactory instance_;
+        return instance_;
+    }
+
+private:
+#define LRN_FWD_PREFIX "lrn_fwd_"
+    Op<T>* get_lrn_fwd(mkldnn::memory::dims src_d,
+                        mkldnn::memory::format src_fmt,
+                        int n, double k, double alpha, double beta,
+                        mkldnn::algorithm alg_kind) {
+        std::string key = LRN_FWD_PREFIX;
+
+        key += dims_to_string(src_d);
+        key += int_to_string(src_fmt);
+        key += int_to_string(n);
+        key += double_to_string(k);
+        key += double_to_string(alpha);
+        key += double_to_string(beta);
+        key += int_to_string(alg_kind);
+
+        return this->get_op(key);
+    }
+
+    void set_lrn_fwd(mkldnn::memory::dims src_d,
+            mkldnn::memory::format src_fmt,
+            int n, double k, double alpha, double beta,
+            mkldnn::algorithm alg_kind, Op<T> *op) {
+        std::string key = LRN_FWD_PREFIX;
+
+        key += dims_to_string(src_d);
+        key += int_to_string(src_fmt);
+        key += int_to_string(n);
+        key += double_to_string(k);
+        key += double_to_string(alpha);
+        key += double_to_string(beta);
+        key += int_to_string(alg_kind);
+
+        this->set_op(key, op);
+    }
+};
+
+#endif // _LRN_FWD_FACTORY_
diff --git a/python/ideep4py/include/primitives/prim_mgr/op_factory.h b/python/ideep4py/include/primitives/prim_mgr/op_factory.h
new file mode 100644
index 00000000..44c36d61
--- /dev/null
+++ b/python/ideep4py/include/primitives/prim_mgr/op_factory.h
@@ -0,0 +1,77 @@
+/*
+ *Copyright (c) 2018 Intel Corporation.
+ *
+ *Permission is hereby granted, free of charge, to any person obtaining a copy
+ *of this software and associated documentation files (the "Software"), to deal
+ *in the Software without restriction, including without limitation the rights
+ *to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ *copies of the Software, and to permit persons to whom the Software is
+ *furnished to do so, subject to the following conditions:
+ *
+ *The above copyright notice and this permission notice shall be included in
+ *all copies or substantial portions of the Software.
+ *
+ *THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ *OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ *THE SOFTWARE.
+ *
+ */
+
+
+#ifndef _OP_FACTORY_
+#define _OP_FACTORY_
+
+#include <unordered_map>
+#include <string>
+#include "op.h"
+#include "config.h"
+
+extern bool enable_prim_reuse;
+
+template <typename T>
+class OpFactory {
+public:
+    OpFactory() {};
+    ~OpFactory() {};
+    // virtual Op<T>* get() {return NULL;}
+
+    Op<T>* get_op(std::string key) {
+        // if not enable primitive reuse
+        // just return NULL
+        if (!enable_prim_reuse)
+            return NULL;
+
+        auto stream_iter = map_.find(key);
+        if (stream_iter == map_.end()) {
+            return NULL;
+        } else {
+            return stream_iter->second;
+        }
+    };
+
+    void set_op(std::string key, Op<T>* op) {
+        // if not enable primitive reuse
+        // just return
+        if (!enable_prim_reuse)
+            return;
+
+        auto stream_iter = map_.find(key);
+        if (stream_iter == map_.end()) {
+            map_[key]=op;
+        } else {
+            throw new std::invalid_argument("cannot set same key to a new stream");
+        }
+    };
+
+public:
+    std::unordered_map<std::string, Op<T>*> map_;
+};
+
+#endif // _OP_FACTORY_
+
+
+// vim: et ts=4 sw=4 cindent cino^=l0,\:0,N-s
diff --git a/python/ideep4py/include/primitives/prim_mgr/pooling_bwd_factory.h b/python/ideep4py/include/primitives/prim_mgr/pooling_bwd_factory.h
new file mode 100644
index 00000000..a0b8a9de
--- /dev/null
+++ b/python/ideep4py/include/primitives/prim_mgr/pooling_bwd_factory.h
@@ -0,0 +1,130 @@
+/*
+ *Copyright (c) 2018 Intel Corporation.
+ *
+ *Permission is hereby granted, free of charge, to any person obtaining a copy
+ *of this software and associated documentation files (the "Software"), to deal
+ *in the Software without restriction, including without limitation the rights
+ *to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ *copies of the Software, and to permit persons to whom the Software is
+ *furnished to do so, subject to the following conditions:
+ *
+ *The above copyright notice and this permission notice shall be included in
+ *all copies or substantial portions of the Software.
+ *
+ *THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ *OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ *THE SOFTWARE.
+ *
+ */
+
+
+#ifndef _POOLING_BWD_FACTORY_
+#define _POOLING_BWD_FACTORY_
+#include <mkldnn.hpp>
+#include <string>
+#include "op.h"
+#include "op_factory.h"
+#include <unordered_map>
+#include "utils.h"
+#include "pooling_bwd.h"
+
+template <typename T>
+class Pooling2DBwdFactory : public OpFactory<T>
+{
+private:
+    Pooling2DBwdFactory() {}
+    ~Pooling2DBwdFactory() {}
+
+public:
+    static Pooling2DBwd<T>* get(mkldnn::memory::dims src_d,
+                                mkldnn::memory::dims dst_d,
+                                mkldnn::memory::dims ws_d,
+                                mkldnn::memory::data_type ws_dt,
+                                int ker_h, int ker_w,
+                                int sy, int sx,
+                                int pad_lh, int pad_lw, int pad_rh, int pad_rw,
+                                mkldnn::algorithm alg_kind) {
+        Pooling2DBwd<T>* pooling2d_backward = NULL;
+
+        //try to find a suitable one in pool
+        pooling2d_backward = dynamic_cast<Pooling2DBwd<T>*> (
+                             Pooling2DBwdFactory<T>::get_instance().get_pooling2d_bwd( src_d, dst_d, ws_d, ws_dt, ker_h, ker_w, sy, sx, pad_lh, pad_lw, pad_rh, pad_rw, alg_kind));
+
+        if (pooling2d_backward == NULL) {
+            //LOG(INFO) << "create a new one for pooling bwd: " << alg_kind;
+            pooling2d_backward = new Pooling2DBwd<T>( src_d, dst_d, ws_d, ws_dt, ker_h, ker_w, sy, sx, pad_lh, pad_lw, pad_rh, pad_rw, alg_kind);
+            Pooling2DBwdFactory<T>::get_instance().set_pooling2d_bwd( src_d, dst_d, ws_d, ws_dt, ker_h, ker_w, sy, sx, pad_lh, pad_lw, pad_rh, pad_rw, alg_kind, pooling2d_backward);
+        } else {
+            //LOG(INFO) << "reuse exist one for pooling bwd: " << alg_kind;
+        }
+        return pooling2d_backward;
+    }
+
+    static Pooling2DBwdFactory& get_instance() {
+        static Pooling2DBwdFactory instance_;
+        return instance_;
+    }
+
+private:
+#define POOLING2D_BWD_PREFIX "pooling2d_bwd_"
+    Op<T>* get_pooling2d_bwd(mkldnn::memory::dims src_d,
+                             mkldnn::memory::dims dst_d,
+                             mkldnn::memory::dims ws_d,
+                             mkldnn::memory::data_type ws_dt,
+                             int ker_h, int ker_w,
+                             int sy, int sx,
+                             int pad_lh, int pad_lw, int pad_rh, int pad_rw,
+                             mkldnn::algorithm alg_kind) {
+        std::string key = POOLING2D_BWD_PREFIX;
+
+        key += dims_to_string(src_d);
+        key += dims_to_string(dst_d);
+        key += dims_to_string(ws_d);
+        key += int_to_string(ws_dt);
+        key += int_to_string(ker_h);
+        key += int_to_string(ker_w);
+        key += int_to_string(sy);
+        key += int_to_string(sx);
+        key += int_to_string(pad_lh);
+        key += int_to_string(pad_lw);
+        key += int_to_string(pad_rh);
+        key += int_to_string(pad_rw);
+        key += int_to_string(alg_kind);
+
+        return this->get_op(key);
+    };
+
+    void set_pooling2d_bwd(mkldnn::memory::dims src_d,
+                           mkldnn::memory::dims dst_d,
+                           mkldnn::memory::dims ws_d,
+                           mkldnn::memory::data_type ws_dt,
+                           int ker_h, int ker_w,
+                           int sy, int sx,
+                           int pad_lh, int pad_lw, int pad_rh, int pad_rw,
+                           mkldnn::algorithm alg_kind,
+                           Op<T> *op) {
+        std::string key = POOLING2D_BWD_PREFIX;
+
+        key += dims_to_string(src_d);
+        key += dims_to_string(dst_d);
+        key += dims_to_string(ws_d);
+        key += int_to_string(ws_dt);
+        key += int_to_string(ker_h);
+        key += int_to_string(ker_w);
+        key += int_to_string(sy);
+        key += int_to_string(sx);
+        key += int_to_string(pad_lh);
+        key += int_to_string(pad_lw);
+        key += int_to_string(pad_rh);
+        key += int_to_string(pad_rw);
+        key += int_to_string(alg_kind);
+
+        this->set_op(key, op);
+    }
+};
+
+#endif // _POOLING_BWD_FACTORY_
diff --git a/python/ideep4py/include/primitives/prim_mgr/pooling_fwd_factory.h b/python/ideep4py/include/primitives/prim_mgr/pooling_fwd_factory.h
new file mode 100644
index 00000000..e134dffd
--- /dev/null
+++ b/python/ideep4py/include/primitives/prim_mgr/pooling_fwd_factory.h
@@ -0,0 +1,120 @@
+/*
+ *Copyright (c) 2018 Intel Corporation.
+ *
+ *Permission is hereby granted, free of charge, to any person obtaining a copy
+ *of this software and associated documentation files (the "Software"), to deal
+ *in the Software without restriction, including without limitation the rights
+ *to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ *copies of the Software, and to permit persons to whom the Software is
+ *furnished to do so, subject to the following conditions:
+ *
+ *The above copyright notice and this permission notice shall be included in
+ *all copies or substantial portions of the Software.
+ *
+ *THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ *OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ *THE SOFTWARE.
+ *
+ */
+
+
+#ifndef _POOLING_FWD_FACTORY_
+#define _POOLING_FWD_FACTORY_
+#include <mkldnn.hpp>
+#include <string>
+#include "op.h"
+#include "op_factory.h"
+#include <unordered_map>
+#include "utils.h"
+#include "pooling_fwd.h"
+
+template <typename T>
+class Pooling2DFwdFactory : public OpFactory<T>
+{
+private:
+    Pooling2DFwdFactory() {}
+    ~Pooling2DFwdFactory() {}
+
+public:
+    static Pooling2DFwd<T>* get(mkldnn::memory::dims src_d,
+                                mkldnn::memory::dims dst_d,
+                                int ker_h, int ker_w,
+                                int sy, int sx,
+                                int pad_lh, int pad_lw, int pad_rh, int pad_rw,
+                                mkldnn::algorithm alg_kind) {
+        Pooling2DFwd<T>* pooling2d_forward = NULL;
+
+        //try to find a suitable one in pool
+        pooling2d_forward = dynamic_cast<Pooling2DFwd<T>*> (
+                            Pooling2DFwdFactory<T>::get_instance().get_pooling2d_fwd( src_d, dst_d, ker_h, ker_w, sy, sx, pad_lh, pad_lw, pad_rh, pad_rw, alg_kind));
+
+        if (pooling2d_forward == NULL) {
+            //LOG(INFO) << "create a new one for pooling fwd: " << alg_kind;
+            pooling2d_forward = new Pooling2DFwd<T>( src_d, dst_d, ker_h, ker_w, sy, sx, pad_lh, pad_lw, pad_rh, pad_rw, alg_kind);
+            Pooling2DFwdFactory<T>::get_instance().set_pooling2d_fwd( src_d, dst_d, ker_h, ker_w, sy, sx, pad_lh, pad_lw, pad_rh, pad_rw, alg_kind, pooling2d_forward);
+        } else {
+            //LOG(INFO) << "reuse exist one for pooling fwd: " << alg_kind;
+        }
+        return pooling2d_forward;
+    }
+
+    static Pooling2DFwdFactory& get_instance() {
+        static Pooling2DFwdFactory instance_;
+        return instance_;
+    }
+
+private:
+#define POOLING2D_FWD_PREFIX "pooling2d_fwd_"
+    Op<T>* get_pooling2d_fwd(mkldnn::memory::dims src_d,
+                             mkldnn::memory::dims dst_d,
+                             int ker_h, int ker_w,
+                             int sy, int sx,
+                             int pad_lh, int pad_lw, int pad_rh, int pad_rw,
+                             mkldnn::algorithm alg_kind) {
+        std::string key = POOLING2D_FWD_PREFIX;
+
+        key += dims_to_string(src_d);
+        key += dims_to_string(dst_d);
+        key += int_to_string(ker_h);
+        key += int_to_string(ker_w);
+        key += int_to_string(sy);
+        key += int_to_string(sx);
+        key += int_to_string(pad_lh);
+        key += int_to_string(pad_lw);
+        key += int_to_string(pad_rh);
+        key += int_to_string(pad_rw);
+        key += int_to_string(alg_kind);
+
+        return this->get_op(key);
+    }
+
+    void set_pooling2d_fwd(mkldnn::memory::dims src_d,
+                           mkldnn::memory::dims dst_d,
+                           int ker_h, int ker_w,
+                           int sy, int sx,
+                           int pad_lh, int pad_lw, int pad_rh, int pad_rw,
+                           mkldnn::algorithm alg_kind,
+                           Op<T> *op) {
+        std::string key = POOLING2D_FWD_PREFIX;
+
+        key += dims_to_string(src_d);
+        key += dims_to_string(dst_d);
+        key += int_to_string(ker_h);
+        key += int_to_string(ker_w);
+        key += int_to_string(sy);
+        key += int_to_string(sx);
+        key += int_to_string(pad_lh);
+        key += int_to_string(pad_lw);
+        key += int_to_string(pad_rh);
+        key += int_to_string(pad_rw);
+        key += int_to_string(alg_kind);
+
+        this->set_op(key, op);
+    }
+};
+
+#endif // _POOLING_FWD_FACTORY_
diff --git a/python/ideep4py/include/primitives/prim_mgr/prim_factory.h b/python/ideep4py/include/primitives/prim_mgr/prim_factory.h
new file mode 100644
index 00000000..02a3c827
--- /dev/null
+++ b/python/ideep4py/include/primitives/prim_mgr/prim_factory.h
@@ -0,0 +1,43 @@
+/*
+ *Copyright (c) 2018 Intel Corporation.
+ *
+ *Permission is hereby granted, free of charge, to any person obtaining a copy
+ *of this software and associated documentation files (the "Software"), to deal
+ *in the Software without restriction, including without limitation the rights
+ *to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ *copies of the Software, and to permit persons to whom the Software is
+ *furnished to do so, subject to the following conditions:
+ *
+ *The above copyright notice and this permission notice shall be included in
+ *all copies or substantial portions of the Software.
+ *
+ *THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ *OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ *THE SOFTWARE.
+ *
+ */
+
+
+#ifndef _PRIM_FACTORY_
+#define _PRIM_FACTORY_
+
+#include "reorder_factory.h"
+#include "conv_fwd_factory.h"
+#include "conv_bwd_data_factory.h"
+#include "conv_bwd_weights_factory.h"
+#include "pooling_fwd_factory.h"
+#include "pooling_bwd_factory.h"
+#include "eltwise_fwd_factory.h"
+#include "eltwise_bwd_factory.h"
+#include "bn_fwd_factory.h"
+#include "bn_bwd_factory.h"
+#include "concat_fwd_factory.h"
+#include "concat_bwd_factory.h"
+#include "lrn_fwd_factory.h"
+#include "lrn_bwd_factory.h"
+
+#endif // _PRIM_FACTORY_
diff --git a/python/ideep4py/include/primitives/prim_mgr/reorder_factory.h b/python/ideep4py/include/primitives/prim_mgr/reorder_factory.h
new file mode 100644
index 00000000..41293fbb
--- /dev/null
+++ b/python/ideep4py/include/primitives/prim_mgr/reorder_factory.h
@@ -0,0 +1,93 @@
+/*
+ *Copyright (c) 2018 Intel Corporation.
+ *
+ *Permission is hereby granted, free of charge, to any person obtaining a copy
+ *of this software and associated documentation files (the "Software"), to deal
+ *in the Software without restriction, including without limitation the rights
+ *to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ *copies of the Software, and to permit persons to whom the Software is
+ *furnished to do so, subject to the following conditions:
+ *
+ *The above copyright notice and this permission notice shall be included in
+ *all copies or substantial portions of the Software.
+ *
+ *THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ *OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ *THE SOFTWARE.
+ *
+ */
+
+
+#ifndef _REORDER_FACTORY_
+#define _REORDER_FACTORY_
+#include <mkldnn.hpp>
+#include <string>
+#include "op.h"
+#include "op_factory.h"
+#include <unordered_map>
+#include "utils.h"
+#include "reorder_op.h"
+
+template <typename T>
+class ReorderFactory : public OpFactory<T>
+{
+private:
+    ReorderFactory() {}
+    ~ReorderFactory() {}
+
+public:
+    static ReorderOp<T>* get(mkldnn::memory::dims dims, mkldnn::memory::format src_fmt, mkldnn::memory::format dst_fmt) {
+        ReorderOp<T>* reorder_op = NULL;
+
+        //try to find a suitable one in pool
+        reorder_op = dynamic_cast<ReorderOp<T>*> (
+                            ReorderFactory<T>::get_instance().get_reorder(dims, src_fmt, dst_fmt));
+
+        if (reorder_op == NULL) {
+            //LOG(INFO) << "create a new one for reorder";
+            reorder_op = new ReorderOp<T>( dims, src_fmt, dst_fmt);
+            ReorderFactory<T>::get_instance().set_reorder( dims, src_fmt, dst_fmt, reorder_op);
+        } else {
+            //LOG(INFO) << "reuse exist one for reorder";
+        }
+        return reorder_op;
+    }
+
+    static ReorderFactory& get_instance() {
+        static ReorderFactory instance_;
+        return instance_;
+    }
+
+private:
+#define REORDER_PREFIX "reorder_"
+    Op<T>* get_reorder(mkldnn::memory::dims dims,
+                       mkldnn::memory::format src_fmt,
+                       mkldnn::memory::format dst_fmt) {
+        std::string key = REORDER_PREFIX;
+
+        key += dims_to_string(dims);
+        key += int_to_string((int)src_fmt);
+        key += int_to_string((int)dst_fmt);
+
+        return this->get_op(key);
+    }
+
+    void set_reorder(mkldnn::memory::dims dims,
+                     mkldnn::memory::format src_fmt,
+                     mkldnn::memory::format dst_fmt,
+                     Op<T> *op) {
+        std::string key = REORDER_PREFIX;
+
+        key += dims_to_string(dims);
+        key += int_to_string((int)src_fmt);
+        key += int_to_string((int)dst_fmt);
+
+        this->set_op(key, op);
+    }
+};
+
+#endif // _REORDER_FACTORY_
diff --git a/python/ideep4py/include/swigpyrun.h b/python/ideep4py/include/swigpyrun.h
new file mode 100644
index 00000000..f9bcd6c4
--- /dev/null
+++ b/python/ideep4py/include/swigpyrun.h
@@ -0,0 +1,2988 @@
+/* ----------------------------------------------------------------------------
+ * This file was automatically generated by SWIG (http://www.swig.org).
+ * Version 3.0.12
+ *
+ * This file is not intended to be easily readable and contains a number of
+ * coding conventions designed to improve portability and efficiency. Do not make
+ * changes to this file unless you know what you are doing--modify the SWIG
+ * interface file instead.
+ * ----------------------------------------------------------------------------- */
+#if !defined(SWIGPYTHON)
+#define SWIGPYTHON
+
+#define SWIGPYTHON_BUILTIN
+/* -----------------------------------------------------------------------------
+ *  This section contains generic SWIG labels for method/variable
+ *  declarations/attributes, and other compiler dependent labels.
+ * ----------------------------------------------------------------------------- */
+
+/* template workaround for compilers that cannot correctly implement the C++ standard */
+#ifndef SWIGTEMPLATEDISAMBIGUATOR
+# if defined(__SUNPRO_CC) && (__SUNPRO_CC <= 0x560)
+#  define SWIGTEMPLATEDISAMBIGUATOR template
+# elif defined(__HP_aCC)
+/* Needed even with `aCC -AA' when `aCC -V' reports HP ANSI C++ B3910B A.03.55 */
+/* If we find a maximum version that requires this, the test would be __HP_aCC <= 35500 for A.03.55 */
+#  define SWIGTEMPLATEDISAMBIGUATOR template
+# else
+#  define SWIGTEMPLATEDISAMBIGUATOR
+# endif
+#endif
+
+/* inline attribute */
+#ifndef SWIGINLINE
+# if defined(__cplusplus) || (defined(__GNUC__) && !defined(__STRICT_ANSI__))
+#   define SWIGINLINE inline
+# else
+#   define SWIGINLINE
+# endif
+#endif
+
+/* attribute recognised by some compilers to avoid 'unused' warnings */
+#ifndef SWIGUNUSED
+# if defined(__GNUC__)
+#   if !(defined(__cplusplus)) || (__GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR__ >= 4))
+#     define SWIGUNUSED __attribute__ ((__unused__))
+#   else
+#     define SWIGUNUSED
+#   endif
+# elif defined(__ICC)
+#   define SWIGUNUSED __attribute__ ((__unused__))
+# else
+#   define SWIGUNUSED
+# endif
+#endif
+
+#ifndef SWIG_MSC_UNSUPPRESS_4505
+# if defined(_MSC_VER)
+#   pragma warning(disable : 4505) /* unreferenced local function has been removed */
+# endif
+#endif
+
+#ifndef SWIGUNUSEDPARM
+# ifdef __cplusplus
+#   define SWIGUNUSEDPARM(p)
+# else
+#   define SWIGUNUSEDPARM(p) p SWIGUNUSED
+# endif
+#endif
+
+/* internal SWIG method */
+#ifndef SWIGINTERN
+# define SWIGINTERN static SWIGUNUSED
+#endif
+
+/* internal inline SWIG method */
+#ifndef SWIGINTERNINLINE
+# define SWIGINTERNINLINE SWIGINTERN SWIGINLINE
+#endif
+
+/* exporting methods */
+#if defined(__GNUC__)
+#  if (__GNUC__ >= 4) || (__GNUC__ == 3 && __GNUC_MINOR__ >= 4)
+#    ifndef GCC_HASCLASSVISIBILITY
+#      define GCC_HASCLASSVISIBILITY
+#    endif
+#  endif
+#endif
+
+#ifndef SWIGEXPORT
+# if defined(_WIN32) || defined(__WIN32__) || defined(__CYGWIN__)
+#   if defined(STATIC_LINKED)
+#     define SWIGEXPORT
+#   else
+#     define SWIGEXPORT __declspec(dllexport)
+#   endif
+# else
+#   if defined(__GNUC__) && defined(GCC_HASCLASSVISIBILITY)
+#     define SWIGEXPORT __attribute__ ((visibility("default")))
+#   else
+#     define SWIGEXPORT
+#   endif
+# endif
+#endif
+
+/* calling conventions for Windows */
+#ifndef SWIGSTDCALL
+# if defined(_WIN32) || defined(__WIN32__) || defined(__CYGWIN__)
+#   define SWIGSTDCALL __stdcall
+# else
+#   define SWIGSTDCALL
+# endif
+#endif
+
+/* Deal with Microsoft's attempt at deprecating C standard runtime functions */
+#if !defined(SWIG_NO_CRT_SECURE_NO_DEPRECATE) && defined(_MSC_VER) && !defined(_CRT_SECURE_NO_DEPRECATE)
+# define _CRT_SECURE_NO_DEPRECATE
+#endif
+
+/* Deal with Microsoft's attempt at deprecating methods in the standard C++ library */
+#if !defined(SWIG_NO_SCL_SECURE_NO_DEPRECATE) && defined(_MSC_VER) && !defined(_SCL_SECURE_NO_DEPRECATE)
+# define _SCL_SECURE_NO_DEPRECATE
+#endif
+
+/* Deal with Apple's deprecated 'AssertMacros.h' from Carbon-framework */
+#if defined(__APPLE__) && !defined(__ASSERT_MACROS_DEFINE_VERSIONS_WITHOUT_UNDERSCORES)
+# define __ASSERT_MACROS_DEFINE_VERSIONS_WITHOUT_UNDERSCORES 0
+#endif
+
+/* Intel's compiler complains if a variable which was never initialised is
+ * cast to void, which is a common idiom which we use to indicate that we
+ * are aware a variable isn't used.  So we just silence that warning.
+ * See: https://github.com/swig/swig/issues/192 for more discussion.
+ */
+#ifdef __INTEL_COMPILER
+# pragma warning disable 592
+#endif
+/*  Errors in SWIG */
+#define  SWIG_UnknownError    	   -1
+#define  SWIG_IOError        	   -2
+#define  SWIG_RuntimeError   	   -3
+#define  SWIG_IndexError     	   -4
+#define  SWIG_TypeError      	   -5
+#define  SWIG_DivisionByZero 	   -6
+#define  SWIG_OverflowError  	   -7
+#define  SWIG_SyntaxError    	   -8
+#define  SWIG_ValueError     	   -9
+#define  SWIG_SystemError    	   -10
+#define  SWIG_AttributeError 	   -11
+#define  SWIG_MemoryError    	   -12
+#define  SWIG_NullReferenceError   -13
+
+
+/* -----------------------------------------------------------------------------
+ * swigrun.swg
+ *
+ * This file contains generic C API SWIG runtime support for pointer
+ * type checking.
+ * ----------------------------------------------------------------------------- */
+
+/* This should only be incremented when either the layout of swig_type_info changes,
+   or for whatever reason, the runtime changes incompatibly */
+#define SWIG_RUNTIME_VERSION "4"
+
+/* define SWIG_TYPE_TABLE_NAME as "SWIG_TYPE_TABLE" */
+#ifdef SWIG_TYPE_TABLE
+# define SWIG_QUOTE_STRING(x) #x
+# define SWIG_EXPAND_AND_QUOTE_STRING(x) SWIG_QUOTE_STRING(x)
+# define SWIG_TYPE_TABLE_NAME SWIG_EXPAND_AND_QUOTE_STRING(SWIG_TYPE_TABLE)
+#else
+# define SWIG_TYPE_TABLE_NAME
+#endif
+
+/*
+  You can use the SWIGRUNTIME and SWIGRUNTIMEINLINE macros for
+  creating a static or dynamic library from the SWIG runtime code.
+  In 99.9% of the cases, SWIG just needs to declare them as 'static'.
+
+  But only do this if strictly necessary, ie, if you have problems
+  with your compiler or suchlike.
+*/
+
+#ifndef SWIGRUNTIME
+# define SWIGRUNTIME SWIGINTERN
+#endif
+
+#ifndef SWIGRUNTIMEINLINE
+# define SWIGRUNTIMEINLINE SWIGRUNTIME SWIGINLINE
+#endif
+
+/*  Generic buffer size */
+#ifndef SWIG_BUFFER_SIZE
+# define SWIG_BUFFER_SIZE 1024
+#endif
+
+/* Flags for pointer conversions */
+#define SWIG_POINTER_DISOWN        0x1
+#define SWIG_CAST_NEW_MEMORY       0x2
+
+/* Flags for new pointer objects */
+#define SWIG_POINTER_OWN           0x1
+
+
+/*
+   Flags/methods for returning states.
+
+   The SWIG conversion methods, as ConvertPtr, return an integer
+   that tells if the conversion was successful or not. And if not,
+   an error code can be returned (see swigerrors.swg for the codes).
+
+   Use the following macros/flags to set or process the returning
+   states.
+
+   In old versions of SWIG, code such as the following was usually written:
+
+     if (SWIG_ConvertPtr(obj,vptr,ty.flags) != -1) {
+       // success code
+     } else {
+       //fail code
+     }
+
+   Now you can be more explicit:
+
+    int res = SWIG_ConvertPtr(obj,vptr,ty.flags);
+    if (SWIG_IsOK(res)) {
+      // success code
+    } else {
+      // fail code
+    }
+
+   which is the same really, but now you can also do
+
+    Type *ptr;
+    int res = SWIG_ConvertPtr(obj,(void **)(&ptr),ty.flags);
+    if (SWIG_IsOK(res)) {
+      // success code
+      if (SWIG_IsNewObj(res) {
+        ...
+	delete *ptr;
+      } else {
+        ...
+      }
+    } else {
+      // fail code
+    }
+
+   I.e., now SWIG_ConvertPtr can return new objects and you can
+   identify the case and take care of the deallocation. Of course that
+   also requires SWIG_ConvertPtr to return new result values, such as
+
+      int SWIG_ConvertPtr(obj, ptr,...) {
+        if (<obj is ok>) {
+          if (<need new object>) {
+            *ptr = <ptr to new allocated object>;
+            return SWIG_NEWOBJ;
+          } else {
+            *ptr = <ptr to old object>;
+            return SWIG_OLDOBJ;
+          }
+        } else {
+          return SWIG_BADOBJ;
+        }
+      }
+
+   Of course, returning the plain '0(success)/-1(fail)' still works, but you can be
+   more explicit by returning SWIG_BADOBJ, SWIG_ERROR or any of the
+   SWIG errors code.
+
+   Finally, if the SWIG_CASTRANK_MODE is enabled, the result code
+   allows to return the 'cast rank', for example, if you have this
+
+       int food(double)
+       int fooi(int);
+
+   and you call
+
+      food(1)   // cast rank '1'  (1 -> 1.0)
+      fooi(1)   // cast rank '0'
+
+   just use the SWIG_AddCast()/SWIG_CheckState()
+*/
+
+#define SWIG_OK                    (0)
+#define SWIG_ERROR                 (-1)
+#define SWIG_IsOK(r)               (r >= 0)
+#define SWIG_ArgError(r)           ((r != SWIG_ERROR) ? r : SWIG_TypeError)
+
+/* The CastRankLimit says how many bits are used for the cast rank */
+#define SWIG_CASTRANKLIMIT         (1 << 8)
+/* The NewMask denotes the object was created (using new/malloc) */
+#define SWIG_NEWOBJMASK            (SWIG_CASTRANKLIMIT  << 1)
+/* The TmpMask is for in/out typemaps that use temporal objects */
+#define SWIG_TMPOBJMASK            (SWIG_NEWOBJMASK << 1)
+/* Simple returning values */
+#define SWIG_BADOBJ                (SWIG_ERROR)
+#define SWIG_OLDOBJ                (SWIG_OK)
+#define SWIG_NEWOBJ                (SWIG_OK | SWIG_NEWOBJMASK)
+#define SWIG_TMPOBJ                (SWIG_OK | SWIG_TMPOBJMASK)
+/* Check, add and del mask methods */
+#define SWIG_AddNewMask(r)         (SWIG_IsOK(r) ? (r | SWIG_NEWOBJMASK) : r)
+#define SWIG_DelNewMask(r)         (SWIG_IsOK(r) ? (r & ~SWIG_NEWOBJMASK) : r)
+#define SWIG_IsNewObj(r)           (SWIG_IsOK(r) && (r & SWIG_NEWOBJMASK))
+#define SWIG_AddTmpMask(r)         (SWIG_IsOK(r) ? (r | SWIG_TMPOBJMASK) : r)
+#define SWIG_DelTmpMask(r)         (SWIG_IsOK(r) ? (r & ~SWIG_TMPOBJMASK) : r)
+#define SWIG_IsTmpObj(r)           (SWIG_IsOK(r) && (r & SWIG_TMPOBJMASK))
+
+/* Cast-Rank Mode */
+#if defined(SWIG_CASTRANK_MODE)
+#  ifndef SWIG_TypeRank
+#    define SWIG_TypeRank             unsigned long
+#  endif
+#  ifndef SWIG_MAXCASTRANK            /* Default cast allowed */
+#    define SWIG_MAXCASTRANK          (2)
+#  endif
+#  define SWIG_CASTRANKMASK          ((SWIG_CASTRANKLIMIT) -1)
+#  define SWIG_CastRank(r)           (r & SWIG_CASTRANKMASK)
+SWIGINTERNINLINE int SWIG_AddCast(int r) {
+  return SWIG_IsOK(r) ? ((SWIG_CastRank(r) < SWIG_MAXCASTRANK) ? (r + 1) : SWIG_ERROR) : r;
+}
+SWIGINTERNINLINE int SWIG_CheckState(int r) {
+  return SWIG_IsOK(r) ? SWIG_CastRank(r) + 1 : 0;
+}
+#else /* no cast-rank mode */
+#  define SWIG_AddCast(r) (r)
+#  define SWIG_CheckState(r) (SWIG_IsOK(r) ? 1 : 0)
+#endif
+
+
+#include <string.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef void *(*swig_converter_func)(void *, int *);
+typedef struct swig_type_info *(*swig_dycast_func)(void **);
+
+/* Structure to store information on one type */
+typedef struct swig_type_info {
+  const char             *name;			/* mangled name of this type */
+  const char             *str;			/* human readable name of this type */
+  swig_dycast_func        dcast;		/* dynamic cast function down a hierarchy */
+  struct swig_cast_info  *cast;			/* linked list of types that can cast into this type */
+  void                   *clientdata;		/* language specific type data */
+  int                    owndata;		/* flag if the structure owns the clientdata */
+} swig_type_info;
+
+/* Structure to store a type and conversion function used for casting */
+typedef struct swig_cast_info {
+  swig_type_info         *type;			/* pointer to type that is equivalent to this type */
+  swig_converter_func     converter;		/* function to cast the void pointers */
+  struct swig_cast_info  *next;			/* pointer to next cast in linked list */
+  struct swig_cast_info  *prev;			/* pointer to the previous cast */
+} swig_cast_info;
+
+/* Structure used to store module information
+ * Each module generates one structure like this, and the runtime collects
+ * all of these structures and stores them in a circularly linked list.*/
+typedef struct swig_module_info {
+  swig_type_info         **types;		/* Array of pointers to swig_type_info structures that are in this module */
+  size_t                 size;		        /* Number of types in this module */
+  struct swig_module_info *next;		/* Pointer to next element in circularly linked list */
+  swig_type_info         **type_initial;	/* Array of initially generated type structures */
+  swig_cast_info         **cast_initial;	/* Array of initially generated casting structures */
+  void                    *clientdata;		/* Language specific module data */
+} swig_module_info;
+
+/*
+  Compare two type names skipping the space characters, therefore
+  "char*" == "char *" and "Class<int>" == "Class<int >", etc.
+
+  Return 0 when the two name types are equivalent, as in
+  strncmp, but skipping ' '.
+*/
+SWIGRUNTIME int
+SWIG_TypeNameComp(const char *f1, const char *l1,
+		  const char *f2, const char *l2) {
+  for (;(f1 != l1) && (f2 != l2); ++f1, ++f2) {
+    while ((*f1 == ' ') && (f1 != l1)) ++f1;
+    while ((*f2 == ' ') && (f2 != l2)) ++f2;
+    if (*f1 != *f2) return (*f1 > *f2) ? 1 : -1;
+  }
+  return (int)((l1 - f1) - (l2 - f2));
+}
+
+/*
+  Check type equivalence in a name list like <name1>|<name2>|...
+  Return 0 if equal, -1 if nb < tb, 1 if nb > tb
+*/
+SWIGRUNTIME int
+SWIG_TypeCmp(const char *nb, const char *tb) {
+  int equiv = 1;
+  const char* te = tb + strlen(tb);
+  const char* ne = nb;
+  while (equiv != 0 && *ne) {
+    for (nb = ne; *ne; ++ne) {
+      if (*ne == '|') break;
+    }
+    equiv = SWIG_TypeNameComp(nb, ne, tb, te);
+    if (*ne) ++ne;
+  }
+  return equiv;
+}
+
+/*
+  Check type equivalence in a name list like <name1>|<name2>|...
+  Return 0 if not equal, 1 if equal
+*/
+SWIGRUNTIME int
+SWIG_TypeEquiv(const char *nb, const char *tb) {
+  return SWIG_TypeCmp(nb, tb) == 0 ? 1 : 0;
+}
+
+/*
+  Check the typename
+*/
+SWIGRUNTIME swig_cast_info *
+SWIG_TypeCheck(const char *c, swig_type_info *ty) {
+  if (ty) {
+    swig_cast_info *iter = ty->cast;
+    while (iter) {
+      if (strcmp(iter->type->name, c) == 0) {
+        if (iter == ty->cast)
+          return iter;
+        /* Move iter to the top of the linked list */
+        iter->prev->next = iter->next;
+        if (iter->next)
+          iter->next->prev = iter->prev;
+        iter->next = ty->cast;
+        iter->prev = 0;
+        if (ty->cast) ty->cast->prev = iter;
+        ty->cast = iter;
+        return iter;
+      }
+      iter = iter->next;
+    }
+  }
+  return 0;
+}
+
+/*
+  Identical to SWIG_TypeCheck, except strcmp is replaced with a pointer comparison
+*/
+SWIGRUNTIME swig_cast_info *
+SWIG_TypeCheckStruct(swig_type_info *from, swig_type_info *ty) {
+  if (ty) {
+    swig_cast_info *iter = ty->cast;
+    while (iter) {
+      if (iter->type == from) {
+        if (iter == ty->cast)
+          return iter;
+        /* Move iter to the top of the linked list */
+        iter->prev->next = iter->next;
+        if (iter->next)
+          iter->next->prev = iter->prev;
+        iter->next = ty->cast;
+        iter->prev = 0;
+        if (ty->cast) ty->cast->prev = iter;
+        ty->cast = iter;
+        return iter;
+      }
+      iter = iter->next;
+    }
+  }
+  return 0;
+}
+
+/*
+  Cast a pointer up an inheritance hierarchy
+*/
+SWIGRUNTIMEINLINE void *
+SWIG_TypeCast(swig_cast_info *ty, void *ptr, int *newmemory) {
+  return ((!ty) || (!ty->converter)) ? ptr : (*ty->converter)(ptr, newmemory);
+}
+
+/*
+   Dynamic pointer casting. Down an inheritance hierarchy
+*/
+SWIGRUNTIME swig_type_info *
+SWIG_TypeDynamicCast(swig_type_info *ty, void **ptr) {
+  swig_type_info *lastty = ty;
+  if (!ty || !ty->dcast) return ty;
+  while (ty && (ty->dcast)) {
+    ty = (*ty->dcast)(ptr);
+    if (ty) lastty = ty;
+  }
+  return lastty;
+}
+
+/*
+  Return the name associated with this type
+*/
+SWIGRUNTIMEINLINE const char *
+SWIG_TypeName(const swig_type_info *ty) {
+  return ty->name;
+}
+
+/*
+  Return the pretty name associated with this type,
+  that is an unmangled type name in a form presentable to the user.
+*/
+SWIGRUNTIME const char *
+SWIG_TypePrettyName(const swig_type_info *type) {
+  /* The "str" field contains the equivalent pretty names of the
+     type, separated by vertical-bar characters.  We choose
+     to print the last name, as it is often (?) the most
+     specific. */
+  if (!type) return NULL;
+  if (type->str != NULL) {
+    const char *last_name = type->str;
+    const char *s;
+    for (s = type->str; *s; s++)
+      if (*s == '|') last_name = s+1;
+    return last_name;
+  }
+  else
+    return type->name;
+}
+
+/*
+   Set the clientdata field for a type
+*/
+SWIGRUNTIME void
+SWIG_TypeClientData(swig_type_info *ti, void *clientdata) {
+  swig_cast_info *cast = ti->cast;
+  /* if (ti->clientdata == clientdata) return; */
+  ti->clientdata = clientdata;
+
+  while (cast) {
+    if (!cast->converter) {
+      swig_type_info *tc = cast->type;
+      if (!tc->clientdata) {
+	SWIG_TypeClientData(tc, clientdata);
+      }
+    }
+    cast = cast->next;
+  }
+}
+SWIGRUNTIME void
+SWIG_TypeNewClientData(swig_type_info *ti, void *clientdata) {
+  SWIG_TypeClientData(ti, clientdata);
+  ti->owndata = 1;
+}
+
+/*
+  Search for a swig_type_info structure only by mangled name
+  Search is a O(log #types)
+
+  We start searching at module start, and finish searching when start == end.
+  Note: if start == end at the beginning of the function, we go all the way around
+  the circular list.
+*/
+SWIGRUNTIME swig_type_info *
+SWIG_MangledTypeQueryModule(swig_module_info *start,
+                            swig_module_info *end,
+		            const char *name) {
+  swig_module_info *iter = start;
+  do {
+    if (iter->size) {
+      size_t l = 0;
+      size_t r = iter->size - 1;
+      do {
+	/* since l+r >= 0, we can (>> 1) instead (/ 2) */
+	size_t i = (l + r) >> 1;
+	const char *iname = iter->types[i]->name;
+	if (iname) {
+	  int compare = strcmp(name, iname);
+	  if (compare == 0) {
+	    return iter->types[i];
+	  } else if (compare < 0) {
+	    if (i) {
+	      r = i - 1;
+	    } else {
+	      break;
+	    }
+	  } else if (compare > 0) {
+	    l = i + 1;
+	  }
+	} else {
+	  break; /* should never happen */
+	}
+      } while (l <= r);
+    }
+    iter = iter->next;
+  } while (iter != end);
+  return 0;
+}
+
+/*
+  Search for a swig_type_info structure for either a mangled name or a human readable name.
+  It first searches the mangled names of the types, which is a O(log #types)
+  If a type is not found it then searches the human readable names, which is O(#types).
+
+  We start searching at module start, and finish searching when start == end.
+  Note: if start == end at the beginning of the function, we go all the way around
+  the circular list.
+*/
+SWIGRUNTIME swig_type_info *
+SWIG_TypeQueryModule(swig_module_info *start,
+                     swig_module_info *end,
+		     const char *name) {
+  /* STEP 1: Search the name field using binary search */
+  swig_type_info *ret = SWIG_MangledTypeQueryModule(start, end, name);
+  if (ret) {
+    return ret;
+  } else {
+    /* STEP 2: If the type hasn't been found, do a complete search
+       of the str field (the human readable name) */
+    swig_module_info *iter = start;
+    do {
+      size_t i = 0;
+      for (; i < iter->size; ++i) {
+	if (iter->types[i]->str && (SWIG_TypeEquiv(iter->types[i]->str, name)))
+	  return iter->types[i];
+      }
+      iter = iter->next;
+    } while (iter != end);
+  }
+
+  /* neither found a match */
+  return 0;
+}
+
+/*
+   Pack binary data into a string
+*/
+SWIGRUNTIME char *
+SWIG_PackData(char *c, void *ptr, size_t sz) {
+  static const char hex[17] = "0123456789abcdef";
+  const unsigned char *u = (unsigned char *) ptr;
+  const unsigned char *eu =  u + sz;
+  for (; u != eu; ++u) {
+    unsigned char uu = *u;
+    *(c++) = hex[(uu & 0xf0) >> 4];
+    *(c++) = hex[uu & 0xf];
+  }
+  return c;
+}
+
+/*
+   Unpack binary data from a string
+*/
+SWIGRUNTIME const char *
+SWIG_UnpackData(const char *c, void *ptr, size_t sz) {
+  unsigned char *u = (unsigned char *) ptr;
+  const unsigned char *eu = u + sz;
+  for (; u != eu; ++u) {
+    char d = *(c++);
+    unsigned char uu;
+    if ((d >= '0') && (d <= '9'))
+      uu = (unsigned char)((d - '0') << 4);
+    else if ((d >= 'a') && (d <= 'f'))
+      uu = (unsigned char)((d - ('a'-10)) << 4);
+    else
+      return (char *) 0;
+    d = *(c++);
+    if ((d >= '0') && (d <= '9'))
+      uu |= (unsigned char)(d - '0');
+    else if ((d >= 'a') && (d <= 'f'))
+      uu |= (unsigned char)(d - ('a'-10));
+    else
+      return (char *) 0;
+    *u = uu;
+  }
+  return c;
+}
+
+/*
+   Pack 'void *' into a string buffer.
+*/
+SWIGRUNTIME char *
+SWIG_PackVoidPtr(char *buff, void *ptr, const char *name, size_t bsz) {
+  char *r = buff;
+  if ((2*sizeof(void *) + 2) > bsz) return 0;
+  *(r++) = '_';
+  r = SWIG_PackData(r,&ptr,sizeof(void *));
+  if (strlen(name) + 1 > (bsz - (r - buff))) return 0;
+  strcpy(r,name);
+  return buff;
+}
+
+SWIGRUNTIME const char *
+SWIG_UnpackVoidPtr(const char *c, void **ptr, const char *name) {
+  if (*c != '_') {
+    if (strcmp(c,"NULL") == 0) {
+      *ptr = (void *) 0;
+      return name;
+    } else {
+      return 0;
+    }
+  }
+  return SWIG_UnpackData(++c,ptr,sizeof(void *));
+}
+
+SWIGRUNTIME char *
+SWIG_PackDataName(char *buff, void *ptr, size_t sz, const char *name, size_t bsz) {
+  char *r = buff;
+  size_t lname = (name ? strlen(name) : 0);
+  if ((2*sz + 2 + lname) > bsz) return 0;
+  *(r++) = '_';
+  r = SWIG_PackData(r,ptr,sz);
+  if (lname) {
+    strncpy(r,name,lname+1);
+  } else {
+    *r = 0;
+  }
+  return buff;
+}
+
+SWIGRUNTIME const char *
+SWIG_UnpackDataName(const char *c, void *ptr, size_t sz, const char *name) {
+  if (*c != '_') {
+    if (strcmp(c,"NULL") == 0) {
+      memset(ptr,0,sz);
+      return name;
+    } else {
+      return 0;
+    }
+  }
+  return SWIG_UnpackData(++c,ptr,sz);
+}
+
+#ifdef __cplusplus
+}
+#endif
+/* Compatibility macros for Python 3 */
+#if PY_VERSION_HEX >= 0x03000000
+
+#define PyClass_Check(obj) PyObject_IsInstance(obj, (PyObject *)&PyType_Type)
+#define PyInt_Check(x) PyLong_Check(x)
+#define PyInt_AsLong(x) PyLong_AsLong(x)
+#define PyInt_FromLong(x) PyLong_FromLong(x)
+#define PyInt_FromSize_t(x) PyLong_FromSize_t(x)
+#define PyString_Check(name) PyBytes_Check(name)
+#define PyString_FromString(x) PyUnicode_FromString(x)
+#define PyString_Format(fmt, args)  PyUnicode_Format(fmt, args)
+#define PyString_AsString(str) PyBytes_AsString(str)
+#define PyString_Size(str) PyBytes_Size(str)	
+#define PyString_InternFromString(key) PyUnicode_InternFromString(key)
+#define Py_TPFLAGS_HAVE_CLASS Py_TPFLAGS_BASETYPE
+#define PyString_AS_STRING(x) PyUnicode_AS_STRING(x)
+#define _PyLong_FromSsize_t(x) PyLong_FromSsize_t(x)
+
+#endif
+
+#ifndef Py_TYPE
+#  define Py_TYPE(op) ((op)->ob_type)
+#endif
+
+/* SWIG APIs for compatibility of both Python 2 & 3 */
+
+#if PY_VERSION_HEX >= 0x03000000
+#  define SWIG_Python_str_FromFormat PyUnicode_FromFormat
+#else
+#  define SWIG_Python_str_FromFormat PyString_FromFormat
+#endif
+
+
+/* Warning: This function will allocate a new string in Python 3,
+ * so please call SWIG_Python_str_DelForPy3(x) to free the space.
+ */
+SWIGINTERN char*
+SWIG_Python_str_AsChar(PyObject *str)
+{
+#if PY_VERSION_HEX >= 0x03000000
+  char *cstr;
+  char *newstr;
+  Py_ssize_t len;
+  str = PyUnicode_AsUTF8String(str);
+  PyBytes_AsStringAndSize(str, &cstr, &len);
+  newstr = (char *) malloc(len+1);
+  memcpy(newstr, cstr, len+1);
+  Py_XDECREF(str);
+  return newstr;
+#else
+  return PyString_AsString(str);
+#endif
+}
+
+#if PY_VERSION_HEX >= 0x03000000
+#  define SWIG_Python_str_DelForPy3(x) free( (void*) (x) )
+#else
+#  define SWIG_Python_str_DelForPy3(x) 
+#endif
+
+
+SWIGINTERN PyObject*
+SWIG_Python_str_FromChar(const char *c)
+{
+#if PY_VERSION_HEX >= 0x03000000
+  return PyUnicode_FromString(c); 
+#else
+  return PyString_FromString(c);
+#endif
+}
+
+/* Add PyOS_snprintf for old Pythons */
+#if PY_VERSION_HEX < 0x02020000
+# if defined(_MSC_VER) || defined(__BORLANDC__) || defined(_WATCOM)
+#  define PyOS_snprintf _snprintf
+# else
+#  define PyOS_snprintf snprintf
+# endif
+#endif
+
+/* A crude PyString_FromFormat implementation for old Pythons */
+#if PY_VERSION_HEX < 0x02020000
+
+#ifndef SWIG_PYBUFFER_SIZE
+# define SWIG_PYBUFFER_SIZE 1024
+#endif
+
+static PyObject *
+PyString_FromFormat(const char *fmt, ...) {
+  va_list ap;
+  char buf[SWIG_PYBUFFER_SIZE * 2];
+  int res;
+  va_start(ap, fmt);
+  res = vsnprintf(buf, sizeof(buf), fmt, ap);
+  va_end(ap);
+  return (res < 0 || res >= (int)sizeof(buf)) ? 0 : PyString_FromString(buf);
+}
+#endif
+
+#ifndef PyObject_DEL
+# define PyObject_DEL PyObject_Del
+#endif
+
+/* A crude PyExc_StopIteration exception for old Pythons */
+#if PY_VERSION_HEX < 0x02020000
+# ifndef PyExc_StopIteration
+#  define PyExc_StopIteration PyExc_RuntimeError
+# endif
+# ifndef PyObject_GenericGetAttr
+#  define PyObject_GenericGetAttr 0
+# endif
+#endif
+
+/* Py_NotImplemented is defined in 2.1 and up. */
+#if PY_VERSION_HEX < 0x02010000
+# ifndef Py_NotImplemented
+#  define Py_NotImplemented PyExc_RuntimeError
+# endif
+#endif
+
+/* A crude PyString_AsStringAndSize implementation for old Pythons */
+#if PY_VERSION_HEX < 0x02010000
+# ifndef PyString_AsStringAndSize
+#  define PyString_AsStringAndSize(obj, s, len) {*s = PyString_AsString(obj); *len = *s ? strlen(*s) : 0;}
+# endif
+#endif
+
+/* PySequence_Size for old Pythons */
+#if PY_VERSION_HEX < 0x02000000
+# ifndef PySequence_Size
+#  define PySequence_Size PySequence_Length
+# endif
+#endif
+
+/* PyBool_FromLong for old Pythons */
+#if PY_VERSION_HEX < 0x02030000
+static
+PyObject *PyBool_FromLong(long ok)
+{
+  PyObject *result = ok ? Py_True : Py_False;
+  Py_INCREF(result);
+  return result;
+}
+#endif
+
+/* Py_ssize_t for old Pythons */
+/* This code is as recommended by: */
+/* http://www.python.org/dev/peps/pep-0353/#conversion-guidelines */
+#if PY_VERSION_HEX < 0x02050000 && !defined(PY_SSIZE_T_MIN)
+typedef int Py_ssize_t;
+# define PY_SSIZE_T_MAX INT_MAX
+# define PY_SSIZE_T_MIN INT_MIN
+typedef inquiry lenfunc;
+typedef intargfunc ssizeargfunc;
+typedef intintargfunc ssizessizeargfunc;
+typedef intobjargproc ssizeobjargproc;
+typedef intintobjargproc ssizessizeobjargproc;
+typedef getreadbufferproc readbufferproc;
+typedef getwritebufferproc writebufferproc;
+typedef getsegcountproc segcountproc;
+typedef getcharbufferproc charbufferproc;
+static long PyNumber_AsSsize_t (PyObject *x, void *SWIGUNUSEDPARM(exc))
+{
+  long result = 0;
+  PyObject *i = PyNumber_Int(x);
+  if (i) {
+    result = PyInt_AsLong(i);
+    Py_DECREF(i);
+  }
+  return result;
+}
+#endif
+
+#if PY_VERSION_HEX < 0x02050000
+#define PyInt_FromSize_t(x) PyInt_FromLong((long)x)
+#endif
+
+#if PY_VERSION_HEX < 0x02040000
+#define Py_VISIT(op)				\
+  do { 						\
+    if (op) {					\
+      int vret = visit((op), arg);		\
+      if (vret)					\
+        return vret;				\
+    }						\
+  } while (0)
+#endif
+
+#if PY_VERSION_HEX < 0x02030000
+typedef struct {
+  PyTypeObject type;
+  PyNumberMethods as_number;
+  PyMappingMethods as_mapping;
+  PySequenceMethods as_sequence;
+  PyBufferProcs as_buffer;
+  PyObject *name, *slots;
+} PyHeapTypeObject;
+#endif
+
+#if PY_VERSION_HEX < 0x02030000
+typedef destructor freefunc;
+#endif
+
+#if ((PY_MAJOR_VERSION == 2 && PY_MINOR_VERSION > 6) || \
+     (PY_MAJOR_VERSION == 3 && PY_MINOR_VERSION > 0) || \
+     (PY_MAJOR_VERSION > 3))
+# define SWIGPY_USE_CAPSULE
+# define SWIGPY_CAPSULE_NAME ((char*)"swig_runtime_data" SWIG_RUNTIME_VERSION ".type_pointer_capsule" SWIG_TYPE_TABLE_NAME)
+#endif
+
+#if PY_VERSION_HEX < 0x03020000
+#define PyDescr_TYPE(x) (((PyDescrObject *)(x))->d_type)
+#define PyDescr_NAME(x) (((PyDescrObject *)(x))->d_name)
+#define Py_hash_t long
+#endif
+/* -----------------------------------------------------------------------------
+ * error manipulation
+ * ----------------------------------------------------------------------------- */
+
+SWIGRUNTIME PyObject*
+SWIG_Python_ErrorType(int code) {
+  PyObject* type = 0;
+  switch(code) {
+  case SWIG_MemoryError:
+    type = PyExc_MemoryError;
+    break;
+  case SWIG_IOError:
+    type = PyExc_IOError;
+    break;
+  case SWIG_RuntimeError:
+    type = PyExc_RuntimeError;
+    break;
+  case SWIG_IndexError:
+    type = PyExc_IndexError;
+    break;
+  case SWIG_TypeError:
+    type = PyExc_TypeError;
+    break;
+  case SWIG_DivisionByZero:
+    type = PyExc_ZeroDivisionError;
+    break;
+  case SWIG_OverflowError:
+    type = PyExc_OverflowError;
+    break;
+  case SWIG_SyntaxError:
+    type = PyExc_SyntaxError;
+    break;
+  case SWIG_ValueError:
+    type = PyExc_ValueError;
+    break;
+  case SWIG_SystemError:
+    type = PyExc_SystemError;
+    break;
+  case SWIG_AttributeError:
+    type = PyExc_AttributeError;
+    break;
+  default:
+    type = PyExc_RuntimeError;
+  }
+  return type;
+}
+
+
+SWIGRUNTIME void
+SWIG_Python_AddErrorMsg(const char* mesg)
+{
+  PyObject *type = 0;
+  PyObject *value = 0;
+  PyObject *traceback = 0;
+
+  if (PyErr_Occurred()) PyErr_Fetch(&type, &value, &traceback);
+  if (value) {
+    char *tmp;
+    PyObject *old_str = PyObject_Str(value);
+    PyErr_Clear();
+    Py_XINCREF(type);
+
+    PyErr_Format(type, "%s %s", tmp = SWIG_Python_str_AsChar(old_str), mesg);
+    SWIG_Python_str_DelForPy3(tmp);
+    Py_DECREF(old_str);
+    Py_DECREF(value);
+  } else {
+    PyErr_SetString(PyExc_RuntimeError, mesg);
+  }
+}
+#if defined(SWIG_PYTHON_NO_THREADS)
+#  if defined(SWIG_PYTHON_THREADS)
+#    undef SWIG_PYTHON_THREADS
+#  endif
+#endif
+#if defined(SWIG_PYTHON_THREADS) /* Threading support is enabled */
+#  if !defined(SWIG_PYTHON_USE_GIL) && !defined(SWIG_PYTHON_NO_USE_GIL)
+#    if (PY_VERSION_HEX >= 0x02030000) /* For 2.3 or later, use the PyGILState calls */
+#      define SWIG_PYTHON_USE_GIL
+#    endif
+#  endif
+#  if defined(SWIG_PYTHON_USE_GIL) /* Use PyGILState threads calls */
+#    ifndef SWIG_PYTHON_INITIALIZE_THREADS
+#     define SWIG_PYTHON_INITIALIZE_THREADS  PyEval_InitThreads() 
+#    endif
+#    ifdef __cplusplus /* C++ code */
+       class SWIG_Python_Thread_Block {
+         bool status;
+         PyGILState_STATE state;
+       public:
+         void end() { if (status) { PyGILState_Release(state); status = false;} }
+         SWIG_Python_Thread_Block() : status(true), state(PyGILState_Ensure()) {}
+         ~SWIG_Python_Thread_Block() { end(); }
+       };
+       class SWIG_Python_Thread_Allow {
+         bool status;
+         PyThreadState *save;
+       public:
+         void end() { if (status) { PyEval_RestoreThread(save); status = false; }}
+         SWIG_Python_Thread_Allow() : status(true), save(PyEval_SaveThread()) {}
+         ~SWIG_Python_Thread_Allow() { end(); }
+       };
+#      define SWIG_PYTHON_THREAD_BEGIN_BLOCK   SWIG_Python_Thread_Block _swig_thread_block
+#      define SWIG_PYTHON_THREAD_END_BLOCK     _swig_thread_block.end()
+#      define SWIG_PYTHON_THREAD_BEGIN_ALLOW   SWIG_Python_Thread_Allow _swig_thread_allow
+#      define SWIG_PYTHON_THREAD_END_ALLOW     _swig_thread_allow.end()
+#    else /* C code */
+#      define SWIG_PYTHON_THREAD_BEGIN_BLOCK   PyGILState_STATE _swig_thread_block = PyGILState_Ensure()
+#      define SWIG_PYTHON_THREAD_END_BLOCK     PyGILState_Release(_swig_thread_block)
+#      define SWIG_PYTHON_THREAD_BEGIN_ALLOW   PyThreadState *_swig_thread_allow = PyEval_SaveThread()
+#      define SWIG_PYTHON_THREAD_END_ALLOW     PyEval_RestoreThread(_swig_thread_allow)
+#    endif
+#  else /* Old thread way, not implemented, user must provide it */
+#    if !defined(SWIG_PYTHON_INITIALIZE_THREADS)
+#      define SWIG_PYTHON_INITIALIZE_THREADS
+#    endif
+#    if !defined(SWIG_PYTHON_THREAD_BEGIN_BLOCK)
+#      define SWIG_PYTHON_THREAD_BEGIN_BLOCK
+#    endif
+#    if !defined(SWIG_PYTHON_THREAD_END_BLOCK)
+#      define SWIG_PYTHON_THREAD_END_BLOCK
+#    endif
+#    if !defined(SWIG_PYTHON_THREAD_BEGIN_ALLOW)
+#      define SWIG_PYTHON_THREAD_BEGIN_ALLOW
+#    endif
+#    if !defined(SWIG_PYTHON_THREAD_END_ALLOW)
+#      define SWIG_PYTHON_THREAD_END_ALLOW
+#    endif
+#  endif
+#else /* No thread support */
+#  define SWIG_PYTHON_INITIALIZE_THREADS
+#  define SWIG_PYTHON_THREAD_BEGIN_BLOCK
+#  define SWIG_PYTHON_THREAD_END_BLOCK
+#  define SWIG_PYTHON_THREAD_BEGIN_ALLOW
+#  define SWIG_PYTHON_THREAD_END_ALLOW
+#endif
+/* -----------------------------------------------------------------------------
+ * Python API portion that goes into the runtime
+ * ----------------------------------------------------------------------------- */
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* -----------------------------------------------------------------------------
+ * Constant declarations
+ * ----------------------------------------------------------------------------- */
+
+/* Constant Types */
+#define SWIG_PY_POINTER 4
+#define SWIG_PY_BINARY  5
+
+/* Constant information structure */
+typedef struct swig_const_info {
+  int type;
+  char *name;
+  long lvalue;
+  double dvalue;
+  void   *pvalue;
+  swig_type_info **ptype;
+} swig_const_info;
+
+
+/* -----------------------------------------------------------------------------
+ * Wrapper of PyInstanceMethod_New() used in Python 3
+ * It is exported to the generated module, used for -fastproxy
+ * ----------------------------------------------------------------------------- */
+#if PY_VERSION_HEX >= 0x03000000
+SWIGRUNTIME PyObject* SWIG_PyInstanceMethod_New(PyObject *SWIGUNUSEDPARM(self), PyObject *func)
+{
+  return PyInstanceMethod_New(func);
+}
+#else
+SWIGRUNTIME PyObject* SWIG_PyInstanceMethod_New(PyObject *SWIGUNUSEDPARM(self), PyObject *SWIGUNUSEDPARM(func))
+{
+  return NULL;
+}
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+
+/* -----------------------------------------------------------------------------
+ * pyrun.swg
+ *
+ * This file contains the runtime support for Python modules
+ * and includes code for managing global variables and pointer
+ * type checking.
+ *
+ * ----------------------------------------------------------------------------- */
+
+/* Common SWIG API */
+
+/* for raw pointers */
+#define SWIG_Python_ConvertPtr(obj, pptr, type, flags)  SWIG_Python_ConvertPtrAndOwn(obj, pptr, type, flags, 0)
+#define SWIG_ConvertPtr(obj, pptr, type, flags)         SWIG_Python_ConvertPtr(obj, pptr, type, flags)
+#define SWIG_ConvertPtrAndOwn(obj,pptr,type,flags,own)  SWIG_Python_ConvertPtrAndOwn(obj, pptr, type, flags, own)
+
+#ifdef SWIGPYTHON_BUILTIN
+#define SWIG_NewPointerObj(ptr, type, flags)            SWIG_Python_NewPointerObj(self, ptr, type, flags)
+#else
+#define SWIG_NewPointerObj(ptr, type, flags)            SWIG_Python_NewPointerObj(NULL, ptr, type, flags)
+#endif
+
+#define SWIG_InternalNewPointerObj(ptr, type, flags)	SWIG_Python_NewPointerObj(NULL, ptr, type, flags)
+
+#define SWIG_CheckImplicit(ty)                          SWIG_Python_CheckImplicit(ty) 
+#define SWIG_AcquirePtr(ptr, src)                       SWIG_Python_AcquirePtr(ptr, src)
+#define swig_owntype                                    int
+
+/* for raw packed data */
+#define SWIG_ConvertPacked(obj, ptr, sz, ty)            SWIG_Python_ConvertPacked(obj, ptr, sz, ty)
+#define SWIG_NewPackedObj(ptr, sz, type)                SWIG_Python_NewPackedObj(ptr, sz, type)
+
+/* for class or struct pointers */
+#define SWIG_ConvertInstance(obj, pptr, type, flags)    SWIG_ConvertPtr(obj, pptr, type, flags)
+#define SWIG_NewInstanceObj(ptr, type, flags)           SWIG_NewPointerObj(ptr, type, flags)
+
+/* for C or C++ function pointers */
+#define SWIG_ConvertFunctionPtr(obj, pptr, type)        SWIG_Python_ConvertFunctionPtr(obj, pptr, type)
+#define SWIG_NewFunctionPtrObj(ptr, type)               SWIG_Python_NewPointerObj(NULL, ptr, type, 0)
+
+/* for C++ member pointers, ie, member methods */
+#define SWIG_ConvertMember(obj, ptr, sz, ty)            SWIG_Python_ConvertPacked(obj, ptr, sz, ty)
+#define SWIG_NewMemberObj(ptr, sz, type)                SWIG_Python_NewPackedObj(ptr, sz, type)
+
+
+/* Runtime API */
+
+#define SWIG_GetModule(clientdata)                      SWIG_Python_GetModule(clientdata)
+#define SWIG_SetModule(clientdata, pointer)             SWIG_Python_SetModule(pointer)
+#define SWIG_NewClientData(obj)                         SwigPyClientData_New(obj)
+
+#define SWIG_SetErrorObj                                SWIG_Python_SetErrorObj                            
+#define SWIG_SetErrorMsg                        	SWIG_Python_SetErrorMsg				   
+#define SWIG_ErrorType(code)                    	SWIG_Python_ErrorType(code)                        
+#define SWIG_Error(code, msg)            		SWIG_Python_SetErrorMsg(SWIG_ErrorType(code), msg) 
+#define SWIG_fail                        		goto fail					   
+
+
+/* Runtime API implementation */
+
+/* Error manipulation */
+
+SWIGINTERN void 
+SWIG_Python_SetErrorObj(PyObject *errtype, PyObject *obj) {
+  SWIG_PYTHON_THREAD_BEGIN_BLOCK; 
+  PyErr_SetObject(errtype, obj);
+  Py_DECREF(obj);
+  SWIG_PYTHON_THREAD_END_BLOCK;
+}
+
+SWIGINTERN void 
+SWIG_Python_SetErrorMsg(PyObject *errtype, const char *msg) {
+  SWIG_PYTHON_THREAD_BEGIN_BLOCK;
+  PyErr_SetString(errtype, msg);
+  SWIG_PYTHON_THREAD_END_BLOCK;
+}
+
+#define SWIG_Python_Raise(obj, type, desc)  SWIG_Python_SetErrorObj(SWIG_Python_ExceptionType(desc), obj)
+
+/* Set a constant value */
+
+#if defined(SWIGPYTHON_BUILTIN)
+
+SWIGINTERN void
+SwigPyBuiltin_AddPublicSymbol(PyObject *seq, const char *key) {
+  PyObject *s = PyString_InternFromString(key);
+  PyList_Append(seq, s);
+  Py_DECREF(s);
+}
+
+SWIGINTERN void
+SWIG_Python_SetConstant(PyObject *d, PyObject *public_interface, const char *name, PyObject *obj) {   
+#if PY_VERSION_HEX < 0x02030000
+  PyDict_SetItemString(d, (char *)name, obj);
+#else
+  PyDict_SetItemString(d, name, obj);
+#endif
+  Py_DECREF(obj);
+  if (public_interface)
+    SwigPyBuiltin_AddPublicSymbol(public_interface, name);
+}
+
+#else
+
+SWIGINTERN void
+SWIG_Python_SetConstant(PyObject *d, const char *name, PyObject *obj) {   
+#if PY_VERSION_HEX < 0x02030000
+  PyDict_SetItemString(d, (char *)name, obj);
+#else
+  PyDict_SetItemString(d, name, obj);
+#endif
+  Py_DECREF(obj);                            
+}
+
+#endif
+
+/* Append a value to the result obj */
+
+SWIGINTERN PyObject*
+SWIG_Python_AppendOutput(PyObject* result, PyObject* obj) {
+#if !defined(SWIG_PYTHON_OUTPUT_TUPLE)
+  if (!result) {
+    result = obj;
+  } else if (result == Py_None) {
+    Py_DECREF(result);
+    result = obj;
+  } else {
+    if (!PyList_Check(result)) {
+      PyObject *o2 = result;
+      result = PyList_New(1);
+      PyList_SetItem(result, 0, o2);
+    }
+    PyList_Append(result,obj);
+    Py_DECREF(obj);
+  }
+  return result;
+#else
+  PyObject*   o2;
+  PyObject*   o3;
+  if (!result) {
+    result = obj;
+  } else if (result == Py_None) {
+    Py_DECREF(result);
+    result = obj;
+  } else {
+    if (!PyTuple_Check(result)) {
+      o2 = result;
+      result = PyTuple_New(1);
+      PyTuple_SET_ITEM(result, 0, o2);
+    }
+    o3 = PyTuple_New(1);
+    PyTuple_SET_ITEM(o3, 0, obj);
+    o2 = result;
+    result = PySequence_Concat(o2, o3);
+    Py_DECREF(o2);
+    Py_DECREF(o3);
+  }
+  return result;
+#endif
+}
+
+/* Unpack the argument tuple */
+
+SWIGINTERN Py_ssize_t
+SWIG_Python_UnpackTuple(PyObject *args, const char *name, Py_ssize_t min, Py_ssize_t max, PyObject **objs)
+{
+  if (!args) {
+    if (!min && !max) {
+      return 1;
+    } else {
+      PyErr_Format(PyExc_TypeError, "%s expected %s%d arguments, got none", 
+		   name, (min == max ? "" : "at least "), (int)min);
+      return 0;
+    }
+  }  
+  if (!PyTuple_Check(args)) {
+    if (min <= 1 && max >= 1) {
+      Py_ssize_t i;
+      objs[0] = args;
+      for (i = 1; i < max; ++i) {
+	objs[i] = 0;
+      }
+      return 2;
+    }
+    PyErr_SetString(PyExc_SystemError, "UnpackTuple() argument list is not a tuple");
+    return 0;
+  } else {
+    Py_ssize_t l = PyTuple_GET_SIZE(args);
+    if (l < min) {
+      PyErr_Format(PyExc_TypeError, "%s expected %s%d arguments, got %d", 
+		   name, (min == max ? "" : "at least "), (int)min, (int)l);
+      return 0;
+    } else if (l > max) {
+      PyErr_Format(PyExc_TypeError, "%s expected %s%d arguments, got %d", 
+		   name, (min == max ? "" : "at most "), (int)max, (int)l);
+      return 0;
+    } else {
+      Py_ssize_t i;
+      for (i = 0; i < l; ++i) {
+	objs[i] = PyTuple_GET_ITEM(args, i);
+      }
+      for (; l < max; ++l) {
+	objs[l] = 0;
+      }
+      return i + 1;
+    }    
+  }
+}
+
+/* A functor is a function object with one single object argument */
+#if PY_VERSION_HEX >= 0x02020000
+#define SWIG_Python_CallFunctor(functor, obj)	        PyObject_CallFunctionObjArgs(functor, obj, NULL);
+#else
+#define SWIG_Python_CallFunctor(functor, obj)	        PyObject_CallFunction(functor, "O", obj);
+#endif
+
+/*
+  Helper for static pointer initialization for both C and C++ code, for example
+  static PyObject *SWIG_STATIC_POINTER(MyVar) = NewSomething(...);
+*/
+#ifdef __cplusplus
+#define SWIG_STATIC_POINTER(var)  var
+#else
+#define SWIG_STATIC_POINTER(var)  var = 0; if (!var) var
+#endif
+
+/* -----------------------------------------------------------------------------
+ * Pointer declarations
+ * ----------------------------------------------------------------------------- */
+
+/* Flags for new pointer objects */
+#define SWIG_POINTER_NOSHADOW       (SWIG_POINTER_OWN      << 1)
+#define SWIG_POINTER_NEW            (SWIG_POINTER_NOSHADOW | SWIG_POINTER_OWN)
+
+#define SWIG_POINTER_IMPLICIT_CONV  (SWIG_POINTER_DISOWN   << 1)
+
+#define SWIG_BUILTIN_TP_INIT	    (SWIG_POINTER_OWN << 2)
+#define SWIG_BUILTIN_INIT	    (SWIG_BUILTIN_TP_INIT | SWIG_POINTER_OWN)
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*  How to access Py_None */
+#if defined(_WIN32) || defined(__WIN32__) || defined(__CYGWIN__)
+#  ifndef SWIG_PYTHON_NO_BUILD_NONE
+#    ifndef SWIG_PYTHON_BUILD_NONE
+#      define SWIG_PYTHON_BUILD_NONE
+#    endif
+#  endif
+#endif
+
+#ifdef SWIG_PYTHON_BUILD_NONE
+#  ifdef Py_None
+#   undef Py_None
+#   define Py_None SWIG_Py_None()
+#  endif
+SWIGRUNTIMEINLINE PyObject * 
+_SWIG_Py_None(void)
+{
+  PyObject *none = Py_BuildValue((char*)"");
+  Py_DECREF(none);
+  return none;
+}
+SWIGRUNTIME PyObject * 
+SWIG_Py_None(void)
+{
+  static PyObject *SWIG_STATIC_POINTER(none) = _SWIG_Py_None();
+  return none;
+}
+#endif
+
+/* The python void return value */
+
+SWIGRUNTIMEINLINE PyObject * 
+SWIG_Py_Void(void)
+{
+  PyObject *none = Py_None;
+  Py_INCREF(none);
+  return none;
+}
+
+/* SwigPyClientData */
+
+typedef struct {
+  PyObject *klass;
+  PyObject *newraw;
+  PyObject *newargs;
+  PyObject *destroy;
+  int delargs;
+  int implicitconv;
+  PyTypeObject *pytype;
+} SwigPyClientData;
+
+SWIGRUNTIMEINLINE int 
+SWIG_Python_CheckImplicit(swig_type_info *ty)
+{
+  SwigPyClientData *data = (SwigPyClientData *)ty->clientdata;
+  return data ? data->implicitconv : 0;
+}
+
+SWIGRUNTIMEINLINE PyObject *
+SWIG_Python_ExceptionType(swig_type_info *desc) {
+  SwigPyClientData *data = desc ? (SwigPyClientData *) desc->clientdata : 0;
+  PyObject *klass = data ? data->klass : 0;
+  return (klass ? klass : PyExc_RuntimeError);
+}
+
+
+SWIGRUNTIME SwigPyClientData * 
+SwigPyClientData_New(PyObject* obj)
+{
+  if (!obj) {
+    return 0;
+  } else {
+    SwigPyClientData *data = (SwigPyClientData *)malloc(sizeof(SwigPyClientData));
+    /* the klass element */
+    data->klass = obj;
+    Py_INCREF(data->klass);
+    /* the newraw method and newargs arguments used to create a new raw instance */
+    if (PyClass_Check(obj)) {
+      data->newraw = 0;
+      data->newargs = obj;
+      Py_INCREF(obj);
+    } else {
+#if (PY_VERSION_HEX < 0x02020000)
+      data->newraw = 0;
+#else
+      data->newraw = PyObject_GetAttrString(data->klass, (char *)"__new__");
+#endif
+      if (data->newraw) {
+	Py_INCREF(data->newraw);
+	data->newargs = PyTuple_New(1);
+	PyTuple_SetItem(data->newargs, 0, obj);
+      } else {
+	data->newargs = obj;
+      }
+      Py_INCREF(data->newargs);
+    }
+    /* the destroy method, aka as the C++ delete method */
+    data->destroy = PyObject_GetAttrString(data->klass, (char *)"__swig_destroy__");
+    if (PyErr_Occurred()) {
+      PyErr_Clear();
+      data->destroy = 0;
+    }
+    if (data->destroy) {
+      int flags;
+      Py_INCREF(data->destroy);
+      flags = PyCFunction_GET_FLAGS(data->destroy);
+#ifdef METH_O
+      data->delargs = !(flags & (METH_O));
+#else
+      data->delargs = 0;
+#endif
+    } else {
+      data->delargs = 0;
+    }
+    data->implicitconv = 0;
+    data->pytype = 0;
+    return data;
+  }
+}
+
+SWIGRUNTIME void 
+SwigPyClientData_Del(SwigPyClientData *data) {
+  Py_XDECREF(data->newraw);
+  Py_XDECREF(data->newargs);
+  Py_XDECREF(data->destroy);
+}
+
+/* =============== SwigPyObject =====================*/
+
+typedef struct {
+  PyObject_HEAD
+  void *ptr;
+  swig_type_info *ty;
+  int own;
+  PyObject *next;
+#ifdef SWIGPYTHON_BUILTIN
+  PyObject *dict;
+#endif
+} SwigPyObject;
+
+
+#ifdef SWIGPYTHON_BUILTIN
+
+SWIGRUNTIME PyObject *
+SwigPyObject_get___dict__(PyObject *v, PyObject *SWIGUNUSEDPARM(args))
+{
+  SwigPyObject *sobj = (SwigPyObject *)v;
+
+  if (!sobj->dict)
+    sobj->dict = PyDict_New();
+
+  Py_INCREF(sobj->dict);
+  return sobj->dict;
+}
+
+#endif
+
+SWIGRUNTIME PyObject *
+SwigPyObject_long(SwigPyObject *v)
+{
+  return PyLong_FromVoidPtr(v->ptr);
+}
+
+SWIGRUNTIME PyObject *
+SwigPyObject_format(const char* fmt, SwigPyObject *v)
+{
+  PyObject *res = NULL;
+  PyObject *args = PyTuple_New(1);
+  if (args) {
+    if (PyTuple_SetItem(args, 0, SwigPyObject_long(v)) == 0) {
+      PyObject *ofmt = SWIG_Python_str_FromChar(fmt);
+      if (ofmt) {
+#if PY_VERSION_HEX >= 0x03000000
+	res = PyUnicode_Format(ofmt,args);
+#else
+	res = PyString_Format(ofmt,args);
+#endif
+	Py_DECREF(ofmt);
+      }
+      Py_DECREF(args);
+    }
+  }
+  return res;
+}
+
+SWIGRUNTIME PyObject *
+SwigPyObject_oct(SwigPyObject *v)
+{
+  return SwigPyObject_format("%o",v);
+}
+
+SWIGRUNTIME PyObject *
+SwigPyObject_hex(SwigPyObject *v)
+{
+  return SwigPyObject_format("%x",v);
+}
+
+SWIGRUNTIME PyObject *
+#ifdef METH_NOARGS
+SwigPyObject_repr(SwigPyObject *v)
+#else
+SwigPyObject_repr(SwigPyObject *v, PyObject *args)
+#endif
+{
+  const char *name = SWIG_TypePrettyName(v->ty);
+  PyObject *repr = SWIG_Python_str_FromFormat("<Swig Object of type '%s' at %p>", (name ? name : "unknown"), (void *)v);
+  if (v->next) {
+# ifdef METH_NOARGS
+    PyObject *nrep = SwigPyObject_repr((SwigPyObject *)v->next);
+# else
+    PyObject *nrep = SwigPyObject_repr((SwigPyObject *)v->next, args);
+# endif
+# if PY_VERSION_HEX >= 0x03000000
+    PyObject *joined = PyUnicode_Concat(repr, nrep);
+    Py_DecRef(repr);
+    Py_DecRef(nrep);
+    repr = joined;
+# else
+    PyString_ConcatAndDel(&repr,nrep);
+# endif
+  }
+  return repr;  
+}
+
+SWIGRUNTIME int
+SwigPyObject_compare(SwigPyObject *v, SwigPyObject *w)
+{
+  void *i = v->ptr;
+  void *j = w->ptr;
+  return (i < j) ? -1 : ((i > j) ? 1 : 0);
+}
+
+/* Added for Python 3.x, would it also be useful for Python 2.x? */
+SWIGRUNTIME PyObject*
+SwigPyObject_richcompare(SwigPyObject *v, SwigPyObject *w, int op)
+{
+  PyObject* res;
+  if( op != Py_EQ && op != Py_NE ) {
+    Py_INCREF(Py_NotImplemented);
+    return Py_NotImplemented;
+  }
+  res = PyBool_FromLong( (SwigPyObject_compare(v, w)==0) == (op == Py_EQ) ? 1 : 0);
+  return res;  
+}
+
+
+SWIGRUNTIME PyTypeObject* SwigPyObject_TypeOnce(void);
+
+#ifdef SWIGPYTHON_BUILTIN
+static swig_type_info *SwigPyObject_stype = 0;
+SWIGRUNTIME PyTypeObject*
+SwigPyObject_type(void) {
+    SwigPyClientData *cd;
+    assert(SwigPyObject_stype);
+    cd = (SwigPyClientData*) SwigPyObject_stype->clientdata;
+    assert(cd);
+    assert(cd->pytype);
+    return cd->pytype;
+}
+#else
+SWIGRUNTIME PyTypeObject*
+SwigPyObject_type(void) {
+  static PyTypeObject *SWIG_STATIC_POINTER(type) = SwigPyObject_TypeOnce();
+  return type;
+}
+#endif
+
+SWIGRUNTIMEINLINE int
+SwigPyObject_Check(PyObject *op) {
+#ifdef SWIGPYTHON_BUILTIN
+  PyTypeObject *target_tp = SwigPyObject_type();
+  if (PyType_IsSubtype(op->ob_type, target_tp))
+    return 1;
+  return (strcmp(op->ob_type->tp_name, "SwigPyObject") == 0);
+#else
+  return (Py_TYPE(op) == SwigPyObject_type())
+    || (strcmp(Py_TYPE(op)->tp_name,"SwigPyObject") == 0);
+#endif
+}
+
+SWIGRUNTIME PyObject *
+SwigPyObject_New(void *ptr, swig_type_info *ty, int own);
+
+SWIGRUNTIME void
+SwigPyObject_dealloc(PyObject *v)
+{
+  SwigPyObject *sobj = (SwigPyObject *) v;
+  PyObject *next = sobj->next;
+  if (sobj->own == SWIG_POINTER_OWN) {
+    swig_type_info *ty = sobj->ty;
+    SwigPyClientData *data = ty ? (SwigPyClientData *) ty->clientdata : 0;
+    PyObject *destroy = data ? data->destroy : 0;
+    if (destroy) {
+      /* destroy is always a VARARGS method */
+      PyObject *res;
+
+      /* PyObject_CallFunction() has the potential to silently drop
+         the active active exception.  In cases of unnamed temporary
+         variable or where we just finished iterating over a generator
+         StopIteration will be active right now, and this needs to
+         remain true upon return from SwigPyObject_dealloc.  So save
+         and restore. */
+      
+      PyObject *val = NULL, *type = NULL, *tb = NULL;
+      PyErr_Fetch(&val, &type, &tb);
+
+      if (data->delargs) {
+        /* we need to create a temporary object to carry the destroy operation */
+        PyObject *tmp = SwigPyObject_New(sobj->ptr, ty, 0);
+        res = SWIG_Python_CallFunctor(destroy, tmp);
+        Py_DECREF(tmp);
+      } else {
+        PyCFunction meth = PyCFunction_GET_FUNCTION(destroy);
+        PyObject *mself = PyCFunction_GET_SELF(destroy);
+        res = ((*meth)(mself, v));
+      }
+      if (!res)
+        PyErr_WriteUnraisable(destroy);
+
+      PyErr_Restore(val, type, tb);
+
+      Py_XDECREF(res);
+    } 
+#if !defined(SWIG_PYTHON_SILENT_MEMLEAK)
+    else {
+      const char *name = SWIG_TypePrettyName(ty);
+      printf("swig/python detected a memory leak of type '%s', no destructor found.\n", (name ? name : "unknown"));
+    }
+#endif
+  } 
+  Py_XDECREF(next);
+  PyObject_DEL(v);
+}
+
+SWIGRUNTIME PyObject* 
+SwigPyObject_append(PyObject* v, PyObject* next)
+{
+  SwigPyObject *sobj = (SwigPyObject *) v;
+#ifndef METH_O
+  PyObject *tmp = 0;
+  if (!PyArg_ParseTuple(next,(char *)"O:append", &tmp)) return NULL;
+  next = tmp;
+#endif
+  if (!SwigPyObject_Check(next)) {
+    PyErr_SetString(PyExc_TypeError, "Attempt to append a non SwigPyObject");
+    return NULL;
+  }
+  sobj->next = next;
+  Py_INCREF(next);
+  return SWIG_Py_Void();
+}
+
+SWIGRUNTIME PyObject* 
+#ifdef METH_NOARGS
+SwigPyObject_next(PyObject* v)
+#else
+SwigPyObject_next(PyObject* v, PyObject *SWIGUNUSEDPARM(args))
+#endif
+{
+  SwigPyObject *sobj = (SwigPyObject *) v;
+  if (sobj->next) {    
+    Py_INCREF(sobj->next);
+    return sobj->next;
+  } else {
+    return SWIG_Py_Void();
+  }
+}
+
+SWIGINTERN PyObject*
+#ifdef METH_NOARGS
+SwigPyObject_disown(PyObject *v)
+#else
+SwigPyObject_disown(PyObject* v, PyObject *SWIGUNUSEDPARM(args))
+#endif
+{
+  SwigPyObject *sobj = (SwigPyObject *)v;
+  sobj->own = 0;
+  return SWIG_Py_Void();
+}
+
+SWIGINTERN PyObject*
+#ifdef METH_NOARGS
+SwigPyObject_acquire(PyObject *v)
+#else
+SwigPyObject_acquire(PyObject* v, PyObject *SWIGUNUSEDPARM(args))
+#endif
+{
+  SwigPyObject *sobj = (SwigPyObject *)v;
+  sobj->own = SWIG_POINTER_OWN;
+  return SWIG_Py_Void();
+}
+
+SWIGINTERN PyObject*
+SwigPyObject_own(PyObject *v, PyObject *args)
+{
+  PyObject *val = 0;
+#if (PY_VERSION_HEX < 0x02020000)
+  if (!PyArg_ParseTuple(args,(char *)"|O:own",&val))
+#elif (PY_VERSION_HEX < 0x02050000)
+  if (!PyArg_UnpackTuple(args, (char *)"own", 0, 1, &val)) 
+#else
+  if (!PyArg_UnpackTuple(args, "own", 0, 1, &val)) 
+#endif
+    {
+      return NULL;
+    } 
+  else
+    {
+      SwigPyObject *sobj = (SwigPyObject *)v;
+      PyObject *obj = PyBool_FromLong(sobj->own);
+      if (val) {
+#ifdef METH_NOARGS
+	if (PyObject_IsTrue(val)) {
+	  SwigPyObject_acquire(v);
+	} else {
+	  SwigPyObject_disown(v);
+	}
+#else
+	if (PyObject_IsTrue(val)) {
+	  SwigPyObject_acquire(v,args);
+	} else {
+	  SwigPyObject_disown(v,args);
+	}
+#endif
+      } 
+      return obj;
+    }
+}
+
+#ifdef METH_O
+static PyMethodDef
+swigobject_methods[] = {
+  {(char *)"disown",  (PyCFunction)SwigPyObject_disown,  METH_NOARGS,  (char *)"releases ownership of the pointer"},
+  {(char *)"acquire", (PyCFunction)SwigPyObject_acquire, METH_NOARGS,  (char *)"acquires ownership of the pointer"},
+  {(char *)"own",     (PyCFunction)SwigPyObject_own,     METH_VARARGS, (char *)"returns/sets ownership of the pointer"},
+  {(char *)"append",  (PyCFunction)SwigPyObject_append,  METH_O,       (char *)"appends another 'this' object"},
+  {(char *)"next",    (PyCFunction)SwigPyObject_next,    METH_NOARGS,  (char *)"returns the next 'this' object"},
+  {(char *)"__repr__",(PyCFunction)SwigPyObject_repr,    METH_NOARGS,  (char *)"returns object representation"},
+  {0, 0, 0, 0}  
+};
+#else
+static PyMethodDef
+swigobject_methods[] = {
+  {(char *)"disown",  (PyCFunction)SwigPyObject_disown,  METH_VARARGS,  (char *)"releases ownership of the pointer"},
+  {(char *)"acquire", (PyCFunction)SwigPyObject_acquire, METH_VARARGS,  (char *)"acquires ownership of the pointer"},
+  {(char *)"own",     (PyCFunction)SwigPyObject_own,     METH_VARARGS,  (char *)"returns/sets ownership of the pointer"},
+  {(char *)"append",  (PyCFunction)SwigPyObject_append,  METH_VARARGS,  (char *)"appends another 'this' object"},
+  {(char *)"next",    (PyCFunction)SwigPyObject_next,    METH_VARARGS,  (char *)"returns the next 'this' object"},
+  {(char *)"__repr__",(PyCFunction)SwigPyObject_repr,   METH_VARARGS,  (char *)"returns object representation"},
+  {0, 0, 0, 0}  
+};
+#endif
+
+#if PY_VERSION_HEX < 0x02020000
+SWIGINTERN PyObject *
+SwigPyObject_getattr(SwigPyObject *sobj,char *name)
+{
+  return Py_FindMethod(swigobject_methods, (PyObject *)sobj, name);
+}
+#endif
+
+SWIGRUNTIME PyTypeObject*
+SwigPyObject_TypeOnce(void) {
+  static char swigobject_doc[] = "Swig object carries a C/C++ instance pointer";
+
+  static PyNumberMethods SwigPyObject_as_number = {
+    (binaryfunc)0, /*nb_add*/
+    (binaryfunc)0, /*nb_subtract*/
+    (binaryfunc)0, /*nb_multiply*/
+    /* nb_divide removed in Python 3 */
+#if PY_VERSION_HEX < 0x03000000
+    (binaryfunc)0, /*nb_divide*/
+#endif
+    (binaryfunc)0, /*nb_remainder*/
+    (binaryfunc)0, /*nb_divmod*/
+    (ternaryfunc)0,/*nb_power*/
+    (unaryfunc)0,  /*nb_negative*/
+    (unaryfunc)0,  /*nb_positive*/
+    (unaryfunc)0,  /*nb_absolute*/
+    (inquiry)0,    /*nb_nonzero*/
+    0,		   /*nb_invert*/
+    0,		   /*nb_lshift*/
+    0,		   /*nb_rshift*/
+    0,		   /*nb_and*/
+    0,		   /*nb_xor*/
+    0,		   /*nb_or*/
+#if PY_VERSION_HEX < 0x03000000
+    0,   /*nb_coerce*/
+#endif
+    (unaryfunc)SwigPyObject_long, /*nb_int*/
+#if PY_VERSION_HEX < 0x03000000
+    (unaryfunc)SwigPyObject_long, /*nb_long*/
+#else
+    0, /*nb_reserved*/
+#endif
+    (unaryfunc)0,                 /*nb_float*/
+#if PY_VERSION_HEX < 0x03000000
+    (unaryfunc)SwigPyObject_oct,  /*nb_oct*/
+    (unaryfunc)SwigPyObject_hex,  /*nb_hex*/
+#endif
+#if PY_VERSION_HEX >= 0x03050000 /* 3.5 */
+    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 /* nb_inplace_add -> nb_inplace_matrix_multiply */
+#elif PY_VERSION_HEX >= 0x03000000 /* 3.0 */
+    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 /* nb_inplace_add -> nb_index, nb_inplace_divide removed */
+#elif PY_VERSION_HEX >= 0x02050000 /* 2.5.0 */
+    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 /* nb_inplace_add -> nb_index */
+#elif PY_VERSION_HEX >= 0x02020000 /* 2.2.0 */
+    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 /* nb_inplace_add -> nb_inplace_true_divide */
+#elif PY_VERSION_HEX >= 0x02000000 /* 2.0.0 */
+    0,0,0,0,0,0,0,0,0,0,0 /* nb_inplace_add -> nb_inplace_or */
+#endif
+  };
+
+  static PyTypeObject swigpyobject_type;
+  static int type_init = 0;
+  if (!type_init) {
+    const PyTypeObject tmp = {
+#if PY_VERSION_HEX >= 0x03000000
+      PyVarObject_HEAD_INIT(NULL, 0)
+#else
+      PyObject_HEAD_INIT(NULL)
+      0,                                    /* ob_size */
+#endif
+      (char *)"SwigPyObject",               /* tp_name */
+      sizeof(SwigPyObject),                 /* tp_basicsize */
+      0,                                    /* tp_itemsize */
+      (destructor)SwigPyObject_dealloc,     /* tp_dealloc */
+      0,                                    /* tp_print */
+#if PY_VERSION_HEX < 0x02020000
+      (getattrfunc)SwigPyObject_getattr,    /* tp_getattr */
+#else
+      (getattrfunc)0,                       /* tp_getattr */
+#endif
+      (setattrfunc)0,                       /* tp_setattr */
+#if PY_VERSION_HEX >= 0x03000000
+      0, /* tp_reserved in 3.0.1, tp_compare in 3.0.0 but not used */
+#else
+      (cmpfunc)SwigPyObject_compare,        /* tp_compare */
+#endif
+      (reprfunc)SwigPyObject_repr,          /* tp_repr */
+      &SwigPyObject_as_number,              /* tp_as_number */
+      0,                                    /* tp_as_sequence */
+      0,                                    /* tp_as_mapping */
+      (hashfunc)0,                          /* tp_hash */
+      (ternaryfunc)0,                       /* tp_call */
+      0,                                    /* tp_str */
+      PyObject_GenericGetAttr,              /* tp_getattro */
+      0,                                    /* tp_setattro */
+      0,                                    /* tp_as_buffer */
+      Py_TPFLAGS_DEFAULT,                   /* tp_flags */
+      swigobject_doc,                       /* tp_doc */
+      0,                                    /* tp_traverse */
+      0,                                    /* tp_clear */
+      (richcmpfunc)SwigPyObject_richcompare,/* tp_richcompare */
+      0,                                    /* tp_weaklistoffset */
+#if PY_VERSION_HEX >= 0x02020000
+      0,                                    /* tp_iter */
+      0,                                    /* tp_iternext */
+      swigobject_methods,                   /* tp_methods */
+      0,                                    /* tp_members */
+      0,                                    /* tp_getset */
+      0,                                    /* tp_base */
+      0,                                    /* tp_dict */
+      0,                                    /* tp_descr_get */
+      0,                                    /* tp_descr_set */
+      0,                                    /* tp_dictoffset */
+      0,                                    /* tp_init */
+      0,                                    /* tp_alloc */
+      0,                                    /* tp_new */
+      0,                                    /* tp_free */
+      0,                                    /* tp_is_gc */
+      0,                                    /* tp_bases */
+      0,                                    /* tp_mro */
+      0,                                    /* tp_cache */
+      0,                                    /* tp_subclasses */
+      0,                                    /* tp_weaklist */
+#endif
+#if PY_VERSION_HEX >= 0x02030000
+      0,                                    /* tp_del */
+#endif
+#if PY_VERSION_HEX >= 0x02060000
+      0,                                    /* tp_version_tag */
+#endif
+#if PY_VERSION_HEX >= 0x03040000
+      0,                                    /* tp_finalize */
+#endif
+#ifdef COUNT_ALLOCS
+      0,                                    /* tp_allocs */
+      0,                                    /* tp_frees */
+      0,                                    /* tp_maxalloc */
+#if PY_VERSION_HEX >= 0x02050000
+      0,                                    /* tp_prev */
+#endif
+      0                                     /* tp_next */
+#endif
+    };
+    swigpyobject_type = tmp;
+    type_init = 1;
+#if PY_VERSION_HEX < 0x02020000
+    swigpyobject_type.ob_type = &PyType_Type;
+#else
+    if (PyType_Ready(&swigpyobject_type) < 0)
+      return NULL;
+#endif
+  }
+  return &swigpyobject_type;
+}
+
+SWIGRUNTIME PyObject *
+SwigPyObject_New(void *ptr, swig_type_info *ty, int own)
+{
+  SwigPyObject *sobj = PyObject_NEW(SwigPyObject, SwigPyObject_type());
+  if (sobj) {
+    sobj->ptr  = ptr;
+    sobj->ty   = ty;
+    sobj->own  = own;
+    sobj->next = 0;
+  }
+  return (PyObject *)sobj;
+}
+
+/* -----------------------------------------------------------------------------
+ * Implements a simple Swig Packed type, and use it instead of string
+ * ----------------------------------------------------------------------------- */
+
+typedef struct {
+  PyObject_HEAD
+  void *pack;
+  swig_type_info *ty;
+  size_t size;
+} SwigPyPacked;
+
+SWIGRUNTIME int
+SwigPyPacked_print(SwigPyPacked *v, FILE *fp, int SWIGUNUSEDPARM(flags))
+{
+  char result[SWIG_BUFFER_SIZE];
+  fputs("<Swig Packed ", fp); 
+  if (SWIG_PackDataName(result, v->pack, v->size, 0, sizeof(result))) {
+    fputs("at ", fp); 
+    fputs(result, fp); 
+  }
+  fputs(v->ty->name,fp); 
+  fputs(">", fp);
+  return 0; 
+}
+  
+SWIGRUNTIME PyObject *
+SwigPyPacked_repr(SwigPyPacked *v)
+{
+  char result[SWIG_BUFFER_SIZE];
+  if (SWIG_PackDataName(result, v->pack, v->size, 0, sizeof(result))) {
+    return SWIG_Python_str_FromFormat("<Swig Packed at %s%s>", result, v->ty->name);
+  } else {
+    return SWIG_Python_str_FromFormat("<Swig Packed %s>", v->ty->name);
+  }  
+}
+
+SWIGRUNTIME PyObject *
+SwigPyPacked_str(SwigPyPacked *v)
+{
+  char result[SWIG_BUFFER_SIZE];
+  if (SWIG_PackDataName(result, v->pack, v->size, 0, sizeof(result))){
+    return SWIG_Python_str_FromFormat("%s%s", result, v->ty->name);
+  } else {
+    return SWIG_Python_str_FromChar(v->ty->name);
+  }  
+}
+
+SWIGRUNTIME int
+SwigPyPacked_compare(SwigPyPacked *v, SwigPyPacked *w)
+{
+  size_t i = v->size;
+  size_t j = w->size;
+  int s = (i < j) ? -1 : ((i > j) ? 1 : 0);
+  return s ? s : strncmp((char *)v->pack, (char *)w->pack, 2*v->size);
+}
+
+SWIGRUNTIME PyTypeObject* SwigPyPacked_TypeOnce(void);
+
+SWIGRUNTIME PyTypeObject*
+SwigPyPacked_type(void) {
+  static PyTypeObject *SWIG_STATIC_POINTER(type) = SwigPyPacked_TypeOnce();
+  return type;
+}
+
+SWIGRUNTIMEINLINE int
+SwigPyPacked_Check(PyObject *op) {
+  return ((op)->ob_type == SwigPyPacked_TypeOnce()) 
+    || (strcmp((op)->ob_type->tp_name,"SwigPyPacked") == 0);
+}
+
+SWIGRUNTIME void
+SwigPyPacked_dealloc(PyObject *v)
+{
+  if (SwigPyPacked_Check(v)) {
+    SwigPyPacked *sobj = (SwigPyPacked *) v;
+    free(sobj->pack);
+  }
+  PyObject_DEL(v);
+}
+
+SWIGRUNTIME PyTypeObject*
+SwigPyPacked_TypeOnce(void) {
+  static char swigpacked_doc[] = "Swig object carries a C/C++ instance pointer";
+  static PyTypeObject swigpypacked_type;
+  static int type_init = 0;
+  if (!type_init) {
+    const PyTypeObject tmp = {
+#if PY_VERSION_HEX>=0x03000000
+      PyVarObject_HEAD_INIT(NULL, 0)
+#else
+      PyObject_HEAD_INIT(NULL)
+      0,                                    /* ob_size */
+#endif
+      (char *)"SwigPyPacked",               /* tp_name */
+      sizeof(SwigPyPacked),                 /* tp_basicsize */
+      0,                                    /* tp_itemsize */
+      (destructor)SwigPyPacked_dealloc,     /* tp_dealloc */
+      (printfunc)SwigPyPacked_print,        /* tp_print */
+      (getattrfunc)0,                       /* tp_getattr */
+      (setattrfunc)0,                       /* tp_setattr */
+#if PY_VERSION_HEX>=0x03000000
+      0, /* tp_reserved in 3.0.1 */
+#else
+      (cmpfunc)SwigPyPacked_compare,        /* tp_compare */
+#endif
+      (reprfunc)SwigPyPacked_repr,          /* tp_repr */
+      0,                                    /* tp_as_number */
+      0,                                    /* tp_as_sequence */
+      0,                                    /* tp_as_mapping */
+      (hashfunc)0,                          /* tp_hash */
+      (ternaryfunc)0,                       /* tp_call */
+      (reprfunc)SwigPyPacked_str,           /* tp_str */
+      PyObject_GenericGetAttr,              /* tp_getattro */
+      0,                                    /* tp_setattro */
+      0,                                    /* tp_as_buffer */
+      Py_TPFLAGS_DEFAULT,                   /* tp_flags */
+      swigpacked_doc,                       /* tp_doc */
+      0,                                    /* tp_traverse */
+      0,                                    /* tp_clear */
+      0,                                    /* tp_richcompare */
+      0,                                    /* tp_weaklistoffset */
+#if PY_VERSION_HEX >= 0x02020000
+      0,                                    /* tp_iter */
+      0,                                    /* tp_iternext */
+      0,                                    /* tp_methods */
+      0,                                    /* tp_members */
+      0,                                    /* tp_getset */
+      0,                                    /* tp_base */
+      0,                                    /* tp_dict */
+      0,                                    /* tp_descr_get */
+      0,                                    /* tp_descr_set */
+      0,                                    /* tp_dictoffset */
+      0,                                    /* tp_init */
+      0,                                    /* tp_alloc */
+      0,                                    /* tp_new */
+      0,                                    /* tp_free */
+      0,                                    /* tp_is_gc */
+      0,                                    /* tp_bases */
+      0,                                    /* tp_mro */
+      0,                                    /* tp_cache */
+      0,                                    /* tp_subclasses */
+      0,                                    /* tp_weaklist */
+#endif
+#if PY_VERSION_HEX >= 0x02030000
+      0,                                    /* tp_del */
+#endif
+#if PY_VERSION_HEX >= 0x02060000
+      0,                                    /* tp_version_tag */
+#endif
+#if PY_VERSION_HEX >= 0x03040000
+      0,                                    /* tp_finalize */
+#endif
+#ifdef COUNT_ALLOCS
+      0,                                    /* tp_allocs */
+      0,                                    /* tp_frees */
+      0,                                    /* tp_maxalloc */
+#if PY_VERSION_HEX >= 0x02050000
+      0,                                    /* tp_prev */
+#endif
+      0                                     /* tp_next */
+#endif
+    };
+    swigpypacked_type = tmp;
+    type_init = 1;
+#if PY_VERSION_HEX < 0x02020000
+    swigpypacked_type.ob_type = &PyType_Type;
+#else
+    if (PyType_Ready(&swigpypacked_type) < 0)
+      return NULL;
+#endif
+  }
+  return &swigpypacked_type;
+}
+
+SWIGRUNTIME PyObject *
+SwigPyPacked_New(void *ptr, size_t size, swig_type_info *ty)
+{
+  SwigPyPacked *sobj = PyObject_NEW(SwigPyPacked, SwigPyPacked_type());
+  if (sobj) {
+    void *pack = malloc(size);
+    if (pack) {
+      memcpy(pack, ptr, size);
+      sobj->pack = pack;
+      sobj->ty   = ty;
+      sobj->size = size;
+    } else {
+      PyObject_DEL((PyObject *) sobj);
+      sobj = 0;
+    }
+  }
+  return (PyObject *) sobj;
+}
+
+SWIGRUNTIME swig_type_info *
+SwigPyPacked_UnpackData(PyObject *obj, void *ptr, size_t size)
+{
+  if (SwigPyPacked_Check(obj)) {
+    SwigPyPacked *sobj = (SwigPyPacked *)obj;
+    if (sobj->size != size) return 0;
+    memcpy(ptr, sobj->pack, size);
+    return sobj->ty;
+  } else {
+    return 0;
+  }
+}
+
+/* -----------------------------------------------------------------------------
+ * pointers/data manipulation
+ * ----------------------------------------------------------------------------- */
+
+SWIGRUNTIMEINLINE PyObject *
+_SWIG_This(void)
+{
+    return SWIG_Python_str_FromChar("this");
+}
+
+static PyObject *swig_this = NULL;
+
+SWIGRUNTIME PyObject *
+SWIG_This(void)
+{
+  if (swig_this == NULL)
+    swig_this = _SWIG_This();
+  return swig_this;
+}
+
+/* #define SWIG_PYTHON_SLOW_GETSET_THIS */
+
+/* TODO: I don't know how to implement the fast getset in Python 3 right now */
+#if PY_VERSION_HEX>=0x03000000
+#define SWIG_PYTHON_SLOW_GETSET_THIS 
+#endif
+
+SWIGRUNTIME SwigPyObject *
+SWIG_Python_GetSwigThis(PyObject *pyobj) 
+{
+  PyObject *obj;
+
+  if (SwigPyObject_Check(pyobj))
+    return (SwigPyObject *) pyobj;
+
+#ifdef SWIGPYTHON_BUILTIN
+  (void)obj;
+# ifdef PyWeakref_CheckProxy
+  if (PyWeakref_CheckProxy(pyobj)) {
+    pyobj = PyWeakref_GET_OBJECT(pyobj);
+    if (pyobj && SwigPyObject_Check(pyobj))
+      return (SwigPyObject*) pyobj;
+  }
+# endif
+  return NULL;
+#else
+
+  obj = 0;
+
+#if (!defined(SWIG_PYTHON_SLOW_GETSET_THIS) && (PY_VERSION_HEX >= 0x02030000))
+  if (PyInstance_Check(pyobj)) {
+    obj = _PyInstance_Lookup(pyobj, SWIG_This());      
+  } else {
+    PyObject **dictptr = _PyObject_GetDictPtr(pyobj);
+    if (dictptr != NULL) {
+      PyObject *dict = *dictptr;
+      obj = dict ? PyDict_GetItem(dict, SWIG_This()) : 0;
+    } else {
+#ifdef PyWeakref_CheckProxy
+      if (PyWeakref_CheckProxy(pyobj)) {
+	PyObject *wobj = PyWeakref_GET_OBJECT(pyobj);
+	return wobj ? SWIG_Python_GetSwigThis(wobj) : 0;
+      }
+#endif
+      obj = PyObject_GetAttr(pyobj,SWIG_This());
+      if (obj) {
+	Py_DECREF(obj);
+      } else {
+	if (PyErr_Occurred()) PyErr_Clear();
+	return 0;
+      }
+    }
+  }
+#else
+  obj = PyObject_GetAttr(pyobj,SWIG_This());
+  if (obj) {
+    Py_DECREF(obj);
+  } else {
+    if (PyErr_Occurred()) PyErr_Clear();
+    return 0;
+  }
+#endif
+  if (obj && !SwigPyObject_Check(obj)) {
+    /* a PyObject is called 'this', try to get the 'real this'
+       SwigPyObject from it */ 
+    return SWIG_Python_GetSwigThis(obj);
+  }
+  return (SwigPyObject *)obj;
+#endif
+}
+
+/* Acquire a pointer value */
+
+SWIGRUNTIME int
+SWIG_Python_AcquirePtr(PyObject *obj, int own) {
+  if (own == SWIG_POINTER_OWN) {
+    SwigPyObject *sobj = SWIG_Python_GetSwigThis(obj);
+    if (sobj) {
+      int oldown = sobj->own;
+      sobj->own = own;
+      return oldown;
+    }
+  }
+  return 0;
+}
+
+/* Convert a pointer value */
+
+SWIGRUNTIME int
+SWIG_Python_ConvertPtrAndOwn(PyObject *obj, void **ptr, swig_type_info *ty, int flags, int *own) {
+  int res;
+  SwigPyObject *sobj;
+  int implicit_conv = (flags & SWIG_POINTER_IMPLICIT_CONV) != 0;
+
+  if (!obj)
+    return SWIG_ERROR;
+  if (obj == Py_None && !implicit_conv) {
+    if (ptr)
+      *ptr = 0;
+    return SWIG_OK;
+  }
+
+  res = SWIG_ERROR;
+
+  sobj = SWIG_Python_GetSwigThis(obj);
+  if (own)
+    *own = 0;
+  while (sobj) {
+    void *vptr = sobj->ptr;
+    if (ty) {
+      swig_type_info *to = sobj->ty;
+      if (to == ty) {
+        /* no type cast needed */
+        if (ptr) *ptr = vptr;
+        break;
+      } else {
+        swig_cast_info *tc = SWIG_TypeCheck(to->name,ty);
+        if (!tc) {
+          sobj = (SwigPyObject *)sobj->next;
+        } else {
+          if (ptr) {
+            int newmemory = 0;
+            *ptr = SWIG_TypeCast(tc,vptr,&newmemory);
+            if (newmemory == SWIG_CAST_NEW_MEMORY) {
+              assert(own); /* badly formed typemap which will lead to a memory leak - it must set and use own to delete *ptr */
+              if (own)
+                *own = *own | SWIG_CAST_NEW_MEMORY;
+            }
+          }
+          break;
+        }
+      }
+    } else {
+      if (ptr) *ptr = vptr;
+      break;
+    }
+  }
+  if (sobj) {
+    if (own)
+      *own = *own | sobj->own;
+    if (flags & SWIG_POINTER_DISOWN) {
+      sobj->own = 0;
+    }
+    res = SWIG_OK;
+  } else {
+    if (implicit_conv) {
+      SwigPyClientData *data = ty ? (SwigPyClientData *) ty->clientdata : 0;
+      if (data && !data->implicitconv) {
+        PyObject *klass = data->klass;
+        if (klass) {
+          PyObject *impconv;
+          data->implicitconv = 1; /* avoid recursion and call 'explicit' constructors*/
+          impconv = SWIG_Python_CallFunctor(klass, obj);
+          data->implicitconv = 0;
+          if (PyErr_Occurred()) {
+            PyErr_Clear();
+            impconv = 0;
+          }
+          if (impconv) {
+            SwigPyObject *iobj = SWIG_Python_GetSwigThis(impconv);
+            if (iobj) {
+              void *vptr;
+              res = SWIG_Python_ConvertPtrAndOwn((PyObject*)iobj, &vptr, ty, 0, 0);
+              if (SWIG_IsOK(res)) {
+                if (ptr) {
+                  *ptr = vptr;
+                  /* transfer the ownership to 'ptr' */
+                  iobj->own = 0;
+                  res = SWIG_AddCast(res);
+                  res = SWIG_AddNewMask(res);
+                } else {
+                  res = SWIG_AddCast(res);		    
+                }
+              }
+            }
+            Py_DECREF(impconv);
+          }
+        }
+      }
+    }
+    if (!SWIG_IsOK(res) && obj == Py_None) {
+      if (ptr)
+        *ptr = 0;
+      if (PyErr_Occurred())
+        PyErr_Clear();
+      res = SWIG_OK;
+    }
+  }
+  return res;
+}
+
+/* Convert a function ptr value */
+
+SWIGRUNTIME int
+SWIG_Python_ConvertFunctionPtr(PyObject *obj, void **ptr, swig_type_info *ty) {
+  if (!PyCFunction_Check(obj)) {
+    return SWIG_ConvertPtr(obj, ptr, ty, 0);
+  } else {
+    void *vptr = 0;
+    
+    /* here we get the method pointer for callbacks */
+    const char *doc = (((PyCFunctionObject *)obj) -> m_ml -> ml_doc);
+    const char *desc = doc ? strstr(doc, "swig_ptr: ") : 0;
+    if (desc)
+      desc = ty ? SWIG_UnpackVoidPtr(desc + 10, &vptr, ty->name) : 0;
+    if (!desc) 
+      return SWIG_ERROR;
+    if (ty) {
+      swig_cast_info *tc = SWIG_TypeCheck(desc,ty);
+      if (tc) {
+        int newmemory = 0;
+        *ptr = SWIG_TypeCast(tc,vptr,&newmemory);
+        assert(!newmemory); /* newmemory handling not yet implemented */
+      } else {
+        return SWIG_ERROR;
+      }
+    } else {
+      *ptr = vptr;
+    }
+    return SWIG_OK;
+  }
+}
+
+/* Convert a packed value value */
+
+SWIGRUNTIME int
+SWIG_Python_ConvertPacked(PyObject *obj, void *ptr, size_t sz, swig_type_info *ty) {
+  swig_type_info *to = SwigPyPacked_UnpackData(obj, ptr, sz);
+  if (!to) return SWIG_ERROR;
+  if (ty) {
+    if (to != ty) {
+      /* check type cast? */
+      swig_cast_info *tc = SWIG_TypeCheck(to->name,ty);
+      if (!tc) return SWIG_ERROR;
+    }
+  }
+  return SWIG_OK;
+}  
+
+/* -----------------------------------------------------------------------------
+ * Create a new pointer object
+ * ----------------------------------------------------------------------------- */
+
+/*
+  Create a new instance object, without calling __init__, and set the
+  'this' attribute.
+*/
+
+SWIGRUNTIME PyObject* 
+SWIG_Python_NewShadowInstance(SwigPyClientData *data, PyObject *swig_this)
+{
+#if (PY_VERSION_HEX >= 0x02020000)
+  PyObject *inst = 0;
+  PyObject *newraw = data->newraw;
+  if (newraw) {
+    inst = PyObject_Call(newraw, data->newargs, NULL);
+    if (inst) {
+#if !defined(SWIG_PYTHON_SLOW_GETSET_THIS)
+      PyObject **dictptr = _PyObject_GetDictPtr(inst);
+      if (dictptr != NULL) {
+	PyObject *dict = *dictptr;
+	if (dict == NULL) {
+	  dict = PyDict_New();
+	  *dictptr = dict;
+	  PyDict_SetItem(dict, SWIG_This(), swig_this);
+	}
+      }
+#else
+      PyObject *key = SWIG_This();
+      PyObject_SetAttr(inst, key, swig_this);
+#endif
+    }
+  } else {
+#if PY_VERSION_HEX >= 0x03000000
+    inst = ((PyTypeObject*) data->newargs)->tp_new((PyTypeObject*) data->newargs, Py_None, Py_None);
+    if (inst) {
+      PyObject_SetAttr(inst, SWIG_This(), swig_this);
+      Py_TYPE(inst)->tp_flags &= ~Py_TPFLAGS_VALID_VERSION_TAG;
+    }
+#else
+    PyObject *dict = PyDict_New();
+    if (dict) {
+      PyDict_SetItem(dict, SWIG_This(), swig_this);
+      inst = PyInstance_NewRaw(data->newargs, dict);
+      Py_DECREF(dict);
+    }
+#endif
+  }
+  return inst;
+#else
+#if (PY_VERSION_HEX >= 0x02010000)
+  PyObject *inst = 0;
+  PyObject *dict = PyDict_New();
+  if (dict) {
+    PyDict_SetItem(dict, SWIG_This(), swig_this);
+    inst = PyInstance_NewRaw(data->newargs, dict);
+    Py_DECREF(dict);
+  }
+  return (PyObject *) inst;
+#else
+  PyInstanceObject *inst = PyObject_NEW(PyInstanceObject, &PyInstance_Type);
+  if (inst == NULL) {
+    return NULL;
+  }
+  inst->in_class = (PyClassObject *)data->newargs;
+  Py_INCREF(inst->in_class);
+  inst->in_dict = PyDict_New();
+  if (inst->in_dict == NULL) {
+    Py_DECREF(inst);
+    return NULL;
+  }
+#ifdef Py_TPFLAGS_HAVE_WEAKREFS
+  inst->in_weakreflist = NULL;
+#endif
+#ifdef Py_TPFLAGS_GC
+  PyObject_GC_Init(inst);
+#endif
+  PyDict_SetItem(inst->in_dict, SWIG_This(), swig_this);
+  return (PyObject *) inst;
+#endif
+#endif
+}
+
+SWIGRUNTIME void
+SWIG_Python_SetSwigThis(PyObject *inst, PyObject *swig_this)
+{
+ PyObject *dict;
+#if (PY_VERSION_HEX >= 0x02020000) && !defined(SWIG_PYTHON_SLOW_GETSET_THIS)
+ PyObject **dictptr = _PyObject_GetDictPtr(inst);
+ if (dictptr != NULL) {
+   dict = *dictptr;
+   if (dict == NULL) {
+     dict = PyDict_New();
+     *dictptr = dict;
+   }
+   PyDict_SetItem(dict, SWIG_This(), swig_this);
+   return;
+ }
+#endif
+ dict = PyObject_GetAttrString(inst, (char*)"__dict__");
+ PyDict_SetItem(dict, SWIG_This(), swig_this);
+ Py_DECREF(dict);
+} 
+
+
+SWIGINTERN PyObject *
+SWIG_Python_InitShadowInstance(PyObject *args) {
+  PyObject *obj[2];
+  if (!SWIG_Python_UnpackTuple(args, "swiginit", 2, 2, obj)) {
+    return NULL;
+  } else {
+    SwigPyObject *sthis = SWIG_Python_GetSwigThis(obj[0]);
+    if (sthis) {
+      SwigPyObject_append((PyObject*) sthis, obj[1]);
+    } else {
+      SWIG_Python_SetSwigThis(obj[0], obj[1]);
+    }
+    return SWIG_Py_Void();
+  }
+}
+
+/* Create a new pointer object */
+
+SWIGRUNTIME PyObject *
+SWIG_Python_NewPointerObj(PyObject *self, void *ptr, swig_type_info *type, int flags) {
+  SwigPyClientData *clientdata;
+  PyObject * robj;
+  int own;
+
+  if (!ptr)
+    return SWIG_Py_Void();
+
+  clientdata = type ? (SwigPyClientData *)(type->clientdata) : 0;
+  own = (flags & SWIG_POINTER_OWN) ? SWIG_POINTER_OWN : 0;
+  if (clientdata && clientdata->pytype) {
+    SwigPyObject *newobj;
+    if (flags & SWIG_BUILTIN_TP_INIT) {
+      newobj = (SwigPyObject*) self;
+      if (newobj->ptr) {
+        PyObject *next_self = clientdata->pytype->tp_alloc(clientdata->pytype, 0);
+        while (newobj->next)
+	  newobj = (SwigPyObject *) newobj->next;
+        newobj->next = next_self;
+        newobj = (SwigPyObject *)next_self;
+#ifdef SWIGPYTHON_BUILTIN
+        newobj->dict = 0;
+#endif
+      }
+    } else {
+      newobj = PyObject_New(SwigPyObject, clientdata->pytype);
+#ifdef SWIGPYTHON_BUILTIN
+      newobj->dict = 0;
+#endif
+    }
+    if (newobj) {
+      newobj->ptr = ptr;
+      newobj->ty = type;
+      newobj->own = own;
+      newobj->next = 0;
+      return (PyObject*) newobj;
+    }
+    return SWIG_Py_Void();
+  }
+
+  assert(!(flags & SWIG_BUILTIN_TP_INIT));
+
+  robj = SwigPyObject_New(ptr, type, own);
+  if (robj && clientdata && !(flags & SWIG_POINTER_NOSHADOW)) {
+    PyObject *inst = SWIG_Python_NewShadowInstance(clientdata, robj);
+    Py_DECREF(robj);
+    robj = inst;
+  }
+  return robj;
+}
+
+/* Create a new packed object */
+
+SWIGRUNTIMEINLINE PyObject *
+SWIG_Python_NewPackedObj(void *ptr, size_t sz, swig_type_info *type) {
+  return ptr ? SwigPyPacked_New((void *) ptr, sz, type) : SWIG_Py_Void();
+}
+
+/* -----------------------------------------------------------------------------*
+ *  Get type list 
+ * -----------------------------------------------------------------------------*/
+
+#ifdef SWIG_LINK_RUNTIME
+void *SWIG_ReturnGlobalTypeList(void *);
+#endif
+
+SWIGRUNTIME swig_module_info *
+SWIG_Python_GetModule(void *SWIGUNUSEDPARM(clientdata)) {
+  static void *type_pointer = (void *)0;
+  /* first check if module already created */
+  if (!type_pointer) {
+#ifdef SWIG_LINK_RUNTIME
+    type_pointer = SWIG_ReturnGlobalTypeList((void *)0);
+#else
+# ifdef SWIGPY_USE_CAPSULE
+    type_pointer = PyCapsule_Import(SWIGPY_CAPSULE_NAME, 0);
+# else
+    type_pointer = PyCObject_Import((char*)"swig_runtime_data" SWIG_RUNTIME_VERSION,
+				    (char*)"type_pointer" SWIG_TYPE_TABLE_NAME);
+# endif
+    if (PyErr_Occurred()) {
+      PyErr_Clear();
+      type_pointer = (void *)0;
+    }
+#endif
+  }
+  return (swig_module_info *) type_pointer;
+}
+
+#if PY_MAJOR_VERSION < 2
+/* PyModule_AddObject function was introduced in Python 2.0.  The following function
+   is copied out of Python/modsupport.c in python version 2.3.4 */
+SWIGINTERN int
+PyModule_AddObject(PyObject *m, char *name, PyObject *o)
+{
+  PyObject *dict;
+  if (!PyModule_Check(m)) {
+    PyErr_SetString(PyExc_TypeError, "PyModule_AddObject() needs module as first arg");
+    return SWIG_ERROR;
+  }
+  if (!o) {
+    PyErr_SetString(PyExc_TypeError, "PyModule_AddObject() needs non-NULL value");
+    return SWIG_ERROR;
+  }
+  
+  dict = PyModule_GetDict(m);
+  if (dict == NULL) {
+    /* Internal error -- modules must have a dict! */
+    PyErr_Format(PyExc_SystemError, "module '%s' has no __dict__",
+		 PyModule_GetName(m));
+    return SWIG_ERROR;
+  }
+  if (PyDict_SetItemString(dict, name, o))
+    return SWIG_ERROR;
+  Py_DECREF(o);
+  return SWIG_OK;
+}
+#endif
+
+SWIGRUNTIME void
+#ifdef SWIGPY_USE_CAPSULE
+SWIG_Python_DestroyModule(PyObject *obj)
+#else
+SWIG_Python_DestroyModule(void *vptr)
+#endif
+{
+#ifdef SWIGPY_USE_CAPSULE
+  swig_module_info *swig_module = (swig_module_info *) PyCapsule_GetPointer(obj, SWIGPY_CAPSULE_NAME);
+#else
+  swig_module_info *swig_module = (swig_module_info *) vptr;
+#endif
+  swig_type_info **types = swig_module->types;
+  size_t i;
+  for (i =0; i < swig_module->size; ++i) {
+    swig_type_info *ty = types[i];
+    if (ty->owndata) {
+      SwigPyClientData *data = (SwigPyClientData *) ty->clientdata;
+      if (data) SwigPyClientData_Del(data);
+    }
+  }
+  Py_DECREF(SWIG_This());
+  swig_this = NULL;
+}
+
+SWIGRUNTIME void
+SWIG_Python_SetModule(swig_module_info *swig_module) {
+#if PY_VERSION_HEX >= 0x03000000
+ /* Add a dummy module object into sys.modules */
+  PyObject *module = PyImport_AddModule((char*)"swig_runtime_data" SWIG_RUNTIME_VERSION);
+#else
+  static PyMethodDef swig_empty_runtime_method_table[] = { {NULL, NULL, 0, NULL} }; /* Sentinel */
+  PyObject *module = Py_InitModule((char*)"swig_runtime_data" SWIG_RUNTIME_VERSION, swig_empty_runtime_method_table);
+#endif
+#ifdef SWIGPY_USE_CAPSULE
+  PyObject *pointer = PyCapsule_New((void *) swig_module, SWIGPY_CAPSULE_NAME, SWIG_Python_DestroyModule);
+  if (pointer && module) {
+    PyModule_AddObject(module, (char*)"type_pointer_capsule" SWIG_TYPE_TABLE_NAME, pointer);
+  } else {
+    Py_XDECREF(pointer);
+  }
+#else
+  PyObject *pointer = PyCObject_FromVoidPtr((void *) swig_module, SWIG_Python_DestroyModule);
+  if (pointer && module) {
+    PyModule_AddObject(module, (char*)"type_pointer" SWIG_TYPE_TABLE_NAME, pointer);
+  } else {
+    Py_XDECREF(pointer);
+  }
+#endif
+}
+
+/* The python cached type query */
+SWIGRUNTIME PyObject *
+SWIG_Python_TypeCache(void) {
+  static PyObject *SWIG_STATIC_POINTER(cache) = PyDict_New();
+  return cache;
+}
+
+SWIGRUNTIME swig_type_info *
+SWIG_Python_TypeQuery(const char *type)
+{
+  PyObject *cache = SWIG_Python_TypeCache();
+  PyObject *key = SWIG_Python_str_FromChar(type); 
+  PyObject *obj = PyDict_GetItem(cache, key);
+  swig_type_info *descriptor;
+  if (obj) {
+#ifdef SWIGPY_USE_CAPSULE
+    descriptor = (swig_type_info *) PyCapsule_GetPointer(obj, NULL);
+#else
+    descriptor = (swig_type_info *) PyCObject_AsVoidPtr(obj);
+#endif
+  } else {
+    swig_module_info *swig_module = SWIG_GetModule(0);
+    descriptor = SWIG_TypeQueryModule(swig_module, swig_module, type);
+    if (descriptor) {
+#ifdef SWIGPY_USE_CAPSULE
+      obj = PyCapsule_New((void*) descriptor, NULL, NULL);
+#else
+      obj = PyCObject_FromVoidPtr(descriptor, NULL);
+#endif
+      PyDict_SetItem(cache, key, obj);
+      Py_DECREF(obj);
+    }
+  }
+  Py_DECREF(key);
+  return descriptor;
+}
+
+/* 
+   For backward compatibility only
+*/
+#define SWIG_POINTER_EXCEPTION  0
+#define SWIG_arg_fail(arg)      SWIG_Python_ArgFail(arg)
+#define SWIG_MustGetPtr(p, type, argnum, flags)  SWIG_Python_MustGetPtr(p, type, argnum, flags)
+
+SWIGRUNTIME int
+SWIG_Python_AddErrMesg(const char* mesg, int infront)
+{  
+  if (PyErr_Occurred()) {
+    PyObject *type = 0;
+    PyObject *value = 0;
+    PyObject *traceback = 0;
+    PyErr_Fetch(&type, &value, &traceback);
+    if (value) {
+      char *tmp;
+      PyObject *old_str = PyObject_Str(value);
+      Py_XINCREF(type);
+      PyErr_Clear();
+      if (infront) {
+	PyErr_Format(type, "%s %s", mesg, tmp = SWIG_Python_str_AsChar(old_str));
+      } else {
+	PyErr_Format(type, "%s %s", tmp = SWIG_Python_str_AsChar(old_str), mesg);
+      }
+      SWIG_Python_str_DelForPy3(tmp);
+      Py_DECREF(old_str);
+    }
+    return 1;
+  } else {
+    return 0;
+  }
+}
+  
+SWIGRUNTIME int
+SWIG_Python_ArgFail(int argnum)
+{
+  if (PyErr_Occurred()) {
+    /* add information about failing argument */
+    char mesg[256];
+    PyOS_snprintf(mesg, sizeof(mesg), "argument number %d:", argnum);
+    return SWIG_Python_AddErrMesg(mesg, 1);
+  } else {
+    return 0;
+  }
+}
+
+SWIGRUNTIMEINLINE const char *
+SwigPyObject_GetDesc(PyObject *self)
+{
+  SwigPyObject *v = (SwigPyObject *)self;
+  swig_type_info *ty = v ? v->ty : 0;
+  return ty ? ty->str : "";
+}
+
+SWIGRUNTIME void
+SWIG_Python_TypeError(const char *type, PyObject *obj)
+{
+  if (type) {
+#if defined(SWIG_COBJECT_TYPES)
+    if (obj && SwigPyObject_Check(obj)) {
+      const char *otype = (const char *) SwigPyObject_GetDesc(obj);
+      if (otype) {
+	PyErr_Format(PyExc_TypeError, "a '%s' is expected, 'SwigPyObject(%s)' is received",
+		     type, otype);
+	return;
+      }
+    } else 
+#endif      
+    {
+      const char *otype = (obj ? obj->ob_type->tp_name : 0); 
+      if (otype) {
+	PyObject *str = PyObject_Str(obj);
+	const char *cstr = str ? SWIG_Python_str_AsChar(str) : 0;
+	if (cstr) {
+	  PyErr_Format(PyExc_TypeError, "a '%s' is expected, '%s(%s)' is received",
+		       type, otype, cstr);
+          SWIG_Python_str_DelForPy3(cstr);
+	} else {
+	  PyErr_Format(PyExc_TypeError, "a '%s' is expected, '%s' is received",
+		       type, otype);
+	}
+	Py_XDECREF(str);
+	return;
+      }
+    }   
+    PyErr_Format(PyExc_TypeError, "a '%s' is expected", type);
+  } else {
+    PyErr_Format(PyExc_TypeError, "unexpected type is received");
+  }
+}
+
+
+/* Convert a pointer value, signal an exception on a type mismatch */
+SWIGRUNTIME void *
+SWIG_Python_MustGetPtr(PyObject *obj, swig_type_info *ty, int SWIGUNUSEDPARM(argnum), int flags) {
+  void *result;
+  if (SWIG_Python_ConvertPtr(obj, &result, ty, flags) == -1) {
+    PyErr_Clear();
+#if SWIG_POINTER_EXCEPTION
+    if (flags) {
+      SWIG_Python_TypeError(SWIG_TypePrettyName(ty), obj);
+      SWIG_Python_ArgFail(argnum);
+    }
+#endif
+  }
+  return result;
+}
+
+#ifdef SWIGPYTHON_BUILTIN
+SWIGRUNTIME int
+SWIG_Python_NonDynamicSetAttr(PyObject *obj, PyObject *name, PyObject *value) {
+  PyTypeObject *tp = obj->ob_type;
+  PyObject *descr;
+  PyObject *encoded_name;
+  descrsetfunc f;
+  int res = -1;
+
+# ifdef Py_USING_UNICODE
+  if (PyString_Check(name)) {
+    name = PyUnicode_Decode(PyString_AsString(name), PyString_Size(name), NULL, NULL);
+    if (!name)
+      return -1;
+  } else if (!PyUnicode_Check(name))
+# else
+  if (!PyString_Check(name))
+# endif
+  {
+    PyErr_Format(PyExc_TypeError, "attribute name must be string, not '%.200s'", name->ob_type->tp_name);
+    return -1;
+  } else {
+    Py_INCREF(name);
+  }
+
+  if (!tp->tp_dict) {
+    if (PyType_Ready(tp) < 0)
+      goto done;
+  }
+
+  descr = _PyType_Lookup(tp, name);
+  f = NULL;
+  if (descr != NULL)
+    f = descr->ob_type->tp_descr_set;
+  if (!f) {
+    if (PyString_Check(name)) {
+      encoded_name = name;
+      Py_INCREF(name);
+    } else {
+      encoded_name = PyUnicode_AsUTF8String(name);
+    }
+    PyErr_Format(PyExc_AttributeError, "'%.100s' object has no attribute '%.200s'", tp->tp_name, PyString_AsString(encoded_name));
+    Py_DECREF(encoded_name);
+  } else {
+    res = f(descr, obj, value);
+  }
+  
+  done:
+  Py_DECREF(name);
+  return res;
+}
+#endif
+
+
+#ifdef __cplusplus
+}
+#endif
+/* -----------------------------------------------------------------------------*
+   Standard SWIG API for use inside user code.
+ 
+   Don't include this file directly, run the command
+   swig -python -external-runtime
+   Also, read the Modules chapter of the SWIG Manual.
+ 
+ * -----------------------------------------------------------------------------*/
+
+#ifdef SWIG_MODULE_CLIENTDATA_TYPE
+
+SWIGRUNTIMEINLINE swig_type_info *
+SWIG_TypeQuery(SWIG_MODULE_CLIENTDATA_TYPE clientdata, const char *name) {
+  swig_module_info *module = SWIG_GetModule(clientdata);
+  return SWIG_TypeQueryModule(module, module, name);
+}
+
+SWIGRUNTIMEINLINE swig_type_info *
+SWIG_MangledTypeQuery(SWIG_MODULE_CLIENTDATA_TYPE clientdata, const char *name) {
+  swig_module_info *module = SWIG_GetModule(clientdata);
+  return SWIG_MangledTypeQueryModule(module, module, name);
+}
+
+#else
+
+SWIGRUNTIMEINLINE swig_type_info *
+SWIG_TypeQuery(const char *name) {
+  swig_module_info *module = SWIG_GetModule(NULL);
+  return SWIG_TypeQueryModule(module, module, name);
+}
+
+SWIGRUNTIMEINLINE swig_type_info *
+SWIG_MangledTypeQuery(const char *name) {
+  swig_module_info *module = SWIG_GetModule(NULL);
+  return SWIG_MangledTypeQueryModule(module, module, name);
+}
+
+#endif
+#endif
diff --git a/python/ideep4py/mm/mem.cc b/python/ideep4py/mm/mem.cc
new file mode 100644
index 00000000..878c75f7
--- /dev/null
+++ b/python/ideep4py/mm/mem.cc
@@ -0,0 +1,130 @@
+/*
+ *Copyright (c) 2018 Intel Corporation.
+ *
+ *Permission is hereby granted, free of charge, to any person obtaining a copy
+ *of this software and associated documentation files (the "Software"), to deal
+ *in the Software without restriction, including without limitation the rights
+ *to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ *copies of the Software, and to permit persons to whom the Software is
+ *furnished to do so, subject to the following conditions:
+ *
+ *The above copyright notice and this permission notice shall be included in
+ *all copies or substantial portions of the Software.
+ *
+ *THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ *OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ *THE SOFTWARE.
+ *
+ */
+
+
+#include <vector>
+#include <numeric>
+#include "mem.h"
+
+using namespace std;
+
+#define MALLOC_FREE_IMPL(prefix) \
+    static Memory<DEFAULT_ALIGNMENT> prefix##_pool(#prefix); \
+    static avx::byte* prefix##_malloc(size_t size) { \
+        return (avx::byte *)prefix##_pool.malloc(size); \
+    } \
+    static void prefix##_free(avx::byte *p) { \
+        return prefix##_pool.free((void *)p); \
+    }
+
+MALLOC_FREE_IMPL(anon)
+MALLOC_FREE_IMPL(reorder)
+MALLOC_FREE_IMPL(relu_fwd)
+MALLOC_FREE_IMPL(relu_bwd)
+MALLOC_FREE_IMPL(bn_fwd)
+MALLOC_FREE_IMPL(bn_bwd)
+MALLOC_FREE_IMPL(lrn_fwd)
+MALLOC_FREE_IMPL(lrn_bwd)
+MALLOC_FREE_IMPL(conv_fwd)
+MALLOC_FREE_IMPL(conv_bwd)
+MALLOC_FREE_IMPL(pooling_fwd)
+MALLOC_FREE_IMPL(pooling_bwd)
+MALLOC_FREE_IMPL(ip_fwd)
+MALLOC_FREE_IMPL(ip_bwd)
+MALLOC_FREE_IMPL(concat_fwd)
+MALLOC_FREE_IMPL(concat_bwd)
+
+std::shared_ptr<avx::byte> Allocator::malloc(size_t len, mem_pool_t mpool)
+{
+    std::shared_ptr<avx::byte> data;
+    switch(mpool) {
+        case MPOOL_REORDER:
+            data = std::shared_ptr<avx::byte>(reorder_malloc(len), reorder_free);
+            break;
+        case MPOOL_ELTWISE_FWD:
+            data = std::shared_ptr<avx::byte>(relu_fwd_malloc(len), relu_fwd_free);
+            break;
+        case MPOOL_ELTWISE_BWD:
+            data = std::shared_ptr<avx::byte>(relu_bwd_malloc(len), relu_bwd_free);
+            break;
+        case MPOOL_BN_FWD:
+            data = std::shared_ptr<avx::byte>(bn_fwd_malloc(len), bn_fwd_free);
+            break;
+        case MPOOL_BN_BWD:
+            data = std::shared_ptr<avx::byte>(bn_bwd_malloc(len), bn_bwd_free);
+            break;
+        case MPOOL_LRN_FWD:
+            data = std::shared_ptr<avx::byte>(lrn_fwd_malloc(len), lrn_fwd_free);
+            break;
+        case MPOOL_LRN_BWD:
+            data = std::shared_ptr<avx::byte>(lrn_bwd_malloc(len), lrn_bwd_free);
+            break;
+        case MPOOL_CONV_FWD:
+            data = std::shared_ptr<avx::byte>(conv_fwd_malloc(len), conv_fwd_free);
+            break;
+        case MPOOL_CONV_BWD:
+            data = std::shared_ptr<avx::byte>(conv_bwd_malloc(len), conv_bwd_free);
+            break;
+        case MPOOL_POOLING_FWD:
+            data = std::shared_ptr<avx::byte>(pooling_fwd_malloc(len), pooling_fwd_free);
+            break;
+        case MPOOL_POOLING_BWD:
+            data = std::shared_ptr<avx::byte>(pooling_bwd_malloc(len), pooling_bwd_free);
+            break;
+        case MPOOL_IP_FWD:
+            data = std::shared_ptr<avx::byte>(ip_fwd_malloc(len), ip_fwd_free);
+            break;
+        case MPOOL_IP_BWD:
+            data = std::shared_ptr<avx::byte>(ip_bwd_malloc(len), ip_bwd_free);
+            break;
+        case MPOOL_CONCAT_FWD:
+            data = std::shared_ptr<avx::byte>(concat_fwd_malloc(len), concat_fwd_free);
+            break;
+        case MPOOL_CONCAT_BWD:
+            data = std::shared_ptr<avx::byte>(concat_bwd_malloc(len), concat_bwd_free);
+            break;
+        default:
+            data = std::shared_ptr<avx::byte>(anon_malloc(len), anon_free);
+            break;
+    }
+
+    return data;
+}
+
+std::shared_ptr<avx::byte> Allocator::malloc(vector<int> dims, int element_sz, mem_pool_t mpool)
+{
+    auto len = std::accumulate(dims.begin(), dims.end(), 1
+            , std::multiplies<int>()) * element_sz;
+
+    return Allocator::malloc(len, mpool);
+}
+
+void* dnn_malloc(size_t size, mem_pool_t mpool)
+{
+    return anon_pool.malloc(size);
+}
+
+void dnn_free(void *p, mem_pool_t mpool)
+{
+    return anon_pool.free(p);
+}
diff --git a/python/ideep4py/mm/tensor.cc b/python/ideep4py/mm/tensor.cc
new file mode 100644
index 00000000..d608f47c
--- /dev/null
+++ b/python/ideep4py/mm/tensor.cc
@@ -0,0 +1,32 @@
+/*
+ *Copyright (c) 2018 Intel Corporation.
+ *
+ *Permission is hereby granted, free of charge, to any person obtaining a copy
+ *of this software and associated documentation files (the "Software"), to deal
+ *in the Software without restriction, including without limitation the rights
+ *to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ *copies of the Software, and to permit persons to whom the Software is
+ *furnished to do so, subject to the following conditions:
+ *
+ *The above copyright notice and this permission notice shall be included in
+ *all copies or substantial portions of the Software.
+ *
+ *THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ *OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ *THE SOFTWARE.
+ *
+ */
+
+
+#include <vector>
+#include <cstdlib>
+#include "tensor.h"
+#include "blas.h"
+
+Tensor *Tensor::sum(vector<int> axis) {
+    return blas_sum(this, axis);
+}
diff --git a/python/ideep4py/primitives/bn.cc b/python/ideep4py/primitives/bn.cc
new file mode 100644
index 00000000..a464fdc1
--- /dev/null
+++ b/python/ideep4py/primitives/bn.cc
@@ -0,0 +1,222 @@
+/*
+ *Copyright (c) 2018 Intel Corporation.
+ *
+ *Permission is hereby granted, free of charge, to any person obtaining a copy
+ *of this software and associated documentation files (the "Software"), to deal
+ *in the Software without restriction, including without limitation the rights
+ *to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ *copies of the Software, and to permit persons to whom the Software is
+ *furnished to do so, subject to the following conditions:
+ *
+ *The above copyright notice and this permission notice shall be included in
+ *all copies or substantial portions of the Software.
+ *
+ *THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ *OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ *THE SOFTWARE.
+ *
+ */
+
+
+#include <mkldnn.hpp>
+#include <vector>
+#include <memory>
+#include <omp.h>
+#include "mkl_vml_functions.h"
+#include "layer.h"
+#include "tensor.h"
+#include "bn.h"
+#include "bn_fwd.h"
+#include "bn_bwd.h"
+#include "prim_factory.h"
+#include "reorder_op.h"
+
+template<typename T>
+void batch_normalization_inv(T *var, float eps, int size, T *inv) {
+    int blk_nthr = omp_get_max_threads(),
+        blk_num = blk_nthr,
+        blk_len = size / blk_num,
+        blk_len_ex = size % blk_num;
+
+    if (!blk_len)
+        blk_nthr = size;
+
+    T *var_eps = reinterpret_cast<T *>(new avx::byte[size * sizeof(T)]);
+
+    # pragma omp parallel num_threads(blk_nthr)
+    {
+        int ithr = omp_get_thread_num();
+        int blen = ithr < blk_len_ex ? blk_len + 1 : blk_len;
+        int bstart = ithr <= blk_len_ex ? (blk_len + 1) * ithr :
+                     blk_len_ex * (blk_len + 1) + (ithr - blk_len_ex) * blk_len;
+        int bend = bstart + blen;
+
+        for (int b = bstart; b < bend; b++)
+            var_eps[b] = var[b] + eps;
+    }
+
+    vsPowx(size, var_eps, -0.5, inv);
+    delete(reinterpret_cast<avx::byte *>(var_eps));
+    return;
+}
+
+template<typename T>
+std::vector<Tensor *> batch_normalization<T>::Forward(
+    Tensor *src, Tensor *w, Tensor *mean, Tensor *var, float eps) {
+
+    assert(memory_data_type<T>() == src.cxx_data_type());
+
+    bool scale_shift = w ? true : false;
+    bool global_stats = mean ? true : false;
+    bool training = mean ? false : true;
+
+    auto bn_fwd = batch_normalization_fwd_factory<T>::get(
+            (mkldnn::memory::dims)src->dims(),
+            eps, scale_shift, global_stats, training);
+
+    void *src_data = src->data();
+    shared_ptr<avx::byte> src_itnl;
+    if (src->cxx_format() != bn_fwd->get_src_fmt()) {
+        auto reorder = ReorderFactory<T>::get(
+            (mkldnn::memory::dims)src->dims(),
+            (mkldnn::memory::format)src->cxx_format(),
+            (mkldnn::memory::format)bn_fwd->get_src_fmt());
+        src_itnl= Allocator::malloc(src->len(), MPOOL_REORDER);
+        //src_itnl = new avx::byte[src->len()];
+        reorder->execute(src_data, src_itnl.get());
+        src_data = src_itnl.get();
+    }
+
+#if 0
+    auto dst = new Tensor(src->ndims(), src->dims(),
+                          (mkldnn_memory_format_t)bn_fwd->get_dst_fmt(),
+                          src->type());
+    mean = training ?
+           new Tensor(bn_fwd->get_mean_ndims(), bn_fwd->get_mean_dims(),
+                      (mkldnn_memory_format_t)bn_fwd->get_mean_fmt(),
+                      src->type()) : mean;
+    var = training ?
+          new Tensor(bn_fwd->get_var_ndims(), bn_fwd->get_var_dims(),
+                     (mkldnn_memory_format_t)bn_fwd->get_var_fmt(),
+                     src->type()) : var;
+#else
+    auto data = Allocator::malloc(src->dims(), type2size(src->type()), MPOOL_BN_FWD);
+    auto dst = new Tensor(src->ndims(), src->dims(), data,
+            (mkldnn_memory_format_t)bn_fwd->get_dst_fmt(),
+            src->type());
+
+    Tensor *inv;
+    if (training) {
+        auto data_mean = Allocator::malloc(bn_fwd->get_mean_dims(), type2size(src->type()), MPOOL_BN_FWD);
+        mean = new Tensor(bn_fwd->get_mean_ndims(), bn_fwd->get_mean_dims(), data_mean,
+                      (mkldnn_memory_format_t)bn_fwd->get_mean_fmt(),
+                      src->type());
+        auto data_var = Allocator::malloc(bn_fwd->get_var_dims(), type2size(src->type()), MPOOL_BN_FWD);
+        var = new Tensor(bn_fwd->get_var_ndims(), bn_fwd->get_var_dims(), data_var,
+                     (mkldnn_memory_format_t)bn_fwd->get_var_fmt(),
+                     src->type());
+        auto data_inv = Allocator::malloc(bn_fwd->get_var_dims(), type2size(src->type()), MPOOL_BN_FWD);
+        inv = new Tensor(bn_fwd->get_var_ndims(), bn_fwd->get_var_dims(), data_inv,
+                     (mkldnn_memory_format_t)bn_fwd->get_var_fmt(),
+                     src->type());
+    }
+#endif
+
+    bn_fwd->execute(src_data, (w ? w->data() : nullptr),
+                    dst->data(), (mean ? mean->data() : nullptr),
+                    (var ? var->data() : nullptr));
+
+    std::vector<Tensor *> outs;
+    outs.push_back(dst);
+    if (training) {
+        outs.push_back(mean);
+        outs.push_back(var);
+
+        batch_normalization_inv(reinterpret_cast<T *>(var->data()), eps,
+                                var->desc().data.dims[0],
+                                reinterpret_cast<T *>(inv->data()));
+        outs.push_back(inv);
+    }
+
+    return outs;
+}
+
+template<typename T>
+std::vector<Tensor *> batch_normalization<T>::Backward(
+            Tensor *src, Tensor *diff_dst, Tensor *mean,
+            Tensor *var, Tensor *w, float eps) {
+
+    assert(memory_data_type<T>() == src.cxx_data_type());
+
+    bool scale_shift = w ? true : false;
+
+    auto bn_bwd = batch_normalization_bwd_factory<T>::get(
+            (mkldnn::memory::dims)src->dims(),
+            (mkldnn::memory::dims)diff_dst->dims(),
+            eps, scale_shift);
+
+    void *src_data = src->data();
+    shared_ptr<avx::byte> src_itnl;
+    if (src->cxx_format() != bn_bwd->get_src_fmt()) {
+        auto reorder = ReorderFactory<T>::get(
+            (mkldnn::memory::dims)src->dims(),
+            (mkldnn::memory::format)src->cxx_format(),
+            (mkldnn::memory::format)bn_bwd->get_src_fmt());
+        //src_itnl = new avx::byte[src->len()];
+        src_itnl= Allocator::malloc(src->len(), MPOOL_REORDER);
+        reorder->execute(src_data, src_itnl.get());
+        src_data = src_itnl.get();
+    }
+
+    void *diff_dst_data = diff_dst->data();
+    shared_ptr<avx::byte> diff_dst_itnl;
+    if (diff_dst->cxx_format() != bn_bwd->get_diff_dst_fmt()) {
+        auto reorder = ReorderFactory<T>::get(
+            (mkldnn::memory::dims)diff_dst->dims(),
+            (mkldnn::memory::format)diff_dst->cxx_format(),
+            (mkldnn::memory::format)bn_bwd->get_diff_dst_fmt());
+        diff_dst_itnl = Allocator::malloc(diff_dst->len(), MPOOL_REORDER);
+        //diff_dst_itnl = new avx::byte[diff_dst->len()];
+        reorder->execute(diff_dst_data, diff_dst_itnl.get());
+        diff_dst_data = diff_dst_itnl.get();
+    }
+
+#if 0
+    auto diff_src = new Tensor(src->ndims(), src->dims(),
+                    (mkldnn_memory_format_t)bn_bwd->get_diff_src_fmt(),
+                    src->type());
+    auto diff_w = scale_shift ?
+                  new Tensor(w->ndims(), w->dims(),
+                  (mkldnn_memory_format_t)bn_bwd->get_diff_w_fmt(),
+                  w->type()) : (Tensor *)(nullptr);
+#else
+    auto data = Allocator::malloc(src->dims(), type2size(src->type()), MPOOL_BN_BWD);
+    auto diff_src = new Tensor(src->ndims(), src->dims(), data,
+                    (mkldnn_memory_format_t)bn_bwd->get_diff_src_fmt(),
+                    src->type());
+    Tensor *diff_w = nullptr;
+    if (scale_shift) {
+        auto data_diff_w = Allocator::malloc(w->dims(), type2size(src->type()), MPOOL_BN_BWD);
+        diff_w = new Tensor(w->ndims(), w->dims(), data_diff_w,
+                (mkldnn_memory_format_t)bn_bwd->get_diff_w_fmt(),
+                w->type());
+    }
+#endif
+
+    bn_bwd->execute(src_data, diff_dst_data, mean->data(), var->data(),
+                    (w ? w->data() : nullptr), diff_src->data(),
+                    (diff_w ? diff_w->data() : nullptr));
+
+    std::vector<Tensor *> outs;
+    outs.push_back(diff_src);
+    if (scale_shift)
+        outs.push_back(diff_w);
+
+    return outs;
+}
+
+template class batch_normalization<float>;
diff --git a/python/ideep4py/primitives/concat.cc b/python/ideep4py/primitives/concat.cc
new file mode 100644
index 00000000..ee6c42dd
--- /dev/null
+++ b/python/ideep4py/primitives/concat.cc
@@ -0,0 +1,229 @@
+/*
+ *Copyright (c) 2018 Intel Corporation.
+ *
+ *Permission is hereby granted, free of charge, to any person obtaining a copy
+ *of this software and associated documentation files (the "Software"), to deal
+ *in the Software without restriction, including without limitation the rights
+ *to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ *copies of the Software, and to permit persons to whom the Software is
+ *furnished to do so, subject to the following conditions:
+ *
+ *The above copyright notice and this permission notice shall be included in
+ *all copies or substantial portions of the Software.
+ *
+ *THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ *OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ *THE SOFTWARE.
+ *
+ */
+
+
+#include <glog/logging.h>
+#include <iostream>
+#include "common.h"
+#include "mkldnn.hpp"
+#include "tensor.h"
+#include "mem.h"
+#include "concat.h"
+#include "utils.h"
+#include "concat_fwd.h"
+#include "prim_factory.h"
+#include "reorder_op.h"
+
+using namespace mkldnn;
+
+extern engine cpu_engine;
+
+template<typename T>
+Concat<T>::Concat()
+{
+}
+
+template<typename T>
+Concat<T>::~Concat()
+{
+}
+
+template<typename T>
+Tensor *Concat<T>::Forward(
+                std::vector<Tensor*> src,
+                int axis)
+{
+    // sanity check
+    assert (src.size() > 0);
+
+    std::vector<mkldnn::memory::format> src_fmts;
+    std::vector<mkldnn::memory::format> expected_fmts;
+    std::vector<void*> src_datas;
+    std::vector<void*> src_reorder;
+
+    std::vector<mkldnn::memory::dims> src_ds;
+    mkldnn::memory::dims dst_d;
+
+    //get output channel
+    int out_channel = 0;
+    for (int i = 0; i < src.size(); i++) {
+        //get relate infor from src
+        src_fmts.push_back(src[i]->cxx_format());
+        src_datas.push_back(src[i]->data());
+        src_reorder.push_back(src[i]->data());
+
+        src_ds.push_back(src[i]->cxx_dims());
+        out_channel += (src[i]->cxx_dims())[axis];
+    }
+
+    for (int i = 0; i < src_ds[0].size(); i++){
+        if (i == axis)
+            dst_d.push_back(out_channel);
+        else
+            dst_d.push_back(src_ds[0][i]);
+    }
+
+    //LOG(INFO) << "dst_d={" << dst_d[0] << "," << dst_d[1] << "," << dst_d[2] << "," << dst_d[3] << "}";
+    
+    // get a concat fwd from primitive pool
+    ConcatFwd<T> *concat_forward = NULL;
+    concat_forward = ConcatFwdFactory<T>::get(src_ds, dst_d, axis);
+
+    // check wehther fmt is same
+    expected_fmts = concat_forward->src_fmts_;
+    assert(src_fmts.size() == expected_fmts.size());
+
+    for (int i = 0; i < expected_fmts.size(); i++) {
+        if ( src_fmts[i] != expected_fmts[i]) {
+            //LOG(INFO) << "Concat src fmt not match ("<< i << "):"
+                //"src_fmt=" << src_fmts[i] <<
+                //"; expected_fmt="<< expected_fmts[i];
+            // From reorder factory to find one reorder
+            ReorderOp<T>* reorder_src_op = ReorderFactory<T>::get(src_ds[i], src_fmts[i], expected_fmts[i]);
+            src_reorder[i] = new avx::byte[src[i]->len()];
+            reorder_src_op->execute(src_datas[i], src_reorder[i]);
+        }
+    }
+
+    // create tensor based on primitive's dst 
+    // assume dst and src have same data type
+    // Tensor *dst_tensor = new Tensor(dst_d, src[0]->cxx_data_type(), concat_forward->dst_fmt_, cpu_engine);
+    auto data = Allocator::malloc(dst_d, type2size(src[0]->type()), MPOOL_CONCAT_FWD);
+    Tensor *dst_tensor = new Tensor(dst_d.size(), dst_d, data,
+            (mkldnn_memory_format_t)concat_forward->dst_fmt_,
+            src[0]->type());
+    
+    // do forward
+    concat_forward->execute(src_reorder, dst_tensor->data());
+
+    //FIXME here may cause performance issue
+    for (int i = 0; i < src_reorder.size(); i++) {
+        if (src_reorder[i] != src_datas[i]) {
+            // means reorder happen
+            delete static_cast<avx::byte *>(src_reorder[i]);
+        }
+    }
+
+    return dst_tensor;
+}
+
+
+template<typename T>
+std::vector<Tensor*> Concat<T>::Backward(
+                Tensor *diff_dst,
+                std::vector<int> offsets,
+                int axis)
+{
+    //
+    assert (offsets.size() > 0);
+
+    std::vector<Tensor*> gxs;
+    std::vector<void*> gxs_data;
+
+    mkldnn::memory::format expected_dst_fmt; // expected format
+    void *diff_dst_data = NULL;
+    void *diff_dst_reorder = NULL; 
+
+    // get diff src fmts
+    // offset store the offsets of concat
+    // Example
+    // inputs: [2, 2, 3, 3], [2, 3, 3, 3], [2, 1, 3, 3], [2, 1, 3, 3]
+    // outputs: [2, 7, 3, 3]
+    // offsets: [2, 5, 6]
+    std::vector<mkldnn::memory::dims> diff_src_d;
+    mkldnn::memory::dims diff_dst_d = diff_dst->cxx_dims();
+    
+    // get elements
+    mkldnn::memory::dims tmp;
+    for (int i = 0; i < offsets.size(); i++) {
+        int axis_value = -1;
+        if (i == 0)
+            axis_value = offsets[0];
+        else
+            axis_value = offsets[i] - offsets[i-1];
+
+        for (int j = 0; j < diff_dst_d.size(); j++) {
+            if (j == axis)
+                tmp.push_back(axis_value);
+            else
+                tmp.push_back(diff_dst_d[j]);
+
+        }
+        diff_src_d.push_back(tmp);
+        tmp.clear();
+    }
+
+    // get last element
+    for (int i = 0; i < diff_dst_d.size(); i++){
+        if (i == axis)
+            tmp.push_back(diff_dst_d[axis]-offsets.back());
+        else
+            tmp.push_back(diff_dst_d[i]);
+    }
+    diff_src_d.push_back(tmp);
+    tmp.clear();
+    
+    // get a concat bwd from primitive pool
+    ConcatBwd<T> *concat_backward = NULL;
+    concat_backward = ConcatBwdFactory<T>::get(diff_src_d, diff_dst_d, axis);
+
+    //check whether diff dst fmt is same
+    expected_dst_fmt = concat_backward->diff_dst_fmt_;
+    diff_dst_data = diff_dst->data();
+    if (expected_dst_fmt != diff_dst->cxx_format()) {
+        //LOG(INFO) << "Concat diff dst fmt not match: diff_dst_fmt="
+           // << diff_dst->cxx_format() << "; expected fmt = " << expected_dst_fmt;
+
+        // From reorder factory to find one reorder
+        ReorderOp<T>* reorder_diff_dst_op = ReorderFactory<T>::get(diff_dst->cxx_dims(), diff_dst->cxx_format(), expected_dst_fmt);
+        diff_dst_reorder = new avx::byte[diff_dst->len()];
+        reorder_diff_dst_op->execute(diff_dst_data, diff_dst_reorder);
+        diff_dst_data = diff_dst_reorder;
+    }
+
+    // create diff src tensors to execute concat backward
+    assert(diff_src_d.szie() == concat_backward->diff_src_fmts_.size());
+    for (int i = 0; i < diff_src_d.size(); i++) {
+        // Tensor *diff_src_tensor = new Tensor(diff_src_d[i], diff_dst->cxx_data_type(), concat_backward->diff_src_fmts_[i], cpu_engine);
+        auto data = Allocator::malloc(diff_src_d[i], type2size(diff_dst->type()), MPOOL_CONCAT_BWD);
+        Tensor *diff_src_tensor = new Tensor(diff_src_d[i].size(), diff_src_d[i], data,
+                                        (mkldnn_memory_format_t)concat_backward->diff_src_fmts_[i],
+                                        diff_dst->type());
+        gxs.push_back(diff_src_tensor);
+        gxs_data.push_back(diff_src_tensor->data());
+    }
+
+    // do concat backward
+    concat_backward->execute(gxs_data, diff_dst_data);
+
+    //
+    if (diff_dst_reorder != NULL)
+        delete static_cast<avx::byte *>(diff_dst_reorder);
+
+    return gxs;
+}
+
+template class Concat<float>;
+
+
+// vim: et ts=4 sw=4 cindent cino^=l0,\:0,N-s
diff --git a/python/ideep4py/primitives/conv.cc b/python/ideep4py/primitives/conv.cc
new file mode 100644
index 00000000..5cc808b6
--- /dev/null
+++ b/python/ideep4py/primitives/conv.cc
@@ -0,0 +1,373 @@
+/*
+ *Copyright (c) 2018 Intel Corporation.
+ *
+ *Permission is hereby granted, free of charge, to any person obtaining a copy
+ *of this software and associated documentation files (the "Software"), to deal
+ *in the Software without restriction, including without limitation the rights
+ *to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ *copies of the Software, and to permit persons to whom the Software is
+ *furnished to do so, subject to the following conditions:
+ *
+ *The above copyright notice and this permission notice shall be included in
+ *all copies or substantial portions of the Software.
+ *
+ *THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ *OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ *THE SOFTWARE.
+ *
+ */
+
+
+#include <glog/logging.h>
+#include <iostream>
+#include "common.h"
+#include "mkldnn.hpp"
+#include "tensor.h"
+#include "mem.h"
+#include "conv.h"
+#include "utils.h"
+#include "conv_fwd.h"
+#include "conv_bwd_data.h"
+#include "conv_bwd_weights.h"
+#include "prim_factory.h"
+#include "reorder_op.h"
+
+using namespace mkldnn;
+
+extern engine cpu_engine;
+
+template<typename T>
+Convolution2D<T>::Convolution2D()
+{
+}
+
+template<typename T>
+Convolution2D<T>::~Convolution2D()
+{
+}
+
+template<typename T>
+Tensor *Convolution2D<T>::Forward(
+                Tensor *src, Tensor *weights,
+                Tensor *bias,
+                conv_param_t *cp)
+{
+    // sanity check
+    mkldnn::memory::dims src_dims = (mkldnn::memory::dims)src->dims();
+    mkldnn::memory::dims w_dims = (mkldnn::memory::dims)weights->dims();
+    mkldnn::memory::dims dst_dims = (mkldnn::memory::dims)cp->out_dims;
+    mkldnn::memory::dims b_dims;
+    if (bias)
+        b_dims = (mkldnn::memory::dims)bias->dims();
+
+    //sanity check for data type
+    //assuem all x/w/b should have same data type as T
+    //FIXME
+    //yli135: Is it possible x and w have different data type????
+    assert(memory_data_type<T>() == src->cxx_data_type());
+    assert(memory_data_type<T>() == weights->cxx_data_type());
+    if (bias)
+        assert(memory_data_type<T>() == bias->cxx_data_type());
+    
+    // get a conv2d fwd from primitive pool
+    Convolution2DFwd<T> *conv2d_forward = NULL;
+    if (bias)
+        conv2d_forward = Convolution2DFwdFactory<T>::get(src_dims, w_dims, b_dims, dst_dims,
+                cp->dilate_y, cp->dilate_x, cp->sy, cp->sx, cp->pad_lh, cp->pad_lw, cp->pad_rh, cp->pad_rw);
+    else
+        conv2d_forward = Convolution2DFwdFactory<T>::get(src_dims, w_dims, NONE_DIMS, dst_dims,
+                cp->dilate_y, cp->dilate_x, cp->sy, cp->sx, cp->pad_lh, cp->pad_lw, cp->pad_rh, cp->pad_rw);
+    
+    // FIXME: in this model, every call to conv_forward will create a new tensor, when to free???
+    mkldnn::memory::format src_fmt = src->cxx_format(); // src fmt in tensor
+    mkldnn::memory::format w_fmt = weights->cxx_format(); // weight fmt in tensor
+
+    void *src_tmp = src->data();
+    void *w_tmp = weights->data();
+    shared_ptr<avx::byte> src_reorder;
+    shared_ptr<avx::byte> w_reorder;
+    
+    // check wehther fmt is same
+    if (src_fmt == conv2d_forward->src_fmt_ && w_fmt == conv2d_forward->weights_fmt_) {
+        //LOG(INFO) << "primitive fmt matched";
+    } else {
+        //LOG(INFO) << "fmt not match, need to reorder";
+
+        if (src_fmt != conv2d_forward->src_fmt_) {
+            //LOG(INFO) << "src_fmt=" << src_fmt <<", conv2d_forward->src_fmt_=" << conv2d_forward->src_fmt_;
+            // FIXME: when to free the reordered memory
+            ReorderOp<T>* reorder_src_op = ReorderFactory<T>::get(src_dims, src_fmt, conv2d_forward->src_fmt_);
+            src_reorder = Allocator::malloc(src->len(), MPOOL_REORDER);
+            //src_reorder = new avx::byte[src->len()];
+            reorder_src_op->execute(src_tmp, src_reorder.get());
+            src_tmp = src_reorder.get();
+        }
+
+        if (w_fmt != conv2d_forward->weights_fmt_) {
+            //LOG(INFO) << "weight_fmt=" << w_fmt <<", conv2d_forward->weight_fmt_=" << conv2d_forward->weights_fmt_;
+            // FIXME: when to free the reordered memory
+            ReorderOp<T>* reorder_w_op = ReorderFactory<T>::get(w_dims, w_fmt, conv2d_forward->weights_fmt_);
+            w_reorder = Allocator::malloc(weights->len(), MPOOL_REORDER);
+            //w_reorder = new avx::byte[weights->len()];
+            reorder_w_op->execute(w_tmp, w_reorder.get());
+            w_tmp = w_reorder.get();
+            
+            
+            // set internal fmt back to weight tensor
+            weights->reset_memory(
+                    static_cast<mkldnn_memory_format_t>(conv2d_forward->weights_fmt_),
+                    w_reorder);
+        }
+    }
+
+    // create tensor based on primitive's dst 
+    // assume dst and src have same data type
+    //Tensor *dst_tensor = new Tensor(dst_dims, src->cxx_data_type(), conv2d_forward->dst_fmt_, cpu_engine);
+    auto data = Allocator::malloc(dst_dims, type2size(src->type()), MPOOL_CONV_FWD);
+    Tensor *dst_tensor = new Tensor(dst_dims.size(), dst_dims, data,
+            (mkldnn_memory_format_t)conv2d_forward->dst_fmt_,
+            src->type());
+    
+    // do forward
+    if (bias) {
+        conv2d_forward->execute(src_tmp, w_tmp, bias->data(), dst_tensor->data());
+    } else {
+        conv2d_forward->execute(src_tmp, w_tmp, dst_tensor->data());
+    }
+
+    return dst_tensor;
+}
+
+/*
+ * gW = gy *x
+ */
+template<typename T>
+Tensor *Convolution2D<T>::BackwardWeights(
+                Tensor *src, Tensor *diff_dst,
+                conv_param_t *cp)
+{
+    std::vector<Tensor *> bwd_weight_vec;
+
+    // sanity check
+    mkldnn::memory::dims src_dims = (mkldnn::memory::dims)src->dims();
+    mkldnn::memory::dims diff_dst_dims = (mkldnn::memory::dims)diff_dst->dims();
+    mkldnn::memory::dims diff_w_dims = (mkldnn::memory::dims)cp->out_dims;
+
+    assert(src_dims == src->cxx_dims() && diff_dst_dims = diff_dst->cxx_dims());
+
+    // sanity check for data type
+    // FIXME
+    // is it possible y and w have different data type??
+    assert(memory_data_type<T>() == src->cxx_data_type());
+    assert(memory_data_type<T>() == diff_dst->cxx_data_type());
+
+    // get a conv2d bwd weights from primitive pool
+    Convolution2DBwdWeights<T> *conv2d_bwd_weights = NULL;
+    conv2d_bwd_weights = Convolution2DBwdWeightsFactory<T>::get(src_dims, diff_w_dims, NONE_DIMS, diff_dst_dims,
+        cp->dilate_y, cp->dilate_x, cp->sy, cp->sx, cp->pad_lh, cp->pad_lw, cp->pad_rh, cp->pad_rw);
+
+    // create tensor based on selected primitive
+    mkldnn::memory::format src_fmt = src->cxx_format();
+    mkldnn::memory::format diff_dst_fmt = diff_dst->cxx_format();
+
+    //assum dst and src have same data type
+    void* src_tmp = src->data();
+    void* diff_dst_tmp = diff_dst->data();
+    shared_ptr<avx::byte> src_reorder;
+    shared_ptr<avx::byte> diff_dst_reorder;
+
+    //check whether fmt is same
+    if (src_fmt == conv2d_bwd_weights->src_fmt_ && diff_dst_fmt == conv2d_bwd_weights->diff_dst_fmt_) {
+       // LOG(INFO) << "primitive fmt matched";
+    } else {
+       // LOG(INFO) << "fmt not match, need to reorder";
+
+        if (src_fmt != conv2d_bwd_weights->src_fmt_) {
+            //LOG(INFO) << "src_fmt=" << src_fmt << ", conv2d_bwd_weights->src_fmt_=" << conv2d_bwd_weights->src_fmt_;
+            // FIXME: when to free the reordered memory
+            ReorderOp<T>* reorder_src_op = ReorderFactory<T>::get(src_dims, src_fmt, conv2d_bwd_weights->src_fmt_);
+            src_reorder = Allocator::malloc(src->len(), MPOOL_REORDER);
+            //src_reorder = new avx::byte[src->len()];
+            reorder_src_op->execute(src_tmp, src_reorder.get());
+            src_tmp = src_reorder.get();
+        }
+        if (diff_dst_fmt != conv2d_bwd_weights->diff_dst_fmt_) {
+           // LOG(INFO) << "diff_dst_fmt=" << diff_dst_fmt <<", conv2d_bwd_weights->diff_dst_fmt_=" << conv2d_bwd_weights->diff_dst_fmt_;
+            // FIXME: when to free the reordered memory
+            ReorderOp<T>* reorder_diff_dst_op = ReorderFactory<T>::get(diff_dst_dims, diff_dst_fmt, conv2d_bwd_weights->diff_dst_fmt_);
+            diff_dst_reorder = Allocator::malloc(diff_dst->len(), MPOOL_REORDER);
+            //diff_dst_reorder = new avx::byte[diff_dst->len()];
+            reorder_diff_dst_op->execute(diff_dst_tmp, diff_dst_reorder.get());
+            diff_dst_tmp = diff_dst_reorder.get();
+        }
+    }
+
+    //assum dst and src have same data type
+    //Tensor *diff_w_tensor = new Tensor(diff_w_dims, src->cxx_data_type(), conv2d_bwd_weights->diff_weights_fmt_, cpu_engine);
+    auto w_data = Allocator::malloc(diff_w_dims, type2size(src->type()), MPOOL_CONV_BWD);
+    Tensor *diff_w_tensor = new Tensor(diff_w_dims.size(), diff_w_dims, w_data,
+            (mkldnn_memory_format_t)conv2d_bwd_weights->diff_weights_fmt_,
+            src->type());
+
+    // do execute
+    conv2d_bwd_weights->execute(src_tmp, diff_w_tensor->data(), diff_dst_tmp);
+    return diff_w_tensor;
+}
+
+template<typename T>
+std::vector<Tensor *> Convolution2D<T>::BackwardWeightsBias(
+                Tensor *src, Tensor *diff_dst,
+                conv_param_t *cp)
+{
+    std::vector<Tensor *> bwd_weight_vec;
+
+    // sanity check
+    mkldnn::memory::dims src_dims = (mkldnn::memory::dims)src->dims();
+    mkldnn::memory::dims diff_dst_dims = (mkldnn::memory::dims)diff_dst->dims();
+    mkldnn::memory::dims diff_w_dims = (mkldnn::memory::dims)cp->out_dims;
+    mkldnn::memory::dims diff_b_dims = {diff_w_dims[0]};
+
+    assert(src_dims == src->cxx_dims() && diff_dst_dims = diff_dst->cxx_dims());
+
+    // sanity check for data type
+    // FIXME
+    // is it possible y and w have different data type??
+    assert(memory_data_type<T>() == src->cxx_data_type());
+    assert(memory_data_type<T>() == diff_dst->cxx_data_type());
+
+    // get a conv2d bwd weights from primitive pool
+    Convolution2DBwdWeights<T> *conv2d_bwd_weights = NULL;
+    conv2d_bwd_weights = Convolution2DBwdWeightsFactory<T>::get(src_dims, diff_w_dims, diff_b_dims, diff_dst_dims,
+        cp->dilate_y, cp->dilate_x, cp->sy, cp->sx, cp->pad_lh, cp->pad_lw, cp->pad_rh, cp->pad_rw);
+
+    // create tensor based on selected primitive
+    mkldnn::memory::format src_fmt = src->cxx_format();
+    mkldnn::memory::format diff_dst_fmt = diff_dst->cxx_format();
+
+    //assum dst and src have same data type
+    void* src_tmp = src->data();
+    void* diff_dst_tmp = diff_dst->data();
+    shared_ptr<avx::byte> src_reorder;
+    shared_ptr<avx::byte> diff_dst_reorder;
+
+    //check whether fmt is same
+    if (src_fmt == conv2d_bwd_weights->src_fmt_ && diff_dst_fmt == conv2d_bwd_weights->diff_dst_fmt_) {
+       // LOG(INFO) << "primitive fmt matched";
+    } else {
+       // LOG(INFO) << "fmt not match, need to reorder";
+
+        if (src_fmt != conv2d_bwd_weights->src_fmt_) {
+            //LOG(INFO) << "src_fmt=" << src_fmt << ", conv2d_bwd_weights->src_fmt_=" << conv2d_bwd_weights->src_fmt_;
+            // FIXME: when to free the reordered memory
+            ReorderOp<T>* reorder_src_op = ReorderFactory<T>::get(src_dims, src_fmt, conv2d_bwd_weights->src_fmt_);
+            src_reorder = Allocator::malloc(src->len(), MPOOL_REORDER);
+            //src_reorder = new avx::byte[src->len()];
+            reorder_src_op->execute(src_tmp, src_reorder.get());
+            src_tmp = src_reorder.get();
+        }
+        if (diff_dst_fmt != conv2d_bwd_weights->diff_dst_fmt_) {
+           // LOG(INFO) << "diff_dst_fmt=" << diff_dst_fmt <<", conv2d_bwd_weights->diff_dst_fmt_=" << conv2d_bwd_weights->diff_dst_fmt_;
+            // FIXME: when to free the reordered memory
+            ReorderOp<T>* reorder_diff_dst_op = ReorderFactory<T>::get(diff_dst_dims, diff_dst_fmt, conv2d_bwd_weights->diff_dst_fmt_);
+            diff_dst_reorder = Allocator::malloc(diff_dst->len(), MPOOL_REORDER);
+            //diff_dst_reorder = new avx::byte[diff_dst->len()];
+            reorder_diff_dst_op->execute(diff_dst_tmp, diff_dst_reorder.get());
+            diff_dst_tmp = diff_dst_reorder.get();
+        }
+    }
+
+    //assum dst and src have same data type
+    //Tensor *diff_w_tensor = new Tensor(diff_w_dims, src->cxx_data_type(), conv2d_bwd_weights->diff_weights_fmt_, cpu_engine);
+    auto w_data = Allocator::malloc(diff_w_dims, type2size(src->type()), MPOOL_CONV_BWD);
+    Tensor *diff_w_tensor = new Tensor(diff_w_dims.size(), diff_w_dims, w_data,
+            (mkldnn_memory_format_t)conv2d_bwd_weights->diff_weights_fmt_,
+            src->type());
+
+    auto b_data = Allocator::malloc(diff_b_dims, type2size(src->type()), MPOOL_CONV_BWD);
+    Tensor *diff_b_tensor = new Tensor(diff_b_dims.size(), diff_b_dims, b_data,
+            (mkldnn_memory_format_t)mkldnn::memory::format::x, src->type());
+
+    conv2d_bwd_weights->execute(src_tmp, diff_w_tensor->data(), diff_b_tensor->data(), diff_dst_tmp);
+    bwd_weight_vec.push_back(diff_w_tensor);
+    bwd_weight_vec.push_back(diff_b_tensor);
+
+    return bwd_weight_vec;
+}
+
+template<typename T>
+Tensor *Convolution2D<T>::BackwardData(
+                Tensor *weights, Tensor *diff_dst,
+                conv_param_t *cp)
+{
+    //sanity check
+    mkldnn::memory::dims diff_src_dims = (mkldnn::memory::dims)cp->out_dims;
+    mkldnn::memory::dims w_dims = (mkldnn::memory::dims)weights->dims();
+    mkldnn::memory::dims diff_dst_dims = (mkldnn::memory::dims)diff_dst->dims();
+    assert(w_dims == weights->cxx_dims() && diff_dst_dims == diff_dst->cxx_dims());
+
+    // sanity check for data type
+    // assuem all x/w/b should have same data type as T
+    // FIXME
+    // yli135: Is it possible x and w have different data type????
+    assert(memory_data_type<T>() == weights->cxx_data_type());
+    assert(memory_data_type<T>() == diff_dst->cxx_data_type());
+
+    // get a conv2d bwd data from primitive pool
+    Convolution2DBwdData<T> *conv2d_bwd_data = NULL;
+    conv2d_bwd_data = Convolution2DBwdDataFactory<T>::get( diff_src_dims, w_dims, diff_dst_dims,
+            cp->dilate_y, cp->dilate_x, cp->sy, cp->sx, cp->pad_lh, cp->pad_lw, cp->pad_rh, cp->pad_rw);
+
+    // FIXME: in this model, every call to conv_forward will create a new tensor, when to free???
+    mkldnn::memory::format w_fmt = weights->cxx_format();
+    mkldnn::memory::format diff_dst_fmt = diff_dst->cxx_format();
+    
+    void* w_tmp = weights->data();
+    void* diff_dst_tmp = diff_dst->data();
+    shared_ptr<avx::byte> w_reorder;
+    shared_ptr<avx::byte> diff_dst_reorder;
+
+    if (w_fmt == conv2d_bwd_data->weights_fmt_ && diff_dst_fmt == conv2d_bwd_data->diff_dst_fmt_) {
+        //LOG(INFO) << "conv2d bwd data primitive fmt matched";
+    } else {
+        //LOG(INFO) << "conv2d bwd data fmt not match, need to reorder";
+
+        if (w_fmt != conv2d_bwd_data->weights_fmt_) {
+            //LOG(INFO) << "weight_fmt=" << w_fmt << ", conv2d_bwd_data->weights_fmt_="<< conv2d_bwd_data->weights_fmt_;
+            ReorderOp<T>* reorder_w_op = ReorderFactory<T>::get(w_dims, w_fmt, conv2d_bwd_data->weights_fmt_);
+            w_reorder = Allocator::malloc(weights->len(), MPOOL_REORDER);
+            //w_reorder = new avx::byte[weights->len()];
+            reorder_w_op->execute(w_tmp, w_reorder.get());
+            w_tmp = w_reorder.get();
+        } 
+        if (diff_dst_fmt != conv2d_bwd_data->diff_dst_fmt_) {
+            //LOG(INFO) << "diff_dst_fmt=" << diff_dst_fmt <<", conv2d_bwd_data->diff_dst_fmt_=" << conv2d_bwd_data->diff_dst_fmt_;
+            ReorderOp<T>* reorder_diff_dst_op = ReorderFactory<T>::get(diff_dst_dims, diff_dst_fmt, conv2d_bwd_data->diff_dst_fmt_);
+            diff_dst_reorder = Allocator::malloc(diff_dst->len(), MPOOL_REORDER);
+            //diff_dst_reorder = new avx::byte[diff_dst->len()];
+            reorder_diff_dst_op->execute(diff_dst_tmp, diff_dst_reorder.get());
+            diff_dst_tmp = diff_dst_reorder.get();
+        }
+    }
+
+    // create tensor based on selected primitive
+    // assume dst and src have same data type
+    //Tensor *diff_src_tensor = new Tensor(diff_src_dims, diff_dst->cxx_data_type(), conv2d_bwd_data->diff_src_fmt_, cpu_engine);
+    auto data = Allocator::malloc(diff_src_dims, type2size(diff_dst->type()), MPOOL_CONV_BWD);
+    Tensor *diff_src_tensor = new Tensor(diff_src_dims.size(), diff_src_dims, data,
+            (mkldnn_memory_format_t)conv2d_bwd_data->diff_src_fmt_,
+            diff_dst->type());
+    
+    conv2d_bwd_data->execute(diff_src_tensor->data(), w_tmp, diff_dst_tmp);
+
+    return diff_src_tensor;
+}
+
+
+template class Convolution2D<float>;
+
+
+// vim: et ts=4 sw=4 cindent cino^=l0,\:0,N-s
diff --git a/python/ideep4py/primitives/dropout.cc b/python/ideep4py/primitives/dropout.cc
new file mode 100644
index 00000000..3a975b19
--- /dev/null
+++ b/python/ideep4py/primitives/dropout.cc
@@ -0,0 +1,138 @@
+/*
+ *Copyright (c) 2018 Intel Corporation.
+ *
+ *Permission is hereby granted, free of charge, to any person obtaining a copy
+ *of this software and associated documentation files (the "Software"), to deal
+ *in the Software without restriction, including without limitation the rights
+ *to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ *copies of the Software, and to permit persons to whom the Software is
+ *furnished to do so, subject to the following conditions:
+ *
+ *The above copyright notice and this permission notice shall be included in
+ *all copies or substantial portions of the Software.
+ *
+ *THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ *OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ *THE SOFTWARE.
+ *
+ */
+
+
+#include <ctime>
+#include <memory>
+#include <mkldnn.hpp>
+#include <random>
+#include <vector>
+
+#ifdef _OPENMP
+#include <omp.h>
+#endif
+
+#include "cpu_info.h"
+#include "dropout.h"
+#include "layer.h"
+#include "mkl_vsl.h"
+#include "prim_factory.h"
+#include "reorder_op.h"
+#include "tensor.h"
+
+static void bernoulli_generate(const long n, const double p, int* r) {
+    std::srand(std::time(0));
+    const int seed = 17 + std::rand() % 4096;
+
+#ifdef _OPENMP
+    int nthr = omp_get_max_threads();
+    const int threshold = nthr * OpenMpManager::getProcessorSpeedMHz() / 3;
+    const bool run_parallel = (omp_in_parallel() == 0) && (n >= threshold);
+    if (!run_parallel) {
+        nthr = 1;
+    }
+
+# pragma omp parallel num_threads(nthr)
+    {
+        const int ithr = omp_get_thread_num();
+        const long avg_amount = (n + nthr - 1) / nthr;
+        const long my_offset = ithr * avg_amount;
+        const long my_amount = std::min(my_offset + avg_amount, n) - my_offset;
+#else
+    {
+        const long my_amount = n;
+        const long my_offset = 0;
+#endif
+        if (my_amount > 0) {
+            VSLStreamStatePtr stream;
+            vslNewStream(&stream, VSL_BRNG_MCG31, seed);
+            vslSkipAheadStream(stream, my_offset);
+            viRngBernoulli(VSL_RNG_METHOD_BERNOULLI_ICDF, stream, my_amount, r + my_offset, p);
+            vslDeleteStream(&stream);
+        }
+    }
+}
+
+template<typename T>
+std::vector<Tensor*> Dropout<T>::Forward(Tensor* x, float ratio) {
+    const auto scale = 1.0 / (1.0 - ratio);
+    const auto x_buf = static_cast<T*>(x->data());
+    const auto size = x->size();
+    const auto mask = new Tensor(x->ndims(), x->dims(), x->format(), x->type());
+    const auto y = new Tensor(x->ndims(), x->dims(), x->format(), x->type());
+
+    // Init the mask
+    std::unique_ptr<int[]> bernouli_nums(new int[size]);
+    bernoulli_generate(size, 1.0 - ratio, bernouli_nums.get());
+
+    const auto mask_buf = static_cast<T*>(mask->data());
+    const auto y_buf = static_cast<T*>(y->data());
+
+#pragma omp parallel for schedule(static)
+    for (size_t i = 0; i < size; ++i) {
+        mask_buf[i] = bernouli_nums[i] * scale;
+        y_buf[i] = mask_buf[i] * x_buf[i];
+    }
+
+    return std::vector<Tensor*>{mask, y};
+}
+
+template<typename T>
+Tensor* Dropout<T>::Backward(Tensor* mask, Tensor* gy) {
+    assert(mask->size() == gy->size());
+
+    // Reorder mask if needed
+    auto gy_fmt = gy->cxx_format();
+    auto mask_fmt = mask->cxx_format();
+    void* mask_data = mask->data();
+    shared_ptr<avx::byte> mask_reorder;
+
+    if (gy_fmt == mask_fmt) {
+        //LOG(INFO) << "mask fmt matched";
+    } else {
+       // LOG(INFO) << "mask fmt not match, need to reorder";
+       // LOG(INFO) << "mask_fmt=" << mask_fmt <<", gy_fmt=" << gy_fmt;
+        auto reorder_op = ReorderFactory<T>::get(mask->dims(), mask_fmt, gy_fmt);
+        mask_reorder = Allocator::malloc(mask->len(), MPOOL_REORDER);
+        //mask_reorder = new avx::byte[mask->len()];
+        reorder_op->execute(mask->data(), mask_reorder.get());
+        mask_data = mask_reorder.get();
+    }
+
+    const auto size = mask->size();
+    const auto gx = new Tensor(gy->ndims(), gy->dims(), gy->format(), gy->type());
+
+    //const auto mask_buf = static_cast<T*>(mask_reorder ? mask_reorder : mask->data());
+    const auto mask_buf = static_cast<T*>(mask_data);
+    const auto gy_buf = static_cast<T*>(gy->data());
+    const auto gx_buf = static_cast<T*>(gx->data());
+
+#pragma omp parallel for schedule(static)
+    for (size_t i = 0; i < size; ++i) {
+        gx_buf[i] = mask_buf[i] * gy_buf[i];
+    }
+
+    return gx;
+}
+
+template class Dropout<float>;
diff --git a/python/ideep4py/primitives/eltwise.cc b/python/ideep4py/primitives/eltwise.cc
new file mode 100644
index 00000000..939fe8b9
--- /dev/null
+++ b/python/ideep4py/primitives/eltwise.cc
@@ -0,0 +1,116 @@
+/*
+ *Copyright (c) 2018 Intel Corporation.
+ *
+ *Permission is hereby granted, free of charge, to any person obtaining a copy
+ *of this software and associated documentation files (the "Software"), to deal
+ *in the Software without restriction, including without limitation the rights
+ *to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ *copies of the Software, and to permit persons to whom the Software is
+ *furnished to do so, subject to the following conditions:
+ *
+ *The above copyright notice and this permission notice shall be included in
+ *all copies or substantial portions of the Software.
+ *
+ *THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ *OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ *THE SOFTWARE.
+ *
+ */
+
+
+#include <mkldnn.hpp>
+#include <vector>
+#include <memory>
+#include "layer.h"
+#include "tensor.h"
+#include "eltwise.h"
+#include "eltwise_fwd.h"
+#include "eltwise_bwd.h"
+#include "prim_factory.h"
+#include "reorder_op.h"
+
+using namespace mkldnn;
+
+const mkldnn::memory::dims NONE_DIMS = {};
+extern engine cpu_engine;
+
+template<typename T1, typename T2>
+Eltwise<T1, T2>::Eltwise()
+{
+}
+
+template<typename T1, typename T2>
+Eltwise<T1, T2>::~Eltwise()
+{
+}
+
+template<typename T1, typename T2>
+Tensor *Eltwise<T1, T2>::Forward(Tensor *src, eltwise_algorithm_t alg_kind, T2 alpha, T2 beta)
+{
+    //sanity check for data type
+    assert(memory_data_type<T1>() == src.cxx_data_type());
+
+    // get a eltwise fwd from primitive pool
+    EltwiseFwd<T1, T2> *eltwise_fwd = nullptr;
+    // FIXME: in this model, every call to eltwise_fwd will create a new tensor, when to free???
+    mkldnn::memory::format src_fmt = src->cxx_format(); // src fmt in tensor
+    mkldnn::algorithm malg_kind = ideepy2mkldnn_eltwise_algorithm(alg_kind);
+    eltwise_fwd = EltwiseFwdFactory<T1, T2>::get(src->dims(), malg_kind, src_fmt, alpha, beta);
+
+    // create tensor based on primitive's dst 
+    // assume dst and src have same data type
+    auto data = Allocator::malloc(src->dims(), type2size(src->type()), MPOOL_ELTWISE_FWD);
+    Tensor *dst_tensor = new Tensor(src->ndims(), src->dims(), data,
+            (mkldnn_memory_format_t)eltwise_fwd->dst_fmt_,
+            src->type());
+
+    // do forward
+    eltwise_fwd->execute(src->data(), dst_tensor->data());
+
+    return dst_tensor;
+}
+
+template<typename T1, typename T2>
+Tensor *Eltwise<T1, T2>::Backward(Tensor *src, Tensor *diff_dst, eltwise_algorithm_t alg_kind, T2 alpha, T2 beta)
+{
+    // sanity check for data type
+    assert(memory_data_type<T1>() == diff_dst->cxx_data_type());
+    assert(src->ndims() == diff_dst->ndims());
+    assert(src->size() == diff_dst->size());
+
+    // get a eltwise bwd data from primitive pool
+    EltwiseBwd<T1, T2> *eltwise_bwd = nullptr;
+    mkldnn::algorithm malg_kind = ideepy2mkldnn_eltwise_algorithm(alg_kind);
+    eltwise_bwd = EltwiseBwdFactory<T1, T2>::get(diff_dst->dims(), malg_kind, diff_dst->cxx_format(), alpha, beta);
+
+    void *src_buf = src->data();
+
+    if (src->cxx_format() != diff_dst->cxx_format()) {
+        //LOG(INFO) << "eltwise bwd data fmt not match, need to reorder";
+        //LOG(INFO) << "diff_dst_fmt=" << diff_dst->cxx_format() <<", src format=" << src->cxx_format();
+        ReorderOp<T1>* reorder_src_op = ReorderFactory<T1>::get(src->dims(), src->cxx_format(), diff_dst->cxx_format());
+        //src_reorder = new avx::byte[diff_dst->len()];
+        auto src_reorder = Allocator::malloc(diff_dst->len(), MPOOL_REORDER);
+        reorder_src_op->execute(src_buf, src_reorder.get());
+        src_buf = static_cast<void *>(src_reorder.get());
+    }
+
+    // create tensor based on selected primitive
+    // assume dst and src have same data type
+    auto data = Allocator::malloc(src->dims(), type2size(src->type()), MPOOL_ELTWISE_BWD);
+    Tensor *diff_src = new Tensor(src->ndims(), src->dims(), data,
+                                    (mkldnn_memory_format_t)eltwise_bwd->src_diff_fmt_,
+                                    src->type());
+    
+    eltwise_bwd->execute(src_buf, diff_dst->data(), diff_src->data());
+
+    return diff_src;
+}
+
+template class Eltwise<float, float>;
+
+// vim: et ts=4 sw=4 cindent cino^=l0,\:0,N-s
diff --git a/python/ideep4py/primitives/linear.cc b/python/ideep4py/primitives/linear.cc
new file mode 100644
index 00000000..1b7f6f94
--- /dev/null
+++ b/python/ideep4py/primitives/linear.cc
@@ -0,0 +1,305 @@
+/*
+ *Copyright (c) 2018 Intel Corporation.
+ *
+ *Permission is hereby granted, free of charge, to any person obtaining a copy
+ *of this software and associated documentation files (the "Software"), to deal
+ *in the Software without restriction, including without limitation the rights
+ *to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ *copies of the Software, and to permit persons to whom the Software is
+ *furnished to do so, subject to the following conditions:
+ *
+ *The above copyright notice and this permission notice shall be included in
+ *all copies or substantial portions of the Software.
+ *
+ *THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ *OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ *THE SOFTWARE.
+ *
+ */
+
+
+#include <glog/logging.h>
+#include <iostream>
+#include "common.h"
+#include "mkldnn.hpp"
+#include "linear.h"
+#include "utils.h"
+#include "linear_fwd.h"
+#include "linear_bwd_data.h"
+#include "linear_bwd_weights.h"
+#include "linear_fwd_factory.h"
+#include "linear_bwd_data_factory.h"
+#include "linear_bwd_weights_factory.h"
+#include "reorder_op.h"
+#include "reorder_factory.h"
+using namespace mkldnn;
+
+extern const mkldnn::memory::dims NONE_DIMS;
+extern engine cpu_engine;
+
+template<typename T>
+Linear<T>::Linear()
+{
+}
+
+template<typename T>
+Linear<T>::~Linear()
+{
+}
+
+
+
+template<typename T>
+Tensor *Linear<T>::Forward(
+        Tensor *src, Tensor *weights,
+        Tensor *bias)
+{
+    //sanity check
+    mkldnn::memory::dims src_dims = src->cxx_dims();
+    mkldnn::memory::dims w_dims = weights->cxx_dims();
+    mkldnn::memory::dims b_dims;
+    mkldnn::memory::dims dst_dims ;
+    if (bias) {
+        b_dims = bias->cxx_dims();
+        assert(b_dims == bias->cxx_dims());
+    }
+
+    if (src->ndims() != weights->ndims()) {
+        assert(weights->ndims() == 2 && src->ndims() == 4);
+        w_dims = {w_dims[0], src_dims[1], src_dims[2], src_dims[3]};
+        weights->reset_memory(format_2_as_4(weights->format()), w_dims);
+    }
+    dst_dims = {src_dims[0], w_dims[0]};
+
+    //sanity check for data type
+    //FIXME
+    //is it possible y and w have different data type?
+    assert(memory_data_type<T>() == src->cxx_data_type());
+    assert(memory_data_type<T>() == weights->cxx_data_type());
+    if (bias) {
+        assert(memory_data_type<T>() == bias->cxx_data_type());
+    }
+    //get a linear from primitive pool
+    LinearFwd<T> *linear_forward = NULL;
+    if (bias)
+        linear_forward = LinearFwdFactory<T>::get(src_dims, w_dims, b_dims, dst_dims);
+    else 
+        linear_forward = LinearFwdFactory<T>::get(src_dims, w_dims, NONE_DIMS, dst_dims);
+    //FIXME: in this model, every call to conv_forward will create a new mdarray, when to free?
+    mkldnn::memory::format src_fmt = src->cxx_format();
+    mkldnn::memory::format w_fmt = weights->cxx_format();
+    void *src_tmp = src->data();
+    void *w_tmp = weights->data();
+    shared_ptr<avx::byte> src_reorder;
+    shared_ptr<avx::byte> w_reorder;
+    //check wheter format is match
+    if(src_fmt == linear_forward->src_fmt_ && w_fmt == linear_forward->weights_fmt_) {
+        //LOG(INFO) << "primitive fmt matched";
+    } else {
+        //LOG(INFO) << "format not matched, need to do reorder";
+        if (src_fmt != linear_forward->src_fmt_) {
+            //LOG(INFO) << "src_fmt" << src_fmt << ", linear_forward->src_fmt_" << linear_forward->src_fmt_;
+            ReorderOp<T>* reorder_src_op = ReorderFactory<T>::get(src_dims, src_fmt, linear_forward->src_fmt_);
+            src_reorder = Allocator::malloc(src->len(), MPOOL_REORDER);
+            //src_reorder =  new avx::byte[src->len()];
+            reorder_src_op->execute(src_tmp, src_reorder.get());
+            src_tmp = src_reorder.get();
+        }
+        if (w_fmt != linear_forward->weights_fmt_) {
+            //LOG(INFO) << "weight_fmt  = "<< w_fmt << ", linear_forward->weights_fmt_=" << linear_forward->weights_fmt_;
+            //FIXME: when to free the reordered memory
+            ReorderOp<T>* reorder_w_op = ReorderFactory<T>::get(w_dims, w_fmt, linear_forward->weights_fmt_);
+            w_reorder = Allocator::malloc(weights->len(), MPOOL_REORDER);
+            //w_reorder = new avx::byte[weights->len()];
+            reorder_w_op->execute(w_tmp, w_reorder.get());
+            w_tmp = w_reorder.get();
+            //set internal fmt back to weight tensor
+            weights->reset_memory(
+                    static_cast<mkldnn_memory_format_t>(linear_forward->weights_fmt_),
+                    w_reorder);
+        }
+    }
+    //create mdarray based on primitive's dst
+    //Tensor *dst_tensor = new Tensor(dst_dims, src->cxx_data_type(), linear_forward->dst_fmt_, cpu_engine);
+    auto data = Allocator::malloc(dst_dims, type2size(src->type()), MPOOL_IP_FWD);
+    Tensor *dst_tensor = new Tensor(dst_dims.size(), dst_dims, data,
+            (mkldnn_memory_format_t)linear_forward->dst_fmt_,
+            src->type());
+    // do forward
+    if (bias) {
+        linear_forward->execute(src_tmp, w_tmp, bias->data(), dst_tensor->data());
+    } else {
+        linear_forward->execute(src_tmp, w_tmp, dst_tensor->data());
+    }
+
+    return dst_tensor;
+}
+
+/*
+ * gW = gy * x
+ */
+template<typename T>
+std::vector<Tensor *> Linear<T>::BackwardWeights(
+            Tensor *src, Tensor* diff_dst, bool need_bias)
+{
+    std::vector<Tensor *> bwd_weight_vec;
+    mkldnn::memory::dims src_dims = src->cxx_dims();
+    mkldnn::memory::dims diff_dst_dims = diff_dst->cxx_dims();
+    mkldnn::memory::dims diff_w_dims;
+    mkldnn::memory::dims diff_b_dims;
+    diff_w_dims = {diff_dst_dims[1], src_dims[1]};
+    /*
+    if (src->ndims() == 4) {
+        diff_w_dims = {diff_dst_dims[1], src_dims[1], src_dims[2], src_dims[3]};
+    } else if (src->ndims() == 2){
+        diff_w_dims = {diff_dst_dims[1], src_dims[1]};
+    } else {
+        LOG(INFO) << "Error:: src only support 2 dims or 4 dims";
+    }*/
+    if (need_bias)
+        diff_b_dims = {diff_w_dims[0]};
+    // sanity check for data type
+    // FIXME
+    // is it possible y and w ave different data type?
+    assert(memory_data_type<T>() == src->cxx_data_type());
+    assert(memory_data_type<T>() == diff_dst->cxx_data_type());
+    //get a linear bwd weights from primitive pool
+    LinearBwdWeights<T> *linear_bwd_weights = NULL;
+    if (need_bias) {
+        linear_bwd_weights = LinearBwdWeightsFactory<T>::get(src_dims, diff_w_dims, diff_b_dims, diff_dst_dims);
+    } else {
+        linear_bwd_weights = LinearBwdWeightsFactory<T>::get(src_dims, diff_w_dims, NONE_DIMS, diff_dst_dims);
+    }
+    //create tensor based on selected primitive
+    mkldnn::memory::format src_fmt = src->cxx_format();
+    mkldnn::memory::format diff_dst_fmt = diff_dst->cxx_format();
+    //assum dst and src have same data type
+    void* src_tmp = src->data();
+    void* diff_dst_tmp = diff_dst->data();
+    shared_ptr<avx::byte> src_reorder;
+    shared_ptr<avx::byte> diff_dst_reorder;
+    //check whether fmt is same
+    if (src_fmt == linear_bwd_weights->src_fmt_ && diff_dst_fmt == linear_bwd_weights->diff_dst_fmt_) {
+        //LOG(INFO) << "primitive fmt matched";
+    } else {
+        //LOG(INFO) << "fmt not match, need to reorder";
+        if (src_fmt != linear_bwd_weights->src_fmt_) {
+           // LOG(INFO) <<  "src_fmt = " << src_fmt << ", linear_bwd_weights->src_fmt_=" << linear_bwd_weights->src_fmt_;
+            ReorderOp<T>* reorder_src_op = ReorderFactory<T>::get(src_dims, src_fmt, linear_bwd_weights->src_fmt_);
+            src_reorder = Allocator::malloc(src->len(), MPOOL_REORDER);
+            //src_reorder = new avx::byte[src->len()];
+            reorder_src_op->execute(src_tmp, src_reorder.get());
+            src_tmp = src_reorder.get();
+        }
+        if (diff_dst_fmt != linear_bwd_weights->diff_dst_fmt_) {
+            //LOG(INFO) << "diff_dst_fmt = " << diff_dst_fmt << ", linear_bwd_weights->diff_dst_fmt = " << linear_bwd_weights->diff_dst_fmt_;
+            //FIXME when to free the reordered memory
+            ReorderOp<T>* reorder_diff_dst_op = ReorderFactory<T>::get(diff_dst_dims, diff_dst_fmt, linear_bwd_weights->diff_dst_fmt_);
+            diff_dst_reorder = Allocator::malloc(diff_dst->len(), MPOOL_REORDER);
+            //diff_dst_reorder = new avx::byte[diff_dst->len()];
+            reorder_diff_dst_op->execute(diff_dst_tmp, diff_dst_reorder.get());
+            diff_dst_tmp = diff_dst_reorder.get();
+        }
+    }
+    //assume dst and src have the same data type
+    //Tensor *diff_w_tensor = new Tensor(diff_w_dims, src->cxx_data_type(), linear_bwd_weights->diff_weights_fmt_, cpu_engine);
+    auto w_data = Allocator::malloc(diff_w_dims, type2size(src->type()), MPOOL_IP_BWD);
+    Tensor *diff_w_tensor = new Tensor(diff_w_dims.size(), diff_w_dims, w_data,
+            (mkldnn_memory_format_t)linear_bwd_weights->diff_weights_fmt_,
+            src->type());
+    //do execute
+    if (need_bias) {
+        //assume bias's format is always mkldnn::memory::format::x
+        //Tensor *diff_b_tensor = new Tensor(diff_b_dims, src->cxx_data_type(), mkldnn::memory::format::x, cpu_engine);
+        auto b_data = Allocator::malloc(diff_b_dims, type2size(src->type()), MPOOL_IP_BWD);
+        Tensor *diff_b_tensor = new Tensor(diff_b_dims.size(), diff_b_dims, b_data,
+                (mkldnn_memory_format_t)mkldnn::memory::format::x, src->type());
+        linear_bwd_weights->execute(src_tmp, diff_w_tensor->data(), diff_b_tensor->data(), diff_dst_tmp);
+        bwd_weight_vec.push_back(diff_w_tensor);
+        bwd_weight_vec.push_back(diff_b_tensor);
+    } else {
+        linear_bwd_weights->execute(src_tmp, diff_w_tensor->data(), diff_dst_tmp);
+        bwd_weight_vec.push_back(diff_w_tensor);
+    }
+
+    return bwd_weight_vec;
+}
+
+template<typename T>
+Tensor *Linear<T>::BackwardData(
+            Tensor *weights, Tensor *diff_dst)
+{
+    //sanity check
+    mkldnn::memory::dims w_dims = weights->cxx_dims();
+    mkldnn::memory::dims diff_dst_dims = diff_dst->cxx_dims();
+    mkldnn::memory::dims diff_src_dims;
+    diff_src_dims = {diff_dst_dims[0], w_dims[1]};
+    /*
+    if (lp->src_ndims == 2) {
+        assert(weights->ndims() == 2);
+        diff_src_dims = {lp->src_d1, lp->src_d2};
+    } else if (lp->src_ndims == 4) {
+        diff_src_dims = {lp->src_d1, lp->src_d2, lp->src_d3, lp->src_d4};
+        if (weights->ndims() != 4) {
+            w_dims = {w_dims[0], diff_src_dims[1], diff_src_dims[2], diff_src_dims[3]};
+            weights->reset_memory(format_2_as_4(weights->format()), w_dims);
+        }
+    } else {
+        LOG(INFO) << "Error:: src ndim not support(2 or 4 only)";
+    }*/
+    //sanity check for data type
+    //assume all a/w/b should have the same type as T
+    //FIXME
+    //is it possible x and w have different data type???
+    assert(memory_data_type<T>() == weights->cxx_data_type());
+    assert(memory_data_type<T>() == diff_dst->cxx_data_type());
+    //get a linear bwd  data from primitive pool
+    LinearBwdData<T> *linear_bwd_data = NULL;
+    linear_bwd_data = LinearBwdDataFactory<T>::get(diff_src_dims, w_dims, diff_dst_dims);
+    //FIXME: in this model, every call to linear_forward will create a new tensor, when to free??
+    mkldnn::memory::format w_fmt = weights->cxx_format();
+    mkldnn::memory::format diff_dst_fmt = diff_dst->cxx_format();
+    
+    void* w_tmp = weights->data();
+    void* diff_dst_tmp = diff_dst->data();
+    shared_ptr<avx::byte> w_reorder;
+    shared_ptr<avx::byte> diff_dst_reorder;
+
+    if (w_fmt == linear_bwd_data->weights_fmt_ && diff_dst_fmt == linear_bwd_data->diff_dst_fmt_) {
+        //LOG(INFO) << "linear bwd data primitive fmt matched";
+    } else {
+        //LOG(INFO) << "linear bwd data fmt not match, need to reorder";
+        if (w_fmt != linear_bwd_data->weights_fmt_) {
+           // LOG(INFO) << "weights_fmt_ = " << w_fmt << ", linear_bwd_data->weights_fmt_ = " << linear_bwd_data->weights_fmt_;
+            ReorderOp<T>* reorder_w_op = ReorderFactory<T>::get(w_dims, w_fmt, linear_bwd_data->weights_fmt_);
+            w_reorder = Allocator::malloc(weights->len(), MPOOL_REORDER);
+            //w_reorder = new avx::byte[weights->len()];
+            reorder_w_op->execute(w_tmp, w_reorder.get());
+            w_tmp = w_reorder.get();
+        }
+        if (diff_dst_fmt != linear_bwd_data->diff_dst_fmt_) {
+            //LOG(INFO) << "diff_dst_fmt = " << diff_dst_fmt << ", linear_bwd_data->diff_dst_fmt = " << linear_bwd_data->diff_dst_fmt_;
+            ReorderOp<T>* reorder_diff_dst_op = ReorderFactory<T>::get(diff_dst_dims, diff_dst_fmt, linear_bwd_data->diff_dst_fmt_);
+            diff_dst_reorder = Allocator::malloc(diff_dst->len(), MPOOL_REORDER);
+            //diff_dst_reorder  = new avx::byte[diff_dst->len()];
+            reorder_diff_dst_op->execute(diff_dst_tmp, diff_dst_reorder.get());
+            diff_dst_tmp = diff_dst_reorder.get();
+        }
+    }
+    //create tensor based on selected primitive
+    //assume dst and src have the same data type
+    //Tensor* diff_src_tensor = new Tensor(diff_src_dims, diff_dst->cxx_data_type(), linear_bwd_data->diff_src_fmt_, cpu_engine);
+    auto data = Allocator::malloc(diff_src_dims, type2size(diff_dst->type()), MPOOL_IP_BWD);
+    Tensor *diff_src_tensor = new Tensor(diff_src_dims.size(), diff_src_dims, data,
+            (mkldnn_memory_format_t)linear_bwd_data->diff_src_fmt_,
+            diff_dst->type());
+    linear_bwd_data->execute(diff_src_tensor->data(), w_tmp, diff_dst_tmp);
+
+    return diff_src_tensor;
+}
+template class Linear<float>;
+
diff --git a/python/ideep4py/primitives/lrn.cc b/python/ideep4py/primitives/lrn.cc
new file mode 100755
index 00000000..a35738e4
--- /dev/null
+++ b/python/ideep4py/primitives/lrn.cc
@@ -0,0 +1,190 @@
+/*
+ *Copyright (c) 2018 Intel Corporation.
+ *
+ *Permission is hereby granted, free of charge, to any person obtaining a copy
+ *of this software and associated documentation files (the "Software"), to deal
+ *in the Software without restriction, including without limitation the rights
+ *to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ *copies of the Software, and to permit persons to whom the Software is
+ *furnished to do so, subject to the following conditions:
+ *
+ *The above copyright notice and this permission notice shall be included in
+ *all copies or substantial portions of the Software.
+ *
+ *THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ *OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ *THE SOFTWARE.
+ *
+ */
+
+
+#include <glog/logging.h>
+#include <iostream>
+#include "common.h"
+#include "mkldnn.hpp"
+#include "tensor.h"
+#include "mem.h"
+#include "lrn.h"
+#include "utils.h"
+#include "lrn_fwd.h"
+#include "lrn_bwd.h"
+#include "prim_factory.h"
+#include "reorder_op.h"
+
+using namespace mkldnn;
+
+extern engine cpu_engine;
+
+template<typename T>
+LocalResponseNormalization<T>::LocalResponseNormalization()
+{
+}
+
+template<typename T>
+LocalResponseNormalization<T>::~LocalResponseNormalization()
+{
+}
+
+template<typename T>
+std::vector<Tensor *> LocalResponseNormalization<T>::Forward(
+    Tensor *src, lrn_param_t* pp)
+{
+    //sanity check for data type
+    assert(memory_data_type<T>() == src.cxx_data_type());
+
+    // get a conv2d fwd from primitive pool
+    mkldnn::memory::format src_fmt = src->cxx_format(); // src fmt in tensor
+    LocalResponseNormalizationFwd<T> *lrn_forward = NULL;
+    lrn_forward = LocalResponseNormalizationFwdFactory<T>::get(
+        src->dims(), src_fmt,
+        pp->n, pp->k,
+        pp->alpha, pp->beta, 
+        lrn_algo_convert(pp->algo_kind));
+    
+    // mkldnn::memory::format src_fmt = src->cxx_format(); // src fmt in tensor
+
+    void *src_tmp = src->data();
+    shared_ptr<avx::byte> src_reorder;
+    
+    // check wehther fmt is same
+    if (src_fmt == lrn_forward->src_fmt_) {
+        //LOG(INFO) << "lrn forward fmt matched";
+    } else {
+        //LOG(INFO) << "lrn fwd fmt not match, need to reorder";
+       // LOG(INFO) << "src_fmt=" << src_fmt <<", lrn_forward->src_fmt_=" << lrn_forward->src_fmt_;
+        // FIXME: when to free the reordered memory
+        ReorderOp<T>* reorder_src_op = ReorderFactory<T>::get(src->dims(), src_fmt, lrn_forward->src_fmt_);
+        src_reorder = Allocator::malloc(src->len(), MPOOL_REORDER);
+        //src_reorder = new avx::byte[src->len()];
+        reorder_src_op->execute(src_tmp, src_reorder.get());
+        src_tmp = src_reorder.get();
+    }
+
+    // create tensor based on primitive's dst 
+    // assume dst and src have same data type
+    //Tensor *dst_tensor = new Tensor(src->dims(), src->cxx_data_type(), lrn_forward->dst_fmt_, cpu_engine);
+    auto data = Allocator::malloc(src->dims(), type2size(src->type()), MPOOL_LRN_FWD);
+    Tensor *dst_tensor = new Tensor(src->ndims(), src->dims(), data,
+            (mkldnn_memory_format_t)lrn_forward->dst_fmt_,
+            src->type());
+    
+    // do forward
+    // to return workspace
+    // LOG(INFO) << "ws_dt_=" << lrn_forward->ws_dt_;
+    // workspace must be int tensor
+    //Tensor *ws_tensor = new Tensor((lrn_forward->ws_dims_), lrn_forward->ws_dt_, lrn_forward->ws_fmt_, cpu_engine);
+    auto ws_data = Allocator::malloc(lrn_forward->ws_size_, MPOOL_LRN_FWD);
+    Tensor *ws_tensor = new Tensor(lrn_forward->ws_dims_,
+            static_cast<mkldnn_data_type_t>(lrn_forward->ws_dt_),
+            lrn_forward->ws_fmt_, ws_data);
+
+    lrn_forward->execute(src_tmp, dst_tensor->data(), ws_tensor->data());
+    std::vector<Tensor *> outputs;
+    outputs.push_back(dst_tensor);
+    outputs.push_back(ws_tensor);
+
+    //LOG(INFO) << "Succ exec lrn forward";
+    return outputs;
+}
+
+template<typename T>
+Tensor *LocalResponseNormalization<T>::Backward(
+                Tensor *src, Tensor *diff_dst, Tensor *ws, lrn_param_t* pp)
+{
+    //sanity check
+    assert(src->ndims() == diff_dst->ndims());
+    assert(src->size() == diff_dst->size());
+    assert(memory_data_type<T>() == diff_dst->cxx_data_type());
+
+    mkldnn::memory::dims ws_dims;
+    mkldnn::memory::data_type ws_dt;
+    ws_dims = ws->cxx_dims();
+    ws_dt = ws->cxx_data_type();
+
+    // get a conv2d bwd data from primitive pool
+    LocalResponseNormalizationBwd<T> *lrn_bwd = NULL;
+    lrn_bwd = LocalResponseNormalizationBwdFactory<T>::get(src->dims(), diff_dst->dims(), ws_dims, ws_dt,
+            pp->n, pp->k, pp->alpha, pp->beta, lrn_algo_convert(pp->algo_kind));
+
+    // FIXME: in this model, every call to conv_forward will create a new tensor, when to free???
+    shared_ptr<avx::byte> ws_reorder;
+    mkldnn::memory::format ws_fmt = ws->cxx_format();
+    void* ws_tmp = ws->data();
+    assert(ws_tmp == NULL);
+
+    mkldnn::memory::format diff_dst_fmt = diff_dst->cxx_format();
+    void* diff_dst_tmp = diff_dst->data();
+    shared_ptr<avx::byte> diff_dst_reorder;
+
+    if (ws_fmt != lrn_bwd->ws_fmt_) {
+        //LOG(INFO) << "lrn bwd data ws fmt not match, need to reorder";
+        //LOG(INFO) << "ws_fmt=" << ws_fmt << ", lrn_bwd->ws_fmt_="<< lrn_bwd->ws_fmt_;
+        ReorderOp<T>* reorder_ws_op = ReorderFactory<T>::get(ws_dims, ws_fmt, lrn_bwd->ws_fmt_);
+        ws_reorder = Allocator::malloc(ws->len(), MPOOL_REORDER);
+        //ws_reorder = new avx::byte[ws->len()];
+        reorder_ws_op->execute(ws_tmp, ws_reorder.get());
+        ws_tmp = ws_reorder.get();
+    } 
+    if (diff_dst_fmt != lrn_bwd->diff_dst_fmt_) {
+        //LOG(INFO) << "lrn bwd data diff dst fmt not match, need to reorder";
+        //LOG(INFO) << "diff_dst_fmt=" << diff_dst_fmt <<", lrn_bwd->diff_dst_fmt_=" << lrn_bwd->diff_dst_fmt_;
+        ReorderOp<T>* reorder_diff_dst_op = ReorderFactory<T>::get(diff_dst->dims(), diff_dst_fmt, lrn_bwd->diff_dst_fmt_);
+        diff_dst_reorder = Allocator::malloc(diff_dst->len(), MPOOL_REORDER);
+        //diff_dst_reorder = new avx::byte[diff_dst->len()];
+        reorder_diff_dst_op->execute(diff_dst_tmp, diff_dst_reorder.get());
+        diff_dst_tmp = diff_dst_reorder.get();
+    }
+    void *src_buf = src->data();
+    shared_ptr<avx::byte> src_reorder;
+    if (src->cxx_format() != diff_dst->cxx_format()) {
+        //LOG(INFO) << "lrn bwd data src fmt not match, need to reorder";
+       // LOG(INFO) << "diff_dst_fmt=" << diff_dst->cxx_format() <<", src format=" << src->cxx_format();
+        ReorderOp<T>* reorder_src_op = ReorderFactory<T>::get(src->dims(), src->cxx_format(), diff_dst->cxx_format());
+        //src_reorder = new avx::byte[diff_dst->len()];
+        src_reorder = Allocator::malloc(src->len(), MPOOL_REORDER);
+        reorder_src_op->execute(src_buf, src_reorder.get());
+        src_buf = src_reorder.get();
+    }
+
+    // create tensor based on selected primitive
+    // assume dst and src have same data type
+    //Tensor *diff_src_tensor = new Tensor(src->dims(), diff_dst->cxx_data_type(), lrn_bwd->diff_src_fmt_, cpu_engine);
+    auto data = Allocator::malloc(src->dims(), type2size(src->type()), MPOOL_LRN_BWD);
+    Tensor *diff_src_tensor = new Tensor(src->ndims(), src->dims(), data,
+            (mkldnn_memory_format_t)lrn_bwd->diff_src_fmt_,
+            src->type());
+    
+    lrn_bwd->execute(src_buf, diff_src_tensor->data(), diff_dst_tmp, ws_tmp);
+
+    return diff_src_tensor;
+}
+
+
+template class LocalResponseNormalization<float>;
+
+
+// vim: et ts=4 sw=4 cindent cino^=l0,\:0,N-s
diff --git a/python/ideep4py/primitives/ops/bn_bwd.cc b/python/ideep4py/primitives/ops/bn_bwd.cc
new file mode 100644
index 00000000..70dc8fb0
--- /dev/null
+++ b/python/ideep4py/primitives/ops/bn_bwd.cc
@@ -0,0 +1,118 @@
+/*
+ *Copyright (c) 2018 Intel Corporation.
+ *
+ *Permission is hereby granted, free of charge, to any person obtaining a copy
+ *of this software and associated documentation files (the "Software"), to deal
+ *in the Software without restriction, including without limitation the rights
+ *to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ *copies of the Software, and to permit persons to whom the Software is
+ *furnished to do so, subject to the following conditions:
+ *
+ *The above copyright notice and this permission notice shall be included in
+ *all copies or substantial portions of the Software.
+ *
+ *THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ *OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ *THE SOFTWARE.
+ *
+ */
+
+
+#include "mkldnn.hpp"
+#include "bn_bwd.h"
+#include "utils.h"
+#include "common.h"
+
+using namespace mkldnn;
+
+extern engine cpu_engine;
+
+template<typename T>
+void batch_normalization_bwd<T>::setup(mkldnn::memory::dims src_d,
+                                       mkldnn::memory::dims diff_dst_d,
+                                       float eps, bool scale_shift) {
+    flags_ |= scale_shift ? use_scale_shift : 0;
+
+    // memory desc
+    auto  src_md = memory::desc({src_d}, memory_data_type<T>(),
+                                get_desired_format(bn_size_));
+    auto  diff_dst_md = memory::desc({diff_dst_d}, memory_data_type<T>(),
+                                     get_desired_format(bn_size_));
+
+    // fwd desc & primitive desc
+    auto fwd_desc = batch_normalization_forward::desc(prop_kind::forward_training, src_md, eps, flags_);
+    auto fwd_pd = batch_normalization_forward::primitive_desc(fwd_desc, cpu_engine);
+
+    // bwd desc & primitive desc
+    auto bwd_desc = batch_normalization_backward::desc(
+                    scale_shift ? prop_kind::backward : prop_kind::backward_data,
+                    diff_dst_md, src_md, eps, flags_);
+    auto bwd_pd = batch_normalization_backward::primitive_desc(
+                  bwd_desc, cpu_engine, fwd_pd);
+
+    // memory primitive
+    src_mem_.reset(new memory({src_md, cpu_engine}, dummy));
+    diff_dst_mem_.reset(new memory({diff_dst_md, cpu_engine}, dummy));
+    mean_mem_.reset(new memory(bwd_pd.mean_primitive_desc(), dummy));
+    var_mem_.reset(new memory(bwd_pd.variance_primitive_desc(), dummy));
+    diff_src_mem_.reset(new memory({src_md, cpu_engine}, dummy));
+
+    // bn bwd primitive
+    if ((flags_ & use_scale_shift) && mkldnn_use_scaleshift) {
+        w_mem_.reset(new memory(bwd_pd.weights_primitive_desc(), dummy));
+        diff_w_mem_.reset(new memory(bwd_pd.diff_weights_primitive_desc(), dummy));
+
+        bn_bwd_.reset(new batch_normalization_backward(bwd_pd, *src_mem_, *mean_mem_,
+                      *var_mem_, *diff_dst_mem_, *w_mem_, *diff_src_mem_, *diff_w_mem_));
+    } else {
+        bn_bwd_.reset(new batch_normalization_backward(bwd_pd, *src_mem_, *mean_mem_,
+                      *var_mem_, *diff_dst_mem_, *diff_src_mem_));
+    }
+
+    bwd_primitives_.push_back(*bn_bwd_);
+
+    return;
+}
+
+template<typename T>
+void batch_normalization_bwd<T>::execute(void *src, void *diff_dst,
+                                         void *mean, void *var,
+                                         void *w, void *diff_src,
+                                         void *diff_w) {
+    // couple with buffer
+    src_mem_->set_data_handle(src);
+    diff_dst_mem_->set_data_handle(diff_dst);
+    mean_mem_->set_data_handle(mean);
+    var_mem_->set_data_handle(var);
+
+    if (flags_ & use_scale_shift) {
+        w_mem_->set_data_handle(w);
+        diff_w_mem_->set_data_handle(diff_w);
+    }
+
+    diff_src_mem_->set_data_handle(diff_src);
+
+    // exec
+    bwd_stream_->submit(bwd_primitives_);
+
+    // decouple
+    src_mem_->set_data_handle(dummy);
+    diff_dst_mem_->set_data_handle(dummy);
+    mean_mem_->set_data_handle(dummy);
+    var_mem_->set_data_handle(dummy);
+
+    if (flags_ & use_scale_shift) {
+        w_mem_->set_data_handle(dummy);
+        diff_w_mem_->set_data_handle(dummy);
+    }
+
+    diff_src_mem_->set_data_handle(dummy);
+
+    return;
+}
+
+template class batch_normalization_bwd<float>;
diff --git a/python/ideep4py/primitives/ops/bn_fwd.cc b/python/ideep4py/primitives/ops/bn_fwd.cc
new file mode 100644
index 00000000..b94fafcb
--- /dev/null
+++ b/python/ideep4py/primitives/ops/bn_fwd.cc
@@ -0,0 +1,134 @@
+/*
+ *Copyright (c) 2018 Intel Corporation.
+ *
+ *Permission is hereby granted, free of charge, to any person obtaining a copy
+ *of this software and associated documentation files (the "Software"), to deal
+ *in the Software without restriction, including without limitation the rights
+ *to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ *copies of the Software, and to permit persons to whom the Software is
+ *furnished to do so, subject to the following conditions:
+ *
+ *The above copyright notice and this permission notice shall be included in
+ *all copies or substantial portions of the Software.
+ *
+ *THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ *OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ *THE SOFTWARE.
+ *
+ */
+
+
+#include "mkldnn.hpp"
+#include "bn_fwd.h"
+#include "utils.h"
+#include "common.h"
+
+using namespace mkldnn;
+
+extern engine cpu_engine;
+
+template<typename T>
+void batch_normalization_fwd<T>::setup(mkldnn::memory::dims src_d,
+                                       float eps, bool scale_shift,
+                                       bool global_stats, bool training) {
+
+    flags_ |= scale_shift ? use_scale_shift : 0;
+    flags_ |= global_stats ? use_global_stats : 0;
+
+    pkind_ = training ?
+             prop_kind::forward_training :
+             prop_kind::forward_scoring;
+
+    // memory desc
+    auto src_md = memory::desc({src_d}, memory_data_type<T>(), get_desired_format(src_d[1]));
+
+    // fwd desc & primitive desc
+    auto fwd_desc = batch_normalization_forward::desc(pkind_, src_md, eps, flags_);
+    auto fwd_pd = batch_normalization_forward::primitive_desc(fwd_desc, cpu_engine);
+
+    // memory primitive
+    src_mem_.reset(new memory({src_md, cpu_engine}, dummy));
+    dst_mem_.reset(new memory(fwd_pd.dst_primitive_desc(), dummy));
+
+    if (flags_ & use_scale_shift)
+        w_mem_.reset(new memory(fwd_pd.weights_primitive_desc(), dummy));
+
+    if (training || (flags_ & use_global_stats)) {
+        mean_mem_.reset(new memory(fwd_pd.mean_primitive_desc(), dummy));
+        var_mem_.reset(new memory(fwd_pd.variance_primitive_desc(), dummy));
+    }
+
+    // bn fwd primitive
+    if (!training && !(flags_ & use_global_stats)) {
+        if ((flags_ & use_scale_shift) && mkldnn_use_scaleshift) {
+            bn_fwd_.reset(new batch_normalization_forward(
+                          fwd_pd, *src_mem_, *w_mem_, *dst_mem_));
+        } else {
+            bn_fwd_.reset(new batch_normalization_forward(
+                          fwd_pd, *src_mem_, *dst_mem_));
+        }
+    } else if (flags_ & use_global_stats) {
+        if ((flags_ & use_scale_shift) && mkldnn_use_scaleshift) {
+            bn_fwd_.reset(new batch_normalization_forward(
+                          fwd_pd, *src_mem_, (const primitive::at)*mean_mem_,
+                          (const primitive::at)*var_mem_, *w_mem_, *dst_mem_));
+        } else {
+            bn_fwd_.reset(new batch_normalization_forward(
+                          fwd_pd, *src_mem_, (const primitive::at)*mean_mem_,
+                          (const primitive::at)*var_mem_, *dst_mem_));
+        }
+    } else {
+        if ((flags_ & use_scale_shift) && mkldnn_use_scaleshift) {
+            bn_fwd_.reset(new batch_normalization_forward(
+                          fwd_pd, *src_mem_, *w_mem_, *dst_mem_, *mean_mem_, *var_mem_));
+        } else {
+            bn_fwd_.reset(new batch_normalization_forward(
+                          fwd_pd, *src_mem_, *dst_mem_, *mean_mem_, *var_mem_));
+        }
+    }
+
+    fwd_primitives_.push_back(*bn_fwd_);
+
+    return;
+}
+
+template<typename T>
+void batch_normalization_fwd<T>::execute(void *src, void *w, void *dst,
+                                         void *mean, void *var) {
+    // couple with buffer
+    src_mem_->set_data_handle(src);
+    dst_mem_->set_data_handle(dst);
+
+    if (flags_ & use_scale_shift)
+        w_mem_->set_data_handle(w);
+
+    if ((pkind_ == prop_kind::forward_training) ||
+        (flags_ & use_global_stats)) {
+        mean_mem_->set_data_handle(mean);
+        var_mem_->set_data_handle(var);
+    }
+
+    // exec
+    fwd_stream_->submit(fwd_primitives_);
+
+    // decouple
+    src_mem_->set_data_handle(dummy);
+    dst_mem_->set_data_handle(dummy);
+
+    if (flags_ & use_scale_shift)
+        w_mem_->set_data_handle(dummy);
+
+    if ((pkind_ == prop_kind::forward_training) ||
+        (flags_ & use_global_stats)) {
+        mean_mem_->set_data_handle(dummy);
+        var_mem_->set_data_handle(dummy);
+    }
+
+    return;
+}
+
+template class batch_normalization_fwd<float>;
diff --git a/python/ideep4py/primitives/ops/concat_bwd.cc b/python/ideep4py/primitives/ops/concat_bwd.cc
new file mode 100644
index 00000000..09e31bb1
--- /dev/null
+++ b/python/ideep4py/primitives/ops/concat_bwd.cc
@@ -0,0 +1,137 @@
+/*
+ *Copyright (c) 2018 Intel Corporation.
+ *
+ *Permission is hereby granted, free of charge, to any person obtaining a copy
+ *of this software and associated documentation files (the "Software"), to deal
+ *in the Software without restriction, including without limitation the rights
+ *to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ *copies of the Software, and to permit persons to whom the Software is
+ *furnished to do so, subject to the following conditions:
+ *
+ *The above copyright notice and this permission notice shall be included in
+ *all copies or substantial portions of the Software.
+ *
+ *THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ *OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ *THE SOFTWARE.
+ *
+ */
+
+
+#include <glog/logging.h>
+#include <iostream>
+#include "mkldnn.hpp"
+#include "concat_bwd.h"
+#include "utils.h"
+#include "common.h"
+
+using namespace mkldnn;
+
+extern engine cpu_engine;
+
+template<typename T>
+ConcatBwd<T>::ConcatBwd( std::vector<mkldnn::memory::dims> diff_src_ds,
+                         mkldnn::memory::dims diff_dst_d,
+                         int axis)
+{
+    bwd_stream_.reset(new stream(stream::kind::eager));
+    // create concat primitive
+    setup(diff_src_ds, diff_dst_d,  axis);
+}
+
+template<typename T>
+ConcatBwd<T>::~ConcatBwd()
+{
+}
+
+template<typename T>
+void ConcatBwd<T>::setup( std::vector<mkldnn::memory::dims> diff_src_ds, 
+                          mkldnn::memory::dims diff_dst_d,
+                          int axis)
+{
+    //LOG(INFO) << "Concat backward_setup";
+    
+    assert(diff_src_ds.size() > 0);
+    axis_ = axis;
+
+    /* init the offset */
+    memory::dims offsets = {0, 0, 0, 0};
+
+    //LOG(INFO) << "diff dst dims: [" << diff_dst_d[0] << "," << diff_dst_d[1] 
+      //  << "," << diff_dst_d[2] << "," << diff_dst_d[3] << "]";
+
+    //FIXME
+    // Currently, concat backward's diff_dst fmt is hard set, and store it
+    memory::format diff_dst_fmt = get_desired_format(diff_dst_d[1]); //
+    diff_dst_fmt_ = diff_dst_fmt;
+
+    // create diff dst md/mpt/mem
+    diff_dst_mpd_.reset(new memory::primitive_desc(
+                {{diff_dst_d}, memory_data_type<T>(), diff_dst_fmt}, cpu_engine));
+    diff_dst_mem_.reset(new memory(
+                {{{diff_dst_d}, memory_data_type<T>(), diff_dst_fmt}, cpu_engine}, dummy));
+
+    for (int i = 0; i < diff_src_ds.size(); i++) {
+        //FIXME
+        //Currently, concat's diff src fmt hard set as diff_dst fmt, need to pay attention in future for performance issue
+        memory::dims diff_src_tz = diff_src_ds[i];
+        //LOG(INFO) << "diff src dims: [" << diff_src_tz[0] << "," << diff_src_tz[1] 
+        //    << "," << diff_src_tz[2] << "," << diff_src_tz[3] << "]";
+        
+        auto diff_src_mpd = memory::primitive_desc(
+                {{diff_src_tz}, memory_data_type<T>(), diff_dst_fmt}, cpu_engine);
+        auto diff_src_mem = memory({diff_src_mpd}, dummy);
+
+        // store diff src fmt, same as diff dst
+        diff_src_fmts_.push_back(diff_dst_fmt);
+
+        diff_src_mems_.push_back(diff_src_mem);
+
+        // create view from gy to gxs[i]
+        std::shared_ptr<view::primitive_desc> view_pd;
+        view_pd.reset(new view::primitive_desc(*diff_dst_mpd_, diff_src_tz, offsets));
+        // create reorder primitive from gy to gxs[i]
+        std::shared_ptr<reorder::primitive_desc> reorder_pd;
+        reorder_pd.reset(new reorder::primitive_desc(view_pd.get()->dst_primitive_desc(), diff_src_mpd));
+
+        std::shared_ptr<mkldnn::reorder> reorder_prim;
+        reorder_prim.reset(new reorder(*reorder_pd, *diff_dst_mem_, diff_src_mems_[i]));
+    
+        bwd_primitives_.push_back(*reorder_prim);
+    
+        offsets[axis_] += diff_src_tz[axis_];
+    }
+
+    return;
+}
+
+template<typename T>
+void ConcatBwd<T>::execute(std::vector<void*> diff_src, void *diff_dst)
+{
+    //LOG(INFO) << "Concat backward";
+    assert(diff_src.size() == diff_src_mems_.size());
+
+    for (int i = 0; i < diff_src_mems_.size(); i++) {
+        diff_src_mems_[i].set_data_handle(diff_src[i]);
+    }
+    diff_dst_mem_->set_data_handle(diff_dst);
+
+    bwd_stream_->submit(bwd_primitives_);
+
+    //after exec, set data handle back
+    for (int i = 0; i < diff_src_mems_.size(); i++) {
+        diff_src_mems_[i].set_data_handle(dummy);
+    }
+    diff_dst_mem_->set_data_handle(dummy);
+
+    return;
+}
+
+template class ConcatBwd<float>;
+
+
+// vim: et ts=4 sw=4 cindent cino^=l0,\:0,N-s
diff --git a/python/ideep4py/primitives/ops/concat_fwd.cc b/python/ideep4py/primitives/ops/concat_fwd.cc
new file mode 100644
index 00000000..472703fa
--- /dev/null
+++ b/python/ideep4py/primitives/ops/concat_fwd.cc
@@ -0,0 +1,131 @@
+/*
+ *Copyright (c) 2018 Intel Corporation.
+ *
+ *Permission is hereby granted, free of charge, to any person obtaining a copy
+ *of this software and associated documentation files (the "Software"), to deal
+ *in the Software without restriction, including without limitation the rights
+ *to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ *copies of the Software, and to permit persons to whom the Software is
+ *furnished to do so, subject to the following conditions:
+ *
+ *The above copyright notice and this permission notice shall be included in
+ *all copies or substantial portions of the Software.
+ *
+ *THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ *OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ *THE SOFTWARE.
+ *
+ */
+
+
+#include <glog/logging.h>
+#include <iostream>
+#include "mkldnn.hpp"
+#include "concat_fwd.h"
+#include "utils.h"
+#include "common.h"
+
+using namespace mkldnn;
+
+extern engine cpu_engine;
+
+template<typename T>
+ConcatFwd<T>::ConcatFwd( std::vector<mkldnn::memory::dims> src_ds,
+                         mkldnn::memory::dims dst_d, int axis)
+{
+    fwd_stream_.reset(new stream(stream::kind::eager));
+    // create concat primitive
+    if (concat_fwd_ == NULL) {
+        setup(src_ds, dst_d, axis);
+    }
+}
+
+template<typename T>
+ConcatFwd<T>::~ConcatFwd()
+{
+}
+
+template<typename T>
+void ConcatFwd<T>::setup( std::vector<mkldnn::memory::dims> src_ds, 
+                          mkldnn::memory::dims dst_d,
+                          int axis)
+{
+    //LOG(INFO) << "Concat forward_setup";
+    
+    assert(src_ds.size() > 0);
+    axis_ = axis;
+
+    //LOG(INFO) << "dst dims: [" << dst_d[0] << "," << dst_d[1] 
+        //<< "," << dst_d[2] << "," << dst_d[3] << "]";
+
+    //FIXME
+    // Currently, concat's src fms is hard set
+    memory::format src_fmt = get_desired_format(src_ds[0][1]); //
+
+    for (int i = 0; i < src_ds.size(); i++) {
+        //FIXME
+        //Currently, concat's src fmt hard set as nchw, need to pay attention in future for performance issue
+        memory::dims src_tz = src_ds[i];
+        
+        auto src_mpd = memory::primitive_desc(
+                {{src_tz}, memory_data_type<T>(), src_fmt}, cpu_engine);
+        auto src_mem = memory({src_mpd}, dummy);
+
+        src_mpds_.push_back(src_mpd);
+        src_mems_.push_back(src_mem);
+
+        // concat only accept mkldnn::primitive::at parameter
+        src_prim_at_.push_back(primitive::at(src_mem));
+
+
+        // store src fmt
+        src_fmts_.push_back(src_fmt);
+    }
+
+    // FIXME
+    // here, if set format as any, will create memory fail?????
+    dst_md_.reset(new memory::desc(dst_d, memory_data_type<T>(), src_fmt));
+    dst_mem_.reset(new memory({{{dst_d}, memory_data_type<T>(), src_fmt}, cpu_engine}, dummy));
+    //dst_md_.reset(new memory::desc(dst_d, memory_data_type<T>(), mkldnn::memory::format::any));
+    //dst_mem_.reset(new memory({{{dst_d}, memory_data_type<T>(), mkldnn::memory::format::any}, cpu_engine}, dummy));
+
+    // create concat pd/primitive
+    concat_pd_.reset(new concat::primitive_desc(*dst_md_, axis_, src_mpds_));
+    concat_fwd_.reset(new concat(*concat_pd_, src_prim_at_, *dst_mem_));
+
+    // store dst fmr
+    dst_fmt_ = static_cast<mkldnn::memory::format>(concat_pd_.get()->dst_primitive_desc().desc().data.format);
+
+    return;
+}
+
+template<typename T>
+void ConcatFwd<T>::execute(std::vector<void*> src, void *dst)
+{
+    //LOG(INFO) << "Concat forward";
+    assert(src.size() == src_mems_.size());
+
+    for (int i = 0; i < src_mems_.size(); i++) {
+        src_mems_[i].set_data_handle(src[i]);
+    }
+    dst_mem_->set_data_handle(dst);
+
+    fwd_stream_->submit({*concat_fwd_});
+
+    //after exec, set data handle back
+    for (int i = 0; i < src_mems_.size(); i++) {
+        src_mems_[i].set_data_handle(dummy);
+    }
+    dst_mem_->set_data_handle(dummy);
+
+    return;
+}
+
+template class ConcatFwd<float>;
+
+
+// vim: et ts=4 sw=4 cindent cino^=l0,\:0,N-s
diff --git a/python/ideep4py/primitives/ops/conv_bwd_data.cc b/python/ideep4py/primitives/ops/conv_bwd_data.cc
new file mode 100644
index 00000000..d6d79d81
--- /dev/null
+++ b/python/ideep4py/primitives/ops/conv_bwd_data.cc
@@ -0,0 +1,153 @@
+/*
+ *Copyright (c) 2018 Intel Corporation.
+ *
+ *Permission is hereby granted, free of charge, to any person obtaining a copy
+ *of this software and associated documentation files (the "Software"), to deal
+ *in the Software without restriction, including without limitation the rights
+ *to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ *copies of the Software, and to permit persons to whom the Software is
+ *furnished to do so, subject to the following conditions:
+ *
+ *The above copyright notice and this permission notice shall be included in
+ *all copies or substantial portions of the Software.
+ *
+ *THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ *OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ *THE SOFTWARE.
+ *
+ */
+
+
+#include <glog/logging.h>
+#include <iostream>
+#include "mkldnn.hpp"
+#include "conv_bwd_data.h"
+#include "utils.h"
+#include "common.h"
+
+using namespace mkldnn;
+
+extern engine cpu_engine;
+
+template<typename T>
+Convolution2DBwdData<T>::Convolution2DBwdData(
+        mkldnn::memory::dims diff_src_d,
+        mkldnn::memory::dims w_d,
+        mkldnn::memory::dims diff_dst_d,
+        int dilate_y, int dilate_x,
+        int sy, int sx,
+        int pad_lh, int pad_lw, int pad_rh, int pad_rw)
+{
+    bwd_data_stream_.reset(new stream(stream::kind::eager));
+    // create conv primitive
+    if (conv_bwd_data_ == NULL) {
+        setup(diff_src_d, w_d, diff_dst_d,
+                dilate_y, dilate_x,
+                sy, sx,
+                pad_lh, pad_lw,
+                pad_rh, pad_rw);
+    }
+}
+
+template<typename T>
+Convolution2DBwdData<T>::~Convolution2DBwdData()
+{
+}
+
+template<typename T>
+void Convolution2DBwdData<T>::setup(
+        mkldnn::memory::dims diff_src_d, 
+        mkldnn::memory::dims w_d,
+        mkldnn::memory::dims diff_dst_d,
+        int dilate_y, int dilate_x,
+        int sy, int sx,
+        int pad_lh, int pad_lw,
+        int pad_rh, int pad_rw)
+{
+    //LOG(INFO) << "Convolution backward data setup";
+    assert(diff_src_d != NULL);
+    assert(w_d != NULL);
+    assert(diff_dst_d != NULL);
+
+    dilates_ = {dilate_y, dilate_x};
+    strides_ = {sy, sx};
+    padding_l_ = {pad_lh, pad_lw};
+    padding_r_ = {pad_rh, pad_rw};
+
+    //LOG(INFO) << "diff_src[0]=" << diff_src_d[0] << ", diff_src[1]=" << diff_src_d[1] << ", diff_src[2]=" << diff_src_d[2] << ", diff_src[3]=" << diff_src_d[3];
+    //LOG(INFO) << "w[0]=" << w_d[0] << ", w[1]=" << w_d[1] << ", w=" << w_d[2] << ", w[3]=" << w_d[3];
+    //LOG(INFO) << "diff_dst[0]=" << diff_dst_d[0] << ", diff_dst[1]=" << diff_dst_d[1] << ", diff_dst[2]=" << diff_dst_d[2] << ", diff_dst[3]=" << diff_dst_d[3];
+
+    //LOG(INFO) << "sy=" << sy << ", sx=" << sx;
+   // LOG(INFO) << "pl1=" << pad_lh << ", pl2=" << pad_lw << ", pr1=" << pad_rh << ", pr2=" << pad_rw;
+
+    /* create memory descriptors for convolution data w/ no specified format */
+    diff_src_md_.reset(new memory::desc({diff_src_d}, memory_data_type<T>(),
+                                   memory::format::any));
+    weights_md_.reset(new memory::desc({w_d},
+                                       memory_data_type<T>(), memory::format::any));
+    diff_dst_md_.reset(new memory::desc({diff_dst_d}, memory_data_type<T>(),
+                                   memory::format::any));
+    /* create a convolution */
+    bwd_data_desc_.reset(new convolution_backward_data::desc(
+                    convolution_direct, *diff_src_md_, *weights_md_,
+                    *diff_dst_md_, strides_, dilates_, padding_l_, padding_r_, padding_kind::zero));
+
+    // FIXME
+    // yli135: Current conv bwd need a fwd pd as hint, will remove in future
+    fwd_desc_.reset(new convolution_forward::desc(prop_kind::forward,
+                convolution_direct, *diff_src_md_, *weights_md_,
+                *diff_dst_md_, strides_, dilates_, padding_l_, padding_r_, padding_kind::zero));
+    fwd_pd_.reset(new convolution_forward::primitive_desc(*fwd_desc_, cpu_engine));
+
+    /* create backward conv prim desc*/
+    bwd_data_pd_.reset(new convolution_backward_data::primitive_desc(
+                *bwd_data_desc_, cpu_engine, *fwd_pd_));
+
+
+    //store the expected memory format
+    diff_src_fmt_ = static_cast<mkldnn::memory::format>(bwd_data_pd_.get()->diff_src_primitive_desc().desc().data.format);
+    weights_fmt_ = static_cast<mkldnn::memory::format>(bwd_data_pd_.get()->weights_primitive_desc().desc().data.format);
+    diff_dst_fmt_ = static_cast<mkldnn::memory::format>(bwd_data_pd_.get()->diff_dst_primitive_desc().desc().data.format);
+    
+    // create memory primitive based on dummy data
+    diff_src_mem_.reset(new memory(bwd_data_pd_.get()->diff_src_primitive_desc(), dummy));
+    weights_mem_.reset(new memory(bwd_data_pd_.get()->weights_primitive_desc(), dummy));
+    diff_dst_mem_.reset(new memory(bwd_data_pd_.get()->diff_dst_primitive_desc(), dummy));
+
+    /* create convolution primitive and add it to net */
+    conv_bwd_data_.reset(new convolution_backward_data(*bwd_data_pd_, *diff_dst_mem_,
+                                      *weights_mem_, *diff_src_mem_));
+
+    bwd_data_primitives_.push_back(*conv_bwd_data_);
+    return;
+}
+
+template<typename T>
+void Convolution2DBwdData<T>::execute(void* diff_src, void* w, void* diff_dst)
+{
+//    LOG(INFO) << "Convolution forward without bias";
+//    LOG(INFO) << conv_fwd_;
+
+    diff_src_mem_->set_data_handle(diff_src);
+    weights_mem_->set_data_handle(w);
+    diff_dst_mem_->set_data_handle(diff_dst);
+    //conv_fwd_->execute();
+    bwd_data_stream_->submit(bwd_data_primitives_);
+
+    //set back data handke
+    diff_src_mem_->set_data_handle(dummy);
+    weights_mem_->set_data_handle(dummy);
+    diff_dst_mem_->set_data_handle(dummy);
+
+    return;
+}
+
+template class Convolution2DBwdData<float>;
+
+
+// vim: et ts=4 sw=4 cindent cino^=l0,\:0,N-s
diff --git a/python/ideep4py/primitives/ops/conv_bwd_weights.cc b/python/ideep4py/primitives/ops/conv_bwd_weights.cc
new file mode 100644
index 00000000..69c00d8d
--- /dev/null
+++ b/python/ideep4py/primitives/ops/conv_bwd_weights.cc
@@ -0,0 +1,176 @@
+/*
+ *Copyright (c) 2018 Intel Corporation.
+ *
+ *Permission is hereby granted, free of charge, to any person obtaining a copy
+ *of this software and associated documentation files (the "Software"), to deal
+ *in the Software without restriction, including without limitation the rights
+ *to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ *copies of the Software, and to permit persons to whom the Software is
+ *furnished to do so, subject to the following conditions:
+ *
+ *The above copyright notice and this permission notice shall be included in
+ *all copies or substantial portions of the Software.
+ *
+ *THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ *OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ *THE SOFTWARE.
+ *
+ */
+
+
+#include <glog/logging.h>
+#include <iostream>
+#include "mkldnn.hpp"
+#include "conv_bwd_weights.h"
+#include "utils.h"
+#include "common.h"
+
+using namespace mkldnn;
+
+extern engine cpu_engine;
+
+template<typename T>
+Convolution2DBwdWeights<T>::Convolution2DBwdWeights(
+        mkldnn::memory::dims src_d, mkldnn::memory::dims diff_w_d,
+        mkldnn::memory::dims diff_b_d, mkldnn::memory::dims diff_dst_d,
+        int dilate_y, int dilate_x,
+        int sy, int sx,
+        int pad_lh, int pad_lw, int pad_rh, int pad_rw)
+{
+    bwd_weights_stream_.reset(new stream(stream::kind::eager));
+    // create conv primitive
+    if (conv_bwd_weights_ == NULL) {
+        setup(src_d, diff_w_d, diff_b_d, diff_dst_d,
+                dilate_y, dilate_x,
+                sy, sx,
+                pad_lh, pad_lw,
+                pad_rh, pad_rw);
+    }
+}
+
+template<typename T>
+Convolution2DBwdWeights<T>::~Convolution2DBwdWeights()
+{
+}
+
+template<typename T>
+void Convolution2DBwdWeights<T>::setup(mkldnn::memory::dims src_d, mkldnn::memory::dims diff_w_d,
+        mkldnn::memory::dims diff_b_d, mkldnn::memory::dims diff_dst_d,
+        int dilate_y, int dilate_x,
+        int sy, int sx,
+        int pad_lh, int pad_lw,
+        int pad_rh, int pad_rw)
+{
+    //LOG(INFO) << "Convolution backward_setup";
+    assert(src_d != NULL);
+    assert(diff_w_d != NULL);
+    assert(diff_b_d != NULL); // no bias case, expect as NONE_DIMS, not NULL
+    assert(diff_dst_d != NULL);
+
+    dilates_ = {dilate_y, dilate_x};
+    strides_ = {sy, sx};
+    padding_l_ = {pad_lh, pad_lw};
+    padding_r_ = {pad_rh, pad_rw};
+
+    /* create memory descriptors for convolution data w/ no specified format */
+    src_md_.reset(new memory::desc({src_d}, memory_data_type<T>(),
+                                   memory::format::any));
+    diff_weights_md_.reset(new memory::desc({diff_w_d},
+                                       memory_data_type<T>(), memory::format::any));
+    diff_dst_md_.reset(new memory::desc({diff_dst_d}, memory_data_type<T>(),
+                                   memory::format::any));
+    if (!diff_b_d.empty())
+        diff_bias_md_.reset(new memory::desc({diff_b_d}, memory_data_type<T>(),
+                                   memory::format::any));
+    /* create a convolution */
+    if (!diff_b_d.empty()) {
+        bwd_weights_desc_.reset(new convolution_backward_weights::desc(
+                    convolution_direct, *src_md_, *diff_weights_md_,
+                    *diff_bias_md_, *diff_dst_md_, strides_, dilates_, padding_l_, padding_r_, padding_kind::zero));
+    } else {
+        bwd_weights_desc_.reset(new convolution_backward_weights::desc(
+                    convolution_direct, *src_md_, *diff_weights_md_,
+                    *diff_dst_md_, strides_, dilates_, padding_l_, padding_r_, padding_kind::zero));
+
+    }
+
+    // FIXME
+    // yli135: Current conv bwd need a fwd pd as hint, will remove in future
+    fwd_desc_.reset(new convolution_forward::desc(prop_kind::forward,
+                convolution_direct, *src_md_, *diff_weights_md_,
+                *diff_dst_md_, strides_, dilates_, padding_l_, padding_r_, padding_kind::zero));
+    fwd_pd_.reset(new convolution_forward::primitive_desc(*fwd_desc_, cpu_engine));
+
+    /* create backward conv prim desc*/
+    bwd_weights_pd_.reset(new convolution_backward_weights::primitive_desc(
+                *bwd_weights_desc_, cpu_engine, *fwd_pd_));
+
+
+    //store the expected memory format
+    src_fmt_ = static_cast<mkldnn::memory::format>(bwd_weights_pd_.get()->src_primitive_desc().desc().data.format);
+    diff_weights_fmt_ = static_cast<mkldnn::memory::format>(bwd_weights_pd_.get()->diff_weights_primitive_desc().desc().data.format);
+    diff_dst_fmt_ = static_cast<mkldnn::memory::format>(bwd_weights_pd_.get()->diff_dst_primitive_desc().desc().data.format);
+    
+    // create memory primitive based on dummy data
+    src_mem_.reset(new memory(bwd_weights_pd_.get()->src_primitive_desc(), dummy));
+    diff_weights_mem_.reset(new memory(bwd_weights_pd_.get()->diff_weights_primitive_desc(), dummy));
+    diff_dst_mem_.reset(new memory(bwd_weights_pd_.get()->diff_dst_primitive_desc(), dummy));
+
+    /* create convolution primitive and add it to net */
+    if (!diff_b_d.empty()) {
+        diff_bias_mem_.reset(new memory({{{diff_b_d}, memory_data_type<T>(), memory::format::x}, cpu_engine}, dummy));
+        conv_bwd_weights_.reset(new convolution_backward_weights(*bwd_weights_pd_, *src_mem_,
+                                      *diff_dst_mem_, *diff_weights_mem_, *diff_bias_mem_));
+    } else {
+        conv_bwd_weights_.reset(new convolution_backward_weights(*bwd_weights_pd_, *src_mem_,
+                                      *diff_dst_mem_, *diff_weights_mem_));
+    }
+
+    bwd_weights_primitives_.push_back(*conv_bwd_weights_);
+    return;
+}
+
+template<typename T>
+void Convolution2DBwdWeights<T>::execute(void* src, void* diff_w, void* diff_b, void* diff_dst)
+{
+//    LOG(INFO) << "Convolution forward";
+    //LOG(INFO) << "conv_fwd_:" << conv_fwd_;
+    //LOG(INFO) << "x=" << x << "; x_size=" << x_d1*x_d2*x_d3*x_d4*4;
+    src_mem_->set_data_handle(src);
+    diff_weights_mem_->set_data_handle(diff_w);
+    diff_bias_mem_->set_data_handle(diff_b);
+    diff_dst_mem_->set_data_handle(diff_dst);
+    //conv_fwd_->execute();
+    bwd_weights_stream_->submit(bwd_weights_primitives_);
+    src_mem_->set_data_handle(dummy);
+    diff_weights_mem_->set_data_handle(dummy);
+    diff_bias_mem_->set_data_handle(dummy);
+    diff_dst_mem_->set_data_handle(dummy);
+    return;
+}
+
+template<typename T>
+void Convolution2DBwdWeights<T>::execute(void* src, void* diff_w, void* diff_dst)
+{
+//    LOG(INFO) << "Convolution forward without bias";
+//    LOG(INFO) << conv_fwd_;
+
+    src_mem_->set_data_handle(src);
+    diff_weights_mem_->set_data_handle(diff_w);
+    diff_dst_mem_->set_data_handle(diff_dst);
+    //conv_fwd_->execute();
+    bwd_weights_stream_->submit(bwd_weights_primitives_);
+    src_mem_->set_data_handle(dummy);
+    diff_weights_mem_->set_data_handle(dummy);
+    diff_dst_mem_->set_data_handle(dummy);
+    return;
+}
+
+template class Convolution2DBwdWeights<float>;
+
+
+// vim: et ts=4 sw=4 cindent cino^=l0,\:0,N-s
diff --git a/python/ideep4py/primitives/ops/conv_fwd.cc b/python/ideep4py/primitives/ops/conv_fwd.cc
new file mode 100644
index 00000000..9c83da3b
--- /dev/null
+++ b/python/ideep4py/primitives/ops/conv_fwd.cc
@@ -0,0 +1,179 @@
+/*
+ *Copyright (c) 2018 Intel Corporation.
+ *
+ *Permission is hereby granted, free of charge, to any person obtaining a copy
+ *of this software and associated documentation files (the "Software"), to deal
+ *in the Software without restriction, including without limitation the rights
+ *to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ *copies of the Software, and to permit persons to whom the Software is
+ *furnished to do so, subject to the following conditions:
+ *
+ *The above copyright notice and this permission notice shall be included in
+ *all copies or substantial portions of the Software.
+ *
+ *THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ *OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ *THE SOFTWARE.
+ *
+ */
+
+
+#include <glog/logging.h>
+#include <iostream>
+#include "mkldnn.hpp"
+#include "conv_fwd.h"
+#include "utils.h"
+#include "common.h"
+
+using namespace mkldnn;
+
+extern engine cpu_engine;
+
+template<typename T>
+Convolution2DFwd<T>::Convolution2DFwd( mkldnn::memory::dims src_d, mkldnn::memory::dims w_d,
+                                       mkldnn::memory::dims b_d, mkldnn::memory::dims dst_d,
+                                       int dilate_y, int dilate_x,
+                                       int sy, int sx,
+                                       int pad_lh, int pad_lw, int pad_rh, int pad_rw)
+{
+    fwd_stream_.reset(new stream(stream::kind::eager));
+    // create conv primitive
+    if (conv_fwd_ == NULL) {
+        setup(src_d, w_d, b_d, dst_d,
+                dilate_y, dilate_x,
+                sy, sx,
+                pad_lh, pad_lw,
+                pad_rh, pad_rw);
+    }
+}
+
+template<typename T>
+Convolution2DFwd<T>::~Convolution2DFwd()
+{
+}
+
+template<typename T>
+void Convolution2DFwd<T>::setup(mkldnn::memory::dims src_d, mkldnn::memory::dims w_d,
+        mkldnn::memory::dims b_d, mkldnn::memory::dims dst_d,
+        int dilate_y, int dilate_x,
+        int s1, int s2,
+        int pl1, int pl2,
+        int pr1, int pr2)
+{
+    //LOG(INFO) << "Convolution forward_setup";
+    assert(src_d != NULL);
+    assert(w_d != NULL);
+    assert(bias_d != NULL); // no bias case, expect as NONE_DIMS, not NULL
+    assert(dst_d != NULL);
+
+    dilates_ = {dilate_y, dilate_x};
+    strides_ = {s1, s2};
+    padding_l_ = {pl1, pl2};
+    padding_r_ = {pr1, pr2};
+
+    //LOG(INFO) << "src_d1=" << src_d[0] << ", src_d2=" << src_d[1] << "; src_d3=" << src_d[2] << ", src_d4=" << src_d[3];
+    //LOG(INFO) << "w_d1=" << w_d[0] << ", w_d2=" << w_d[1] << "; w_d3=" << w_d[2] << ", w_d4=" << w_d[3];
+    //LOG(INFO) << "dst_d1=" << dst_d[0] << ", dst_d2=" << dst_d[1] << "; dst_d3=" << dst_d[2] << ", dst_d4=" << dst_d[3];
+    //LOG(INFO) << "dialte_y=" << dilate_y << ", dilate_x=" << dilate_x;
+    //LOG(INFO) << "sy=" << s1 << ", sx=" << s2;
+    //LOG(INFO) << "pl1=" << pl1 << ", pl2=" << pl2 << ", pr1=" << pr1 << ", pr2=" << pr2;
+
+    /* create memory descriptors for convolution data w/ no specified format */
+    src_md_.reset(new memory::desc({src_d}, memory_data_type<T>(),
+                                   memory::format::any));
+    weights_md_.reset(new memory::desc({w_d},
+                                       memory_data_type<T>(), memory::format::any));
+    dst_md_.reset(new memory::desc({dst_d}, memory_data_type<T>(),
+                                   memory::format::any));
+    if (!b_d.empty())
+        bias_md_.reset(new memory::desc({b_d}, memory_data_type<T>(),
+                                   memory::format::any));
+    /* create a convolution */
+    if (!b_d.empty()) {
+        fwd_desc_.reset(new convolution_forward::desc(prop_kind::forward,
+                                                 convolution_direct, *src_md_, *weights_md_, *bias_md_,
+                                                 *dst_md_, strides_, dilates_, padding_l_, padding_r_,
+                                                 padding_kind::zero));
+    } else {
+        fwd_desc_.reset(new convolution_forward::desc(prop_kind::forward,
+                                                 convolution_direct, *src_md_, *weights_md_,
+                                                 *dst_md_, strides_, dilates_, padding_l_, padding_r_,
+                                                 padding_kind::zero));
+    }
+
+    fwd_pd_.reset(new convolution_forward::primitive_desc(*fwd_desc_, cpu_engine));
+
+    //store the expected memory format
+    src_fmt_ = static_cast<mkldnn::memory::format>(fwd_pd_.get()->src_primitive_desc().desc().data.format);
+    weights_fmt_ = static_cast<mkldnn::memory::format>(fwd_pd_.get()->weights_primitive_desc().desc().data.format);
+    dst_fmt_ = static_cast<mkldnn::memory::format>(fwd_pd_.get()->dst_primitive_desc().desc().data.format);
+    
+    // create memory primitive based on dummy data
+    src_mem_.reset(new memory(fwd_pd_.get()->src_primitive_desc(), dummy));
+    weights_mem_.reset(new memory(fwd_pd_.get()->weights_primitive_desc(), dummy));
+    dst_mem_.reset(new memory(fwd_pd_.get()->dst_primitive_desc(), dummy));
+
+    /* create convolution primitive and add it to net */
+    if (!b_d.empty()) {
+        bias_mem_.reset(new memory({{{b_d}, memory_data_type<T>(), memory::format::x}, cpu_engine}, dummy));
+        conv_fwd_.reset(new convolution_forward(*fwd_pd_, *src_mem_,
+                                      *weights_mem_, *bias_mem_, *dst_mem_));
+    } else {
+        conv_fwd_.reset(new convolution_forward(*fwd_pd_, *src_mem_,
+                                      *weights_mem_, *dst_mem_));
+    }
+
+    fwd_primitives_.push_back(*conv_fwd_);
+    return;
+}
+
+template<typename T>
+void Convolution2DFwd<T>::execute(void* src, void* w, void* b, void* dst)
+{
+    //LOG(INFO) << "Convolution forward";
+    //LOG(INFO) << "conv_fwd_:" << conv_fwd_;
+    //LOG(INFO) << "x=" << x << "; x_size=" << x_d1*x_d2*x_d3*x_d4*4;
+    src_mem_->set_data_handle(src);
+    weights_mem_->set_data_handle(w);
+    bias_mem_->set_data_handle(b);
+    dst_mem_->set_data_handle(dst);
+    //conv_fwd_->execute();
+    fwd_stream_->submit(fwd_primitives_);
+
+    //after exec, set data handle back
+    src_mem_->set_data_handle(dummy);
+    weights_mem_->set_data_handle(dummy);
+    bias_mem_->set_data_handle(dummy);
+    dst_mem_->set_data_handle(dummy);
+
+    return;
+}
+
+template<typename T>
+void Convolution2DFwd<T>::execute(void* src, void* w, void* dst)
+{
+    //LOG(INFO) << "Convolution forward without bias";
+//    LOG(INFO) << conv_fwd_;
+
+    src_mem_->set_data_handle(src);
+    weights_mem_->set_data_handle(w);
+    dst_mem_->set_data_handle(dst);
+    //conv_fwd_->execute();
+    fwd_stream_->submit(fwd_primitives_);
+    
+    //after exec, set data handle back
+    src_mem_->set_data_handle(dummy);
+    weights_mem_->set_data_handle(dummy);
+    dst_mem_->set_data_handle(dummy);
+    
+    return;
+}
+
+template class Convolution2DFwd<float>;
+
+
+// vim: et ts=4 sw=4 cindent cino^=l0,\:0,N-s
diff --git a/python/ideep4py/primitives/ops/eltwise_bwd.cc b/python/ideep4py/primitives/ops/eltwise_bwd.cc
new file mode 100644
index 00000000..f15df8eb
--- /dev/null
+++ b/python/ideep4py/primitives/ops/eltwise_bwd.cc
@@ -0,0 +1,110 @@
+/*
+ *Copyright (c) 2018 Intel Corporation.
+ *
+ *Permission is hereby granted, free of charge, to any person obtaining a copy
+ *of this software and associated documentation files (the "Software"), to deal
+ *in the Software without restriction, including without limitation the rights
+ *to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ *copies of the Software, and to permit persons to whom the Software is
+ *furnished to do so, subject to the following conditions:
+ *
+ *The above copyright notice and this permission notice shall be included in
+ *all copies or substantial portions of the Software.
+ *
+ *THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ *OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ *THE SOFTWARE.
+ *
+ */
+
+
+#include <glog/logging.h>
+#include <iostream>
+#include "mkldnn.hpp"
+#include "eltwise_bwd.h"
+#include "utils.h"
+#include "common.h"
+
+using namespace mkldnn;
+
+extern engine cpu_engine;
+
+template<typename T1, typename T2>
+EltwiseBwd<T1, T2>::EltwiseBwd(mkldnn::memory::dims src_d, mkldnn::algorithm alg_kind, mkldnn::memory::format dst_diff_fmt, T2 alpha, T2 beta)
+{
+    bwd_stream_.reset(new stream(stream::kind::eager));
+    // create eltwise primitive
+    if (eltwise_bwd_ == nullptr) {
+        setup(src_d, alg_kind, dst_diff_fmt, alpha, beta);
+    }
+}
+
+template<typename T1, typename T2>
+EltwiseBwd<T1, T2>::~EltwiseBwd()
+{
+}
+
+template<typename T1, typename T2>
+void EltwiseBwd<T1, T2>::setup(mkldnn::memory::dims src_d, mkldnn::algorithm alg_kind, mkldnn::memory::format dst_diff_fmt, T2 alpha, T2 beta)
+{
+    //LOG(INFO) << "Eltwise backward_setup";
+    assert(src_d != nullptr);
+
+    /* create memory descriptors for eltwise data w/ no specified format */
+    src_md_.reset(new memory::desc({src_d}, memory_data_type<T1>(),
+                                   dst_diff_fmt));
+    dst_diff_md_.reset(new memory::desc({src_d}, memory_data_type<T1>(),
+                                   dst_diff_fmt));
+    src_mpd_.reset(new memory::primitive_desc(*src_md_, cpu_engine));
+    dst_diff_mpd_.reset(new memory::primitive_desc(*dst_diff_md_, cpu_engine));
+    /* create a eltwise*/
+    fwd_desc_.reset(new eltwise_forward::desc(prop_kind::forward, alg_kind,
+                                             *src_md_, alpha, beta));
+    fwd_pd_.reset(new eltwise_forward::primitive_desc(*fwd_desc_, cpu_engine));
+
+    bwd_desc_.reset(new eltwise_backward::desc(alg_kind,
+                                               *dst_diff_md_, *src_md_, alpha, beta));
+
+    bwd_pd_.reset(new eltwise_backward::primitive_desc(*bwd_desc_, cpu_engine, *fwd_pd_));
+
+    //store the expected memory format
+    src_diff_fmt_ = static_cast<mkldnn::memory::format>(bwd_pd_.get()->diff_src_primitive_desc().desc().data.format);
+    
+    // create memory primitive based on dummy data
+    src_mem_.reset(new memory(*src_mpd_, dummy));
+    dst_diff_mem_.reset(new memory(*dst_diff_mpd_, dummy));
+    src_diff_mem_.reset(new memory(bwd_pd_.get()->diff_src_primitive_desc(), dummy));
+
+    /* create eltwise primitive and add it to net */
+    eltwise_bwd_.reset(new eltwise_backward(*bwd_pd_, *src_mem_, *dst_diff_mem_, *src_diff_mem_));
+
+    bwd_primitives_.push_back(*eltwise_bwd_);
+    return;
+}
+
+template<typename T1, typename T2>
+void EltwiseBwd<T1, T2>::execute(void* src, void* dst_diff, void* src_diff)
+{
+    //LOG(INFO) << "Eltwise backward";
+
+    src_mem_->set_data_handle(src);
+    dst_diff_mem_->set_data_handle(dst_diff);
+    src_diff_mem_->set_data_handle(src_diff);
+    bwd_stream_->submit(bwd_primitives_);
+    
+    //after exec, set data handle back
+    src_mem_->set_data_handle(dummy);
+    dst_diff_mem_->set_data_handle(dummy);
+    src_diff_mem_->set_data_handle(dummy);
+    
+    return;
+}
+
+template class EltwiseBwd<float, float>;
+
+
+// vim: et ts=4 sw=4 cindent cino^=l0,\:0,N-s
diff --git a/python/ideep4py/primitives/ops/eltwise_fwd.cc b/python/ideep4py/primitives/ops/eltwise_fwd.cc
new file mode 100644
index 00000000..0b7431c8
--- /dev/null
+++ b/python/ideep4py/primitives/ops/eltwise_fwd.cc
@@ -0,0 +1,101 @@
+/*
+ *Copyright (c) 2018 Intel Corporation.
+ *
+ *Permission is hereby granted, free of charge, to any person obtaining a copy
+ *of this software and associated documentation files (the "Software"), to deal
+ *in the Software without restriction, including without limitation the rights
+ *to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ *copies of the Software, and to permit persons to whom the Software is
+ *furnished to do so, subject to the following conditions:
+ *
+ *The above copyright notice and this permission notice shall be included in
+ *all copies or substantial portions of the Software.
+ *
+ *THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ *OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ *THE SOFTWARE.
+ *
+ */
+
+
+#include <glog/logging.h>
+#include <iostream>
+#include "mkldnn.hpp"
+#include "eltwise_fwd.h"
+#include "utils.h"
+#include "common.h"
+
+using namespace mkldnn;
+
+extern engine cpu_engine;
+
+template<typename T1, typename T2>
+EltwiseFwd<T1, T2>::EltwiseFwd(mkldnn::memory::dims src_d, mkldnn::algorithm alg_kind, mkldnn::memory::format src_fmt, T2 alpha, T2 beta)
+{
+    fwd_stream_.reset(new stream(stream::kind::eager));
+    // create eltwise primitive
+    if (eltwise_fwd_ == nullptr) {
+        setup(src_d, alg_kind, src_fmt, alpha, beta);
+    }
+}
+
+template<typename T1, typename T2>
+EltwiseFwd<T1, T2>::~EltwiseFwd()
+{
+}
+
+template<typename T1, typename T2>
+void EltwiseFwd<T1, T2>::setup(mkldnn::memory::dims src_d, mkldnn::algorithm alg_kind, mkldnn::memory::format src_fmt, T2 alpha, T2 beta)
+{
+    //LOG(INFO) << "Eltwise forward_setup";
+    assert(src_d != nullptr);
+
+    /* create memory descriptors for eltwise data w/ no specified format */
+    src_md_.reset(new memory::desc({src_d}, memory_data_type<T1>(),
+                                   src_fmt));
+    src_mpd_.reset(new memory::primitive_desc(*src_md_, cpu_engine));
+    /* create a eltwise*/
+    fwd_desc_.reset(new eltwise_forward::desc(prop_kind::forward, alg_kind,
+                                             *src_md_, alpha, beta));
+
+    fwd_pd_.reset(new eltwise_forward::primitive_desc(*fwd_desc_, cpu_engine));
+
+    //store the expected memory format
+    src_fmt_ = src_fmt;
+    dst_fmt_ = static_cast<mkldnn::memory::format>(fwd_pd_.get()->dst_primitive_desc().desc().data.format);
+    
+    // create memory primitive based on dummy data
+    src_mem_.reset(new memory(*src_mpd_, dummy));
+    dst_mem_.reset(new memory(fwd_pd_.get()->dst_primitive_desc(), dummy));
+
+    /* create eltwise primitive and add it to net */
+    eltwise_fwd_.reset(new eltwise_forward(*fwd_pd_, *src_mem_, *dst_mem_));
+
+    fwd_primitives_.push_back(*eltwise_fwd_);
+    return;
+}
+
+template<typename T1, typename T2>
+void EltwiseFwd<T1, T2>::execute(void* src, void* dst)
+{
+    //LOG(INFO) << "Eltwise forward";
+
+    src_mem_->set_data_handle(src);
+    dst_mem_->set_data_handle(dst);
+    fwd_stream_->submit(fwd_primitives_);
+    
+    //after exec, set data handle back
+    src_mem_->set_data_handle(dummy);
+    dst_mem_->set_data_handle(dummy);
+    
+    return;
+}
+
+template class EltwiseFwd<float, float>;
+
+
+// vim: et ts=4 sw=4 cindent cino^=l0,\:0,N-s
diff --git a/python/ideep4py/primitives/ops/linear_bwd_data.cc b/python/ideep4py/primitives/ops/linear_bwd_data.cc
new file mode 100644
index 00000000..018accb9
--- /dev/null
+++ b/python/ideep4py/primitives/ops/linear_bwd_data.cc
@@ -0,0 +1,114 @@
+/*
+ *Copyright (c) 2018 Intel Corporation.
+ *
+ *Permission is hereby granted, free of charge, to any person obtaining a copy
+ *of this software and associated documentation files (the "Software"), to deal
+ *in the Software without restriction, including without limitation the rights
+ *to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ *copies of the Software, and to permit persons to whom the Software is
+ *furnished to do so, subject to the following conditions:
+ *
+ *The above copyright notice and this permission notice shall be included in
+ *all copies or substantial portions of the Software.
+ *
+ *THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ *OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ *THE SOFTWARE.
+ *
+ */
+
+
+#include <glog/logging.h>
+#include <iostream>
+#include "mkldnn.hpp"
+#include "linear_bwd_data.h"
+#include "utils.h"
+#include "common.h"
+
+using namespace mkldnn;
+
+extern engine cpu_engine;
+
+template<typename T>
+LinearBwdData<T>::LinearBwdData(
+        mkldnn::memory::dims diff_src_d,
+        mkldnn::memory::dims w_d,
+        mkldnn::memory::dims diff_dst_d
+        )
+{
+    bwd_data_stream_.reset(new stream(stream::kind::eager));
+    //create linear primitive
+    if (linear_bwd_data_ == NULL) {
+        setup(diff_src_d, w_d, diff_dst_d);
+    }
+}
+template<typename T>
+LinearBwdData<T>::~LinearBwdData()
+{
+}
+
+template<typename T>
+void LinearBwdData<T>::setup(
+        mkldnn::memory::dims diff_src_d,
+        mkldnn::memory::dims w_d,
+        mkldnn::memory::dims diff_dst_d
+        ) 
+{
+    assert(diff_src_d != NULL);
+    assert(w_d != NULL);
+    assert(diff_dst_d != NULL);
+
+    diff_src_md_.reset(new memory::desc({diff_src_d}, memory_data_type<T>(), memory::format::any));
+    weights_md_.reset(new memory::desc({w_d}, memory_data_type<T>(), memory::format::any));
+    diff_dst_md_.reset(new memory::desc({diff_dst_d}, memory_data_type<T>(), memory::format::any));
+    //LOG(INFO) << "diff_src_d" << diff_src_d[0]<<", "<<diff_src_d[1];
+    //LOG(INFO) << "w_d" << w_d[0] << "," << w_d[1];
+    //LOG(INFO) << "diff_dst_d" << diff_dst_d[0] << "," << diff_dst_d[1];
+    /*create a linear descriptor*/
+    bwd_data_desc_.reset(new inner_product_backward_data::desc(*diff_src_md_, *weights_md_, *diff_dst_md_));
+
+    //jiangzho: Current linear bwd need a fwd pd as hint, will remove in future
+    fwd_desc_.reset(new inner_product_forward::desc(prop_kind::forward, *diff_src_md_, *weights_md_, *diff_dst_md_));
+    fwd_pd_.reset(new inner_product_forward::primitive_desc(*fwd_desc_, cpu_engine));
+
+    /* create backward linear prim desc*/
+    bwd_data_pd_.reset(new inner_product_backward_data::primitive_desc(*bwd_data_desc_, cpu_engine, *fwd_pd_));
+    
+    //store the expected memory format
+    diff_src_fmt_ = static_cast<mkldnn::memory::format>(bwd_data_pd_.get()->diff_src_primitive_desc().desc().data.format);
+    weights_fmt_ = static_cast<mkldnn::memory::format>(bwd_data_pd_.get()->weights_primitive_desc().desc().data.format);
+    diff_dst_fmt_ = static_cast<mkldnn::memory::format>(bwd_data_pd_.get()->diff_dst_primitive_desc().desc().data.format); 
+
+    // create memory primitive based on dummy data
+    diff_src_mem_.reset(new memory(bwd_data_pd_.get()->diff_src_primitive_desc(), dummy));
+    weights_mem_.reset(new memory(bwd_data_pd_.get()->weights_primitive_desc(), dummy)); 
+    diff_dst_mem_.reset(new memory(bwd_data_pd_.get()->diff_dst_primitive_desc(), dummy)); 
+    
+    //create linear primitive and add it to net
+    linear_bwd_data_.reset(new inner_product_backward_data(*bwd_data_pd_, *diff_dst_mem_, *weights_mem_, *diff_src_mem_));
+    bwd_data_primitives_.push_back(*linear_bwd_data_);
+    return;
+}
+
+template<typename T>
+void LinearBwdData<T>::execute(void* diff_src, void* w, void* diff_dst)
+{
+    //LOG(INFO) << "linear fwd without bias"
+    diff_src_mem_->set_data_handle(diff_src);
+    weights_mem_->set_data_handle(w);
+    diff_dst_mem_->set_data_handle(diff_dst);
+    //linear_bwd->execute();
+    bwd_data_stream_->submit(bwd_data_primitives_);
+    diff_src_mem_->set_data_handle(dummy);
+    weights_mem_->set_data_handle(dummy);
+    diff_dst_mem_->set_data_handle(dummy);
+    return;
+}
+
+template class LinearBwdData<float>;
+
+
diff --git a/python/ideep4py/primitives/ops/linear_bwd_weights.cc b/python/ideep4py/primitives/ops/linear_bwd_weights.cc
new file mode 100644
index 00000000..eb97cdb2
--- /dev/null
+++ b/python/ideep4py/primitives/ops/linear_bwd_weights.cc
@@ -0,0 +1,138 @@
+/*
+ *Copyright (c) 2018 Intel Corporation.
+ *
+ *Permission is hereby granted, free of charge, to any person obtaining a copy
+ *of this software and associated documentation files (the "Software"), to deal
+ *in the Software without restriction, including without limitation the rights
+ *to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ *copies of the Software, and to permit persons to whom the Software is
+ *furnished to do so, subject to the following conditions:
+ *
+ *The above copyright notice and this permission notice shall be included in
+ *all copies or substantial portions of the Software.
+ *
+ *THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ *OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ *THE SOFTWARE.
+ *
+ */
+
+
+#include <glog/logging.h>
+#include <iostream>
+#include "mkldnn.hpp"
+#include "linear_bwd_weights.h"
+#include "utils.h"
+#include "common.h"
+
+using namespace mkldnn;
+
+extern engine cpu_engine;
+
+template<typename T>
+LinearBwdWeights<T>::LinearBwdWeights(
+        mkldnn::memory::dims src_d, mkldnn::memory::dims diff_w_d,
+        mkldnn::memory::dims diff_b_d, mkldnn::memory::dims diff_dst_d)
+{
+    bwd_weights_stream_.reset(new stream(stream::kind::eager));
+    //create linear primitive
+    if (linear_bwd_weights_ == NULL) {
+        setup(src_d, diff_w_d, diff_b_d, diff_dst_d);
+    }
+}
+
+template<typename T>
+LinearBwdWeights<T>::~LinearBwdWeights()
+{
+}
+
+template<typename T>
+void LinearBwdWeights<T>::setup(mkldnn::memory::dims src_d, mkldnn::memory::dims diff_w_d,
+        mkldnn::memory::dims diff_b_d, mkldnn::memory::dims diff_dst_d)
+{
+    assert(src_d != NULL);
+    assert(diff_w_d != NULL);
+    assert(diff_b_d != NULL);
+    assert(diff_dst_d != NULL);
+
+    src_md_.reset(new memory::desc({src_d}, memory_data_type<T>(), memory::format::any));
+    diff_weights_md_.reset(new memory::desc({diff_w_d}, memory_data_type<T>(), memory::format::any));
+    diff_dst_md_.reset(new memory::desc({diff_dst_d}, memory_data_type<T>(), memory::format::any));
+    //LOG(INFO) << "src_d"<<src_d[0] << "," << src_d[1];
+    //LOG(INFO) << "weights" << diff_w_d[0] << "," << diff_w_d[1];
+    //LOG(INFO) << "dst_d" << diff_dst_d[0] << "," << diff_dst_d[1];
+    //create linear
+    if (!diff_b_d.empty()) {
+        diff_bias_md_.reset(new memory::desc({diff_b_d}, memory_data_type<T>(), memory::format::any));
+        bwd_weights_desc_.reset(new inner_product_backward_weights::desc(*src_md_, *diff_weights_md_, 
+                    *diff_bias_md_, *diff_dst_md_));
+    } else {
+        bwd_weights_desc_.reset(new inner_product_backward_weights::desc(*src_md_, *diff_weights_md_,
+                    *diff_dst_md_));
+    }
+
+    //FIXME
+    //jiangzho, Current linear bwd need a fwd pd as hint, will remove in future
+    fwd_desc_.reset(new inner_product_forward::desc(prop_kind::forward, *src_md_,
+                *diff_weights_md_, *diff_dst_md_));
+    fwd_pd_.reset(new inner_product_forward::primitive_desc(*fwd_desc_, cpu_engine));
+    bwd_weights_pd_.reset(new inner_product_backward_weights::primitive_desc(*bwd_weights_desc_, cpu_engine, *fwd_pd_));
+    
+    //store the expected memory format
+    src_fmt_ = static_cast<mkldnn::memory::format>(bwd_weights_pd_.get()->src_primitive_desc().desc().data.format);
+    diff_weights_fmt_ = static_cast<mkldnn::memory::format>(bwd_weights_pd_.get()->diff_weights_primitive_desc().desc().data.format);
+    diff_dst_fmt_ = static_cast<mkldnn::memory::format>(bwd_weights_pd_.get()->diff_dst_primitive_desc().desc().data.format); 
+
+    //create linear primitive and add it to net
+    src_mem_.reset(new memory(bwd_weights_pd_.get()->src_primitive_desc(), dummy));
+    diff_weights_mem_.reset(new memory(bwd_weights_pd_.get()->diff_weights_primitive_desc(), dummy));  
+    diff_dst_mem_.reset(new memory(bwd_weights_pd_.get()->diff_dst_primitive_desc(), dummy));
+    //create linear primitive and add it to net
+    if (!diff_b_d.empty()) {
+        diff_bias_mem_.reset(new memory({{{diff_b_d}, memory_data_type<T>(), memory::format::x}, cpu_engine}, dummy));
+        linear_bwd_weights_.reset(new inner_product_backward_weights(*bwd_weights_pd_, *src_mem_, *diff_dst_mem_, 
+                    *diff_weights_mem_, *diff_bias_mem_));
+    } else {
+        linear_bwd_weights_.reset(new inner_product_backward_weights(*bwd_weights_pd_, *src_mem_, *diff_dst_mem_,
+                    *diff_weights_mem_));
+    }
+    bwd_weights_primitives_.push_back(*linear_bwd_weights_);
+    return;
+}
+
+template<typename T>
+void LinearBwdWeights<T>::execute(void* src, void* diff_w, void* diff_b, void* diff_dst)
+{
+    //LOG(INFO) << "linear backward weights";
+    src_mem_->set_data_handle(src);
+    diff_weights_mem_->set_data_handle(diff_w);
+    diff_bias_mem_->set_data_handle(diff_b);
+    diff_dst_mem_->set_data_handle(diff_dst);
+    bwd_weights_stream_->submit(bwd_weights_primitives_);
+    src_mem_->set_data_handle(dummy);
+    diff_weights_mem_->set_data_handle(dummy);
+    diff_bias_mem_->set_data_handle(dummy);
+    diff_dst_mem_->set_data_handle(dummy);
+    return;
+}
+
+template<typename T>
+void LinearBwdWeights<T>::execute(void* src, void* diff_w, void* diff_dst)
+{
+    // LOG(INFO) << "linear  without bias";
+    src_mem_->set_data_handle(src);
+    diff_weights_mem_->set_data_handle(diff_w);
+    diff_dst_mem_->set_data_handle(diff_dst);
+    bwd_weights_stream_->submit(bwd_weights_primitives_);
+    src_mem_->set_data_handle(dummy);
+    diff_weights_mem_->set_data_handle(dummy);
+    diff_dst_mem_->set_data_handle(dummy);
+    return;
+}
+
+template class LinearBwdWeights<float>;
+
diff --git a/python/ideep4py/primitives/ops/linear_fwd.cc b/python/ideep4py/primitives/ops/linear_fwd.cc
new file mode 100644
index 00000000..220a33ed
--- /dev/null
+++ b/python/ideep4py/primitives/ops/linear_fwd.cc
@@ -0,0 +1,142 @@
+/*
+ *Copyright (c) 2018 Intel Corporation.
+ *
+ *Permission is hereby granted, free of charge, to any person obtaining a copy
+ *of this software and associated documentation files (the "Software"), to deal
+ *in the Software without restriction, including without limitation the rights
+ *to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ *copies of the Software, and to permit persons to whom the Software is
+ *furnished to do so, subject to the following conditions:
+ *
+ *The above copyright notice and this permission notice shall be included in
+ *all copies or substantial portions of the Software.
+ *
+ *THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ *OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ *THE SOFTWARE.
+ *
+ */
+
+
+#include <glog/logging.h>
+#include <iostream>
+#include "mkldnn.hpp"
+#include "linear_fwd.h"
+#include "utils.h"
+#include "common.h"
+
+using namespace mkldnn;
+
+extern engine cpu_engine;
+
+template<typename T>
+LinearFwd<T>::LinearFwd(
+        mkldnn::memory::dims src_d, mkldnn::memory::dims w_d,
+        mkldnn::memory::dims b_d, mkldnn::memory::dims dst_d)
+{
+    fwd_stream_.reset(new stream(stream::kind::eager));
+    //create linear primitive
+    if (linear_fwd_ == NULL) {
+        setup(src_d, w_d, b_d, dst_d);
+    }
+}
+
+template<typename T>
+LinearFwd<T>::~LinearFwd()
+{
+}
+
+template<typename T>
+void LinearFwd<T>::setup(mkldnn::memory::dims src_d, mkldnn::memory::dims w_d,
+        mkldnn::memory::dims b_d, mkldnn::memory::dims dst_d)
+{
+    //LOG(INFO)<< "Linear forward setup"; 
+    assert(src_d != NULL);
+    assert(w_d != NULL);
+    assert(b_d != NULL);//no bias case, expect as NONE_DIMS, not NULL
+    assert(dst_d != NULL);
+    src_md_.reset(new memory::desc({src_d}, memory_data_type<T>(),
+                memory::format::any));
+    weights_md_.reset(new memory::desc({w_d}, memory_data_type<T>(),
+                memory::format::any));
+    dst_md_.reset(new memory::desc({dst_d}, memory_data_type<T>(),
+                memory::format::any));
+    //LOG(INFO) << "src_d" << src_d[0]<<","<< src_d[1];
+    //LOG(INFO) << "weight" << w_d[0] << "," << w_d[1];
+    //LOG(INFO) << "dst_d" << dst_d[0] << "," << dst_d[1];
+    //create linear layer descriptor
+    if(!b_d.empty()) {
+        bias_md_.reset(new memory::desc({b_d}, memory_data_type<T>(),
+                    memory::format::any));
+        fwd_desc_.reset(new inner_product_forward::desc(prop_kind::forward, *src_md_,
+                    *weights_md_, *bias_md_, *dst_md_));
+    } else {
+        fwd_desc_.reset(new inner_product_forward::desc(prop_kind::forward, *src_md_,
+                    *weights_md_, *dst_md_));
+    }
+    //-----------Determing engine to use------------------
+    //Current, treat the engine is MKLDNN::CPU
+    fwd_pd_.reset(new inner_product_forward::primitive_desc(*fwd_desc_, cpu_engine));
+    //create user memory primtive
+    src_fmt_ = static_cast<mkldnn::memory::format>(fwd_pd_.get()->src_primitive_desc().desc().data.format);
+    weights_fmt_ = static_cast<mkldnn::memory::format>(fwd_pd_.get()->weights_primitive_desc().desc().data.format);
+    dst_fmt_ = static_cast<mkldnn::memory::format>(fwd_pd_.get()->dst_primitive_desc().desc().data.format);
+
+    //create memory primitive based on dummy data
+    src_mem_.reset(new memory(fwd_pd_.get()->src_primitive_desc(), dummy));
+    weights_mem_.reset(new memory(fwd_pd_.get()->weights_primitive_desc(), dummy));
+    dst_mem_.reset(new memory(fwd_pd_.get()->dst_primitive_desc(), dummy));
+   
+    /*create  linear primitive and add it to net*/
+    if (!b_d.empty()) {
+        bias_mem_.reset(new memory({{{b_d}, memory_data_type<T>(), memory::format::x}, cpu_engine}, dummy));
+        linear_fwd_.reset(new inner_product_forward(*fwd_pd_, *src_mem_, 
+                    *weights_mem_, *bias_mem_, *dst_mem_));
+    } else {
+        linear_fwd_.reset(new inner_product_forward(*fwd_pd_, *src_mem_,
+                    *weights_mem_, *dst_mem_));
+    }
+    fwd_primitives_.push_back(*linear_fwd_);
+    return;
+}
+
+template<typename T>
+void LinearFwd<T>::execute(void* src, void* w, void* b, void* dst)
+{
+    //LOG(INFO) << "Linear forward";
+    src_mem_->set_data_handle(src); 
+    weights_mem_->set_data_handle(w);
+    bias_mem_->set_data_handle(b);
+    dst_mem_->set_data_handle(dst);
+    //linear_fwd_->execute();
+    fwd_stream_->submit(fwd_primitives_);
+    //after exec, set data handle bac
+    src_mem_->set_data_handle(dummy);
+    weights_mem_->set_data_handle(dummy);
+    bias_mem_->set_data_handle(dummy);
+    dst_mem_->set_data_handle(dummy);
+    return;
+}
+
+template<typename T>
+void LinearFwd<T>::execute(void* src, void* w, void* dst)
+{
+    //LOG(INFO) << "Linear forward";
+    src_mem_->set_data_handle(src); 
+    weights_mem_->set_data_handle(w);
+    dst_mem_->set_data_handle(dst);
+    //linear_fwd_->execute();
+    fwd_stream_->submit(fwd_primitives_);
+    //after exec, set data handle bac
+    src_mem_->set_data_handle(dummy);
+    weights_mem_->set_data_handle(dummy);
+    dst_mem_->set_data_handle(dummy);
+    return;
+}
+template class LinearFwd<float>;
+
+
diff --git a/python/ideep4py/primitives/ops/lrn_bwd.cc b/python/ideep4py/primitives/ops/lrn_bwd.cc
new file mode 100755
index 00000000..67832bed
--- /dev/null
+++ b/python/ideep4py/primitives/ops/lrn_bwd.cc
@@ -0,0 +1,137 @@
+/*
+ *Copyright (c) 2018 Intel Corporation.
+ *
+ *Permission is hereby granted, free of charge, to any person obtaining a copy
+ *of this software and associated documentation files (the "Software"), to deal
+ *in the Software without restriction, including without limitation the rights
+ *to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ *copies of the Software, and to permit persons to whom the Software is
+ *furnished to do so, subject to the following conditions:
+ *
+ *The above copyright notice and this permission notice shall be included in
+ *all copies or substantial portions of the Software.
+ *
+ *THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ *OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ *THE SOFTWARE.
+ *
+ */
+
+
+#include <glog/logging.h>
+#include <iostream>
+#include "common.h"
+#include "mkldnn.hpp"
+#include "lrn_bwd.h"
+#include "utils.h"
+
+using namespace mkldnn;
+
+extern engine cpu_engine;
+
+template <typename T>
+LocalResponseNormalizationBwd<T>::LocalResponseNormalizationBwd(
+    mkldnn::memory::dims src_d,
+    mkldnn::memory::dims diff_dst_d,
+    mkldnn::memory::dims ws_d,
+    mkldnn::memory::data_type ws_dt,
+    int n, double k, double alpha, double beta,
+    mkldnn::algorithm alg_kind):alg_kind_(mkldnn::algorithm::lrn_across_channels)
+{
+    bwd_stream_.reset(new stream(stream::kind::eager));
+    // setup
+    if ( bwd_ == NULL){
+        setup(src_d, diff_dst_d, ws_d, ws_dt, n, k, alpha, beta, alg_kind_);
+    }
+}
+
+template <typename T>
+LocalResponseNormalizationBwd<T>::~LocalResponseNormalizationBwd(){}
+
+template <typename T>
+void LocalResponseNormalizationBwd<T>::setup(
+    mkldnn::memory::dims src_d, 
+    mkldnn::memory::dims diff_dst_d,
+    mkldnn::memory::dims ws_d,
+    mkldnn::memory::data_type ws_dt,
+    int n, double k, double alpha, double beta,
+    mkldnn::algorithm alg_kind)
+{
+    //LOG(INFO) << "lrn backward_setup";
+
+    //LOG(INFO) << "src_d[0]=" << src_d[0] << "; src_d[1]" << src_d[1] << "; src_d[2]=" << src_d[2] << "; src_d[3]=" << src_d[3];
+   // LOG(INFO) << "diff_dst_d[0]=" << diff_dst_d[0] << "; diff_dst_d[1]" << diff_dst_d[1] << "; diff_dst_d[2]=" << diff_dst_d[2] << "; diff_dst_d[3]=" << diff_dst_d[3];
+   // LOG(INFO) << "ws_d[0]=" << ws_d[0] << "; ws_d[1]" << ws_d[1] << "; ws_d[2]=" << ws_d[2] << "; ws_d[3]=" << ws_d[3];
+    
+    alg_kind_ = alg_kind;
+
+    // create memory desc
+    src_md_.reset(new memory::desc({src_d}, memory_data_type<T>(),
+    get_desired_format(src_d[1])));
+
+    diff_dst_md_.reset(new memory::desc({diff_dst_d}, memory_data_type<T>(),
+    get_desired_format(diff_dst_d[1]))); // use diff dst chanel to decide fmt
+
+    //Need a forward hint to create backward, will be removed in future
+    // create a lrn descriptor
+    fwd_desc_.reset(new lrn_forward::desc(prop_kind::forward_training, alg_kind_,
+        *diff_dst_md_, n, alpha, beta, k));
+    fwd_pd_.reset(new lrn_forward::primitive_desc( *fwd_desc_, cpu_engine));
+
+    bwd_desc_.reset(new lrn_backward::desc(alg_kind_,
+        *src_md_, *diff_dst_md_,n, alpha, beta, k));
+    bwd_pd_.reset(new lrn_backward::primitive_desc(*bwd_desc_, cpu_engine,
+        *fwd_pd_));
+
+    // store expected primitive format
+    diff_src_fmt_ = static_cast<mkldnn::memory::format>(bwd_pd_.get()->diff_src_primitive_desc().desc().data.format);
+    diff_dst_fmt_ = get_desired_format(diff_dst_d[1]);
+    src_fmt_ = get_desired_format(diff_dst_d[1]);
+
+    // create MKL-DNN internal memory object with dummy data
+    src_mem_.reset(new memory({{{src_d}, memory_data_type<T>(), src_fmt_}, cpu_engine}, dummy));
+    diff_src_mem_.reset(new memory(bwd_pd_.get()->diff_src_primitive_desc(), dummy));
+    diff_dst_mem_.reset(new memory({{{diff_dst_d}, memory_data_type<T>(), diff_dst_fmt_}, cpu_engine}, dummy));
+
+    // store workspace's dims and fmt to create ws tensor
+    ws_fmt_ = get_desired_format(ws_d[1]);
+    ws_mem_.reset(new memory({{{ws_d}, ws_dt, ws_fmt_}, cpu_engine}, dummy)); // use ws dims's channel to decide format
+    
+    bwd_.reset(new lrn_backward(
+            *bwd_pd_, *src_mem_, *diff_dst_mem_, *ws_mem_, *diff_src_mem_));
+
+    bwd_primitives_.push_back(*bwd_);
+    return;
+}
+
+template<typename T>
+void LocalResponseNormalizationBwd<T>::execute(void*src, void *diff_src, void *diff_dst, void *ws)
+{
+    //LOG(INFO) << "lrn backward";
+    
+    diff_src_mem_->set_data_handle(diff_src); //
+    diff_dst_mem_->set_data_handle(diff_dst); //
+    src_mem_->set_data_handle(src);
+   
+    assert(ws!=NULL);
+    ws_mem_->set_data_handle(ws); // output workspace
+
+        
+    bwd_stream_->submit(bwd_primitives_);
+
+    // set back data handle
+    diff_src_mem_->set_data_handle(dummy);
+    diff_dst_mem_->set_data_handle(dummy);
+    src_mem_->set_data_handle(dummy);
+    assert(ws!=NULL);
+    ws_mem_->set_data_handle(dummy);
+    
+    //LOG(INFO) << "lrn backward finish";
+    return;
+}
+
+template class LocalResponseNormalizationBwd<float>;
diff --git a/python/ideep4py/primitives/ops/lrn_fwd.cc b/python/ideep4py/primitives/ops/lrn_fwd.cc
new file mode 100755
index 00000000..cc800358
--- /dev/null
+++ b/python/ideep4py/primitives/ops/lrn_fwd.cc
@@ -0,0 +1,126 @@
+/*
+ *Copyright (c) 2018 Intel Corporation.
+ *
+ *Permission is hereby granted, free of charge, to any person obtaining a copy
+ *of this software and associated documentation files (the "Software"), to deal
+ *in the Software without restriction, including without limitation the rights
+ *to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ *copies of the Software, and to permit persons to whom the Software is
+ *furnished to do so, subject to the following conditions:
+ *
+ *The above copyright notice and this permission notice shall be included in
+ *all copies or substantial portions of the Software.
+ *
+ *THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ *OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ *THE SOFTWARE.
+ *
+ */
+
+
+#include <glog/logging.h>
+#include <iostream>
+#include "common.h"
+#include "mkldnn.hpp"
+#include "lrn_fwd.h"
+#include "utils.h"
+
+using namespace mkldnn;
+
+extern engine cpu_engine;
+
+template<typename T>
+LocalResponseNormalizationFwd<T>::LocalResponseNormalizationFwd(
+    mkldnn::memory::dims src_d, mkldnn::memory::format src_fmt,
+    int n, double k, double alpha, double beta,
+    mkldnn::algorithm)
+    :alg_kind_(algorithm::lrn_across_channels)
+{
+
+    fwd_stream_.reset(new stream(stream::kind::eager));
+    // setup
+    if (fwd_ == NULL){
+        setup(src_d, src_fmt, n, k, alpha, beta, alg_kind_);
+    }
+}
+
+template<typename T>
+LocalResponseNormalizationFwd<T>::~LocalResponseNormalizationFwd(){}
+
+template<typename T>
+void LocalResponseNormalizationFwd<T>::setup(
+    mkldnn::memory::dims src_d, mkldnn::memory::format src_fmt,
+    int n, double k, double alpha, double beta,
+    mkldnn::algorithm alg_kind)
+{
+    //LOG(INFO) << "lrn forward_setup";
+
+    //LOG(INFO) << "src_d[0]=" << src_d[0] << "; src_d[1]" << src_d[1] << "; src_d[2]=" << src_d[2] << "; src_d[3]=" << src_d[3];
+    alg_kind_ = alg_kind;
+    // local_size_ = n;
+
+    src_md_.reset(new memory::desc({src_d}, memory_data_type<T>(),
+        get_desired_format(src_d[1]))); // use src's input channel to decide expected fmt
+    // src_md_.reset(new memory::desc({src_d}, memory_data_type<T>(),
+    //                                src_fmt));
+
+    //LOG(INFO) << "lrn_fwd_desc_";
+    fwd_desc_.reset(new lrn_forward::desc(prop_kind::forward_training, alg_kind_, 
+        *src_md_, n, alpha, beta, k));
+    fwd_pd_.reset(new lrn_forward::primitive_desc(*fwd_desc_, cpu_engine));
+
+    // store expected primitive format
+    src_fmt_ = get_desired_format(src_d[1]);
+    // src_fmt_ = src_fmt;
+    //LOG(INFO) << "src_fmt is " << src_fmt <<" desired src_fmt_ is "<<src_fmt_;
+    dst_fmt_ = static_cast<mkldnn::memory::format>(fwd_pd_.get()->dst_primitive_desc().desc().data.format);
+
+    // create MKL-DNN internal memory object with dummy data
+    src_mem_.reset(new memory({{{src_d}, memory_data_type<T>(), src_fmt_}, cpu_engine}, dummy));
+    dst_mem_.reset(new memory(fwd_pd_.get()->dst_primitive_desc(), dummy));
+
+    //need to return workspace for backward
+    auto ws_pd = fwd_pd_.get()->workspace_primitive_desc().desc().data;
+    // store workspace's dims and fmt to create ws tensor
+    ws_fmt_ = static_cast<mkldnn::memory::format>(ws_pd.format);
+    ws_dims_.assign(ws_pd.dims, ws_pd.dims + ws_pd.ndims);
+    ws_dt_ = static_cast<mkldnn::memory::data_type>(ws_pd.data_type);
+    ws_size_ = fwd_pd_.get()->workspace_primitive_desc().get_size();
+    ws_mem_.reset(new memory(fwd_pd_.get()->workspace_primitive_desc(), dummy));
+
+    fwd_.reset(new lrn_forward(
+            *fwd_pd_, *src_mem_, *ws_mem_, *dst_mem_));
+    
+    fwd_primitives_.push_back(*fwd_);
+    return;
+}
+
+template<typename T>
+void LocalResponseNormalizationFwd<T>::execute(void *src, void *dst, void *ws)
+{
+    //LOG(INFO) << "lrn forward";
+    
+    src_mem_->set_data_handle(src); // input
+    dst_mem_->set_data_handle(dst); // output dst
+
+    assert(ws!=NULL);
+    ws_mem_->set_data_handle(ws); // output workspace
+        
+    fwd_stream_->submit(fwd_primitives_);
+
+    // set back data handle
+    src_mem_->set_data_handle(dummy);
+    dst_mem_->set_data_handle(dummy);
+    
+    assert(ws!=NULL);
+    ws_mem_->set_data_handle(dummy);
+    
+    //LOG(INFO) << "lrn forward finish";
+    return;
+}
+
+template class LocalResponseNormalizationFwd<float>;
diff --git a/python/ideep4py/primitives/ops/pooling_bwd.cc b/python/ideep4py/primitives/ops/pooling_bwd.cc
new file mode 100644
index 00000000..47c90c66
--- /dev/null
+++ b/python/ideep4py/primitives/ops/pooling_bwd.cc
@@ -0,0 +1,167 @@
+/*
+ *Copyright (c) 2018 Intel Corporation.
+ *
+ *Permission is hereby granted, free of charge, to any person obtaining a copy
+ *of this software and associated documentation files (the "Software"), to deal
+ *in the Software without restriction, including without limitation the rights
+ *to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ *copies of the Software, and to permit persons to whom the Software is
+ *furnished to do so, subject to the following conditions:
+ *
+ *The above copyright notice and this permission notice shall be included in
+ *all copies or substantial portions of the Software.
+ *
+ *THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ *OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ *THE SOFTWARE.
+ *
+ */
+
+
+#include <glog/logging.h>
+#include <iostream>
+#include "common.h"
+#include "mkldnn.hpp"
+#include "pooling_bwd.h"
+#include "utils.h"
+
+using namespace mkldnn;
+
+extern engine cpu_engine;
+
+template<typename T>
+Pooling2DBwd<T>::Pooling2DBwd(mkldnn::memory::dims diff_src_d,
+                              mkldnn::memory::dims diff_dst_d,
+                              mkldnn::memory::dims ws_d,
+                              mkldnn::memory::data_type ws_dt,
+                              int ker_h, int ker_w,
+                              int sy, int sx,
+                              int pad_lh, int pad_lw, int pad_rh, int pad_rw,
+                              mkldnn::algorithm alg_kind )
+{
+    bwd_stream_.reset(new stream(stream::kind::eager));
+    // setup
+    if ( bwd_ == NULL) 
+        setup(diff_src_d, diff_dst_d, ws_d, ws_dt, ker_h, ker_w, sy, sx,
+                pad_lh, pad_lw, pad_rh, pad_rw, alg_kind);
+}
+
+template<typename T>
+Pooling2DBwd<T>::~Pooling2DBwd()
+{
+}
+
+template<typename T>
+void Pooling2DBwd<T>::setup(mkldnn::memory::dims diff_src_d,
+                           mkldnn::memory::dims diff_dst_d,
+                           mkldnn::memory::dims ws_d,
+                           mkldnn::memory::data_type ws_dt,
+                           int ker_h, int ker_w,
+                           int sy, int sx,
+                           int pad_lh, int pad_lw, int pad_rh, int pad_rw,
+                           mkldnn::algorithm alg_kind )
+{
+    //LOG(INFO) << "Pooling backward_setup";
+
+    if (alg_kind != pooling_max && alg_kind != pooling_avg
+            && alg_kind != pooling_avg_include_padding && alg_kind != pooling_avg_exclude_padding) {
+        //LOG(ERROR) << "alg_kind must be either pooling_max or "
+         //          << "pooling_avg";
+    }
+    
+    alg_kind_ = alg_kind;
+    memory::dims strides   = {sy, sx};
+    memory::dims padding_l = {pad_lh, pad_lw};
+    memory::dims padding_r = {pad_rh, pad_rw};
+    memory::dims kernel    = {ker_h, ker_w};
+
+    // create memory desc
+    diff_src_md_.reset(new memory::desc({diff_src_d}, memory_data_type<T>(),
+                            memory::format::any)); //
+    // FIXME
+    // Pooling doesn't expose to get the diff_dst_primitive_desc, so we need to hard set the fmt for diff dst
+    // a util function is used to do this, may be broken the condition in future
+    diff_dst_md_.reset(new memory::desc({diff_dst_d}, memory_data_type<T>(),
+                            get_desired_format(diff_dst_d[1]))); // use diff dst chanel to decide fmt
+
+    // create a pooling descriptor
+    bwd_desc_.reset(new pooling_backward::desc(
+                                         alg_kind,
+                                         *diff_src_md_, *diff_dst_md_,
+                                         strides, kernel, padding_l, padding_r,
+                                         padding_kind::zero));
+    
+    //FIXME
+    //Need a forward hint to create backward, will be removed in future
+    // create a pooling descriptor
+    fwd_desc_.reset(new pooling_forward::desc(prop_kind::forward_training, 
+                alg_kind,
+                *diff_src_md_, *diff_dst_md_,
+                strides, kernel, padding_l, padding_r,
+                padding_kind::zero));
+    fwd_pd_.reset(new pooling_forward::primitive_desc( *fwd_desc_, cpu_engine));
+
+    bwd_pd_.reset(new pooling_backward::primitive_desc(
+                                *bwd_desc_, cpu_engine, *fwd_pd_));
+
+    // store expected primitive format
+    diff_src_fmt_ = static_cast<mkldnn::memory::format>(bwd_pd_.get()->diff_src_primitive_desc().desc().data.format);
+    diff_dst_fmt_ = get_desired_format(diff_dst_d[1]);
+
+    // create MKL-DNN internal memory object with dummy data
+    diff_src_mem_.reset(new memory(bwd_pd_.get()->diff_src_primitive_desc(), dummy));
+    diff_dst_mem_.reset(new memory({{{diff_dst_d}, memory_data_type<T>(), diff_dst_fmt_}, cpu_engine}, dummy));
+    
+    // for max pooling, need to return workspace for backward
+    if (alg_kind == pooling_max) {
+        //FIXME
+        //Pooling backward doesn't expose to get the workspace_primitive_desc, we need to hard set here
+        // store workspace's dims and fmt to create ws tensor
+        ws_fmt_ = get_desired_format(ws_d[1]);
+        ws_mem_.reset(new memory({{{ws_d}, ws_dt, ws_fmt_}, cpu_engine}, dummy)); // use ws dims's channel to decide format
+        
+        bwd_.reset(new pooling_backward(
+                *bwd_pd_, *diff_dst_mem_, *ws_mem_, *diff_src_mem_));
+    } else {
+        bwd_.reset(new pooling_backward(
+                *bwd_pd_, *diff_dst_mem_, *diff_src_mem_));
+    }
+
+    bwd_primitives_.push_back(*bwd_);
+    return;
+}
+
+template<typename T>
+void Pooling2DBwd<T>::execute(void *diff_src, void *diff_dst, void *ws)
+{
+    //LOG(INFO) << "Pooling backward";
+
+    diff_src_mem_->set_data_handle(diff_src); // input
+    diff_dst_mem_->set_data_handle(diff_dst); // output dst
+    if ( alg_kind_ == pooling_max ) { // max pooling must have ws
+        assert(ws!=NULL);
+        ws_mem_->set_data_handle(ws); // output workspace
+    }
+       
+    bwd_stream_->submit(bwd_primitives_);
+
+    // set back data handle
+    diff_src_mem_->set_data_handle(dummy);
+    diff_dst_mem_->set_data_handle(dummy);
+    if ( alg_kind_ == pooling_max ) { // max pooling must have ws
+        assert(ws!=NULL);
+        ws_mem_->set_data_handle(dummy);
+    }
+    
+    //LOG(INFO) << "Pooling backward finish";
+    return;
+}
+
+template class Pooling2DBwd<float>;
+
+
+// vim: et ts=4 sw=4 cindent cino^=l0,\:0,N-s
diff --git a/python/ideep4py/primitives/ops/pooling_fwd.cc b/python/ideep4py/primitives/ops/pooling_fwd.cc
new file mode 100644
index 00000000..00c42932
--- /dev/null
+++ b/python/ideep4py/primitives/ops/pooling_fwd.cc
@@ -0,0 +1,156 @@
+/*
+ *Copyright (c) 2018 Intel Corporation.
+ *
+ *Permission is hereby granted, free of charge, to any person obtaining a copy
+ *of this software and associated documentation files (the "Software"), to deal
+ *in the Software without restriction, including without limitation the rights
+ *to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ *copies of the Software, and to permit persons to whom the Software is
+ *furnished to do so, subject to the following conditions:
+ *
+ *The above copyright notice and this permission notice shall be included in
+ *all copies or substantial portions of the Software.
+ *
+ *THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ *OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ *THE SOFTWARE.
+ *
+ */
+
+
+#include <glog/logging.h>
+#include <iostream>
+#include "common.h"
+#include "mkldnn.hpp"
+#include "pooling_fwd.h"
+#include "utils.h"
+
+using namespace mkldnn;
+
+extern engine cpu_engine;
+
+template<typename T>
+Pooling2DFwd<T>::Pooling2DFwd(mkldnn::memory::dims src_d,
+                              mkldnn::memory::dims dst_d,
+                              int ker_h, int ker_w,
+                              int sy, int sx,
+                              int pad_lh, int pad_lw, int pad_rh, int pad_rw,
+                              mkldnn::algorithm alg_kind )
+{
+    fwd_stream_.reset(new stream(stream::kind::eager));
+    // setup
+    if ( fwd_ == NULL) 
+        setup(src_d, dst_d, ker_h, ker_w, sy, sx,
+                pad_lh, pad_lw, pad_rh, pad_rw, alg_kind);
+}
+
+template<typename T>
+Pooling2DFwd<T>::~Pooling2DFwd()
+{
+}
+
+template<typename T>
+void Pooling2DFwd<T>::setup(mkldnn::memory::dims src_d,
+                           mkldnn::memory::dims dst_d,
+                           int ker_h, int ker_w,
+                           int sy, int sx,
+                           int pad_lh, int pad_lw, int pad_rh, int pad_rw,
+                           mkldnn::algorithm alg_kind )
+{
+    //LOG(INFO) << "Pooling forward_setup";
+
+    if (alg_kind != pooling_max && alg_kind != pooling_avg
+            && alg_kind != pooling_avg_include_padding && alg_kind != pooling_avg_exclude_padding) {
+        //LOG(ERROR) << "alg_kind must be either pooling_max or "
+                   //<< "pooling_avg";
+    }
+    
+    alg_kind_ = alg_kind;
+    memory::dims strides   = {sy, sx};
+    memory::dims padding_l = {pad_lh, pad_lw};
+    memory::dims padding_r = {pad_rh, pad_rw};
+    memory::dims kernel    = {ker_h, ker_w};
+
+    // create memory desc
+    // FIXME
+    // Pooling doesn't expose to get the src_primitive_desc, so we need to hard set the fmt for src
+    // a util function is used to do this, may be broken the condition in future
+    src_md_.reset(new memory::desc({src_d}, memory_data_type<T>(),
+                            get_desired_format(src_d[1]))); // use src's input channel to decide expected fmt
+    dst_md_.reset(new memory::desc({dst_d}, memory_data_type<T>(),
+                            memory::format::any));
+
+    // create a pooling descriptor
+    fwd_desc_.reset(new pooling_forward::desc(prop_kind::forward_training,
+                                         alg_kind,
+                                         *src_md_, *dst_md_,
+                                         strides, kernel, padding_l, padding_r,
+                                         padding_kind::zero));
+
+    fwd_pd_.reset(new pooling_forward::primitive_desc(
+                                *fwd_desc_, cpu_engine));
+
+    // store expected primitive format
+    src_fmt_ = get_desired_format(src_d[1]);
+    dst_fmt_ = static_cast<mkldnn::memory::format>(fwd_pd_.get()->dst_primitive_desc().desc().data.format);
+
+    // create MKL-DNN internal memory object with dummy data
+    src_mem_.reset(new memory({{{src_d}, memory_data_type<T>(), src_fmt_}, cpu_engine}, dummy));
+    dst_mem_.reset(new memory(fwd_pd_.get()->dst_primitive_desc(), dummy));
+    
+    // for max pooling, need to return workspace for backward
+    if (alg_kind == pooling_max) {
+        auto ws_pd = fwd_pd_.get()->workspace_primitive_desc().desc().data;
+
+        // store workspace's dims and fmt to create ws tensor
+        ws_fmt_ = static_cast<mkldnn::memory::format>(ws_pd.format);
+        ws_dims_.assign(ws_pd.dims, ws_pd.dims+ws_pd.ndims);
+        ws_dt_ = static_cast<mkldnn::memory::data_type>(ws_pd.data_type);
+        ws_size_ = fwd_pd_.get()->workspace_primitive_desc().get_size();
+
+        ws_mem_.reset(new memory(fwd_pd_.get()->workspace_primitive_desc(), dummy));
+        fwd_.reset(new pooling_forward(
+                *fwd_pd_, *src_mem_, *dst_mem_, *ws_mem_));
+    } else {
+        fwd_.reset(new pooling_forward(
+                *fwd_pd_, *src_mem_, *dst_mem_));
+    }
+
+    fwd_primitives_.push_back(*fwd_);
+    return;
+}
+
+template<typename T>
+void Pooling2DFwd<T>::execute(void *src, void *dst, void *ws)
+{
+    //LOG(INFO) << "Pooling forward";
+
+    src_mem_->set_data_handle(src); // input
+    dst_mem_->set_data_handle(dst); // output dst
+    if ( alg_kind_ == pooling_max ) { // max pooling must have ws
+        assert(ws!=NULL);
+        ws_mem_->set_data_handle(ws); // output workspace
+    }
+       
+    fwd_stream_->submit(fwd_primitives_);
+
+    // set back data handle
+    src_mem_->set_data_handle(dummy);
+    dst_mem_->set_data_handle(dummy);
+    if ( alg_kind_ == pooling_max ) { // max pooling must have ws
+        assert(ws!=NULL);
+        ws_mem_->set_data_handle(dummy);
+    }
+    
+    //LOG(INFO) << "Pooling forward finish";
+    return;
+}
+
+template class Pooling2DFwd<float>;
+
+
+// vim: et ts=4 sw=4 cindent cino^=l0,\:0,N-s
diff --git a/python/ideep4py/primitives/ops/reorder_op.cc b/python/ideep4py/primitives/ops/reorder_op.cc
new file mode 100644
index 00000000..f29a4bdd
--- /dev/null
+++ b/python/ideep4py/primitives/ops/reorder_op.cc
@@ -0,0 +1,88 @@
+/*
+ *Copyright (c) 2018 Intel Corporation.
+ *
+ *Permission is hereby granted, free of charge, to any person obtaining a copy
+ *of this software and associated documentation files (the "Software"), to deal
+ *in the Software without restriction, including without limitation the rights
+ *to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ *copies of the Software, and to permit persons to whom the Software is
+ *furnished to do so, subject to the following conditions:
+ *
+ *The above copyright notice and this permission notice shall be included in
+ *all copies or substantial portions of the Software.
+ *
+ *THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ *OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ *THE SOFTWARE.
+ *
+ */
+
+
+#include <glog/logging.h>
+#include <iostream>
+#include "mkldnn.hpp"
+#include "reorder_op.h"
+#include "utils.h"
+#include "common.h"
+
+using namespace mkldnn;
+
+extern engine cpu_engine;
+
+template<typename T>
+ReorderOp<T>::ReorderOp( mkldnn::memory::dims dims, mkldnn::memory::format src_fmt, mkldnn::memory::format dst_fmt)
+{
+    reorder_stream_.reset(new stream(stream::kind::eager));
+    // create primitive
+    if (reorder_prim_ == NULL) {
+        setup(dims, src_fmt, dst_fmt);
+    }
+}
+
+template<typename T>
+ReorderOp<T>::~ReorderOp()
+{
+}
+
+template<typename T>
+void ReorderOp<T>::setup(mkldnn::memory::dims dims, 
+                         mkldnn::memory::format src_fmt,
+                         mkldnn::memory::format dst_fmt)
+{
+    //LOG(INFO) << "Reorder setup";
+    
+    assert(src_fmt != dst_mfmt);
+
+    src_md_.reset(new memory::desc(dims, memory_data_type<T>(), src_fmt));
+    dst_md_.reset(new memory::desc(dims, memory_data_type<T>(), dst_fmt));
+    
+    src_mem_.reset(new memory({*src_md_, cpu_engine},dummy));
+    dst_mem_.reset(new memory({*dst_md_, cpu_engine},dummy));
+
+    reorder_prim_ = std::make_shared<mkldnn::reorder>(reorder(*src_mem_, *dst_mem_));
+
+    return;
+}
+
+template<typename T>
+void ReorderOp<T>::execute(void* src, void* dst)
+{
+    //LOG(INFO) << "Reorder execute";
+    src_mem_->set_data_handle(src);
+    dst_mem_->set_data_handle(dst);
+    reorder_stream_->submit({*reorder_prim_});
+
+    //after exec, set data handle back
+    src_mem_->set_data_handle(dummy);
+    dst_mem_->set_data_handle(dummy);
+    return;
+}
+
+template class ReorderOp<float>;
+
+
+// vim: et ts=4 sw=4 cindent cino^=l0,\:0,N-s
diff --git a/python/ideep4py/primitives/pooling.cc b/python/ideep4py/primitives/pooling.cc
new file mode 100644
index 00000000..c267fa26
--- /dev/null
+++ b/python/ideep4py/primitives/pooling.cc
@@ -0,0 +1,216 @@
+/*
+ *Copyright (c) 2018 Intel Corporation.
+ *
+ *Permission is hereby granted, free of charge, to any person obtaining a copy
+ *of this software and associated documentation files (the "Software"), to deal
+ *in the Software without restriction, including without limitation the rights
+ *to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ *copies of the Software, and to permit persons to whom the Software is
+ *furnished to do so, subject to the following conditions:
+ *
+ *The above copyright notice and this permission notice shall be included in
+ *all copies or substantial portions of the Software.
+ *
+ *THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ *OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ *THE SOFTWARE.
+ *
+ */
+
+
+#include <glog/logging.h>
+#include <iostream>
+#include "common.h"
+#include "mkldnn.hpp"
+#include "tensor.h"
+#include "mem.h"
+#include "pooling.h"
+#include "utils.h"
+#include "pooling_fwd.h"
+#include "pooling_bwd.h"
+#include "prim_factory.h"
+#include "reorder_op.h"
+
+using namespace mkldnn;
+
+extern engine cpu_engine;
+
+template<typename T>
+Pooling2D<T>::Pooling2D()
+{
+}
+
+template<typename T>
+Pooling2D<T>::~Pooling2D()
+{
+}
+
+template<typename T>
+std::vector<Tensor *> Pooling2D<T>::Forward(
+                Tensor *src,
+                pooling_param_t *pp)
+{
+    std::vector<Tensor *> outputs;
+
+    // sanity check
+    mkldnn::memory::dims src_dims = (mkldnn::memory::dims)(src->dims());
+    mkldnn::memory::dims dst_dims = (mkldnn::memory::dims)(pp->out_dims);
+    assert(src_dims == src->cxx_dims());
+
+    //sanity check for data type
+    //assuem all should have same data type as T
+    //FIXME
+    //yli135: Is it possible x and w have different data type????
+    assert(memory_data_type<T>() == src->cxx_data_type());
+
+    // get a conv2d fwd from primitive pool
+    Pooling2DFwd<T> *pooling2d_forward = NULL;
+    pooling2d_forward = Pooling2DFwdFactory<T>::get(src_dims, dst_dims,
+                pp->kh, pp->kw,
+                pp->sy, pp->sx,
+                pp->pad_lh, pp->pad_lw, pp->pad_rh, pp->pad_rw,
+                pooling_algo_convert(pp->algo_kind));
+
+    mkldnn::memory::format src_fmt = src->cxx_format(); // src fmt in tensor
+
+    void *src_tmp = src->data();
+    shared_ptr<avx::byte> src_reorder;
+
+    // check wehther fmt is same
+    if (src_fmt == pooling2d_forward->src_fmt_) {
+        //LOG(INFO) << "pooling forward fmt matched";
+    } else {
+        //LOG(INFO) << "pooling fwd fmt not match, need to reorder";
+
+        if (src_fmt != pooling2d_forward->src_fmt_) {
+            //LOG(INFO) << "src_fmt=" << src_fmt <<", pooling2d_forward->src_fmt_=" << pooling2d_forward->src_fmt_;
+            // FIXME: when to free the reordered memory
+            ReorderOp<T>* reorder_src_op = ReorderFactory<T>::get(src_dims, src_fmt, pooling2d_forward->src_fmt_);
+            src_reorder = Allocator::malloc(src->len(), MPOOL_REORDER);
+            //src_reorder = new avx::byte[src->len()];
+            reorder_src_op->execute(src_tmp, src_reorder.get());
+            src_tmp = src_reorder.get();
+        }
+    }
+
+    // create tensor based on primitive's dst 
+    // assume dst and src have same data type
+    // Tensor *dst_tensor = new Tensor(dst_dims, src->cxx_data_type(), pooling2d_forward->dst_fmt_, cpu_engine);
+    auto data = Allocator::malloc(dst_dims, type2size(src->type()), MPOOL_POOLING_FWD);
+    Tensor *dst_tensor = new Tensor(dst_dims.size(), dst_dims, data,
+            (mkldnn_memory_format_t)pooling2d_forward->dst_fmt_,
+            src->type());
+    
+    // do forward
+    // for max pooling, need to return workspace
+    if (pp->algo_kind == pooling_param_t::algorithm::pooling_max) {
+        //LOG(INFO) << "ws_dt_=" << pooling2d_forward->ws_dt_;
+        // workspace must be int tensor
+        //Tensor *ws_tensor = new Tensor((pooling2d_forward->ws_dims_), pooling2d_forward->ws_dt_, pooling2d_forward->ws_fmt_, cpu_engine);
+        auto ws_data = Allocator::malloc(pooling2d_forward->ws_size_, MPOOL_POOLING_FWD);
+        Tensor *ws_tensor = new Tensor(pooling2d_forward->ws_dims_,
+                static_cast<mkldnn_data_type_t>(pooling2d_forward->ws_dt_),
+                pooling2d_forward->ws_fmt_, ws_data);
+
+        pooling2d_forward->execute(src_tmp, dst_tensor->data(), ws_tensor->data());
+        outputs.push_back(dst_tensor);
+        outputs.push_back(ws_tensor);
+    } else {
+        pooling2d_forward->execute(src_tmp, dst_tensor->data());
+        outputs.push_back(dst_tensor);
+    }
+
+    //LOG(INFO) << "Succ exec pooling forward";
+    return outputs;
+}
+
+template<typename T>
+Tensor *Pooling2D<T>::Backward(
+                Tensor *diff_dst,
+                Tensor *ws,
+                pooling_param_t *pp)
+{
+    //sanity check
+    mkldnn::memory::dims diff_src_dims = (mkldnn::memory::dims)pp->out_dims;
+    mkldnn::memory::dims diff_dst_dims = (mkldnn::memory::dims)diff_dst->dims();
+    assert(diff_dst_dims == diff_dst->cxx_dims());
+
+    mkldnn::memory::dims ws_dims;
+    mkldnn::memory::data_type ws_dt;
+    if (pp->algo_kind == pooling_param_t::algorithm::pooling_max) {
+        ws_dims = ws->cxx_dims();
+        ws_dt = ws->cxx_data_type();
+    }
+    // sanity check for data type
+    // assuem all x/w/b should have same data type as T
+    // FIXME
+    // yli135: Is it possible x and w have different data type????
+    assert(memory_data_type<T>() == diff_dst->cxx_data_type());
+
+    // get a conv2d bwd data from primitive pool
+    Pooling2DBwd<T> *pooling2d_bwd = NULL;
+    if (pp->algo_kind == pooling_param_t::algorithm::pooling_max) {
+        pooling2d_bwd = Pooling2DBwdFactory<T>::get( diff_src_dims, diff_dst_dims, ws_dims, ws_dt,
+                pp->kh, pp->kw, pp->sy, pp->sx,
+                pp->pad_lh, pp->pad_lw, pp->pad_rh, pp->pad_rw,
+                pooling_algo_convert(pp->algo_kind));
+    } else {
+        pooling2d_bwd = Pooling2DBwdFactory<T>::get( diff_src_dims, diff_dst_dims, NONE_DIMS, mkldnn::memory::data_type::data_undef, 
+                pp->kh, pp->kw, pp->sy, pp->sx,
+                pp->pad_lh, pp->pad_lw, pp->pad_rh, pp->pad_rw,
+                pooling_algo_convert(pp->algo_kind));
+    }
+
+    // FIXME: in this model, every call to conv_forward will create a new tensor, when to free???
+    mkldnn::memory::format ws_fmt;
+    void* ws_tmp = nullptr;
+    shared_ptr<avx::byte> ws_reorder;
+    if (pp->algo_kind == pooling_param_t::algorithm::pooling_max) {
+        ws_fmt = ws->cxx_format();
+        ws_tmp = ws->data();
+    }
+    
+    mkldnn::memory::format diff_dst_fmt = diff_dst->cxx_format();
+    void* diff_dst_tmp = diff_dst->data();
+    shared_ptr<avx::byte> diff_dst_reorder;
+
+    if ( pp->algo_kind == pooling_param_t::algorithm::pooling_max &&
+            ws_fmt != pooling2d_bwd->ws_fmt_) {
+        LOG(INFO) << "ws_fmt=" << ws_fmt << ", pooling2d_bwd->ws_fmt_="<< pooling2d_bwd->ws_fmt_;
+        ReorderOp<T>* reorder_ws_op = ReorderFactory<T>::get(ws_dims, ws_fmt, pooling2d_bwd->ws_fmt_);
+        ws_reorder = Allocator::malloc(ws->len(), MPOOL_REORDER);
+        //ws_reorder = new avx::byte[ws->len()];
+        reorder_ws_op->execute(ws_tmp, ws_reorder.get());
+        ws_tmp = ws_reorder.get();
+    } 
+    if (diff_dst_fmt != pooling2d_bwd->diff_dst_fmt_) {
+        LOG(INFO) << "diff_dst_fmt=" << diff_dst_fmt <<", pooling2d_bwd->diff_dst_fmt_=" << pooling2d_bwd->diff_dst_fmt_;
+        ReorderOp<T>* reorder_diff_dst_op = ReorderFactory<T>::get(diff_dst_dims, diff_dst_fmt, pooling2d_bwd->diff_dst_fmt_);
+        diff_dst_reorder = Allocator::malloc(diff_dst->len(), MPOOL_REORDER);
+        //diff_dst_reorder = new avx::byte[diff_dst->len()];
+        reorder_diff_dst_op->execute(diff_dst_tmp, diff_dst_reorder.get());
+        diff_dst_tmp = diff_dst_reorder.get();
+    }
+
+    // create tensor based on selected primitive
+    // assume dst and src have same data type
+    // Tensor *diff_src_tensor = new Tensor(diff_src_dims, diff_dst->cxx_data_type(), pooling2d_bwd->diff_src_fmt_, cpu_engine);
+    auto data = Allocator::malloc(diff_src_dims, type2size(diff_dst->type()), MPOOL_POOLING_BWD);
+    Tensor *diff_src_tensor = new Tensor(diff_src_dims.size(), diff_src_dims, data,
+            (mkldnn_memory_format_t)pooling2d_bwd->diff_src_fmt_,
+            diff_dst->type());
+    
+    pooling2d_bwd->execute(diff_src_tensor->data(), diff_dst_tmp, ws_tmp);
+
+    return diff_src_tensor;
+}
+
+
+template class Pooling2D<float>;
+
+
+// vim: et ts=4 sw=4 cindent cino^=l0,\:0,N-s
diff --git a/python/ideep4py/py/dlcp/dlcp.i b/python/ideep4py/py/dlcp/dlcp.i
new file mode 100644
index 00000000..be749d0a
--- /dev/null
+++ b/python/ideep4py/py/dlcp/dlcp.i
@@ -0,0 +1,6 @@
+%{
+    #define SWIG_FILE_WITH_INIT
+    #include "dlcp_py.h"
+%}
+
+%include "dlcp_py.h"
diff --git a/python/ideep4py/py/dlcp/dlcp_py.cc b/python/ideep4py/py/dlcp/dlcp_py.cc
new file mode 100644
index 00000000..67c80bb8
--- /dev/null
+++ b/python/ideep4py/py/dlcp/dlcp_py.cc
@@ -0,0 +1,30 @@
+/*
+ *COPYRIGHT
+ *All modification made by Intel Corporation: © 2017 Intel Corporation.
+ *Copyright (c) 2015 Preferred Infrastructure, Inc.
+ *Copyright (c) 2015 Preferred Networks, Inc.
+ *
+ *Permission is hereby granted, free of charge, to any person obtaining a copy
+ *of this software and associated documentation files (the "Software"), to deal
+ *in the Software without restriction, including without limitation the rights
+ *to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ *copies of the Software, and to permit persons to whom the Software is
+ *furnished to do so, subject to the following conditions:
+ *
+ *The above copyright notice and this permission notice shall be included in
+ *all copies or substantial portions of the Software.
+ *
+ *THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ *OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ *THE SOFTWARE.
+ *
+ */
+
+
+#include "dlcp_py.h"
+
+bool dlCompression::available = false;
diff --git a/python/ideep4py/py/dlcp/dlcp_py.h b/python/ideep4py/py/dlcp/dlcp_py.h
new file mode 100644
index 00000000..b59a0ed6
--- /dev/null
+++ b/python/ideep4py/py/dlcp/dlcp_py.h
@@ -0,0 +1,121 @@
+/*
+ *COPYRIGHT
+ *All modification made by Intel Corporation: © 2017 Intel Corporation.
+ *Copyright (c) 2015 Preferred Infrastructure, Inc.
+ *Copyright (c) 2015 Preferred Networks, Inc.
+ *
+ *Permission is hereby granted, free of charge, to any person obtaining a copy
+ *of this software and associated documentation files (the "Software"), to deal
+ *in the Software without restriction, including without limitation the rights
+ *to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ *copies of the Software, and to permit persons to whom the Software is
+ *furnished to do so, subject to the following conditions:
+ *
+ *The above copyright notice and this permission notice shall be included in
+ *all copies or substantial portions of the Software.
+ *
+ *THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ *OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ *THE SOFTWARE.
+ *
+ */
+
+
+#ifndef _DLCP_PY_H_
+#define _DLCP_PY_H_
+
+#include "dl_compression.h"
+#include "mdarray.h"
+#include "tensor.h"
+
+class dlCompression {
+public:
+    enum {
+        dl_comp_none = DL_COMP_NONE,
+        dl_comp_dfp = DL_COMP_DFP,
+    };
+
+    enum {
+        dl_comp_ok = DL_COMP_OK,
+        dl_comp_fail = DL_COMP_FAIL,
+        dl_comp_fail_src_data_type_not_supported =
+            DL_COMP_FAIL_SRC_DATA_TYPE_NOT_SUPPORTED,
+        dl_comp_fail_ratio_not_supported =
+            DL_COMP_FAIL_RATIO_NOT_SUPPORTED,
+        dl_comp_fail_comp_method_not_supported =
+            DL_COMP_FAIL_COMP_METHOD_NOT_SUPPORTED,
+        dl_comp_fail_invalid_compressed_format =
+            DL_COMP_FAIL_INVALID_COMPRESSED_FORMAT,
+        dl_comp_fail_not_supported =
+            DL_COMP_FAIL_NOT_SUPPORTED,
+    };
+
+    static bool available;
+
+    static void init() {
+        available = dl_comp_check_running_environ();
+    }
+
+    static bool is_available() {
+        return available;
+    }
+
+    static int Compress(mdarray *src, mdarray *dst,
+        mdarray *diff, size_t ratio, int method) {
+        if (!is_available())
+            return DL_COMP_FAIL_NOT_SUPPORTED;
+
+        if (src->get()->tensor()->size() !=
+            dst->get()->tensor()->size())
+            return DL_COMP_FAIL;
+
+        if (src->get()->tensor()->type() !=
+            dst->get()->tensor()->type())
+            return DL_COMP_FAIL;
+
+        int dtype = -1;
+        switch (src->get()->tensor()->type()) {
+        case SINT8:
+            dtype = DL_COMP_INT8;
+            break;
+
+        case FLOAT32:
+            dtype = DL_COMP_FLOAT32;
+            break;
+
+        default:
+            break;
+        }
+
+        if (-1 == dtype)
+            return DL_COMP_FAIL_SRC_DATA_TYPE_NOT_SUPPORTED;
+
+        return dl_comp_compress_buffer(src->get()->tensor()->data(),
+            dst->get()->tensor()->data(), src->get()->tensor()->size(),
+            diff ? diff->get()->tensor()->data() : nullptr,
+            (dl_comp_data_type_t)dtype, ratio, (dl_comp_method_t)method);
+    }
+
+    static int Decompress(mdarray *src, mdarray *dst) {
+        if (!is_available())
+            return DL_COMP_FAIL_NOT_SUPPORTED;
+
+        if (src->get()->tensor()->size() !=
+            dst->get()->tensor()->size())
+            return DL_COMP_FAIL;
+
+        if (src->get()->tensor()->type() !=
+            dst->get()->tensor()->type())
+            return DL_COMP_FAIL;
+
+        return dl_comp_decompress_buffer(src->get()->tensor()->data(),
+                                         dst->get()->tensor()->data(),
+                                         src->get()->tensor()->size());
+    }
+};
+
+#endif
diff --git a/python/ideep4py/py/ideep4py.i b/python/ideep4py/py/ideep4py.i
new file mode 100644
index 00000000..f9d350f8
--- /dev/null
+++ b/python/ideep4py/py/ideep4py.i
@@ -0,0 +1,41 @@
+/*
+ *Copyright (c) 2018 Intel Corporation.
+ *
+ *Permission is hereby granted, free of charge, to any person obtaining a copy
+ *of this software and associated documentation files (the "Software"), to deal
+ *in the Software without restriction, including without limitation the rights
+ *to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ *copies of the Software, and to permit persons to whom the Software is
+ *furnished to do so, subject to the following conditions:
+ *
+ *The above copyright notice and this permission notice shall be included in
+ *all copies or substantial portions of the Software.
+ *
+ *THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ *OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ *THE SOFTWARE.
+ *
+ */
+
+
+%module ideep4py
+
+%init %{
+  import_array();
+  implementation::g_init();
+%}
+
+%include "mdarray.i"
+%include "eltwise.i"
+%include "conv.i"
+%include "pooling.i"
+%include "linear.i"
+%include "bn.i"
+%include "concat.i"
+%include "lrn.i"
+%include "dropout.i"
+%include "dlcp.i"
diff --git a/python/ideep4py/py/mm/basic.cc b/python/ideep4py/py/mm/basic.cc
new file mode 100644
index 00000000..e3df0062
--- /dev/null
+++ b/python/ideep4py/py/mm/basic.cc
@@ -0,0 +1,71 @@
+/*
+ *Copyright (c) 2018 Intel Corporation.
+ *
+ *Permission is hereby granted, free of charge, to any person obtaining a copy
+ *of this software and associated documentation files (the "Software"), to deal
+ *in the Software without restriction, including without limitation the rights
+ *to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ *copies of the Software, and to permit persons to whom the Software is
+ *furnished to do so, subject to the following conditions:
+ *
+ *The above copyright notice and this permission notice shall be included in
+ *all copies or substantial portions of the Software.
+ *
+ *THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ *OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ *THE SOFTWARE.
+ *
+ */
+
+
+#include "basic.h"
+#include "tensor.h"
+
+PyObject *basic::copyto(mdarray *dst, mdarray *src)
+{
+    Tensor *tdst = dst->get()->tensor();
+    Tensor *tsrc = src->get()->tensor();
+    if (tdst->copyto(tsrc) == true)
+        Py_RETURN_NONE;
+    return nullptr;
+}
+
+PyObject *basic::copyto(mdarray *dst, Py_buffer *src_view)
+{
+    // Validate it in ideepy code
+    Tensor *tdst = dst->get()->tensor();
+    if (tdst->len() != (size_t)src_view->len) {
+        return nullptr;
+    }
+    tdst->copyto((char *)src_view->buf);
+    Py_RETURN_NONE;
+}
+
+mdarray basic::acc_sum(vector<mdarray *> arrays)
+{
+    vector<shared_ptr<memory>> srcs_memory;
+    vector<memory::primitive_desc> srcs_pd;
+    vector<primitive::at> inputs;
+    vector<float> scales;
+    for (vector<mdarray *>::iterator it = arrays.begin();
+            it != arrays.end(); it++) {
+        Tensor *tensor = (*it)->get()->tensor();
+        scales.push_back(1.0);
+        srcs_pd.push_back(tensor->mkldnn_memory().get_primitive_desc());
+        inputs.push_back(tensor->mkldnn_memory());
+    }
+    auto sum_pd = sum::primitive_desc(scales, srcs_pd);
+    auto dst_pd = sum_pd.dst_primitive_desc();
+    Tensor *dst_tensor = new Tensor(dst_pd);
+    auto sum_p = sum(sum_pd, inputs, dst_tensor->mkldnn_memory());
+
+    mkldnn::stream s(mkldnn::stream::eager);
+    s.submit({sum_p}).wait();
+
+    mdarray dst_mdarray = mdarray(dst_tensor);
+    return dst_mdarray;
+}
diff --git a/python/ideep4py/py/mm/basic.h b/python/ideep4py/py/mm/basic.h
new file mode 100644
index 00000000..a6484963
--- /dev/null
+++ b/python/ideep4py/py/mm/basic.h
@@ -0,0 +1,36 @@
+/*
+ *Copyright (c) 2018 Intel Corporation.
+ *
+ *Permission is hereby granted, free of charge, to any person obtaining a copy
+ *of this software and associated documentation files (the "Software"), to deal
+ *in the Software without restriction, including without limitation the rights
+ *to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ *copies of the Software, and to permit persons to whom the Software is
+ *furnished to do so, subject to the following conditions:
+ *
+ *The above copyright notice and this permission notice shall be included in
+ *all copies or substantial portions of the Software.
+ *
+ *THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ *OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ *THE SOFTWARE.
+ *
+ */
+
+
+#pragma once
+#define NO_IMPORT_ARRAY
+#define PY_ARRAY_UNIQUE_SYMBOL basic_ARRAY_API
+#include <Python.h>
+#include "mdarray.h"
+
+class basic {
+public:
+    static PyObject *copyto(mdarray *dst, mdarray *src);
+    static PyObject *copyto(mdarray *dst, Py_buffer *view);
+    static mdarray acc_sum(vector<mdarray *> arrays);
+};
diff --git a/python/ideep4py/py/mm/basic.i b/python/ideep4py/py/mm/basic.i
new file mode 100644
index 00000000..591c18b9
--- /dev/null
+++ b/python/ideep4py/py/mm/basic.i
@@ -0,0 +1,67 @@
+/*
+ *Copyright (c) 2018 Intel Corporation.
+ *
+ *Permission is hereby granted, free of charge, to any person obtaining a copy
+ *of this software and associated documentation files (the "Software"), to deal
+ *in the Software without restriction, including without limitation the rights
+ *to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ *copies of the Software, and to permit persons to whom the Software is
+ *furnished to do so, subject to the following conditions:
+ *
+ *The above copyright notice and this permission notice shall be included in
+ *all copies or substantial portions of the Software.
+ *
+ *THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ *OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ *THE SOFTWARE.
+ *
+ */
+
+
+%{
+  #include "basic.h"
+%}
+
+%typemap(in) (vector<mdarray *> arrays) {
+    int i;
+    int argc;
+    vector<mdarray *> varr;
+    if (!PyTuple_Check($input)) {
+        PyErr_SetString(PyExc_ValueError,"Expected a tuple");
+        return nullptr;
+    }
+    argc = PyTuple_Size($input);
+    for (i = 0; i < argc; i++) {
+        PyObject *obj = PyTuple_GET_ITEM($input, i);
+        if (!implementation::mdarray::is_mdarray(obj)) {
+            PyErr_SetString(PyExc_ValueError,"Expected a mdarray in acc_sum");
+            return nullptr;
+        }
+#if 0
+        if (!PyArray_Check(obj)) {
+            PyErr_SetString(PyExc_ValueError,"Expected a array");
+            return nullptr;
+        }
+#endif
+        void *that;
+        int res1 = SWIG_ConvertPtr(obj, &that, nullptr, 0);
+        if (!SWIG_IsOK(res1)) {
+            PyErr_SetString(PyExc_ValueError, "Can't convert mdarray pyobject");
+            return nullptr;
+        }
+        varr.push_back((mdarray *)that);
+    }
+    $1 = varr;
+}
+
+class basic {
+public:
+    static PyObject *copyto(mdarray *dst, mdarray *src);
+    static PyObject *copyto(mdarray *dst, Py_buffer *view);
+    static mdarray acc_sum(vector<mdarray *> arrays);
+};
+
diff --git a/python/ideep4py/py/mm/mdarray.cc b/python/ideep4py/py/mm/mdarray.cc
new file mode 100755
index 00000000..5d43934a
--- /dev/null
+++ b/python/ideep4py/py/mm/mdarray.cc
@@ -0,0 +1,916 @@
+/*
+ *Copyright (c) 2018 Intel Corporation.
+ *
+ *Permission is hereby granted, free of charge, to any person obtaining a copy
+ *of this software and associated documentation files (the "Software"), to deal
+ *in the Software without restriction, including without limitation the rights
+ *to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ *copies of the Software, and to permit persons to whom the Software is
+ *furnished to do so, subject to the following conditions:
+ *
+ *The above copyright notice and this permission notice shall be included in
+ *all copies or substantial portions of the Software.
+ *
+ *THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ *OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ *THE SOFTWARE.
+ *
+ */
+
+
+#include <Python.h>
+#define NPY_NO_DEPRECATED_API NPY_1_7_API_VERSION
+#include <glog/logging.h>
+#if defined(OPENMP_AFFINITY)
+#include "cpu_info.h"
+#endif
+#include "mdarray.h"
+#include <mkl_vml_functions.h>
+#include "mkldnn_ex.h"
+#include "dlcp_py.h"
+
+namespace implementation {
+
+static PyObject *PyType_reorder_buffer = nullptr;
+
+static swig_type_info *SwigTy_mdarray = nullptr;
+//static swig_type_info *SwigTy_engine = nullptr;
+static PyObject *PyType_mdarray = nullptr;
+
+// get mdarray from PyObject
+static inline mdarray *get_mdarray_from_PyObject(PyObject *self) {
+    void *oprd_self;
+    int res = SWIG_ConvertPtr(self, &oprd_self, nullptr, 0);
+    if (!SWIG_IsOK(res)) {
+        // PyErr_SetString(PyExc_ValueError, "Error self PyObject");
+        return NULL;
+    }
+    return (reinterpret_cast<py_handle *>(oprd_self))->get();
+}
+
+//check whether mdarray support this operation
+static inline bool is_mdarray_supported(PyObject *self, PyObject *o) {
+    // get self mdarray
+    mdarray *self_mdarray = get_mdarray_from_PyObject(self);
+    if (!self_mdarray)
+        return false;
+
+    // o is ndarray
+    // if size not equal, mean array broadcast
+    if (reinterpret_cast<PyTypeObject *>(o->ob_type) == &PyArray_Type) {
+        if ((size_t)PyArray_SIZE(reinterpret_cast<PyArrayObject *>(o))
+                != self_mdarray->size() ||
+                !PyArray_ISFLOAT(reinterpret_cast<PyArrayObject *>(o))) {
+            return false;
+        }
+        return true;
+    }
+
+    // o is mdarray
+    if (reinterpret_cast<PyTypeObject *>(o->ob_type)
+            == reinterpret_cast<PyTypeObject *>(PyType_mdarray)) {
+        // if o is mdarray, try to get mdarray
+        mdarray *o_mdarray = get_mdarray_from_PyObject(o);
+        if (!o_mdarray)
+            return false;
+
+        // not support different size's mdarray's operations
+        if (o_mdarray->size() != self_mdarray->size())
+            return false;
+
+        return true;
+    }
+
+    return false;
+}
+
+PyObject *queryPyTypeObject(const char *name) {
+  swig_type_info *info = SWIG_TypeQuery(name);
+  if (info != nullptr) {
+    SwigPyClientData *cd
+      = (SwigPyClientData *)info->clientdata;
+    return reinterpret_cast<PyObject *>(cd->pytype);
+  }
+
+  throw mkldnn::error(mkldnn_invalid_arguments
+      , "Failed to find reorderer object");
+}
+
+// We brought this to global scope to mitigate it consumption
+#if PY_VERSION_HEX >= 0x03000000
+int g_init() {
+#else
+void g_init() {
+#endif
+  PyType_reorder_buffer = queryPyTypeObject("_p_reorder_buffer");
+  SwigTy_mdarray = SWIG_TypeQuery("_p_mdarray");
+  PyType_mdarray = queryPyTypeObject("_p_mdarray");
+  //SwigTy_engine = SWIG_TypeQuery("_p_mkldnn__engine");
+
+#if PY_VERSION_HEX < 0x03000000
+  if ((reinterpret_cast<PyTypeObject *>(PyType_mdarray)->tp_flags
+    & Py_TPFLAGS_HAVE_NEWBUFFER) != Py_TPFLAGS_HAVE_NEWBUFFER)
+    throw mkldnn::error(mkldnn_invalid_arguments
+    , "Python2 should have new buffer flag on!");
+#endif
+
+  // XXX: I don't quite understand it, and its repercussions :)
+  SwigPyObject_stype = SWIG_MangledTypeQuery("_p_SwigPyObject");
+
+  if (SwigPyObject_stype == nullptr)
+    throw mkldnn::error(mkldnn_invalid_arguments
+        , "Failed to find SwigPyObject object");
+
+  // Initiate static variables imported from numpy include
+  import_array();
+
+#if defined(OPENMP_AFFINITY)
+  google::SetStderrLogging(1);
+  google::InitGoogleLogging("mkldnn");
+  OpenMpManager::bindOpenMpThreads();
+  OpenMpManager::printVerboseInformation();
+#endif
+
+  dlCompression::init();
+
+#if PY_VERSION_HEX >= 0x03000000
+  return 0;
+#else
+  return;
+#endif
+}
+
+//FIXME: macro SWIG_as_voidptr is copied from mdarray_wrap.cpp
+#define SWIG_as_voidptr(a) const_cast< void * >(static_cast< const void * >(a))
+
+// Pickle
+PyObject *mdarray::__getstate__() const {
+  auto md = desc();
+  void *raw_data = data();
+  int ndims = md.data.ndims;
+  mkldnn::memory::dims dims;
+  mkldnn::memory::data_type dtype = static_cast<mkldnn::memory::data_type>(md.data.data_type);
+  mkldnn::memory::format format = static_cast<mkldnn::memory::format>(md.data.format);
+  static mkldnn::engine engine = get_engine();
+
+  PyObject *py_dims = PyTuple_New(ndims);
+  for (int i = 0; i < ndims; i++) {
+    PyObject *py_dim = PyLong_FromLong(md.data.dims[i]);
+    PyTuple_SetItem(py_dims, i, py_dim);
+  }
+
+  PyObject *py_dtype = PyLong_FromLong((long)dtype);
+  PyObject *py_format = PyLong_FromLong((long)format);
+  PyObject *py_engine = PyLong_FromVoidPtr((void *)&engine);
+  PyObject *py_rdata = PyLong_FromVoidPtr((void *)raw_data);
+
+  PyObject *state = PyTuple_New(5);
+  PyTuple_SetItem(state, 0, py_dims);
+  PyTuple_SetItem(state, 1, py_dtype);
+  PyTuple_SetItem(state, 2, py_format);
+  PyTuple_SetItem(state, 3, py_engine);
+  PyTuple_SetItem(state, 4, py_rdata);
+
+  return state;
+}
+
+// Unpickle.
+void mdarray::__setstate__(PyObject *state) {
+  return;
+}
+
+PyObject *mdarray::py_mdarray_from(PyObject *o) const {
+  PyObject *argList = Py_BuildValue("(O)", o);
+
+  if (argList == nullptr) {
+    PyErr_SetString(PyExc_SystemError, "Can not create argument list");
+    return nullptr;
+  }
+
+  o = PyObject_CallObject(PyType_mdarray, argList);
+
+  Py_DECREF(argList);
+
+  if (o == nullptr) {
+    PyErr_SetString(PyExc_BufferError, "Cannot create mdarray from input");
+    return nullptr;
+  }
+
+  return o;
+}
+
+template<class T>
+void mdarray::axpby(mdarray *dst, T a, mdarray *x, T b, mdarray *y) {
+    ::axpby(dst->tensor(), a, x->tensor(), b, y->tensor());
+}
+
+template<class T>
+PyObject *mdarray::axpby(T a, T b, PyObject *o) {
+  /// Resource manager, for GCC do not accept lambda
+  struct py_decref {
+    void operator () (PyObject *p) {
+      Py_DECREF(p);
+    }
+  };
+
+  std::unique_ptr<PyObject, py_decref> op(nullptr);
+
+  /// Create mdarray from buffer provider
+  if (reinterpret_cast<PyTypeObject *>(o->ob_type) == &PyArray_Type) {
+    o = py_mdarray_from(o);
+    op.reset(o);
+  }
+
+  void *oprd2;
+  int res = SWIG_ConvertPtr(o, &oprd2, nullptr, 0);
+
+  if (!SWIG_IsOK(res)) {
+    PyErr_SetString(PyExc_ValueError, "Wrong operand object in add wrapper");
+    return nullptr;
+  }
+
+  auto x = (reinterpret_cast<py_handle *>(oprd2))->get();
+  py_handle *output = new py_handle(new mdarray(x->mkldnn_memory().get_primitive_desc()));
+
+  /// Switch position for format consistency
+  axpby(output->get(), b, x, a, this);
+
+  PyObject *resultobj = SWIG_Python_NewPointerObj(nullptr
+      , SWIG_as_voidptr(output), SwigTy_mdarray, SWIG_POINTER_OWN |  0 );
+
+  return resultobj;
+}
+
+template<class T>
+PyObject *mdarray::inplace_axpby(T a, PyObject *self, T b, PyObject *o) {
+  // Resource manager, for GCC do not accept lambda
+  struct py_decref {
+    void operator () (PyObject *p) {
+      Py_DECREF(p);
+    }
+  };
+
+  std::unique_ptr<PyObject, py_decref> op(nullptr);
+
+  // Create mdarray from buffer provider
+  if (reinterpret_cast<PyTypeObject *>(o->ob_type) == &PyArray_Type) {
+    o = py_mdarray_from(o);
+    op.reset(o);
+  }
+
+  void *oprd2;
+  int res = SWIG_ConvertPtr(o, &oprd2, nullptr, 0);
+
+  if (!SWIG_IsOK(res)) {
+    PyErr_SetString(PyExc_ValueError, "Wrong operand object in add wrapper");
+    return nullptr;
+  }
+
+  auto y = (reinterpret_cast<py_handle *>(oprd2))->get();
+  axpby(this, a, this, b, y);
+  Py_INCREF(self);
+
+  return self;
+}
+
+PyObject *mdarray::m_Add(PyObject *self, PyObject *o) {
+  // Array Broadcast
+  if (!is_mdarray_supported(self, o)) {
+    return m_Add_map_impl(self, o);
+  } else if (PyArray_Check(o) &&
+      !PyArray_IS_C_CONTIGUOUS(reinterpret_cast<PyArrayObject *>(o))) {
+    // Make compatibility with Non-C-Contiguous array.
+    PyObject *_o = o;
+#if PY_VERSION_HEX < 0x03000000
+    _o = reinterpret_cast<PyObject *>(PyArray_ContiguousFromAny(
+      o, PyArray_ISFLOAT(reinterpret_cast<PyArrayObject *>(o)) ? NPY_FLOAT : NPY_INT, 0, 0));
+#endif
+    PyObject *ret = m_Add_map_impl(self, _o);
+#if PY_VERSION_HEX < 0x03000000
+    Py_DECREF(_o);
+#endif
+    return ret;
+  } else {
+    return axpby(1.0f, 1.0f, o);
+  }
+}
+
+PyObject *mdarray::m_Subtract(PyObject *self, PyObject *o) {
+  // Array Broadcast
+  if (!is_mdarray_supported(self, o)) {
+    return m_Subtract_map_impl(self, o);
+  } else if (PyArray_Check(o) &&
+      !PyArray_IS_C_CONTIGUOUS(reinterpret_cast<PyArrayObject *>(o))) {
+    PyObject *_o = o;
+#if PY_VERSION_HEX < 0x03000000
+    _o = reinterpret_cast<PyObject *>(PyArray_ContiguousFromAny(
+      o, PyArray_ISFLOAT(reinterpret_cast<PyArrayObject *>(o)) ? NPY_FLOAT : NPY_INT, 0, 0));
+#endif
+    PyObject *ret = m_Subtract_map_impl(self, _o);
+#if PY_VERSION_HEX < 0x03000000
+    Py_DECREF(_o);
+#endif
+    return ret;
+  } else {
+    return axpby(1.0f, -1.0f, o);
+  }
+}
+
+PyObject *mdarray::m_InPlaceAdd(PyObject *self, PyObject *o) {
+  // Array Broadcast
+  if (!is_mdarray_supported(self, o)) {
+    return m_InPlaceAdd_map_impl(self, o);
+  } else if (PyArray_Check(o) &&
+      !PyArray_IS_C_CONTIGUOUS(reinterpret_cast<PyArrayObject *>(o))) {
+    PyObject *_o = o;
+#if PY_VERSION_HEX < 0x03000000
+    _o = reinterpret_cast<PyObject *>(PyArray_ContiguousFromAny(
+      o, PyArray_ISFLOAT(reinterpret_cast<PyArrayObject *>(o)) ? NPY_FLOAT : NPY_INT, 0, 0));
+#endif
+    PyObject *ret = m_InPlaceAdd_map_impl(self, _o);
+#if PY_VERSION_HEX < 0x03000000
+    Py_DECREF(_o);
+#endif
+    return ret;
+  } else {
+    return inplace_axpby(1.0f, self, 1.0f, o);
+  }
+}
+
+PyObject *mdarray::m_InPlaceSubtract(PyObject *self, PyObject *o) {
+  // Array Broadcast
+  if (!is_mdarray_supported(self, o)) {
+    return m_InPlaceSubtract_map_impl(self, o);
+  } else if (PyArray_Check(o) &&
+      !PyArray_IS_C_CONTIGUOUS(reinterpret_cast<PyArrayObject *>(o))) {
+    PyObject *_o = o;
+#if PY_VERSION_HEX < 0x03000000
+    _o = reinterpret_cast<PyObject *>(PyArray_ContiguousFromAny(
+      o, PyArray_ISFLOAT(reinterpret_cast<PyArrayObject *>(o)) ? NPY_FLOAT : NPY_INT, 0, 0));
+#endif
+    PyObject *ret = m_InPlaceSubtract_map_impl(self, _o);
+#if PY_VERSION_HEX < 0x03000000
+    Py_DECREF(_o);
+#endif
+    return ret;
+  } else {
+    return inplace_axpby(1.0f, self, -1.0f, o);
+  }
+}
+
+template <typename T>
+void plain_mult(const T *a, const T *b, T *o, int size) {
+  for (int idx = 0; idx < size; idx++)
+    o[idx] = a[idx] * b[idx];
+}
+
+template <typename T>
+void plain_div(const T *a, const T *b, T *o, int size) {
+  for (int idx = 0; idx < size; idx++)
+    o[idx] = a[idx] / b[idx];
+}
+
+enum {mmult, mdiv};
+PyObject *mdarray::m_mult_div(PyObject *self, PyObject *o, int mult_or_div, bool inplace) {
+  struct py_decref {
+    void operator () (PyObject *p) {
+      Py_DECREF(p);
+    }
+  };
+
+  std::unique_ptr<PyObject, py_decref> op(nullptr);
+
+  enum mult_type_t { MULT_UNKNOWN, MULT_ELTWISE, MULT_SCALAR };
+
+  PyTypeObject *oprd2_type = reinterpret_cast<PyTypeObject *>(o->ob_type);
+  int mult_type = static_cast<int>(MULT_UNKNOWN);
+  if (oprd2_type == &PyArray_Type) {
+    mult_type = MULT_ELTWISE;
+    o = py_mdarray_from(o);
+    op.reset(o);
+  } else if (PyObject_HasAttrString(o, "is_mdarray")) {
+    mult_type = MULT_ELTWISE;
+  } else if (PyFloat_Check(o) || PyInt_Check(o) || PyNumber_Check(o)) {
+    mult_type = MULT_SCALAR;
+  }
+
+  PyObject *resultobj = nullptr;
+
+  switch (static_cast<enum mult_type_t>(mult_type)) {
+  case MULT_ELTWISE: {
+    void *oprd2;
+    int res = SWIG_ConvertPtr(o, &oprd2, nullptr, 0);
+    if (!SWIG_IsOK(res)) {
+      PyErr_SetString(PyExc_ValueError, "Error oprd2 %matrix element multiply");
+      break;
+    }
+
+    auto oprd1_mdarr = this;
+    auto oprd2_mdarr = (reinterpret_cast<py_handle *>(oprd2))->get();
+
+    if (oprd1_mdarr->size() != oprd2_mdarr->size()) {
+      PyErr_SetString(PyExc_SystemError, "Abnormal matrix size %matrix element multiply");
+      break;
+    }
+
+    std::vector<mkldnn::primitive> prims;
+    std::unique_ptr<mkldnn::memory> mreorder;
+
+    auto oprd2_internal_m = reorder_if_must(oprd2_mdarr->mkldnn_memory(),
+                               oprd1_mdarr->mkldnn_memory().get_primitive_desc(),
+                               mreorder,
+                               &prims);
+    mkldnn::stream s(mkldnn::stream::kind::eager);
+    s.submit(prims).wait();
+
+    mkldnn::memory::desc res_desc = oprd1_mdarr->desc();
+    mkldnn::memory::dims res_tz;
+    mkldnn::memory::data_type res_dtype =
+          static_cast<mkldnn::memory::data_type>(res_desc.data.data_type);
+    mkldnn::memory::format res_fmt =
+          static_cast<mkldnn::memory::format>(res_desc.data.format);
+    mkldnn::engine res_engine = oprd1_mdarr->get_engine();
+
+    assert(oprd1_mdarr->ndims() == 2 || oprd1_mdarr->ndims() == 4);
+    for (int ndim = 0; ndim < static_cast<int>(oprd1_mdarr->ndims()); ndim++)
+      res_tz.push_back(res_desc.data.dims[ndim]);
+
+    mdarray *res_mdarr;
+    if (!inplace) {
+      res_mdarr = new mdarray(res_tz, res_dtype, res_fmt, res_engine);
+    } else {
+      res_mdarr = oprd1_mdarr;
+    }
+
+    assert(mkldnn::memory::f32 == res_dtype ||
+           mkldnn::memory::s32 == res_dtype ||
+           mkldnn::memory::s16 == res_dtype ||
+           mkldnn::memory::s8 == res_dtype ||
+           mkldnn::memory::u8 == res_dtype );
+    assert(mmult == mult_or_div ||
+           mdiv == mult_or_div);
+    if (mkldnn::memory::f32 == res_dtype) {
+      switch (mult_or_div) {
+      case mmult:
+        vsMul(oprd1_mdarr->size(),
+              reinterpret_cast<const float *>(oprd1_mdarr->data()),
+              reinterpret_cast<const float *>(oprd2_internal_m.get_data_handle()),
+              reinterpret_cast<float *>(res_mdarr->data()));
+        break;
+
+      case mdiv:
+        plain_div(reinterpret_cast<const float *>(oprd1_mdarr->data()),
+                  reinterpret_cast<const float *>(oprd2_internal_m.get_data_handle()),
+                  reinterpret_cast<float *>(res_mdarr->data()),
+                  static_cast<int>(oprd1_mdarr->size()));
+        break;
+      }
+    } else if (mkldnn::memory::s32 == res_dtype) {
+      switch (mult_or_div) {
+      case mmult:
+        plain_mult(reinterpret_cast<const int *>(oprd1_mdarr->data()),
+                   reinterpret_cast<const int *>(oprd2_internal_m.get_data_handle()),
+                   reinterpret_cast<int *>(res_mdarr->data()),
+                   static_cast<int>(oprd1_mdarr->size()));
+        break;
+
+      case mdiv:
+        plain_div(reinterpret_cast<const int *>(oprd1_mdarr->data()),
+                  reinterpret_cast<const int *>(oprd2_internal_m.get_data_handle()),
+                  reinterpret_cast<int *>(res_mdarr->data()),
+                  static_cast<int>(oprd1_mdarr->size()));
+        break;
+      }
+    } else if (mkldnn::memory::s16 == res_dtype) {
+      switch (mult_or_div) {
+      case mmult:
+        plain_mult(reinterpret_cast<const int16_t *>(oprd1_mdarr->data()),
+                   reinterpret_cast<const int16_t *>(oprd2_internal_m.get_data_handle()),
+                   reinterpret_cast<int16_t *>(res_mdarr->data()),
+                   static_cast<int>(oprd1_mdarr->size()));
+        break;
+
+      case mdiv:
+        plain_div(reinterpret_cast<const int16_t *>(oprd1_mdarr->data()),
+                  reinterpret_cast<const int16_t *>(oprd2_internal_m.get_data_handle()),
+                  reinterpret_cast<int16_t *>(res_mdarr->data()),
+                  static_cast<int>(oprd1_mdarr->size()));
+        break;
+      }
+    } else if (mkldnn::memory::s8 == res_dtype) {
+      switch (mult_or_div) {
+      case mmult:
+        plain_mult(reinterpret_cast<const int8_t *>(oprd1_mdarr->data()),
+                   reinterpret_cast<const int8_t *>(oprd2_internal_m.get_data_handle()),
+                   reinterpret_cast<int8_t *>(res_mdarr->data()),
+                   static_cast<int>(oprd1_mdarr->size()));
+        break;
+
+      case mdiv:
+        plain_div(reinterpret_cast<const int8_t *>(oprd1_mdarr->data()),
+                  reinterpret_cast<const int8_t *>(oprd2_internal_m.get_data_handle()),
+                  reinterpret_cast<int8_t *>(res_mdarr->data()),
+                  static_cast<int>(oprd1_mdarr->size()));
+        break;
+      }
+    } else if (mkldnn::memory::u8 == res_dtype) {
+      switch (mult_or_div) {
+      case mmult:
+        plain_mult(reinterpret_cast<const uint8_t *>(oprd1_mdarr->data()),
+                   reinterpret_cast<const uint8_t *>(oprd2_internal_m.get_data_handle()),
+                   reinterpret_cast<uint8_t *>(res_mdarr->data()),
+                   static_cast<int>(oprd1_mdarr->size()));
+        break;
+
+      case mdiv:
+        plain_div(reinterpret_cast<const uint8_t *>(oprd1_mdarr->data()),
+                  reinterpret_cast<const uint8_t *>(oprd2_internal_m.get_data_handle()),
+                  reinterpret_cast<uint8_t *>(res_mdarr->data()),
+                  static_cast<int>(oprd1_mdarr->size()));
+        break;
+      }
+    }
+
+    if (!inplace) {
+      auto res_py_handle = new py_handle(res_mdarr);
+      resultobj = SWIG_Python_NewPointerObj(nullptr,
+                       SWIG_as_voidptr(res_py_handle),
+                       SwigTy_mdarray,
+                       SWIG_POINTER_OWN | 0);
+    } else {
+      resultobj = self;
+      Py_INCREF(self);
+    }
+
+    break;
+  }
+
+  case MULT_SCALAR: {
+    double a = PyInt_Check(o) ?
+               static_cast<double>(PyInt_AsLong(o)) :
+               PyFloat_AsDouble(o),
+           b = 0.0;
+
+    a = (mmult == mult_or_div) ? a : (1 / a);
+
+    if (!inplace) {
+      resultobj = axpby(a, b, self);
+    } else {
+      resultobj = inplace_axpby(a, self, b, self);;
+    }
+    break;
+  }
+
+  case MULT_UNKNOWN:
+  default:
+    PyErr_SetString(PyExc_SystemError, "Abnormal type % matrix * scalar");
+    break;
+  }
+
+  return resultobj;
+}
+
+PyObject *mdarray::m_Multiply(PyObject *self, PyObject *o) {
+  if (!is_mdarray_supported(self, o)) {
+    return m_Multiply_map_impl(self, o);
+  } else if (PyArray_Check(o) &&
+      !PyArray_IS_C_CONTIGUOUS(reinterpret_cast<PyArrayObject *>(o))) {
+    PyObject *_o = o;
+#if PY_VERSION_HEX < 0x03000000
+    _o = reinterpret_cast<PyObject *>(PyArray_ContiguousFromAny(
+      o, PyArray_ISFLOAT(reinterpret_cast<PyArrayObject *>(o)) ? NPY_FLOAT : NPY_INT, 0, 0));
+#endif
+    PyObject *ret = m_Multiply_map_impl(self, _o);
+#if PY_VERSION_HEX < 0x03000000
+    Py_DECREF(_o);
+#endif
+    return ret;
+  } else {
+    return m_mult_div(self, o, mmult, false);
+  }
+}
+
+PyObject *mdarray::m_InPlaceMultiply(PyObject *self, PyObject *o) {
+  if (!is_mdarray_supported(self, o)) {
+    return m_InPlaceMultiply_map_impl(self, o);
+  } else if (PyArray_Check(o) &&
+      !PyArray_IS_C_CONTIGUOUS(reinterpret_cast<PyArrayObject *>(o))) {
+    PyObject *_o = o;
+#if PY_VERSION_HEX < 0x03000000
+    _o = reinterpret_cast<PyObject *>(PyArray_ContiguousFromAny(
+      o, PyArray_ISFLOAT(reinterpret_cast<PyArrayObject *>(o)) ? NPY_FLOAT : NPY_INT, 0, 0));
+#endif
+    PyObject *ret = m_InPlaceMultiply_map_impl(self, _o);
+#if PY_VERSION_HEX < 0x03000000
+    Py_DECREF(_o);
+#endif
+    return ret;
+  } else {
+    return m_mult_div(self, o, mmult, true);
+  }
+}
+
+PyObject *mdarray::m_Divide(PyObject *self, PyObject *o) {
+  if (!is_mdarray_supported(self, o)) {
+    return m_Divide_map_impl(self, o);
+  } else if (PyArray_Check(o) &&
+      !PyArray_IS_C_CONTIGUOUS(reinterpret_cast<PyArrayObject *>(o))) {
+    PyObject *_o = o;
+#if PY_VERSION_HEX < 0x03000000
+    _o = reinterpret_cast<PyObject *>(PyArray_ContiguousFromAny(
+      o, PyArray_ISFLOAT(reinterpret_cast<PyArrayObject *>(o)) ? NPY_FLOAT : NPY_INT, 0, 0));
+#endif
+    PyObject *ret = m_Divide_map_impl(self, _o);
+#if PY_VERSION_HEX < 0x03000000
+    Py_DECREF(_o);
+#endif
+    return ret;
+  } else {
+    return m_mult_div(self, o, mdiv, false);
+  }
+}
+
+PyObject *mdarray::m_InPlaceDivide(PyObject *self, PyObject *o) {
+  if (!is_mdarray_supported(self, o)) {
+    return m_InPlaceDivide_map_impl(self, o);
+  } else if (PyArray_Check(o) &&
+      !PyArray_IS_C_CONTIGUOUS(reinterpret_cast<PyArrayObject *>(o))) {
+    PyObject *_o = o;
+#if PY_VERSION_HEX < 0x03000000
+    _o = reinterpret_cast<PyObject *>(PyArray_ContiguousFromAny(
+      o, PyArray_ISFLOAT(reinterpret_cast<PyArrayObject *>(o)) ? NPY_FLOAT : NPY_INT, 0, 0));
+#endif
+    PyObject *ret = m_InPlaceDivide_map_impl(self, _o);
+#if PY_VERSION_HEX < 0x03000000
+    Py_DECREF(_o);
+#endif
+    return ret;
+  } else {
+    return m_mult_div(self, o, mdiv, true);
+  }
+}
+
+int mdarray::getbuffer(PyObject *self, Py_buffer *view, int flags) {
+  if ((flags & PyBUF_F_CONTIGUOUS) == PyBUF_F_CONTIGUOUS) {
+    PyErr_SetString(PyExc_ValueError, "carray is not Fortran contiguous");
+    return -1;
+  }
+
+  if (view == nullptr) {
+    PyErr_SetString(PyExc_ValueError, "NULL view in getbuffer");
+    return -1;
+  }
+
+  // reorderer type object
+  if (PyType_reorder_buffer == nullptr) {
+    PyErr_SetString(PyExc_NameError, "name 'reorderer' is not defined");
+    return -1;
+  }
+
+  // Wrote some python in C++ :)
+  PyObject *argList = Py_BuildValue("(O)", self);
+  if (argList == nullptr) {
+    return -1;
+  }
+
+  // TODO: Do we need to cache this thing?
+  PyObject *rbobj = PyObject_CallObject(PyType_reorder_buffer, argList);
+  Py_DECREF(argList);
+
+  if (rbobj == nullptr) {
+    return -1;
+  }
+
+  Reorderer *rb;
+  int res = SWIG_ConvertPtr(rbobj, reinterpret_cast<void **>(&rb), nullptr, 0);
+
+  if (!SWIG_IsOK(res)) {
+    PyErr_SetString(PyExc_RuntimeError, "Can't get C++ object from python object");
+    return -1;
+  }
+
+  if (rb->non_trivial())
+    rb->fire(this->tensor());
+
+  if (build_view(view, flags, *rb)) {
+    PyErr_SetString(PyExc_RuntimeError, "Can't build Py_buffer!");
+    return -1;
+  }
+
+  // Stolen reference
+  view->obj = rbobj;
+  sync_reorder_ = rb;
+
+  // reset self mdarray's tensor, keep buffer consistency.
+  if (rb->non_trivial()) {
+    mdarray *src_mdarray = get_mdarray_from_PyObject(self);
+    if (!src_mdarray) {
+      PyErr_SetString(PyExc_RuntimeError, "Can't get src mdarray from python object!");
+      return -1;
+    }
+
+    Tensor *src_tensor = src_mdarray->tensor();
+    mkldnn::memory::dims src_dims = (mkldnn::memory::dims)src_tensor->dims();
+    mkldnn_memory_format_t dst_fmt = public_format(src_tensor->format());
+
+    Tensor *dst_tensor = new Tensor(src_dims.size(), src_dims, rb->data_,
+                                    dst_fmt, src_tensor->type());
+    src_mdarray->reset_tensor(dst_tensor);
+  }
+  return 0;
+}
+
+PyObject *mdarray::getattro(PyObject *self, PyObject *name) {
+  // XXX: Recursive alarm !!! XXX
+  PyObject *surrogate = PyArray_FromAny(self, nullptr, 0, 0
+      , NPY_ARRAY_ELEMENTSTRIDES, nullptr);
+
+  if (surrogate == nullptr)
+    return nullptr;
+
+  // Watch the reference count of surrogate if more compicated
+  // looking up method involved
+  PyObject * attr = PyObject_GetAttr(surrogate, name);
+
+  // The surrogate will be destroyed after attribute is done
+  Py_DECREF(surrogate);
+
+  if (attr == nullptr && PyErr_ExceptionMatches(PyExc_AttributeError)) {
+    PyErr_Clear();
+
+    // Switch to our exception message if things gone wrong
+    PyTypeObject *tp = Py_TYPE(self);
+    PyErr_Format(PyExc_AttributeError
+        , "mdarray '%.50s' object has no attribute '%p'", tp->tp_name, name);
+  }
+
+  return attr;
+}
+
+Py_ssize_t mdarray::mp_length(PyObject *self) {
+  PyObject *surrogate = PyArray_FromAny(self, nullptr, 0, 0
+      , NPY_ARRAY_ELEMENTSTRIDES, nullptr);
+
+  if (surrogate == nullptr)
+    return -1;
+
+  Py_ssize_t len = PyMapping_Length(surrogate);
+  Py_DECREF(surrogate);
+
+  // TODO: Exception localize
+  return len;
+}
+
+PyObject *mdarray::mp_subscript(PyObject *self, PyObject *op) {
+  PyObject *surrogate = PyArray_FromAny(self, nullptr, 0, 0
+      , NPY_ARRAY_ELEMENTSTRIDES, nullptr);
+
+  if (surrogate == nullptr)
+    return nullptr;
+
+  PyObject *ret = PyObject_GetItem(surrogate, op);
+  Py_DECREF(surrogate);
+
+  // TODO: Exception localize
+  return ret;
+}
+
+int mdarray::mp_ass_subscript(PyObject *self, PyObject *ind, PyObject *op) {
+  PyObject *surrogate = PyArray_FromAny(self, nullptr, 0, 0
+      , NPY_ARRAY_ELEMENTSTRIDES, nullptr);
+
+  int ret;
+
+  if (surrogate == nullptr)
+    return -1;
+
+  if (op == nullptr)
+    ret = PyObject_DelItem(surrogate, ind);
+  else
+    ret = PyObject_SetItem(surrogate, ind, op);
+
+  if (sync_reorder_ && sync_reorder_->non_trivial()) {
+    sync_reorder_->sync(this->tensor());
+  }
+
+  Py_DECREF(surrogate);
+
+  // TODO: Exception localize
+  return ret;
+}
+
+PyObject *mdarray::flat() {
+  long int dims[1] = {static_cast<long int>(this->size())};
+
+  int typenum = NPY_NOTYPE;
+  switch(static_cast<mkldnn::memory::data_type>(this->mkldnn_memory().get_primitive_desc().desc().data.data_type)) {
+    case mkldnn::memory::f32:
+      typenum = NPY_FLOAT32;
+      break;
+    case mkldnn::memory::s32:
+      typenum = NPY_INT;
+      break;
+    case mkldnn::memory::s16:
+      typenum = NPY_INT16;
+      break;
+    case mkldnn::memory::s8:
+      typenum = NPY_INT8;
+      break;
+    case mkldnn::memory::u8:
+      typenum = NPY_UINT8;
+      break;
+    default:
+      PyErr_SetString(PyExc_ValueError, "Bad mdarray data_type");
+      break;
+  }
+
+  PyObject *plain_arr = nullptr;
+  plain_arr = PyArray_SimpleNewFromData(1, dims, typenum, this->data());
+  if (!plain_arr)
+    PyErr_SetString(PyExc_ValueError, "Can't create plain array with format from mdarray");
+
+  return plain_arr;
+}
+
+PyObject *mdarray::reshape(py_handle *self, vector<int> dims)
+{
+    if (dims.size() != 4 && dims.size() != 2) {
+        PyErr_SetString(PyExc_ValueError,"Only support reshape to 2 dimension");
+        return nullptr;
+    }
+    int idx_unknown = -1;
+    size_t size = 1;
+    for (unsigned int i = 0; i < dims.size(); i++) {
+        if (dims[i] < 0) {
+            if (idx_unknown == -1) {
+                idx_unknown = i;
+            } else {
+                PyErr_SetString(PyExc_ValueError,"Only support 1 unkown dimension");
+                return nullptr;
+            }
+        } else {
+            size *= dims[i];
+        }
+    }
+    if (idx_unknown == -1) {
+        if (size != this->size()) {
+            PyErr_SetString(PyExc_ValueError,"Wrong dimension to reshape");
+            return nullptr;
+        }
+    } else if (this->size() % size) {
+        PyErr_SetString(PyExc_ValueError,"Wrong dimension to reshape");
+        return nullptr;
+    } else {
+        dims[idx_unknown] = this->size() / size;
+    }
+    Tensor *tensor = tensor_->reshape(dims);
+    if (tensor == nullptr) {
+        PyErr_SetString(PyExc_ValueError,"The dimension is not valid in reshape");
+        return nullptr;
+    } else {
+        //mdarray *new_array = new ::mdarray(tensor);
+        py_handle *output = new py_handle(new mdarray(tensor));
+        PyObject *resultobj = SWIG_Python_NewPointerObj(nullptr
+                , SWIG_as_voidptr(output), SwigTy_mdarray, SWIG_POINTER_OWN |  0 );
+        return resultobj;
+    }
+}
+
+PyObject *mdarray::sum(vector<int> axis, bool keepdims)
+{
+    auto tensor = tensor_->sum(axis);
+    if (tensor) {
+        if (keepdims) {
+            vector<int> expected_shape;
+            for (int v = 0; v < this->ndims(); v++)
+                expected_shape.push_back(this->desc().data.dims[v]);
+
+            for (unsigned a = 0; a < axis.size(); a++)
+                expected_shape[axis[a]] = 1;
+
+            auto _tensor = tensor->reshape(expected_shape);
+            delete tensor;
+            tensor = _tensor;
+        }
+
+        auto output = new py_handle(new mdarray(tensor));
+        auto resultobj = SWIG_Python_NewPointerObj(nullptr,
+                             SWIG_as_voidptr(output), SwigTy_mdarray,
+                             SWIG_POINTER_OWN | 0);
+        return resultobj;
+    } else {
+        return nullptr;
+    }
+}
+
+bool mdarray::is_mdarray(PyObject *o)
+{
+    return (reinterpret_cast<PyTypeObject *>(o->ob_type)
+            == reinterpret_cast<PyTypeObject *>(PyType_mdarray));
+}
+
+}
diff --git a/python/ideep4py/py/mm/mdarray.h b/python/ideep4py/py/mm/mdarray.h
new file mode 100755
index 00000000..c943942c
--- /dev/null
+++ b/python/ideep4py/py/mm/mdarray.h
@@ -0,0 +1,547 @@
+/*
+ *Copyright (c) 2018 Intel Corporation.
+ *
+ *Permission is hereby granted, free of charge, to any person obtaining a copy
+ *of this software and associated documentation files (the "Software"), to deal
+ *in the Software without restriction, including without limitation the rights
+ *to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ *copies of the Software, and to permit persons to whom the Software is
+ *furnished to do so, subject to the following conditions:
+ *
+ *The above copyright notice and this permission notice shall be included in
+ *all copies or substantial portions of the Software.
+ *
+ *THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ *OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ *THE SOFTWARE.
+ *
+ */
+
+
+#ifndef _MDARRAY_H_
+#define _MDARRAY_H_
+#include <Python.h>
+#define NPY_NO_DEPRECATED_API NPY_1_7_API_VERSION
+#include <numpy/arrayobject.h>
+#include <numpy/ndarraytypes.h>
+#include <cassert>
+#include <cstring>
+#include <iostream>
+#include <vector>
+#include <numeric>
+#include <memory>
+#include <forward_list>
+#include <stdexcept>
+#include <mkldnn.hpp>
+#include <type_traits>
+#include <swigpyrun.h>
+#include "mem.h"
+#include "tensor.h"
+#include "reorder.h"
+
+// FIXME
+// use global engine to init mdarray
+using namespace mkldnn;
+extern engine cpu_engine;
+
+namespace implementation {
+  class mdarray;
+}
+
+using py_handle = std::shared_ptr<implementation::mdarray>;
+
+namespace implementation {
+
+#if PY_VERSION_HEX >= 0x03000000
+  int g_init();
+#else
+  void g_init();
+#endif
+
+#define NPY_ARRAY_SURROGATE_ENTRY(mdarray) \
+  PyObject *surrogate = PyArray_FromAny(mdarray, nullptr, 0, 0 \
+      , NPY_ARRAY_ELEMENTSTRIDES, nullptr)   \
+
+#define NPY_ARRAY_SURROGATE_EXIT()
+
+#define nb_unary_map_impl(method) \
+  PyObject * m_ ## method ## _map_impl(PyObject *self) { \
+    NPY_ARRAY_SURROGATE_ENTRY(self); \
+                                \
+    if (surrogate == nullptr)   \
+      return nullptr;           \
+                                \
+    PyObject *res = PyNumber_ ## method(surrogate); \
+    Py_DECREF(surrogate);   \
+    NPY_ARRAY_SURROGATE_EXIT(); \
+    return res;   \
+  } \
+
+#define nb_unary_map(method) \
+  nb_unary_map_impl(method) \
+  PyObject * m_ ## method (PyObject *self) {    \
+    return m_ ## method ## _map_impl(self); \
+  } \
+
+#define nb_binary_map_impl(method) \
+  PyObject * m_ ## method ## _map_impl(PyObject *self, PyObject *o) {   \
+    PyObject *left = self, *right = o;                                  \
+    if (is_mdarray(left)) {                                             \
+      left = PyArray_FromAny(left, nullptr, 0, 0                        \
+        , NPY_ARRAY_ELEMENTSTRIDES, nullptr);                           \
+    }                                                                   \
+    if (is_mdarray(right)) {                                            \
+      right = PyArray_FromAny(right, nullptr, 0, 0                      \
+        , NPY_ARRAY_ELEMENTSTRIDES, nullptr);                           \
+    }                                                                   \
+    PyObject *res = PyNumber_ ## method(left, right);                   \
+    if (left != self)                                                   \
+      Py_DECREF(left);                                                  \
+    if (right != o)                                                     \
+      Py_DECREF(right);                                                 \
+    return res;                                                         \
+  }
+
+#define nb_binary_map_impl_with_target_func(method, tfunc) \
+  PyObject * m_ ## method ## _map_impl(PyObject *self, PyObject *o) {    \
+    NPY_ARRAY_SURROGATE_ENTRY(self); \
+                                \
+    if (surrogate == nullptr)   \
+      return nullptr;           \
+                                \
+    PyObject *res = PyNumber_ ## tfunc(surrogate, o); \
+    Py_DECREF(surrogate);   \
+    NPY_ARRAY_SURROGATE_EXIT(); \
+    return res;   \
+  }
+
+#define nb_binary_map(method) \
+  nb_binary_map_impl(method) \
+  PyObject * m_ ## method (PyObject *self, PyObject *o) {    \
+    return m_ ## method ## _map_impl(self, o); \
+  } \
+
+#define nb_ternary_map_impl(method) \
+  PyObject * m_ ## method ## _map_impl(PyObject *self, PyObject *o1, PyObject *o2) {    \
+    NPY_ARRAY_SURROGATE_ENTRY(self); \
+                                \
+    if (surrogate == nullptr)   \
+      return nullptr;           \
+                                \
+    PyObject *res = PyNumber_ ## method(surrogate, o1, o2); \
+    Py_DECREF(surrogate); \
+    NPY_ARRAY_SURROGATE_EXIT(); \
+    return res;   \
+  }
+
+#define nb_ternary_map(method) \
+  nb_ternary_map_impl(method) \
+  PyObject * m_ ## method (PyObject *self, PyObject *o1, PyObject *o2) {    \
+    return m_ ## method ## _map_impl(self, o1, o2); \
+  } \
+
+
+//class mdarray : public Tensor {
+class mdarray {
+public:
+  // It is exposed to python
+  //
+  static constexpr int MAX_NDIM = 12; //XXX: For now
+
+  class Reorder_buffer : Reorderer {
+  public:
+    Reorder_buffer(const py_handle in)
+        :Reorderer(in.get()->tensor()) {}
+  };
+
+public:
+  typedef size_t size_type;
+  // Generated on demand
+  //FIXME 
+  //yli135: add default constructor so that we can pass vector<mdarray> form native
+  mdarray();
+  virtual ~mdarray() = default;
+
+  mdarray(Tensor *tensor) : tensor_(tensor) {}
+
+  mdarray(mkldnn::memory::dims &dims
+      , mkldnn::memory::data_type dt
+      , mkldnn::memory::format format
+      , const mkldnn::engine &engine)
+    : tensor_(new Tensor(dims, dt, format, engine)) {}
+
+  mdarray(mkldnn::memory::primitive_desc pd)
+    : tensor_(new Tensor(pd)) {}
+
+#if 0
+  mdarray(int ndims, vector<int> dims, void *data,
+          mkldnn_memory_format_t mm_fmt, data_type_t type=FLOAT32)
+    : tensor_(new Tensor(ndims, dims, data, mm_fmt, type)) {}
+#endif
+
+  mdarray(Py_buffer *view, char input_type='d') {// input_type : 'd'-->data, 'w'-->weight
+    data_type_t dt;
+    std::string format(view->format);
+    if (std::string::npos != format.find_last_of('f')) {
+      dt = FLOAT32;
+    } else if (std::string::npos != format.find_last_of('i')) {
+      dt = SINT32;
+    } else if (std::string::npos != format.find_last_of('h')) {
+      dt = SINT16;
+    } else if (std::string::npos != format.find_last_of('b')) {
+      dt = SINT8;
+    } else if (std::string::npos != format.find_last_of('B')) {
+      dt = UINT8;
+    } else {
+      throw mkldnn::error(mkldnn_invalid_arguments
+          , std::string("MKLDNN does not support data type: ")
+          + format);
+    }
+    vector<int> dims(view->shape, view->shape + view->ndim);
+    //std::unique_ptr<Tensor> tensor(new Tensor(view->ndim, dims, view->buf, dt)); 
+    tensor_.reset(new Tensor(view->ndim, dims, view->buf, dt, input_type)); 
+
+    PyBuffer_Release(view);
+
+#if 0
+    ndims_ = view->ndim;
+    dims_.assign(view->shape, view->shape + view->ndim);
+    size_ = view->len / view->itemsize;
+    type_ = dt;
+    data_ = std::shared_ptr<avx::byte>(new avx::byte [view->len]
+                    , [] (avx::byte *p) {delete [] p;});
+    memcpy(data_.get(), view->buf, view->len);
+    mm_fmt_ = ndims2format(ndims_);
+    memory::data_type type = to_mkldnn_type();
+    mem_.reset(new mkldnn::memory(
+                { { { dims_ }, type, static_cast<memory::format>(mm_fmt_) }
+                , cpu_engine }, data_.get()));
+#endif
+  }
+
+  static bool is_mdarray(PyObject *o);
+  
+  //FIXME
+  inline void unpickled_data(void *pdata) {
+    //data_.reset(reinterpret_cast<avx::byte *>(pdata));
+    //m_.set_data_handle(pdata);
+    return;
+  }
+
+  // PEP 3118 interface
+  int build_view(Py_buffer *view, int flags, const Reorderer &reorder) {
+      view->buf = reorder.data_.get();
+      view->itemsize = reorder.itemsize_;
+      view->readonly = 0;
+      view->internal = nullptr;
+      view->len = reorder.size_ * reorder.itemsize_;
+
+      if ((flags & PyBUF_FORMAT) == PyBUF_FORMAT) {
+          view->format = const_cast<char *>(reorder.format_);
+      } else {
+          view->format = nullptr;
+      }
+
+      if ((flags & PyBUF_ND) == PyBUF_ND) {
+          view->ndim = reorder.ndims_;
+          view->shape = const_cast<Py_ssize_t *>(reorder.shape_);
+      } else {
+          view->ndim = 0;
+          view->shape = nullptr;
+      }
+
+      if ((flags & PyBUF_STRIDES) == PyBUF_STRIDES) {
+          view->strides = const_cast<Py_ssize_t *>(reorder.strides_);
+      } else {
+          view->strides = nullptr;
+      }
+
+      view->suboffsets = nullptr;
+
+      return 0;
+  }
+
+#if 0
+  // Array protocol
+  PyArrayInterface *build_array_struct(void) {
+      auto arrstr = new PyArrayInterface();
+
+      arrstr->two = 2;
+      arrstr->nd = ndims_;
+      arrstr->typekind = *((char *)format_);
+      arrstr->itemsize = itemsize_;
+      arrstr->flags = NPY_ARRAY_C_CONTIGUOUS | NPY_ARRAY_NOTSWAPPED |
+          NPY_ARRAY_ALIGNED | NPY_ARRAY_WRITEABLE;
+      arrstr->flags &= ~(NPY_ARRAY_UPDATEIFCOPY | NPY_ARRAY_OWNDATA);
+      arrstr->shape = shape_;
+      arrstr->strides = strides_;
+      arrstr->data = data_.get();
+      arrstr->descr = nullptr;
+
+      return arrstr;
+  }
+#endif
+
+  PyObject *__getstate__(void) const;
+
+  void __setstate__(PyObject *state);
+
+  PyObject *py_mdarray_from(PyObject *o) const;
+
+  /// d = a * x + b * y, using x's format
+  template<class T>
+  static void axpby(mdarray *dst, T a, mdarray *x, T b, mdarray *y);
+
+  /// Interface to directly contact python
+  template<class T>
+  PyObject *axpby(T a, T b, PyObject *o);
+
+  template<class T>
+  PyObject *inplace_axpby(T a, PyObject *self, T b, PyObject *o);
+
+  PyObject *flat(void);
+
+  PyObject *reshape(py_handle *self, vector<int> dims);
+
+  PyObject *m_mult_div(PyObject *self, PyObject *o, int mult_or_div, bool inplace);
+
+  PyObject *sum(std::vector<int> axis, bool keepdims);
+
+  // PEP: 3118 Buffer Protocol Producer
+  virtual int getbuffer(PyObject *obj, Py_buffer *view, int flags);
+
+  PyObject *getattro(PyObject *self, PyObject *name);
+
+  PyObject *m_Add(PyObject *self, PyObject *o);
+  nb_binary_map_impl(Add);
+  PyObject *m_InPlaceAdd(PyObject *self, PyObject *o);
+  nb_binary_map_impl(InPlaceAdd);
+  PyObject *m_Subtract(PyObject *self, PyObject *o);
+  nb_binary_map_impl(Subtract);
+  PyObject *m_InPlaceSubtract(PyObject *self, PyObject *o);
+  nb_binary_map_impl(InPlaceSubtract);
+  PyObject *m_Multiply(PyObject *self, PyObject *o);
+  nb_binary_map_impl(Multiply);
+  PyObject *m_InPlaceMultiply(PyObject *self, PyObject *o);
+  nb_binary_map_impl(InPlaceMultiply);
+  // SWIG: nb_true_divide (no slot) <= nb_divide
+  PyObject *m_Divide(PyObject *self, PyObject *o);
+#if PY_VERSION_HEX < 0x03000000
+  nb_binary_map_impl(Divide);
+#else
+  nb_binary_map_impl_with_target_func(Divide, TrueDivide);
+#endif
+  PyObject *m_InPlaceDivide(PyObject *self, PyObject *o);
+#if PY_VERSION_HEX < 0x03000000
+  nb_binary_map_impl(InPlaceDivide);
+#else
+  nb_binary_map_impl_with_target_func(InPlaceDivide, InPlaceTrueDivide);
+#endif
+
+  nb_binary_map(Remainder);
+  nb_binary_map(Divmod);
+  nb_unary_map(Negative);
+  nb_unary_map(Positive);
+  nb_unary_map(Absolute);
+  nb_unary_map(Invert);
+  nb_binary_map(Lshift);
+  nb_binary_map(Rshift);
+  nb_binary_map(And);
+  nb_binary_map(Xor);
+  nb_binary_map(Or);
+  nb_binary_map(InPlaceRemainder);
+  nb_ternary_map(InPlacePower);
+  nb_binary_map(InPlaceLshift);
+  nb_binary_map(InPlaceRshift);
+  nb_binary_map(InPlaceAnd);
+  nb_binary_map(InPlaceXor);
+  nb_binary_map(InPlaceOr);
+  nb_binary_map(FloorDivide);
+  nb_binary_map(InPlaceFloorDivide);
+#if (PY_VERSION_HEX >= 0x03000000)
+  nb_binary_map(MatrixMultiply);
+  nb_binary_map(InPlaceMatrixMultiply);
+#endif
+
+  Py_ssize_t mp_length(PyObject *self);
+  PyObject *mp_subscript(PyObject *self, PyObject *op);
+  int mp_ass_subscript(PyObject *self, PyObject *ind, PyObject *op);
+
+  inline Tensor* tensor() {
+      return tensor_.get();
+  }
+  inline Tensor &tensor2() {
+      return *(tensor_.get());
+  }
+  inline int ndims() const {
+      return tensor_->ndims();
+  }
+  inline memory::desc desc() const {
+      return tensor_->desc();
+  }
+  inline size_type size() const {
+      return tensor_->size();
+  }
+  inline void *data() const {
+      return tensor_->data();
+  }
+  inline mkldnn::engine get_engine() const {
+      return tensor_->get_engine();
+  }
+  inline mkldnn::memory mkldnn_memory() const {
+      return tensor_->mkldnn_memory();
+  }
+  inline void reset_tensor(Tensor *dst) {
+    tensor_.reset(dst);
+  }
+private:
+  struct WeDontManageIt {
+    void operator() (const Py_buffer *view) {
+      PyBuffer_Release(const_cast<Py_buffer *>(view));
+      delete view;
+    }
+  };
+
+  std::unique_ptr<const Py_buffer, WeDontManageIt> view_;
+
+protected:
+  std::unique_ptr<Tensor> tensor_;
+  Reorderer *sync_reorder_;
+
+#if 0
+private:
+  static mkldnn::memory::desc _d_from_view(const Py_buffer *view
+      , mkldnn::memory::format order) {
+    mkldnn::memory::dims dims (view->ndim);
+
+    for( int i=0; i < view->ndim; i++)
+      dims[i] = view->shape[i];
+
+    std::string format(view->format);
+    mkldnn::memory::data_type dt;
+
+    if (view->itemsize == 4) {
+      if (std::string::npos != format.find_last_of('f')) {
+        dt = mkldnn::memory::f32;
+      } else if (std::string::npos != format.find_last_of('i')) {
+        dt = mkldnn::memory::s32;
+      } else
+        throw mkldnn::error(mkldnn_invalid_arguments
+            , std::string("MKLDNN does not support data type: ")
+            + format);
+    } else
+      throw mkldnn::error(mkldnn_invalid_arguments
+          , "MKLDNN does not support itemsize other than 4");
+
+    return mkldnn::memory::desc(dims, dt, order);
+  }
+#endif
+};
+
+}
+
+//
+// Actual interface for python
+// DO NOT add field
+//
+class mdarray : public py_handle {
+public:
+  //FIXME 
+  //yli135: add default constructor so that we can pass vector<mdarray> form native
+  mdarray() {};
+
+  mdarray(Tensor *tensor)
+    : py_handle(std::make_shared<implementation::mdarray>(tensor)) {}
+
+  mdarray(mkldnn::memory::dims &dims
+      , mkldnn::memory::data_type dt
+      , mkldnn::memory::format format
+      , mkldnn::engine &engine)
+    : py_handle(std::make_shared<implementation::mdarray>
+        (dims, dt, format, engine)) {}
+
+  mdarray(mkldnn::memory::primitive_desc pd)
+    : py_handle(std::make_shared<implementation::mdarray>(pd)) {}
+
+  mdarray(Py_buffer *view, char input_type='d')
+    : py_handle(std::make_shared<implementation::mdarray>(view, input_type)) {}
+
+#if 0
+  mdarray(int ndims, vector<int> dims, void *data,
+          mkldnn_memory_format_t mm_fmt, data_type_t type=FLOAT32)
+    : py_handle(std::make_shared<implementation::mdarray>(ndims, dims, data, mm_fmt, type)) {}
+#endif
+
+  static PyObject *mdarray_shape_get(mdarray *arg) {
+    implementation::mdarray *self = arg->get();
+    int ndim = self->ndims();
+    PyObject *intTuple = PyTuple_New(ndim);
+    auto data = self->desc().data;
+
+    if (!intTuple)
+      goto fail;
+
+    for (int i = 0; i<ndim; i++) {
+      PyObject *o = PyLong_FromLong(data.dims[i]);
+
+      if (!o) {
+        Py_DECREF(intTuple);
+        intTuple = NULL;
+        goto fail;
+      }
+
+      PyTuple_SET_ITEM(intTuple, i, o);
+    }
+
+  fail:
+    return intTuple;
+  }
+
+  static PyObject *mdarray_dtype_get(mdarray *self) {
+    implementation::mdarray *m = self->get();
+    PyArray_Descr *pd;
+    // Translate our data_type to numpy one
+    switch (static_cast<mkldnn::memory::data_type>(m->desc().data.data_type)) {
+      case mkldnn::memory::f32:
+        pd = PyArray_DescrFromType(NPY_FLOAT);
+        break;
+      case mkldnn::memory::s32:
+        pd= PyArray_DescrFromType(NPY_INT);
+        break;
+      case mkldnn::memory::s16:
+        pd= PyArray_DescrFromType(NPY_INT16);
+        break;
+      case mkldnn::memory::s8:
+        pd= PyArray_DescrFromType(NPY_INT8);
+        break;
+      case mkldnn::memory::u8:
+        pd= PyArray_DescrFromType(NPY_UINT8);
+        break;
+      default:
+        PyErr_SetString(PyExc_ValueError, "Bad mdarray data_type");
+        return nullptr;
+    }
+
+    return reinterpret_cast<PyObject *>(pd);
+  }
+
+  static long mdarray_size_get(mdarray *self) {
+    return self->get()->size();
+  }
+
+  static long mdarray_ndim_get(mdarray *self) {
+    return self->get()->desc().data.ndims;
+  }
+
+  static bool mdarray_is_mdarray_get(mdarray *self) {
+    return true;
+  }
+};
+
+using reorder_buffer = implementation::mdarray::Reorder_buffer;
+
+#endif // _MDARRAY_H_
diff --git a/python/ideep4py/py/mm/mdarray.i b/python/ideep4py/py/mm/mdarray.i
new file mode 100644
index 00000000..890ca5ed
--- /dev/null
+++ b/python/ideep4py/py/mm/mdarray.i
@@ -0,0 +1,351 @@
+/*
+ *Copyright (c) 2018 Intel Corporation.
+ *
+ *Permission is hereby granted, free of charge, to any person obtaining a copy
+ *of this software and associated documentation files (the "Software"), to deal
+ *in the Software without restriction, including without limitation the rights
+ *to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ *copies of the Software, and to permit persons to whom the Software is
+ *furnished to do so, subject to the following conditions:
+ *
+ *The above copyright notice and this permission notice shall be included in
+ *all copies or substantial portions of the Software.
+ *
+ *THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ *OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ *THE SOFTWARE.
+ *
+ */
+
+
+%{
+  #define SWIG_FILE_WITH_INIT
+  #include <cstring>
+  #include <iostream>
+  #include <vector>
+  #include <numeric>
+  #include <memory>
+  #include <stdexcept>
+  #include <stdarg.h>
+#define SWIG_INLINE
+  #include "mdarray.h"
+%}
+
+%include exception.i
+%include pep_3118.i
+%include getattro.i
+%include asnumber.i
+%include asmap.i
+%include attribute.i
+%include tp.i
+%include std_vector.i
+
+%template(mdarrayVector) std::vector<mdarray>;
+%template(intVector) std::vector<int>;
+
+%tp_protocol(mdarray)
+%buffer_protocol_producer(mdarray)
+%buffer_protocol_typemap(Py_buffer *view)
+%getattr_wrapper(mdarray)
+%number_protocol(mdarray)
+%map_protocol(mdarray)
+
+%define %codegen(Class, ret_type, attrib, getter)
+%{
+  inline ret_type %mangle(Class) ##_## attrib ## _get(Class *self_) {
+    return (ret_type) Class::getter(self_);
+  }
+%}
+%enddef
+
+%define %extend_ro_attr(Class, ret_type, attrib, getter)
+  %immutable Class::attrib;
+  %extend Class {
+    ret_type attrib;
+  }
+  %codegen(Class, ret_type, attrib, getter)
+%enddef
+
+%define %extend_ro_attr_and_own(Class, ret_type, attrib, getter)
+  %immutable Class::attrib;
+  %newobject Class::attrib;
+
+  %extend Class {
+    ret_type attrib;
+  }
+
+  %codegen(Class, ret_type *, attrib, getter)
+%enddef
+
+%extend_ro_attr(mdarray, PyObject *, dtype, mdarray_dtype_get)
+%extend_ro_attr(mdarray, PyObject *, shape, mdarray_shape_get)
+%extend_ro_attr(mdarray, long, size, mdarray_size_get)
+%extend_ro_attr(mdarray, long, ndim, mdarray_ndim_get)
+%extend_ro_attr(mdarray, bool, is_mdarray, mdarray_is_mdarray_get)
+
+%extend mdarray {
+  PyObject *axpby(double a, double b, PyObject *y) {
+    return (*$self)->axpby(a, b, y);
+  }
+
+  PyObject *inplace_axpby(double a, double b, PyObject *y) {
+    /// Second param y is a harmless dummy
+    return (*$self)->inplace_axpby(a, y, b, y);
+  }
+
+  PyObject *flat() {
+    return (*self)->flat();
+  }
+}
+
+/* mdarray::reshape */
+%extend mdarray {
+  %typemap(in) (...)(vector<int> args) {
+     int i;
+     int argc;
+     argc = PySequence_Size(varargs);
+     if (argc > 4) {
+       PyErr_SetString(PyExc_ValueError,"Too many arguments");
+       return NULL;
+     }
+     if (argc == 1) {
+       Py_ssize_t size = 0; 
+       PyObject *o = PySequence_GetItem(varargs,0);
+       if (PyNumber_Check(o)) {
+         goto numpy_surrogate;
+       } else if (!PySequence_Check(o)) {
+         PyErr_SetString(PyExc_ValueError,"Expected a sequence");
+         return NULL;
+       }
+       size = PySequence_Size(o);
+       if (size != 4 && size != 2) {
+    numpy_surrogate:
+         PyObject *surrogate = PyArray_FromAny($self, nullptr, 0, 0
+                 , NPY_ARRAY_ELEMENTSTRIDES, nullptr);
+
+         if (surrogate == nullptr) {
+           PyErr_SetString(PyExc_ValueError,"Unexpected array");
+           return nullptr;
+         }
+         PyObject *res = PyArray_Reshape((PyArrayObject *)surrogate, o);
+
+         Py_DECREF(surrogate);
+         return res;
+       }
+       for (i = 0; i < PySequence_Size(o); i++) {
+         PyObject *obj = PySequence_GetItem(o, i);
+         if (!PyInt_Check(obj) && !PyLong_Check(obj)) {
+           PyErr_SetString(PyExc_ValueError,"Expected a int or long in sequence");
+           return NULL;
+         }
+         args.push_back(PyInt_AsLong(obj));
+       }
+     } else {
+       Py_ssize_t size = argc;
+       if (size != 4 && size != 2) {
+         PyObject *surrogate = PyArray_FromAny($self, nullptr, 0, 0
+                 , NPY_ARRAY_ELEMENTSTRIDES, nullptr);
+
+         if (surrogate == nullptr) {
+           PyErr_SetString(PyExc_ValueError,"Unexpected array");
+           return nullptr;
+         }
+         PyObject *res = PyArray_Reshape((PyArrayObject *)surrogate, varargs);
+
+         Py_DECREF(surrogate);
+         return res;
+       }
+       for (i = 0; i < argc; i++) {
+         PyObject *o = PySequence_GetItem(varargs,i);
+         if (!PyInt_Check(o) && !PyLong_Check(o)) {
+           PyErr_SetString(PyExc_ValueError,"Expected a int");
+           return NULL;
+         }
+         //args[i] = PyInt_AsLong(o);
+         args.push_back(PyInt_AsLong(o));
+       }
+     }
+     $1 = &args;
+  }
+
+  PyObject *reshape(...) {
+    va_list vl;
+    va_start(vl, self);
+    vector<int> *dims = va_arg(vl, vector<int>*);
+    va_end(vl);
+    return (*self)->reshape(self, *dims);
+  }
+}
+
+/* mdarray::sum */
+%extend mdarray {
+  %feature ("kwargs") sum;
+  %typemap(in) vector<int> axis {
+    $1.clear();
+    if (PyTuple_Check(obj1)) {
+      for (int i = 0; i < PyTuple_Size(obj1); i++) {
+        PyObject *item = PyTuple_GetItem(obj1, i);
+#if PY_VERSION_HEX > 0x03000000
+        if (!PyLong_Check(item)) {
+#else
+        if (!PyInt_Check(item)) {
+#endif
+          SWIG_exception_fail(SWIG_ValueError,
+              "in method '" "mdarray_sum" "', argument " "2"" of type '" "tuple (int, int, ...)""'");
+          SWIG_fail;
+        }
+
+        $1.push_back(PyLong_AsLong(item));
+      }
+#if PY_VERSION_HEX > 0x03000000
+    } else if (PyLong_Check(obj1)) {
+#else
+    } else if (PyInt_Check(obj1)) {
+#endif
+      $1.push_back(PyLong_AsLong(obj1));
+    } else {
+      void *_obj1;
+      if (!SWIG_IsOK(SWIG_ConvertPtr(obj1, &_obj1, nullptr, 0))) {
+        PyErr_SetString(PyExc_ValueError, "Wrong object in sum wrapper");
+        SWIG_fail;
+      }
+
+      if (!_obj1) {
+        $1.clear();
+      } else {
+        SWIG_exception_fail(SWIG_ValueError,
+            "in method '" "mdarray_sum" "', argument " "2"" of type '" "tuple or int""'");
+        SWIG_fail;
+      }
+    }
+  }
+
+  %typemap(argout) (vector<int> axis) {
+    if (!$result) {
+      auto *surrogate = reinterpret_cast<PyArrayObject *>(PyArray_FromAny(
+                            $self, nullptr, 0, 0, NPY_ARRAY_ELEMENTSTRIDES, nullptr));
+      if (surrogate == nullptr)
+        return nullptr;
+
+      if (!$1.size()) {
+        for (int i = 0; i < PyArray_NDIM(surrogate); i++)
+          $1.push_back(i);
+      }
+
+      vector<long> expected_shape;
+      long *shape = PyArray_DIMS(surrogate);
+      if (arg5) {
+        for (int v = 0; v < PyArray_NDIM(surrogate); v++)
+          expected_shape.push_back(shape[v]);
+
+        for (unsigned a = 0; a < $1.size(); a++)
+          expected_shape[$1[a]] = 1;
+      }
+
+      auto *res = surrogate;
+      for (auto i = 0; i < static_cast<int>($1.size()); i++) {
+        auto *tmp = reinterpret_cast<PyArrayObject *>(PyArray_Sum(
+                      res, $1[i], PyArray_TYPE(res), nullptr));
+        for (unsigned j = i + 1; j < $1.size(); j++) {
+          if ($1[i] < $1[j])
+            $1[j] -= 1;
+        }
+
+        // if (i < axis.size() - 1)
+        //   Py_DECREF(res);
+
+        Py_DECREF(res);
+        res = tmp;
+      }
+
+      if (arg5) {
+        PyObject *new_shape = PyTuple_New(expected_shape.size());
+        for (unsigned v = 0; v < expected_shape.size(); v++)
+#if PY_VERSION_HEX > 0x03000000
+          PyTuple_SetItem(new_shape, v, PyLong_FromLong(expected_shape[v]));
+#else
+          PyTuple_SetItem(new_shape, v, PyInt_FromLong(expected_shape[v]));
+#endif
+        res = reinterpret_cast<PyArrayObject *>(PyArray_Reshape(res, new_shape));
+      }
+      return reinterpret_cast<PyObject *>(res);
+    }
+  }
+
+  PyObject *sum(vector<int> axis={0}, int dtype=0,
+                PyObject *out=nullptr, bool keepdims=false) {
+    return (*self)->sum(axis, keepdims);
+  }
+}
+
+/*
+%extend mdarray {
+  PyObject *__getstate__() {
+    return (*$self)->__getstate__();
+  }
+
+  //TODO
+  %typemap(default) (PyObject *state) {
+    PyObject *state;
+
+    if (!PyArg_UnpackTuple(args, (char *)"mdarray___setstate__", 0, 1, &state)) SWIG_fail;
+
+    if (!PyTuple_Check(state)) SWIG_fail;
+
+    PyObject *py_dims = PyTuple_GetItem(state, 0);
+    PyObject *py_dtype = PyTuple_GetItem(state, 1);
+    PyObject *py_format = PyTuple_GetItem(state, 2);
+    PyObject *py_engine = PyTuple_GetItem(state, 3);
+    PyObject *py_rdata = PyTuple_GetItem(state, 4);
+
+    void *rdata = PyLong_AsVoidPtr(py_rdata);
+
+    mdarray *unpickled_mdarr = nullptr; //new mdarray(dims, dtype, format, engine);
+    (*unpickled_mdarr)->unpickled_data(rdata);
+    SwigPyObject *sobj = SWIG_Python_GetSwigThis(self);
+    if (sobj) {
+      sobj->ptr = reinterpret_cast<void *>(unpickled_mdarr);
+      sobj->ty = SWIGTYPE_p_mdarray;
+      sobj->own = 0;
+      sobj->next = 0;
+    } else {
+      SWIG_fail;
+    }
+  }
+
+  void __setstate__(PyObject *state) {
+    (*$self)->__setstate__(state);
+  }
+}
+*/
+
+class mdarray: public py_handle {
+public:
+  // It is deliberately NOT matching prototypes!
+  // FIXME
+  // add default constructor so that native can pass vector<mdarray> to python
+  mdarray();
+  mdarray(Py_buffer *view, char input_type = 'd');
+  virtual ~mdarray();
+};
+
+%typemap(in) (mdarray *in_mdarray) {
+    void *that;
+    int res1 = SWIG_ConvertPtr($input, &that, nullptr, 0);
+    if (!SWIG_IsOK(res1)) {
+        PyErr_SetString(PyExc_ValueError, "Can't convert mdarray pyobject");
+        return nullptr;
+    }
+    $1 = (reinterpret_cast<mdarray *>(that));
+};
+
+class reorder_buffer {
+public:
+  reorder_buffer(mdarray in);
+};
+
+%include "basic.i"
diff --git a/python/ideep4py/py/primitives/bn.i b/python/ideep4py/py/primitives/bn.i
new file mode 100644
index 00000000..855b7f1b
--- /dev/null
+++ b/python/ideep4py/py/primitives/bn.i
@@ -0,0 +1,35 @@
+/*
+ *Copyright (c) 2018 Intel Corporation.
+ *
+ *Permission is hereby granted, free of charge, to any person obtaining a copy
+ *of this software and associated documentation files (the "Software"), to deal
+ *in the Software without restriction, including without limitation the rights
+ *to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ *copies of the Software, and to permit persons to whom the Software is
+ *furnished to do so, subject to the following conditions:
+ *
+ *The above copyright notice and this permission notice shall be included in
+ *all copies or substantial portions of the Software.
+ *
+ *THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ *OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ *THE SOFTWARE.
+ *
+ */
+
+
+%{
+    #define SWIG_FILE_WITH_INIT
+    #include "bn_py.h"
+    #include "op_param.h"
+%}
+
+%include "param.i"
+%include "std_vector.i"
+%include "bn_py.h"
+
+%template(batchNormalization) batch_normalization_py<float>;
diff --git a/python/ideep4py/py/primitives/bn_py.h b/python/ideep4py/py/primitives/bn_py.h
new file mode 100644
index 00000000..ed9e234f
--- /dev/null
+++ b/python/ideep4py/py/primitives/bn_py.h
@@ -0,0 +1,72 @@
+/*
+ *Copyright (c) 2018 Intel Corporation.
+ *
+ *Permission is hereby granted, free of charge, to any person obtaining a copy
+ *of this software and associated documentation files (the "Software"), to deal
+ *in the Software without restriction, including without limitation the rights
+ *to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ *copies of the Software, and to permit persons to whom the Software is
+ *furnished to do so, subject to the following conditions:
+ *
+ *The above copyright notice and this permission notice shall be included in
+ *all copies or substantial portions of the Software.
+ *
+ *THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ *OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ *THE SOFTWARE.
+ *
+ */
+
+
+#ifndef _BN_PY_H_
+#define _BN_PY_H_
+
+#include <vector>
+#include <memory>
+#include "op_param.h"
+#include "mdarray.h"
+#include "bn.h"
+
+template <typename T>
+class batch_normalization_py {
+public:
+    static std::vector<mdarray> Forward(mdarray *src,
+        mdarray *w, mdarray *mean, mdarray *var, float eps) {
+
+        std::vector<mdarray> outs;
+        auto tensors = batch_normalization<T>::Forward(
+                           (src->get()->tensor()),
+                           (w ? w->get()->tensor() : nullptr),
+                           (mean ? mean->get()->tensor() : nullptr),
+                           (var ? var->get()->tensor() : nullptr), eps);
+
+        for (int i = 0; i < tensors.size(); i++)
+            outs.push_back(mdarray(tensors[i]));
+
+        return outs;
+    }
+
+    static std::vector<mdarray> Backward(mdarray *src, mdarray *diff_dst,
+        mdarray *mean, mdarray *var, mdarray *w, float eps) {
+
+        std::vector<mdarray> outs;
+        auto tensors = batch_normalization<T>::Backward(
+                           (src->get()->tensor()),
+                           (diff_dst->get()->tensor()),
+                           (mean->get()->tensor()),
+                           (var->get()->tensor()),
+                           (w ? w->get()->tensor() : nullptr),
+                           eps);
+
+        for (int i = 0; i < tensors.size(); i++)
+            outs.push_back(mdarray(tensors[i]));
+
+        return outs;
+    }
+};
+
+#endif // _BN_PY_H_
diff --git a/python/ideep4py/py/primitives/concat.i b/python/ideep4py/py/primitives/concat.i
new file mode 100644
index 00000000..7fbaefa9
--- /dev/null
+++ b/python/ideep4py/py/primitives/concat.i
@@ -0,0 +1,44 @@
+/*
+ *Copyright (c) 2018 Intel Corporation.
+ *
+ *Permission is hereby granted, free of charge, to any person obtaining a copy
+ *of this software and associated documentation files (the "Software"), to deal
+ *in the Software without restriction, including without limitation the rights
+ *to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ *copies of the Software, and to permit persons to whom the Software is
+ *furnished to do so, subject to the following conditions:
+ *
+ *The above copyright notice and this permission notice shall be included in
+ *all copies or substantial portions of the Software.
+ *
+ *THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ *OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ *THE SOFTWARE.
+ *
+ */
+
+
+%{
+    #define SWIG_FILE_WITH_INIT
+    #include "concat_py.h"
+%}
+
+%include "std_vector.i"
+%include "concat_py.h"
+
+%template(concat) Concat_Py<float>;
+
+//
+// Python API for Concat
+//
+// mdarray Concat_Py::Forward(
+//                        std::vector<mdarray> src,
+//                        int axis);
+// std::vector<mdarray> Concat_Py::Backward(
+//                        mdarray *diff_dst,
+//                        std::vector<int> offsets,
+//                        int axis);
diff --git a/python/ideep4py/py/primitives/concat_py.h b/python/ideep4py/py/primitives/concat_py.h
new file mode 100644
index 00000000..c528e907
--- /dev/null
+++ b/python/ideep4py/py/primitives/concat_py.h
@@ -0,0 +1,82 @@
+/*
+ *Copyright (c) 2018 Intel Corporation.
+ *
+ *Permission is hereby granted, free of charge, to any person obtaining a copy
+ *of this software and associated documentation files (the "Software"), to deal
+ *in the Software without restriction, including without limitation the rights
+ *to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ *copies of the Software, and to permit persons to whom the Software is
+ *furnished to do so, subject to the following conditions:
+ *
+ *The above copyright notice and this permission notice shall be included in
+ *all copies or substantial portions of the Software.
+ *
+ *THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ *OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ *THE SOFTWARE.
+ *
+ */
+
+
+#ifndef _CONCAT_PY_H_
+#define _CONCAT_PY_H_
+
+#include <vector>
+#include <memory>
+#include "mdarray.h"
+#include "concat.h"
+
+template <typename T>
+class Concat_Py
+{
+public:
+    /*
+     * Python Concat Forward
+     * params:
+     * src: input, xs
+     * axis
+     */
+    static mdarray Forward(std::vector<mdarray> src, int axis) {
+        std::vector<Tensor*> src_tensor;
+
+        for (int i = 0; i < src.size(); i++) {
+            src_tensor.push_back(src[i].get()->tensor());
+        }
+
+        Tensor *dst_tensor = Concat<T>::Forward(src_tensor, axis);
+
+        mdarray dst_mdarray = mdarray(dst_tensor);
+        return dst_mdarray;
+    }
+
+    /*
+     * Python Concat Backward
+     */
+    static std::vector<mdarray> Backward(mdarray *diff_dst,
+                                         std::vector<int> offsets,
+                                         int axis) {
+        std::vector<mdarray> gxs;
+
+        std::vector<Tensor *> gxs_tensor = Concat<T>::Backward(
+                                            (diff_dst->get()->tensor()),
+                                            offsets,
+                                            axis);
+
+        //
+        for (int i = 0; i < gxs_tensor.size(); i++){
+            gxs.push_back(mdarray(gxs_tensor[i]));
+        }
+
+        return gxs;
+    }
+
+};
+
+#endif // _CONCAT_PY_H_
+
+
+// vim: et ts=4 sw=4 cindent cino^=l0,\:0,N-s
diff --git a/python/ideep4py/py/primitives/conv.i b/python/ideep4py/py/primitives/conv.i
new file mode 100644
index 00000000..5d96d58c
--- /dev/null
+++ b/python/ideep4py/py/primitives/conv.i
@@ -0,0 +1,48 @@
+/*
+ *Copyright (c) 2018 Intel Corporation.
+ *
+ *Permission is hereby granted, free of charge, to any person obtaining a copy
+ *of this software and associated documentation files (the "Software"), to deal
+ *in the Software without restriction, including without limitation the rights
+ *to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ *copies of the Software, and to permit persons to whom the Software is
+ *furnished to do so, subject to the following conditions:
+ *
+ *The above copyright notice and this permission notice shall be included in
+ *all copies or substantial portions of the Software.
+ *
+ *THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ *OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ *THE SOFTWARE.
+ *
+ */
+
+
+%{
+    #define SWIG_FILE_WITH_INIT
+    #include "conv_py.h"
+    #include "op_param.h"
+%}
+
+%include "param.i"
+%include "conv_py.h"
+
+%template(convolution2D) Convolution2D_Py<float>;
+
+//
+// Python API for Convolution2D
+//
+// mdarray Convolution2D_Py::Forward(
+//                        mdarray *src, mdarray *weights,
+//                        mdarray *dst, mdarray *bias,
+//                        conv_param_t *cp);
+// std::vector<mdarray> Convolution2D_Py::BackwardWeights(
+//                        mdarray *src, mdarray *diff_dst,
+//                        con_prarm_t *cp);
+// mdarray Convolution2D_Py::BackwardData(
+//                        mdarray *weights, mdarray *diff_dst,
+//                        conv_param_t *cp);
diff --git a/python/ideep4py/py/primitives/conv_py.h b/python/ideep4py/py/primitives/conv_py.h
new file mode 100644
index 00000000..d11ba34a
--- /dev/null
+++ b/python/ideep4py/py/primitives/conv_py.h
@@ -0,0 +1,126 @@
+/*
+ *Copyright (c) 2018 Intel Corporation.
+ *
+ *Permission is hereby granted, free of charge, to any person obtaining a copy
+ *of this software and associated documentation files (the "Software"), to deal
+ *in the Software without restriction, including without limitation the rights
+ *to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ *copies of the Software, and to permit persons to whom the Software is
+ *furnished to do so, subject to the following conditions:
+ *
+ *The above copyright notice and this permission notice shall be included in
+ *all copies or substantial portions of the Software.
+ *
+ *THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ *OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ *THE SOFTWARE.
+ *
+ */
+
+
+#ifndef _CONV_PY_H_
+#define _CONV_PY_H_
+
+#include <vector>
+#include <memory>
+#include "op_param.h"
+#include "mdarray.h"
+#include "conv.h"
+
+template <typename T>
+class Convolution2D_Py
+{
+public:
+    /*
+     * Python Convolution Forward
+     * Y = W*X + b
+     * params:
+     * src: input, x
+     * weight: weights, w
+     * dst: output, y
+     * bias: bias, b
+     * cp: convolution parameters
+     */
+    static mdarray Forward(mdarray *src,
+                           mdarray *weights,
+                           mdarray *bias,
+                           conv_param_t *cp) {
+        auto tensor = Convolution2D<T>::Forward(
+                src->get()->tensor(),
+                weights->get()->tensor(),
+                bias ? bias->get()->tensor() : nullptr, cp);
+
+        auto out = mdarray(tensor);
+        return out;
+    }
+
+    /*
+     * Python Convolution backward weights
+     * gW = gy*x
+     * params:
+     * src: input, x
+     * diff_dst: diff dst, gy
+     * cp: convolution parameters
+     */
+    static mdarray BackwardWeights(mdarray *src,
+                                   mdarray *diff_dst,
+                                   conv_param_t *cp) {
+        auto tensor = Convolution2D<T>::BackwardWeights(
+                          (src->get()->tensor()),
+                          (diff_dst->get()->tensor()), cp);
+
+        auto out = mdarray(tensor);
+        return out;
+    }
+
+    /*
+     * Python Convolution backward weights & bias
+     * gW = gy*x
+     * params:
+     * src: input, x
+     * diff_dst: diff dst, gy
+     * cp: convolution parameters
+     */
+    static std::vector<mdarray> BackwardWeightsBias(mdarray *src,
+                                                    mdarray *diff_dst,
+                                                    conv_param_t *cp) {
+        std::vector<mdarray> outs;
+        auto tensors = Convolution2D<T>::BackwardWeightsBias(
+                           (src->get()->tensor()),
+                           (diff_dst->get()->tensor()), cp);
+
+        for (int i = 0; i < tensors.size(); i++)
+            outs.push_back(mdarray(tensors[i]));
+
+        return outs;
+    }
+
+    /*
+     * Python Convolution backward data
+     * gx = gy*w
+     * param:
+     * weights: weights, w
+     * diff_dst: diff dst, gy
+     * cp: convolution parameters
+     */
+    static mdarray BackwardData(mdarray *weights,
+                                mdarray *diff_dst,
+                                conv_param_t *cp) {
+        auto tensor = Convolution2D<T>::BackwardData(
+                          (weights->get()->tensor()),
+                          (diff_dst->get()->tensor()), cp);
+
+        auto out = mdarray(tensor);
+        return out;
+    }
+
+};
+
+#endif // _CONV_PY_H_
+
+
+// vim: et ts=4 sw=4 cindent cino^=l0,\:0,N-s
diff --git a/python/ideep4py/py/primitives/dropout.i b/python/ideep4py/py/primitives/dropout.i
new file mode 100644
index 00000000..901e9216
--- /dev/null
+++ b/python/ideep4py/py/primitives/dropout.i
@@ -0,0 +1,35 @@
+/*
+ *Copyright (c) 2018 Intel Corporation.
+ *
+ *Permission is hereby granted, free of charge, to any person obtaining a copy
+ *of this software and associated documentation files (the "Software"), to deal
+ *in the Software without restriction, including without limitation the rights
+ *to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ *copies of the Software, and to permit persons to whom the Software is
+ *furnished to do so, subject to the following conditions:
+ *
+ *The above copyright notice and this permission notice shall be included in
+ *all copies or substantial portions of the Software.
+ *
+ *THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ *OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ *THE SOFTWARE.
+ *
+ */
+
+
+%{
+    #define SWIG_FILE_WITH_INIT
+    #include "dropout_py.h"
+    #include "op_param.h"
+%}
+
+%include "param.i"
+%include "std_vector.i"
+%include "dropout_py.h"
+
+%template(dropout) Dropout_py<float>;
diff --git a/python/ideep4py/py/primitives/dropout_py.h b/python/ideep4py/py/primitives/dropout_py.h
new file mode 100644
index 00000000..a3e80450
--- /dev/null
+++ b/python/ideep4py/py/primitives/dropout_py.h
@@ -0,0 +1,53 @@
+/*
+ *Copyright (c) 2018 Intel Corporation.
+ *
+ *Permission is hereby granted, free of charge, to any person obtaining a copy
+ *of this software and associated documentation files (the "Software"), to deal
+ *in the Software without restriction, including without limitation the rights
+ *to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ *copies of the Software, and to permit persons to whom the Software is
+ *furnished to do so, subject to the following conditions:
+ *
+ *The above copyright notice and this permission notice shall be included in
+ *all copies or substantial portions of the Software.
+ *
+ *THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ *OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ *THE SOFTWARE.
+ *
+ */
+
+
+#ifndef _DROPOUT_PY_H_
+#define _DROPOUT_PY_H_
+
+#include <vector>
+#include <memory>
+#include "op_param.h"
+#include "mdarray.h"
+#include "dropout.h"
+
+template <typename T>
+class Dropout_py {
+public:
+    static std::vector<mdarray> Forward(mdarray* x, float ratio) {
+        auto tensors = Dropout<T>::Forward(x->get()->tensor(), ratio);
+
+        std::vector<mdarray> outs;
+        for (const auto& tensor : tensors) {
+            outs.push_back(mdarray(tensor));
+        }
+
+        return outs; // [0]: mask, [1]: y
+    }
+
+    static mdarray Backward(mdarray* mask, mdarray* gy) {
+        return mdarray(Dropout<T>::Backward(mask->get()->tensor(), gy->get()->tensor()));
+    }
+};
+
+#endif // _DROPOUT_PY_H_
diff --git a/python/ideep4py/py/primitives/eltwise.i b/python/ideep4py/py/primitives/eltwise.i
new file mode 100644
index 00000000..aea74cff
--- /dev/null
+++ b/python/ideep4py/py/primitives/eltwise.i
@@ -0,0 +1,34 @@
+/*
+ *Copyright (c) 2018 Intel Corporation.
+ *
+ *Permission is hereby granted, free of charge, to any person obtaining a copy
+ *of this software and associated documentation files (the "Software"), to deal
+ *in the Software without restriction, including without limitation the rights
+ *to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ *copies of the Software, and to permit persons to whom the Software is
+ *furnished to do so, subject to the following conditions:
+ *
+ *The above copyright notice and this permission notice shall be included in
+ *all copies or substantial portions of the Software.
+ *
+ *THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ *OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ *THE SOFTWARE.
+ *
+ */
+
+
+%{
+    #define SWIG_FILE_WITH_INIT
+    #include "eltwise_py.h"
+%}
+
+%include "std_vector.i"
+%include "eltwise_py.h"
+
+%template(relu) Relu_Py<float>;
+%template(tanh) Tanh_Py<float>;
diff --git a/python/ideep4py/py/primitives/eltwise_py.h b/python/ideep4py/py/primitives/eltwise_py.h
new file mode 100644
index 00000000..b583c512
--- /dev/null
+++ b/python/ideep4py/py/primitives/eltwise_py.h
@@ -0,0 +1,92 @@
+/*
+ *Copyright (c) 2018 Intel Corporation.
+ *
+ *Permission is hereby granted, free of charge, to any person obtaining a copy
+ *of this software and associated documentation files (the "Software"), to deal
+ *in the Software without restriction, including without limitation the rights
+ *to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ *copies of the Software, and to permit persons to whom the Software is
+ *furnished to do so, subject to the following conditions:
+ *
+ *The above copyright notice and this permission notice shall be included in
+ *all copies or substantial portions of the Software.
+ *
+ *THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ *OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ *THE SOFTWARE.
+ *
+ */
+
+
+#pragma once
+
+#include <vector>
+#include <memory>
+#include "mdarray.h"
+#include "eltwise.h"
+
+template <typename T>
+class Relu_Py
+{
+public:
+    static mdarray Forward(mdarray &src) {
+        // Shoule be removed in future????
+        implementation::mdarray *src_internal = src.get();
+        Tensor *dst_tensor = Eltwise<T, float>::Forward(
+                src_internal->tensor(), ELTWISE_RELU, 0.0 , 0.0);
+
+        mdarray dst_mdarray = mdarray(dst_tensor);
+        return dst_mdarray;
+    }
+
+    static mdarray Backward(mdarray& src, mdarray& diff_dst) {
+        //FIXME
+        //Should be removed in future
+        Tensor *src_tensor = src.get()->tensor();
+        Tensor *diff_dst_tensor = diff_dst.get()->tensor();
+
+        Tensor *diff_src_tensor = Eltwise<T, float>::Backward(src_tensor, diff_dst_tensor, ELTWISE_RELU, 0.0, 0.0);
+
+        // FIXME
+        // In future, mdarray will have a Tensor member, no need to create a new one
+        mdarray diff_src_mdarray = mdarray(diff_src_tensor);
+        return diff_src_mdarray;
+    }
+
+};
+
+template <typename T>
+class Tanh_Py
+{
+public:
+    static mdarray Forward(mdarray &src) {
+        // Shoule be removed in future????
+        implementation::mdarray *src_internal = src.get();
+        Tensor *dst_tensor = Eltwise<T, float>::Forward(
+                src_internal->tensor(), ELTWISE_TANH, 0.0 , 0.0); 
+        
+        mdarray dst_mdarray = mdarray(dst_tensor);
+        return dst_mdarray;
+    }
+
+    static mdarray Backward(mdarray& src, mdarray& diff_dst) {
+        //FIXME
+        //Should be removed in future
+        Tensor *src_tensor = src.get()->tensor();
+        Tensor *diff_dst_tensor = diff_dst.get()->tensor();
+
+        Tensor *diff_src_tensor = Eltwise<T, float>::Backward(src_tensor, diff_dst_tensor, ELTWISE_TANH, 0.0, 0.0);
+
+        // FIXME
+        // In future, mdarray will have a Tensor member, no need to create a new one
+        mdarray diff_src_mdarray = mdarray(diff_src_tensor);
+        return diff_src_mdarray;
+    }
+
+};
+
+// vim: et ts=4 sw=4 cindent cino^=l0,\:0,N-s
diff --git a/python/ideep4py/py/primitives/linear.i b/python/ideep4py/py/primitives/linear.i
new file mode 100644
index 00000000..51d44825
--- /dev/null
+++ b/python/ideep4py/py/primitives/linear.i
@@ -0,0 +1,48 @@
+/*
+ *Copyright (c) 2018 Intel Corporation.
+ *
+ *Permission is hereby granted, free of charge, to any person obtaining a copy
+ *of this software and associated documentation files (the "Software"), to deal
+ *in the Software without restriction, including without limitation the rights
+ *to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ *copies of the Software, and to permit persons to whom the Software is
+ *furnished to do so, subject to the following conditions:
+ *
+ *The above copyright notice and this permission notice shall be included in
+ *all copies or substantial portions of the Software.
+ *
+ *THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ *OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ *THE SOFTWARE.
+ *
+ */
+
+
+%{
+    #define SWIG_FILE_WITH_INIT
+    #include "linear_py.h"
+    #include "op_param.h"
+%}
+
+%include "param.i"
+%include "std_vector.i"
+%include "linear_py.h"
+
+%template(linear) Linear_Py<float>;
+//
+// API for Linear
+// mdarray Linear_F32::Forward(
+//              mdarray& src, mdarray& weights,
+//              mdarray& dst, mdarray& bias,
+//              linear_param_t& lp);
+// std::vector<mdarray> Linear_F32::BackwardWeights(
+//                              mdarray& src, mdarray& diff_dst,
+//                              linear_param_t& lp);
+// mdarray Linear_F32::BackwardData(
+//                          mdarray& weights, mdarray& diff_dst,
+//                          linear_param_t* lp);
+
diff --git a/python/ideep4py/py/primitives/linear_py.h b/python/ideep4py/py/primitives/linear_py.h
new file mode 100644
index 00000000..8352cd46
--- /dev/null
+++ b/python/ideep4py/py/primitives/linear_py.h
@@ -0,0 +1,84 @@
+/*
+ *Copyright (c) 2018 Intel Corporation.
+ *
+ *Permission is hereby granted, free of charge, to any person obtaining a copy
+ *of this software and associated documentation files (the "Software"), to deal
+ *in the Software without restriction, including without limitation the rights
+ *to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ *copies of the Software, and to permit persons to whom the Software is
+ *furnished to do so, subject to the following conditions:
+ *
+ *The above copyright notice and this permission notice shall be included in
+ *all copies or substantial portions of the Software.
+ *
+ *THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ *OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ *THE SOFTWARE.
+ *
+ */
+
+
+#ifndef _LINEAR_PY_H_
+#define _LINEAR_PY_H_
+
+#include <vector>
+#include <memory>
+#include "op_param.h"
+#include "mdarray.h"
+#include "linear.h"
+
+template <typename T>
+class Linear_Py
+{
+public:
+     static mdarray Forward(mdarray *src,
+                            mdarray *weights,
+                            mdarray *bias) {
+        auto tensor = Linear<T>::Forward(
+                          src->get()->tensor(),
+                          weights->get()->tensor(),
+                          bias ? bias->get()->tensor(): nullptr);
+
+        auto out = mdarray(tensor);
+        return out;
+     }
+
+     static mdarray BackwardWeights(mdarray* src,
+                                    mdarray* diff_dst) {
+        auto tensors = Linear<T>::BackwardWeights(
+                           src->get()->tensor(),
+                           diff_dst->get()->tensor(), false);
+
+        auto out = mdarray(tensors[0]);
+        return out;
+     }
+
+     static std::vector<mdarray> BackwardWeightsBias(mdarray* src,
+                                                     mdarray* diff_dst) {
+        std::vector<mdarray> outs;
+        auto tensors = Linear<T>::BackwardWeights(
+                           src->get()->tensor(),
+                           diff_dst->get()->tensor(), true);
+
+        for (int i = 0; i < tensors.size(); i++)
+            outs.push_back(mdarray(tensors[i]));
+
+        return outs;
+     }
+
+     static mdarray BackwardData(mdarray* weights,
+                                 mdarray* diff_dst) {
+         auto tensor = Linear<T>::BackwardData(
+                           weights->get()->tensor(),
+                           diff_dst->get()->tensor());
+
+         auto out = mdarray(tensor);
+         return out;
+     }
+};
+
+#endif //_LINEAR_PY_H
diff --git a/python/ideep4py/py/primitives/lrn.i b/python/ideep4py/py/primitives/lrn.i
new file mode 100755
index 00000000..b08f220a
--- /dev/null
+++ b/python/ideep4py/py/primitives/lrn.i
@@ -0,0 +1,47 @@
+/*
+ *Copyright (c) 2018 Intel Corporation.
+ *
+ *Permission is hereby granted, free of charge, to any person obtaining a copy
+ *of this software and associated documentation files (the "Software"), to deal
+ *in the Software without restriction, including without limitation the rights
+ *to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ *copies of the Software, and to permit persons to whom the Software is
+ *furnished to do so, subject to the following conditions:
+ *
+ *The above copyright notice and this permission notice shall be included in
+ *all copies or substantial portions of the Software.
+ *
+ *THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ *OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ *THE SOFTWARE.
+ *
+ */
+
+
+%{
+    #define SWIG_FILE_WITH_INIT
+    #include "lrn_py.h"
+    #include "op_param.h"
+%}
+
+%include "param.i"
+%include "std_vector.i"
+%include "lrn_py.h"
+
+%template(localResponseNormalization) LocalResponseNormalization_Py<float>;
+
+//
+// Python API for LocalResponseNormalization
+//
+// std::vector<mdarray*> LocalResponseNormalization_Py::Forward(
+//                        mdarray *src,
+//                        lrn_prarm_t *pp);
+// mdarray* LocalResponseNormalization_Py::Backward(
+//                        mdarray *src,
+//                        mdarray *diff_dst,
+//                        mdarray *ws,
+//                        lrn_param_t *pp);
diff --git a/python/ideep4py/py/primitives/lrn_py.h b/python/ideep4py/py/primitives/lrn_py.h
new file mode 100755
index 00000000..bc7ce387
--- /dev/null
+++ b/python/ideep4py/py/primitives/lrn_py.h
@@ -0,0 +1,95 @@
+/*
+ *Copyright (c) 2018 Intel Corporation.
+ *
+ *Permission is hereby granted, free of charge, to any person obtaining a copy
+ *of this software and associated documentation files (the "Software"), to deal
+ *in the Software without restriction, including without limitation the rights
+ *to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ *copies of the Software, and to permit persons to whom the Software is
+ *furnished to do so, subject to the following conditions:
+ *
+ *The above copyright notice and this permission notice shall be included in
+ *all copies or substantial portions of the Software.
+ *
+ *THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ *OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ *THE SOFTWARE.
+ *
+ */
+
+
+#ifndef _LRN_PY_H_
+#define _LRN_PY_H_
+
+#include <vector>
+#include <memory>
+#include "op_param.h"
+#include "mdarray.h"
+#include "lrn.h"
+
+template <typename T>
+class LocalResponseNormalization_Py
+{
+public:
+    /*
+     * Python Lrn Forward
+     * params:
+     * src: input, x
+     * pp: lrn parameters
+     */
+    static std::vector<mdarray> Forward(mdarray *src,
+                                        lrn_param_t *pp) {
+        std::vector<mdarray> outputs;
+
+        // Shoule be removed in future????
+        implementation::mdarray *src_internal = src->get();
+
+        std::vector<Tensor *> outputs_tensor = LocalResponseNormalization<T>::Forward(
+                                                    (src_internal->tensor()),
+                                                    pp);
+        //FIXME
+        for (int i = 0; i < outputs_tensor.size(); i++) {
+            outputs.push_back( mdarray(outputs_tensor[i]) );
+        }
+
+        return outputs;
+    }
+
+    /*
+     * Python Lrn backward
+     * param:
+     * src: x
+     * diff_dst: diff dst, gy
+     * ws: workspace
+     * pp: lrn parameters
+     */
+    static mdarray Backward(mdarray *src, mdarray *diff_dst, mdarray *ws, lrn_param_t *pp) {
+        //FIXME
+        //Should be removed in future
+        implementation::mdarray *diff_dst_internal = diff_dst->get();
+        implementation::mdarray *src_internal = src->get();
+        implementation::mdarray *ws_internal = ws->get();
+        
+        Tensor *diff_src_tensor = LocalResponseNormalization<T>::Backward(
+            (src_internal->tensor()),
+            (diff_dst_internal->tensor()),
+            (ws_internal->tensor()),
+            pp);
+      
+
+        // FIXME
+        // In future, mdarray will have a Tensor member, no need to create a new one
+        mdarray diff_src_mdarray = mdarray(diff_src_tensor);
+        return diff_src_mdarray;
+    }
+
+};
+
+#endif // _LRN_PY_H_
+
+
+// vim: et ts=4 sw=4 cindent cino^=l0,\:0,N-s
diff --git a/python/ideep4py/py/primitives/param.i b/python/ideep4py/py/primitives/param.i
new file mode 100644
index 00000000..45ad0911
--- /dev/null
+++ b/python/ideep4py/py/primitives/param.i
@@ -0,0 +1,60 @@
+/*
+ *Copyright (c) 2018 Intel Corporation.
+ *
+ *Permission is hereby granted, free of charge, to any person obtaining a copy
+ *of this software and associated documentation files (the "Software"), to deal
+ *in the Software without restriction, including without limitation the rights
+ *to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ *copies of the Software, and to permit persons to whom the Software is
+ *furnished to do so, subject to the following conditions:
+ *
+ *The above copyright notice and this permission notice shall be included in
+ *all copies or substantial portions of the Software.
+ *
+ *THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ *OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ *THE SOFTWARE.
+ *
+ */
+
+
+%rename (convolution2DParam) conv_param_t;
+struct conv_param_t {
+    std::vector<int> out_dims;
+    int kh, kw; // kernel size
+    int dilate_y = 0, dilate_x = 0; // in MKL-DNN, common conv is treated as 0 dilate
+    int sy, sx; // stride
+    int pad_lh, pad_lw, pad_rh, pad_rw; //padding
+};
+
+%rename (pooling2DParam) pooling_param_t;
+struct pooling_param_t {
+    std::vector<int> out_dims;
+    int kh, kw; // kernel size
+    int sy, sx; // stride
+    int pad_lh, pad_lw, pad_rh, pad_rw; //padding
+
+    enum algorithm {
+        pooling_max,
+        pooling_avg,
+        pooling_avg_include_padding,
+        pooling_avg_exclude_padding,
+    } algo_kind;
+};
+
+%rename (localResponseNormalizationParam) lrn_param_t;
+struct lrn_param_t {
+    int n; // local size
+    double k;
+    double alpha;
+    double beta;
+
+    enum algorithm {
+        lrn_across_channels,
+        lrn_within_channel,
+    } algo_kind;
+};
diff --git a/python/ideep4py/py/primitives/pooling.i b/python/ideep4py/py/primitives/pooling.i
new file mode 100644
index 00000000..16c9105d
--- /dev/null
+++ b/python/ideep4py/py/primitives/pooling.i
@@ -0,0 +1,46 @@
+/*
+ *Copyright (c) 2018 Intel Corporation.
+ *
+ *Permission is hereby granted, free of charge, to any person obtaining a copy
+ *of this software and associated documentation files (the "Software"), to deal
+ *in the Software without restriction, including without limitation the rights
+ *to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ *copies of the Software, and to permit persons to whom the Software is
+ *furnished to do so, subject to the following conditions:
+ *
+ *The above copyright notice and this permission notice shall be included in
+ *all copies or substantial portions of the Software.
+ *
+ *THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ *OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ *THE SOFTWARE.
+ *
+ */
+
+
+%{
+    #define SWIG_FILE_WITH_INIT
+    #include "pooling_py.h"
+    #include "op_param.h"
+%}
+
+%include "param.i"
+%include "std_vector.i"
+%include "pooling_py.h"
+
+%template(pooling2D) Pooling2D_Py<float>;
+
+//
+// Python API for Pooling2D
+//
+// std::vector<mdarray*> Pooling2D_Py::Forward(
+//                        mdarray *src,
+//                        pooling_prarm_t *pp);
+// mdarray* Pooling2D_Py::Backward(
+//                        mdarray *diff_dst,
+//                        mdarray *ws,
+//                        conv_param_t *pp);
diff --git a/python/ideep4py/py/primitives/pooling_py.h b/python/ideep4py/py/primitives/pooling_py.h
new file mode 100644
index 00000000..fe1bbd68
--- /dev/null
+++ b/python/ideep4py/py/primitives/pooling_py.h
@@ -0,0 +1,104 @@
+/*
+ *Copyright (c) 2018 Intel Corporation.
+ *
+ *Permission is hereby granted, free of charge, to any person obtaining a copy
+ *of this software and associated documentation files (the "Software"), to deal
+ *in the Software without restriction, including without limitation the rights
+ *to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ *copies of the Software, and to permit persons to whom the Software is
+ *furnished to do so, subject to the following conditions:
+ *
+ *The above copyright notice and this permission notice shall be included in
+ *all copies or substantial portions of the Software.
+ *
+ *THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ *OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ *THE SOFTWARE.
+ *
+ */
+
+
+#ifndef _POOLING_PY_H_
+#define _POOLING_PY_H_
+
+#include <vector>
+#include <memory>
+#include "op_param.h"
+#include "mdarray.h"
+#include "pooling.h"
+
+template <typename T>
+class Pooling2D_Py
+{
+public:
+    /*
+     * Python Pooling Forward
+     * params:
+     * src: input, x
+     * pp: pooling parameters
+     */
+    static std::vector<mdarray> Forward(mdarray *src, 
+                                        pooling_param_t *pp) {
+        std::vector<mdarray> outputs;
+
+        // Shoule be removed in future????
+        implementation::mdarray *src_internal = src->get();
+        
+        std::vector<Tensor *> outputs_tensor = Pooling2D<T>::Forward(
+                                                    (src_internal->tensor()),
+                                                    pp);
+        // FIXME
+        //FIXME
+        for (int i = 0; i < outputs_tensor.size(); i++) {
+            outputs.push_back( mdarray(outputs_tensor[i]) );
+        }
+
+        return outputs;
+    }
+
+    /*
+     * Python Pooling backward
+     * param:
+     * diff_dst: diff dst, gy
+     * ws: workspace
+     * pp: pooling parameters
+     */
+    static mdarray Backward(mdarray *diff_dst,
+                            mdarray *ws,
+                            pooling_param_t *pp) {
+        //FIXME
+        //Should be removed in future
+        implementation::mdarray *diff_dst_internal = diff_dst->get();
+        implementation::mdarray *ws_internal;
+        if ( pp->algo_kind == pooling_param_t::algorithm::pooling_max)
+            ws_internal = ws->get();
+        
+        Tensor *diff_src_tensor;
+        if ( pp->algo_kind == pooling_param_t::algorithm::pooling_max) {
+            diff_src_tensor = Pooling2D<T>::Backward(
+                                    (diff_dst_internal->tensor()),
+                                    (ws_internal->tensor()),
+                                    pp);
+        } else {
+            diff_src_tensor = Pooling2D<T>::Backward(
+                                    (diff_dst_internal->tensor()),
+                                    NULL,
+                                    pp);
+        }
+
+        // FIXME
+        // In future, mdarray will have a Tensor member, no need to create a new one
+        mdarray diff_src_mdarray = mdarray(diff_src_tensor);
+        return diff_src_mdarray;
+    }
+
+};
+
+#endif // _POOLING_PY_H_
+
+
+// vim: et ts=4 sw=4 cindent cino^=l0,\:0,N-s
diff --git a/python/ideep4py/py/swig_utils/asmap.i b/python/ideep4py/py/swig_utils/asmap.i
new file mode 100644
index 00000000..e6ff7dab
--- /dev/null
+++ b/python/ideep4py/py/swig_utils/asmap.i
@@ -0,0 +1,74 @@
+/*
+ *Copyright (c) 2018 Intel Corporation.
+ *
+ *Permission is hereby granted, free of charge, to any person obtaining a copy
+ *of this software and associated documentation files (the "Software"), to deal
+ *in the Software without restriction, including without limitation the rights
+ *to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ *copies of the Software, and to permit persons to whom the Software is
+ *furnished to do so, subject to the following conditions:
+ *
+ *The above copyright notice and this permission notice shall be included in
+ *all copies or substantial portions of the Software.
+ *
+ *THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ *OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ *THE SOFTWARE.
+ *
+ */
+
+
+%{
+  template <class T>
+  struct map_traits {
+    static Py_ssize_t mp_length(PyObject *self) {
+      void *that;
+
+      int res1 = SWIG_ConvertPtr(self, &that, nullptr, 0);
+      if (!SWIG_IsOK(res1)) {
+        PyErr_SetString(PyExc_ValueError, "Wrong self object in mp_length");
+        return 0;
+      }
+
+      return (*reinterpret_cast<T *>(that))->mp_length(self);
+    }
+
+    static PyObject *mp_subscript(PyObject *self, PyObject *op) {
+      void *that;
+
+      int res1 = SWIG_ConvertPtr(self, &that, nullptr, 0);
+      if (!SWIG_IsOK(res1)) {
+        PyErr_SetString(PyExc_ValueError, "Wrong self object in mp_subscript");
+        return nullptr;
+      }
+
+      return (*reinterpret_cast<T *>(that))->mp_subscript(self, op);
+    }
+
+    static int mp_ass_subscript(PyObject *self, PyObject *ind, PyObject *op) {
+      void *that;
+
+      int res1 = SWIG_ConvertPtr(self, &that, nullptr, 0);
+      if (!SWIG_IsOK(res1)) {
+        PyErr_SetString(PyExc_ValueError, "Wrong self object in mp_subscript");
+        return -1;
+      }
+
+      return (*reinterpret_cast<T *>(that))->mp_ass_subscript(self, ind, op);
+    }
+  };
+%}
+
+%define %map_slot(name, type)
+  %feature("python:mp_" %str(name)) type "map_traits<" %str(type) ">::mp_" %str(name);
+%enddef
+
+%define %map_protocol(type...)
+  %map_slot(length, type)
+  %map_slot(subscript, type)
+  %map_slot(ass_subscript, type)
+%enddef
diff --git a/python/ideep4py/py/swig_utils/asnumber.i b/python/ideep4py/py/swig_utils/asnumber.i
new file mode 100644
index 00000000..325f075d
--- /dev/null
+++ b/python/ideep4py/py/swig_utils/asnumber.i
@@ -0,0 +1,138 @@
+/*
+ *Copyright (c) 2018 Intel Corporation.
+ *
+ *Permission is hereby granted, free of charge, to any person obtaining a copy
+ *of this software and associated documentation files (the "Software"), to deal
+ *in the Software without restriction, including without limitation the rights
+ *to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ *copies of the Software, and to permit persons to whom the Software is
+ *furnished to do so, subject to the following conditions:
+ *
+ *The above copyright notice and this permission notice shall be included in
+ *all copies or substantial portions of the Software.
+ *
+ *THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ *OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ *THE SOFTWARE.
+ *
+ */
+
+
+%{
+  // TODO: Support both raw or smart pointer type
+  #define nb_unary(op, m) \
+    static PyObject * nb_ ## op (PyObject *self) {    \
+      void *that;                                                 \
+      int res1 = SWIG_ConvertPtr(self, &that, nullptr, 0);        \
+      if (!SWIG_IsOK(res1)) {                                     \
+        PyErr_SetString(PyExc_ValueError, "Wrong self object in nb_unary wrapper");  \
+        return nullptr;                                                \
+      }                                                           \
+      return (*reinterpret_cast<T *>(that))->m_ ## m(self);  \
+    }
+
+  #define nb_binary(op, m) \
+    static PyObject * nb_ ## op (PyObject *left, PyObject *right) {    \
+      void *that;                                                 \
+      int res1 = SWIG_ConvertPtr(left, &that, nullptr, 0);        \
+      if (SWIG_IsOK(res1)) {                                      \
+        return (*reinterpret_cast<T *>(that))->m_ ## m(left, right);  \
+      } else {                                                    \
+        res1 = SWIG_ConvertPtr(right, &that, nullptr, 0);         \
+        if (!SWIG_IsOK(res1)) {                                   \
+          PyErr_SetString(PyExc_ValueError, "Wrong self object in nb_binary wrapper");  \
+          return nullptr;                                             \
+        }                                                         \
+        return (*reinterpret_cast<T *>(that))->m_ ## m(left, right);  \
+      }                                                           \
+    }
+
+  #define nb_ternary(op, m) \
+    static PyObject * nb_ ## op (PyObject *self, PyObject *o1, PyObject *o2) {    \
+      void *that;                                                 \
+      int res1 = SWIG_ConvertPtr(self, &that, nullptr, 0);        \
+      if (!SWIG_IsOK(res1)) {                                     \
+        PyErr_SetString(PyExc_ValueError, "Wrong self object in nb_ternary wrapper");  \
+        return nullptr;                                                \
+      }                                                           \
+      return (*reinterpret_cast<T *>(that))->m_ ## m(self, o1, o2);  \
+    }
+
+
+  template <class T>
+  struct number_traits {
+    nb_binary(add, Add);
+    nb_binary(subtract, Subtract);
+    nb_binary(multiply, Multiply);
+    nb_binary(divide, Divide);
+    nb_binary(remainder, Remainder);
+    nb_binary(divmod, Divmod);
+    nb_ternary(power, Power);
+    nb_unary(negative, Negative);
+    nb_unary(positive, Positive);
+    nb_unary(absolute, Absolute);
+    nb_unary(invert, Invert);
+    nb_binary(lshift, Lshift);
+    nb_binary(rshift, Rshift);
+    nb_binary(and, And);
+    nb_binary(xor, Xor);
+    nb_binary(or, Or);
+    nb_binary(inplace_add, InPlaceAdd);
+    nb_binary(inplace_subtract, InPlaceSubtract);
+    nb_binary(inplace_multiply, InPlaceMultiply);
+    nb_binary(inplace_divide, InPlaceDivide);
+    nb_binary(inplace_remainder, InPlaceRemainder);
+    nb_ternary(inplace_power, InPlacePower);
+    nb_binary(inplace_lshift, InPlaceLshift);
+    nb_binary(inplace_rshift, InPlaceRshift);
+    nb_binary(inplace_and, InPlaceAnd);
+    nb_binary(inplace_xor, InPlaceXor);
+    nb_binary(inplace_or, InPlaceOr);
+    nb_binary(floor_divide, FloorDivide);
+    nb_binary(true_divide, TrueDivide);
+    nb_binary(inplace_floor_divide, InPlaceFloorDivide);
+    nb_binary(inplace_true_divide, InPlaceTrueDivide);
+    nb_binary(matrix_multiply, MatrixMultiply);
+    nb_binary(inplace_matrix_multiply, InPlaceMatrixMultiply);
+  };
+%}
+
+%define %nb_slot(name, type)
+  %feature("python:nb_" %str(name)) type "number_traits<" %str(type) ">::nb_" %str(name);
+%enddef
+
+%define %number_protocol(type...)
+  %nb_slot(add, type);
+  %nb_slot(subtract, type);
+  %nb_slot(multiply, type);
+  %nb_slot(divide, type)
+  %nb_slot(divmod, type);
+  %nb_slot(negative, type);
+  %nb_slot(positive, type);
+  %nb_slot(absolute, type);
+  %nb_slot(invert, type);
+  %nb_slot(lshift, type);
+  %nb_slot(rshift, type);
+  %nb_slot(and, type);
+  %nb_slot(xor, type);
+  %nb_slot(or, type);
+  %nb_slot(inplace_add, type);
+  %nb_slot(inplace_subtract, type);
+  %nb_slot(inplace_multiply, type);
+  %nb_slot(inplace_divide, type)
+  %nb_slot(inplace_remainder, type);
+  %nb_slot(inplace_power, type);
+  %nb_slot(inplace_lshift, type);
+  %nb_slot(inplace_rshift, type);
+  %nb_slot(inplace_and, type);
+  %nb_slot(inplace_xor, type);
+  %nb_slot(inplace_or, type);
+  %nb_slot(floor_divide, type);
+  %nb_slot(inplace_floor_divide, type);
+  %nb_slot(matrix_multiply, type);
+  %nb_slot(inplace_matrix_multiply, type);
+%enddef
diff --git a/python/ideep4py/py/swig_utils/getattro.i b/python/ideep4py/py/swig_utils/getattro.i
new file mode 100644
index 00000000..2b945970
--- /dev/null
+++ b/python/ideep4py/py/swig_utils/getattro.i
@@ -0,0 +1,57 @@
+/*
+ *Copyright (c) 2018 Intel Corporation.
+ *
+ *Permission is hereby granted, free of charge, to any person obtaining a copy
+ *of this software and associated documentation files (the "Software"), to deal
+ *in the Software without restriction, including without limitation the rights
+ *to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ *copies of the Software, and to permit persons to whom the Software is
+ *furnished to do so, subject to the following conditions:
+ *
+ *The above copyright notice and this permission notice shall be included in
+ *all copies or substantial portions of the Software.
+ *
+ *THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ *OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ *THE SOFTWARE.
+ *
+ */
+
+
+%{
+  template <class T>
+  struct getattr_traits {
+    static PyObject *getattro_hook(PyObject *self, PyObject *name) {
+
+      // Call python default first.
+      PyObject *res = PyObject_GenericGetAttr(self, name);
+
+      // notify our hook if we find nothing from outside.
+      if (res == nullptr && PyErr_ExceptionMatches(PyExc_AttributeError)) {
+        PyErr_Clear();
+
+        void *that;
+        int res1 = SWIG_ConvertPtr(self, &that, nullptr, 0);
+
+        if (!SWIG_IsOK(res1)) {
+          PyErr_SetString(PyExc_ValueError, "Wrong self object in getattro wrapper");
+          res = nullptr;
+        }
+
+        // XXX: should we bump up reference counter?
+        // TODO: Support both raw and smart pointer
+        res = reinterpret_cast<T *>(that)->get()->getattro(self, name);
+      }
+
+      return res;
+    }
+  };
+%}
+
+%define %getattr_wrapper(type...)
+  %feature("python:tp_getattro") type "getattr_traits<" %str(type) ">::getattro_hook";
+%enddef
diff --git a/python/ideep4py/py/swig_utils/pep_3118.i b/python/ideep4py/py/swig_utils/pep_3118.i
new file mode 100755
index 00000000..7a573469
--- /dev/null
+++ b/python/ideep4py/py/swig_utils/pep_3118.i
@@ -0,0 +1,74 @@
+/*
+ *Copyright (c) 2018 Intel Corporation.
+ *
+ *Permission is hereby granted, free of charge, to any person obtaining a copy
+ *of this software and associated documentation files (the "Software"), to deal
+ *in the Software without restriction, including without limitation the rights
+ *to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ *copies of the Software, and to permit persons to whom the Software is
+ *furnished to do so, subject to the following conditions:
+ *
+ *The above copyright notice and this permission notice shall be included in
+ *all copies or substantial portions of the Software.
+ *
+ *THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ *OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ *THE SOFTWARE.
+ *
+ */
+
+
+%{
+  template <class T>
+  struct buffer_traits {
+    #define GET_SELF_OBJ(self, that) \
+      do { \
+        int res1 = SWIG_ConvertPtr(self, &that, nullptr, 0); \
+        if (!SWIG_IsOK(res1)) { \
+          PyErr_SetString(PyExc_ValueError, "Wrong self object in getbuffer wrapper"); \
+          return -1; \
+        } \
+      } while (0)
+
+    static int getbuffer(PyObject *self, Py_buffer *view, int flags) {
+      void *that;
+
+      GET_SELF_OBJ(self, that);
+
+      // TODO: support smart pointer and raw at same time
+      return (*reinterpret_cast<T *>(that))->getbuffer(self, view, flags);
+    }
+  };
+%}
+
+%define %buffer_protocol_producer(type...)
+  %feature("python:bf_getbuffer") type "buffer_traits<" %str(type) ">::getbuffer";
+
+#if defined(NEWBUFFER_ON)
+  %feature("python:tp_flags") type "Py_TPFLAGS_DEFAULT|Py_TPFLAGS_BASETYPE|Py_TPFLAGS_CHECKTYPES|Py_TPFLAGS_HAVE_NEWBUFFER";
+#endif
+
+%enddef
+
+%define %buffer_protocol_typemap(VIEW)
+%typemap(typecheck) (VIEW) {
+  $1 = PyObject_CheckBuffer($input);
+}
+
+%typemap(in) (VIEW) (int res, Py_buffer view
+  , int flags = PyBUF_C_CONTIGUOUS | PyBUF_RECORDS) {
+  /* view = new Py_buffer;*/
+  res = PyObject_GetBuffer($input, &view, flags);
+  if (res != 0) {
+    $1 = NULL;
+    goto fail;
+  } else {
+    $1 = ($1_ltype) &view;
+  }
+  // TODO: IF WE CONFRONT A F_CONTINGUOUS ONE???
+}
+%enddef
diff --git a/python/ideep4py/py/swig_utils/seq_typemap.i b/python/ideep4py/py/swig_utils/seq_typemap.i
new file mode 100644
index 00000000..9728a4cd
--- /dev/null
+++ b/python/ideep4py/py/swig_utils/seq_typemap.i
@@ -0,0 +1,69 @@
+/*
+ *Copyright (c) 2018 Intel Corporation.
+ *
+ *Permission is hereby granted, free of charge, to any person obtaining a copy
+ *of this software and associated documentation files (the "Software"), to deal
+ *in the Software without restriction, including without limitation the rights
+ *to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ *copies of the Software, and to permit persons to whom the Software is
+ *furnished to do so, subject to the following conditions:
+ *
+ *The above copyright notice and this permission notice shall be included in
+ *all copies or substantial portions of the Software.
+ *
+ *THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ *OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ *THE SOFTWARE.
+ *
+ */
+
+
+%define %int_sequence_typemap(integer_sequence_compitable_type)
+
+%typemap(typecheck) (integer_sequence_compitable_type) {
+  $1 = PySequence_Check($input);
+}
+
+%typemap(in) (integer_sequence_compitable_type) (int count) {
+  count = PySequence_Size($input);
+
+  for (int i =0; i < count; i ++) {
+    PyObject *o = PySequence_GetItem($input, i);
+    $1.push_back(PyLong_AsLong(o));
+  }
+}
+%enddef
+
+%define %at_sequence_typemap(at_sequence_compitable_type)
+
+%typemap(typecheck) (at_sequence_compitable_type) {
+  $1 = PySequence_Check($input);
+}
+
+%typemap(in) (at_sequence_compitable_type) (int count,
+    at_sequence_compitable_type ins) {
+  count = PySequence_Size($input);
+  for (int i =0; i < count; i ++) {
+    PyObject *o = PySequence_GetItem($input, i);
+    mkldnn::primitive::at *tmp;
+    int res1 = SWIG_ConvertPtr(o, reinterpret_cast<void **>(&tmp)
+        , $descriptor(mkldnn::primitive::at *), 0);
+
+    if (!SWIG_IsOK(res1)) {
+      SWIG_exception_fail(SWIG_ArgError(res1)
+          , "typemap 'mkldnn::primitive::at' sequence type failed");
+    }
+    if (tmp == nullptr) {
+      SWIG_exception_fail(SWIG_ArgError(res1)
+          , "Input is not a sequential type of 'mkldnn::primitive::at'");
+    }
+    ins.emplace_back(*tmp);
+  }
+
+  $1 = std::move(ins);
+}
+%enddef
diff --git a/python/ideep4py/py/swig_utils/tp.i b/python/ideep4py/py/swig_utils/tp.i
new file mode 100644
index 00000000..fd5bff74
--- /dev/null
+++ b/python/ideep4py/py/swig_utils/tp.i
@@ -0,0 +1,59 @@
+/*
+ *Copyright (c) 2018 Intel Corporation.
+ *
+ *Permission is hereby granted, free of charge, to any person obtaining a copy
+ *of this software and associated documentation files (the "Software"), to deal
+ *in the Software without restriction, including without limitation the rights
+ *to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ *copies of the Software, and to permit persons to whom the Software is
+ *furnished to do so, subject to the following conditions:
+ *
+ *The above copyright notice and this permission notice shall be included in
+ *all copies or substantial portions of the Software.
+ *
+ *THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ *OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ *THE SOFTWARE.
+ *
+ */
+
+
+%{
+  template <class T>
+  struct tp_traits {
+    static PyObject *tp_richcompare(PyObject *self, PyObject *other, int cmp_op) {
+      PyObject *surrogate = PyArray_FromAny(self, nullptr, 0, 0 \
+                                            , NPY_ARRAY_ELEMENTSTRIDES, nullptr);
+      if (surrogate == nullptr)
+        return nullptr;
+
+      PyObject *res = PyObject_RichCompare(surrogate, other, cmp_op);
+      Py_DECREF(surrogate);
+      return res;
+    }
+
+    static PyObject *tp_iter(PyObject *self) {
+      PyObject *surrogate = PyArray_FromAny(self, nullptr, 0, 0 \
+                                            , NPY_ARRAY_ELEMENTSTRIDES, nullptr);
+      if (surrogate == nullptr)
+          return nullptr;
+
+      PyObject *res = PyObject_GetIter(surrogate);
+      Py_DECREF(surrogate);
+      return res;
+    }
+  };
+%}
+
+%define %tp_slot(name, type)
+  %feature("python:tp_" %str(name)) type "tp_traits<" %str(type) ">::tp_" %str(name);
+%enddef
+
+%define %tp_protocol(type...)
+  %tp_slot(richcompare, type)
+  %tp_slot(iter, type)
+%enddef
diff --git a/python/ideep4py/tests/mm/test_acc_sum.py b/python/ideep4py/tests/mm/test_acc_sum.py
new file mode 100644
index 00000000..784fc5f1
--- /dev/null
+++ b/python/ideep4py/tests/mm/test_acc_sum.py
@@ -0,0 +1,18 @@
+import numpy
+import ideep4py
+
+x1 = numpy.random.uniform(-1, 1, (3, 16, 2, 4)).astype(numpy.float32)
+x2 = numpy.random.uniform(-1, 1, (3, 16, 2, 4)).astype(numpy.float32)
+x3 = numpy.random.uniform(-1, 1, (3, 16, 2, 4)).astype(numpy.float32)
+x4 = numpy.random.uniform(-1, 1, (3, 16, 2, 4)).astype(numpy.float32)
+mx1 = ideep4py.mdarray(x1)
+mx2 = ideep4py.mdarray(x2)
+mx3 = ideep4py.mdarray(x3)
+mx4 = ideep4py.mdarray(x4)
+
+x = x1 + x2 + x3 + x4
+mx = ideep4py.basic_acc_sum((mx1, mx2, mx3, mx4))
+# mx = numpy.asarray(mx)
+res = numpy.allclose(mx, x, 1e-5, 1e-4, True)
+if res is not True:
+    print("error!!!!")
diff --git a/python/ideep4py/tests/mm/test_copyto.py b/python/ideep4py/tests/mm/test_copyto.py
new file mode 100755
index 00000000..fc080a83
--- /dev/null
+++ b/python/ideep4py/tests/mm/test_copyto.py
@@ -0,0 +1,22 @@
+import numpy
+# from chainer import testing
+# from chainer import utils
+import ideep4py
+
+x1 = numpy.ndarray(shape=(2, 16, 2, 2), dtype=numpy.float32, order='C')
+x2 = numpy.ndarray(shape=(2, 16, 2, 2), dtype=numpy.float32, order='C')
+mx1 = ideep4py.mdarray(x1)
+mx2 = ideep4py.mdarray(x2)
+numpy.copyto(x2, x1)
+ideep4py.basic_copyto(mx2, mx1)
+t = numpy.asarray(mx2)
+numpy.allclose(t, x2, 1e-5, 1e-4, True)
+
+
+x1 = numpy.ndarray(shape=(2, 16, 2, 2), dtype=numpy.float32, order='C')
+x2 = numpy.ndarray(shape=(2, 16, 2, 2), dtype=numpy.float32, order='C')
+mx2 = ideep4py.mdarray(x2)
+numpy.copyto(x2, x1)
+ideep4py.basic_copyto(mx2, x1)
+t = numpy.asarray(mx2)
+numpy.allclose(t, x2, 1e-5, 1e-4, True)
diff --git a/python/ideep4py/tests/mm/test_dlcp.py b/python/ideep4py/tests/mm/test_dlcp.py
new file mode 100644
index 00000000..67b6966a
--- /dev/null
+++ b/python/ideep4py/tests/mm/test_dlcp.py
@@ -0,0 +1,18 @@
+import ideep4py
+from ideep4py import dlCompression
+
+import numpy
+
+a = numpy.arange(9, dtype=numpy.float32)
+a = a.reshape((3, 3))
+am = ideep4py.array(a)
+
+ret = dlCompression.Compress(am, am, None, 4, dlCompression.dl_comp_dfp)
+assert(ret == dlCompression.dl_comp_ok)
+
+ret = dlCompression.Decompress(am, am)
+assert(ret == dlCompression.dl_comp_ok)
+
+_a = numpy.array(am)
+
+numpy.testing.assert_allclose(a, _a, atol=0.1, rtol=0.01, verbose=True)
diff --git a/python/ideep4py/tests/mm/test_mdarray.py b/python/ideep4py/tests/mm/test_mdarray.py
new file mode 100755
index 00000000..60441c31
--- /dev/null
+++ b/python/ideep4py/tests/mm/test_mdarray.py
@@ -0,0 +1,40 @@
+import numpy
+from chainer import testing
+from chainer import utils  # NOQA
+import ideep4py
+
+x1 = numpy.ndarray(shape=(2, 2, 2, 2), dtype=numpy.float32, order='C')
+x = ideep4py.mdarray(x1)
+x = x + 1
+testing.assert_allclose(x1 + 1, x)
+
+x = ideep4py.mdarray(x1)
+
+print(x)
+print("ndims=", x.ndim)
+print("shape=", x.shape)
+print("size=", x.size)
+print("dtype=", x.dtype)
+print("is_mdarry=", x.is_mdarray)
+
+x1 += x
+x += x
+x2 = numpy.array(x)
+testing.assert_allclose(x1, x2)
+
+
+x1 = numpy.ones(shape=(2, 2, 2, 2), dtype=numpy.float32, order='C')
+x = ideep4py.mdarray(x1)
+y = x + x1
+y2 = numpy.array(y)
+testing.assert_allclose(y2, x1 + x1)
+
+y = x * x1
+y2 = numpy.array(y)
+testing.assert_allclose(y2, x1 * x1)
+
+x1 = numpy.random.uniform(-1, 1, (3, 4)).astype(numpy.float32)
+x = ideep4py.mdarray(x1)
+z1 = (x1 > 0).astype(x1.dtype)
+z = (x > 0).astype(x1.dtype)
+testing.assert_allclose(z, z1)
diff --git a/python/ideep4py/tests/mm/test_mdarray3.py b/python/ideep4py/tests/mm/test_mdarray3.py
new file mode 100755
index 00000000..3dc11460
--- /dev/null
+++ b/python/ideep4py/tests/mm/test_mdarray3.py
@@ -0,0 +1,37 @@
+import numpy
+from chainer import testing  # NOQA
+from chainer import utils  # NOQA
+import ideep4py
+
+x1 = numpy.ndarray(shape=(2, 2), dtype=numpy.float32, order='C')
+x = ideep4py.mdarray(x1)
+print(x1)
+y = x1 > 0
+print(y)
+x *= y
+
+
+# test devide
+x1 = numpy.ndarray(shape=(2, 2), dtype=numpy.float32, order='C')
+x1.fill(2.)
+x = ideep4py.mdarray(x1)
+testing.assert_allclose(1 / x1, 1 / x)
+testing.assert_allclose(2 * x1, 2 * x)
+testing.assert_allclose(1 - x1, 1 - x)
+testing.assert_allclose(1 + x1, 1 + x)
+
+x1 /= 3
+x /= 3
+testing.assert_allclose(x1, x)
+
+x1 *= 2
+x *= 2
+testing.assert_allclose(x1, x)
+
+x1 += 3
+x += 3
+testing.assert_allclose(x1, x)
+
+x1 -= 5
+x -= 5
+testing.assert_allclose(x1, x)
diff --git a/python/ideep4py/tests/mm/test_mdarray_iter.py b/python/ideep4py/tests/mm/test_mdarray_iter.py
new file mode 100644
index 00000000..85029cee
--- /dev/null
+++ b/python/ideep4py/tests/mm/test_mdarray_iter.py
@@ -0,0 +1,59 @@
+import ideep4py  # NOQA
+import numpy
+import six
+from chainer import testing
+from ideep4py import relu, mdarray
+
+# enumerate test
+x = numpy.random.uniform(-1, 1, (256, 512, 13, 13)).astype(numpy.float32)
+mx = mdarray(x)
+
+a = []
+b = []
+for p, xi in enumerate(x):
+    a.append(xi)
+for p, mxi in enumerate(mx):
+    b.append(mxi)
+
+testing.assert_allclose(numpy.asarray(a), numpy.asarray(b))
+
+
+# zip test
+x1 = numpy.random.uniform(-1, 1, (256, 512, 13, 13)).astype(numpy.float32)
+x2 = numpy.random.uniform(-1, 1, (256, 512, 13, 13)).astype(numpy.float32)
+
+mx1 = mdarray(x1)
+mx2 = mdarray(x2)
+
+a1 = []
+a2 = []
+b1 = []
+b2 = []
+
+for x, y in six.moves.zip(x1, x2):
+    a1.append(x)
+    a2.append(y)
+
+for mx, my in six.moves.zip(mx1, mx2):
+    b1.append(mx)
+    b2.append(my)
+
+testing.assert_allclose(numpy.asarray(a1), numpy.asarray(b1))
+testing.assert_allclose(numpy.asarray(a2), numpy.asarray(b2))
+
+
+# mkl-dnn format test
+x = numpy.random.uniform(-1, 1, (256, 512, 13, 13)).astype(numpy.float32)
+y = numpy.maximum(x, 0, dtype=x.dtype)
+mx = mdarray(x)
+my = relu.Forward(mx)
+testing.assert_allclose(y, my)
+
+a = []
+b = []
+for p, xi in enumerate(y):
+    a.append(xi)
+for p, mxi in enumerate(my):
+    b.append(mxi)
+
+testing.assert_allclose(numpy.asarray(a), numpy.asarray(b))
diff --git a/python/ideep4py/tests/mm/test_mdarray_reshape.py b/python/ideep4py/tests/mm/test_mdarray_reshape.py
new file mode 100755
index 00000000..ab15f289
--- /dev/null
+++ b/python/ideep4py/tests/mm/test_mdarray_reshape.py
@@ -0,0 +1,45 @@
+import numpy
+import ideep4py
+
+# list case
+x1 = numpy.ndarray(shape=(2, 2, 2, 2), dtype=numpy.float32, order='C')
+x = ideep4py.mdarray(x1)
+y1 = x1.reshape([4, 4])
+y = x.reshape([4, 4])
+res = numpy.allclose(y, y1, 1e-5, 1e-4)
+if res is not True:
+    print("error!!!!")
+
+# singal number case
+x1 = numpy.ndarray(shape=(2, 2, 2, 2), dtype=numpy.float32, order='C')
+x = ideep4py.mdarray(x1)
+x1.reshape(16)
+x.reshape(16)
+res = numpy.allclose(y, y1, 1e-5, 1e-4)
+if res is not True:
+    print("error!!!!")
+
+# value change
+x1 = numpy.ndarray(shape=(2, 2, 2, 2), dtype=numpy.float32, order='C')
+x = ideep4py.mdarray(x1)
+print(type(x))
+y = x.reshape(len(x), -1)
+x[0, 0, 0, 0] = 3.333
+assert(x[0, 0, 0, 0] == y[0, 0])
+
+y = x.reshape((len(x), -1))
+x[0, 0, 0, 0] = 4.4444
+assert(x[0, 0, 0, 0] == y[0, 0])
+
+# -1 case
+x1 = numpy.ndarray(shape=(2, 2, 2, 2), dtype=numpy.float32, order='C')
+x = ideep4py.mdarray(x1)
+y = x.reshape((2, 2, -1))
+y1 = x1.reshape((2, 2, -1))
+res = numpy.allclose(y, y1, 1e-5, 1e-4)
+if res is not True:
+    print("error!!!!")
+y = x.reshape(2, 2, -1)
+numpy.allclose(y, y1, 1e-5, 1e-4)
+if res is not True:
+    print("error!!!!")
diff --git a/python/ideep4py/tests/mm/test_mdarray_sum.py b/python/ideep4py/tests/mm/test_mdarray_sum.py
new file mode 100644
index 00000000..db9c5ebb
--- /dev/null
+++ b/python/ideep4py/tests/mm/test_mdarray_sum.py
@@ -0,0 +1,133 @@
+import ideep4py  # NOQA
+import numpy
+from chainer import testing
+from ideep4py import relu, mdarray
+
+print('mdarray sum [larg shape routine]')
+print('shape (256, 384, 13, 13) along (0, 2, 3)')
+x = numpy.ndarray((256, 384, 13, 13), dtype=numpy.float32)
+y = numpy.maximum(x, 0, dtype=x.dtype)
+
+mx = mdarray(x)
+my = relu.Forward(mx)
+
+testing.assert_allclose(my.sum((0, 2, 3)), y.sum((0, 2, 3)))
+print('pass ...\n')
+
+
+print('mdarray sum [small shape routine]')
+print('shape (39, 32, 13, 13) along (0, 2, 3)')
+x = numpy.ndarray((39, 32, 13, 13), dtype=numpy.float32)
+y = numpy.maximum(x, 0, dtype=x.dtype)
+
+mx = mdarray(x)
+my = relu.Forward(mx)
+
+testing.assert_allclose(my.sum((0, 2, 3)), y.sum((0, 2, 3)))
+print('pass ...\n')
+
+
+print('mdarray sum [mkldnn format keepdims routine]')
+print('shape (39, 32, 13, 13) along (0, 2, 3)')
+x = numpy.ndarray((39, 32, 13, 13), dtype=numpy.float32)
+y = numpy.maximum(x, 0, dtype=x.dtype)
+
+mx = mdarray(x)
+my = relu.Forward(mx)
+
+testing.assert_allclose(my.sum((0, 2, 3), keepdims=True),
+                        y.sum((0, 2, 3), keepdims=True))
+print('pass ...\n')
+
+
+print('mdarray sum [common format small shape routine]')
+print('shape (2, 2, 3, 3) along (0, 2, 3)')
+x = numpy.ndarray((2, 2, 3, 3), dtype=numpy.float32)
+
+x.fill(2.3232)
+x[0].fill(3.1212)
+mx = mdarray(x)
+
+testing.assert_allclose(mx.sum((0, 2, 3)), x.sum((0, 2, 3)))
+print('pass ...\n')
+
+
+print('mdarray sum [common format small shape routine]')
+print('shape (2, 2, 3, 3) along (1, 3)')
+x = numpy.ndarray((2, 2, 3, 3), dtype=numpy.float32)
+
+x.fill(2.3232)
+x[0].fill(3.1212)
+mx = mdarray(x)
+
+testing.assert_allclose(mx.sum((1, 3)), x.sum((1, 3)))
+print('pass ...\n')
+
+
+print('mdarray sum [common format routine keepdims]')
+print('shape (2, 2, 3, 3) along (0, 2, 3)')
+x = numpy.ndarray((2, 2, 3, 3), dtype=numpy.float32)
+
+x.fill(2.3232)
+x[0].fill(3.1212)
+mx = mdarray(x)
+
+ms = mx.sum((0, 2, 3), keepdims=True)
+ns = x.sum((0, 2, 3), keepdims=True)
+testing.assert_allclose(ms, ns)
+print('pass ...\n')
+
+
+print('mdarray sum [common format routine]')
+print('shape (2, 15, 3, 3) along (0, 2, 3)')
+x = numpy.ndarray((2, 15, 3, 3), dtype=numpy.float32)
+
+x.fill(1)
+x[0].fill(3.1212)
+mx = mdarray(x)
+
+ms = mx.sum((0, 2, 3))
+ns = x.sum((0, 2, 3))
+testing.assert_allclose(ms, ns)
+print('pass ...\n')
+
+
+print('mdarray sum [common format big shape routine]')
+print('shape (256, 385, 13, 13) along (0, 2, 3)')
+x = numpy.ndarray((256, 385, 13, 13), dtype=numpy.float32)
+
+x.fill(1)
+x[0].fill(3.1212)
+mx = mdarray(x)
+
+ms = mx.sum((0, 2, 3))
+ns = x.sum((0, 2, 3))
+testing.assert_allclose(ms, ns)
+print('pass ...\n')
+
+
+print('mdarray sum [common format big shape routine]')
+print('shape (256, 1000) along (0)')
+x = numpy.ndarray((256, 1000), dtype=numpy.float32)
+
+x.fill(1)
+x[0].fill(3.1212)
+mx = mdarray(x)
+
+ms = mx.sum((0))
+ns = x.sum((0))
+testing.assert_allclose(ms, ns)
+print('pass ...\n')
+
+print('mdarray sum [common format big shape routine]')
+print('shape (256, 1000) along (1)')
+x = numpy.ndarray((256, 1000), dtype=numpy.float32)
+
+x.fill(1)
+x[0].fill(3.1212)
+mx = mdarray(x)
+
+ms = mx.sum((1))
+ns = x.sum((1))
+testing.assert_allclose(ms, ns)
+print('pass ...\n')
diff --git a/python/ideep4py/tests/mm/test_memcpy.py b/python/ideep4py/tests/mm/test_memcpy.py
new file mode 100644
index 00000000..552aab00
--- /dev/null
+++ b/python/ideep4py/tests/mm/test_memcpy.py
@@ -0,0 +1,7 @@
+import numpy
+import ideep4py
+x1 = numpy.ndarray(shape=(1, 2, 3, 4), dtype=numpy.float32, order='C')
+x = ideep4py.mdarray(x1)
+x2 = numpy.array(x)
+print("x = ", x1)
+print("x2 = ", x2)
diff --git a/python/ideep4py/tests/mm/test_tanh.py b/python/ideep4py/tests/mm/test_tanh.py
new file mode 100644
index 00000000..5e724465
--- /dev/null
+++ b/python/ideep4py/tests/mm/test_tanh.py
@@ -0,0 +1,30 @@
+import numpy
+from chainer import testing
+import ideep4py
+
+# x = numpy.ndarray(shape=(1,32,224,224), dtype=numpy.float32, order='C')
+x = numpy.random.uniform(-1, 1, (1, 32, 2, 224)).astype(numpy.float32)
+y = numpy.tanh(x)
+
+mx = ideep4py.mdarray(x)
+x2 = numpy.array(mx)
+testing.assert_allclose(x, x2)
+
+print("tanh fwd")
+my = ideep4py._ideep4py.tanh.Forward(mx)
+y2 = numpy.array(my)
+testing.assert_allclose(y, y2)
+
+# Test backward
+print("tanh bwd")
+x = numpy.random.uniform(-1, 1, (1, 32, 224, 224)).astype(numpy.float32)
+gy = numpy.random.uniform(-1, 1, (1, 32, 224, 224)).astype(numpy.float32)
+gx = gy * (1 - numpy.tanh(x) ** 2)
+
+
+mx = ideep4py.mdarray(x)
+mgy = ideep4py.mdarray(gy)
+mgx = ideep4py._ideep4py.tanh.Backward(mx, mgy)
+
+gx1 = numpy.array(mgx)
+testing.assert_allclose(gx1, gx)
diff --git a/python/ideep4py/tests/primitives/test_bn.py b/python/ideep4py/tests/primitives/test_bn.py
new file mode 100644
index 00000000..bfc4ea1b
--- /dev/null
+++ b/python/ideep4py/tests/primitives/test_bn.py
@@ -0,0 +1,68 @@
+import numpy
+import ideep4py
+
+from ideep4py import batchNormalization
+
+
+def run():
+    src = numpy.arange(3 * 2 * 2 * 2, dtype=numpy.float32)
+    src = src.reshape((3, 2, 2, 2))
+    src = ideep4py.mdarray(src)
+
+    gamma = numpy.ones(2, dtype=numpy.float32)
+    beta = numpy.zeros(2, dtype=numpy.float32)
+    w = numpy.concatenate((gamma, beta), axis=0).reshape((2, -1))
+    w = ideep4py.mdarray(w)
+
+    eps = 2e-5
+
+    print("FWD *****************************")
+    y = batchNormalization.Forward(src, w, None, None, eps)
+    print(y)
+    print(-y[0])
+    print(-y[1])
+    print(-y[2])
+    print("==============")
+    y = batchNormalization.Forward(src, w, None, None, eps)
+    print(y)
+    print(-y[0])
+    print(-y[1])
+    print(-y[2])
+    print("==============")
+    mean = y[1]
+    var = y[2]
+    y = batchNormalization.Forward(src, w, mean, var, eps)
+    print(y)
+    print(-y[0])
+    print("==============")
+
+    print("BWD *****************************")
+    diff_dst = numpy.ones(src.shape, dtype=numpy.float32)
+    diff_dst = ideep4py.mdarray(diff_dst)
+    y = batchNormalization.Backward(src, diff_dst, mean, var, w, eps)
+    print(y)
+    print(-y[0])
+    print(-y[1])
+    print("==============")
+    y = batchNormalization.Backward(src, diff_dst, mean, var, w, eps)
+    print(y)
+    print(-y[0])
+    print(-y[1])
+    print("==============")
+    src = numpy.arange(3 * 2 * 3 * 3, dtype=numpy.float32)
+    src = src.reshape((3, 2, 3, 3))
+    src = ideep4py.mdarray(src)
+    diff_dst = numpy.ones(src.shape, dtype=numpy.float32)
+    diff_dst = ideep4py.mdarray(diff_dst)
+    y = batchNormalization.Backward(src, diff_dst, mean, var, w, eps)
+    print(y)
+    print(-y[0])
+    print(-y[1])
+    print("==============")
+    y = batchNormalization.Backward(src, diff_dst, mean, var, None, eps)
+    print(y)
+    print(-y[0])
+    print("==============")
+
+
+run()
diff --git a/python/ideep4py/tests/primitives/test_concat.py b/python/ideep4py/tests/primitives/test_concat.py
new file mode 100755
index 00000000..ee61cbe4
--- /dev/null
+++ b/python/ideep4py/tests/primitives/test_concat.py
@@ -0,0 +1,45 @@
+import numpy
+import ideep4py
+
+# from dnn._dnn import convolution2DParam, conv_test
+from ideep4py import intVector, mdarrayVector, concat
+
+x1 = numpy.ndarray(shape=(1, 16, 224, 224), dtype=numpy.float32, order='C')
+x2 = numpy.ndarray(shape=(1, 32, 224, 224), dtype=numpy.float32, order='C')
+x3 = numpy.ndarray(shape=(1, 64, 224, 224), dtype=numpy.float32, order='C')
+inputs = (x1, x2, x3)
+sizes = numpy.array(
+    [v.shape[1] for v in inputs[:-1]]
+).cumsum()
+print("sizes=", sizes)
+print("type=", type(sizes))
+
+x1 = ideep4py.mdarray(x1)
+x2 = ideep4py.mdarray(x2)
+x3 = ideep4py.mdarray(x3)
+
+xs = mdarrayVector()
+xs.push_back(x1)
+xs.push_back(x2)
+xs.push_back(x3)
+
+print("fwd")
+y = concat.Forward(xs, 1)
+print("==============")
+y = concat.Forward(xs, 1)
+print("y.shape=", y.shape)
+
+print("backward")
+
+int_sizes = intVector()
+
+for i in sizes:
+    print("i=", i)
+    int_sizes.push_back(i)
+
+gxs = concat.Backward(y, int_sizes, 1)
+
+for gx in gxs:
+    print("gx.type=", type(gx))
+    print("gx.shape=", gx.shape)
+print("after backward")
diff --git a/python/ideep4py/tests/primitives/test_conv.py b/python/ideep4py/tests/primitives/test_conv.py
new file mode 100755
index 00000000..dfbc8431
--- /dev/null
+++ b/python/ideep4py/tests/primitives/test_conv.py
@@ -0,0 +1,106 @@
+import numpy
+import ideep4py
+
+# from ideep4py import convolution2DParam, conv_test
+from ideep4py import intVector, convolution2DParam, convolution2D
+
+x = numpy.ndarray(shape=(1, 32, 224, 224), dtype=numpy.float32, order='C')
+x = ideep4py.mdarray(x)
+
+w = numpy.ndarray(shape=(32, 32, 3, 3), dtype=numpy.float32, order='C')
+w = ideep4py.mdarray(w)
+
+b = numpy.ndarray(shape=(32,), dtype=numpy.float32, order='C')
+b = ideep4py.mdarray(b)
+
+cp = convolution2DParam()
+cp.out_dims = intVector()
+cp.out_dims.push_back(1)
+cp.out_dims.push_back(32)
+cp.out_dims.push_back(224)
+cp.out_dims.push_back(224)
+cp.sy = cp.sx = 1
+cp.pad_lh = cp.pad_lw = cp.pad_rh = cp.pad_rw = 1
+
+print("fwd with bias")
+y = convolution2D.Forward(x, w, b, cp)
+print("==============")
+y = convolution2D.Forward(x, w, b, cp)
+print("==============")
+y = convolution2D.Forward(y, w, b, cp)
+
+print("fwd without bias")
+y = convolution2D.Forward(x, w, None, cp)
+print("==============")
+y = convolution2D.Forward(x, w, None, cp)
+print("==============")
+y = convolution2D.Forward(y, w, None, cp)
+
+print("bwd data")
+x = convolution2D.BackwardData(w, y, cp)
+print("==============")
+x = convolution2D.BackwardData(w, y, cp)
+print("==============")
+x = convolution2D.BackwardData(w, y, cp)
+
+cp = convolution2DParam()
+cp.out_dims = intVector()
+cp.out_dims.push_back(32)
+cp.out_dims.push_back(32)
+cp.out_dims.push_back(3)
+cp.out_dims.push_back(3)
+cp.sy = cp.sx = 1
+cp.pad_lh = cp.pad_lw = cp.pad_rh = cp.pad_rw = 1
+
+print("bwd weights with bias")
+weights = convolution2D.BackwardWeightsBias(x, y, cp)
+print("weights=", type(weights))
+print("len=", len(weights))
+print("gw.shape=", weights[0].shape)
+print("gb.shape=", weights[1].shape)
+print("==============")
+x = numpy.ndarray(shape=(1, 32, 224, 224), dtype=numpy.float32, order='C')
+x = ideep4py.mdarray(x)
+weights = convolution2D.BackwardWeightsBias(x, y, cp)
+print("weights=", type(weights))
+print("len=", len(weights))
+print("gw.shape=", weights[0].shape)
+print("gb.shape=", weights[1].shape)
+print("==============")
+x = numpy.ndarray(shape=(1, 32, 224, 224), dtype=numpy.float32, order='C')
+x = ideep4py.mdarray(x)
+weights = convolution2D.BackwardWeightsBias(x, y, cp)
+print("weights=", type(weights))
+print("len=", len(weights))
+print("gw.shape=", weights[0].shape)
+print("gb.shape=", weights[1].shape)
+print("==============")
+
+print("bwd weights without bias")
+x = numpy.ndarray(shape=(1, 32, 224, 224), dtype=numpy.float32, order='C')
+x = ideep4py.mdarray(x)
+weights = convolution2D.BackwardWeights(x, y, cp)
+print("weights=", type(weights))
+print("gw.shape=", weights.shape)
+print("==============")
+x = numpy.ndarray(shape=(1, 32, 224, 224), dtype=numpy.float32, order='C')
+x = ideep4py.mdarray(x)
+weights = convolution2D.BackwardWeights(x, y, cp)
+print("weights=", type(weights))
+print("gw.shape=", weights.shape)
+print("==============")
+x = numpy.ndarray(shape=(1, 32, 224, 224), dtype=numpy.float32, order='C')
+x = ideep4py.mdarray(x)
+weights = convolution2D.BackwardWeights(x, y, cp)
+print("weights=", type(weights))
+print("gw.shape=", weights.shape)
+print("==============")
+x = numpy.ndarray(shape=(1, 32, 224, 224), dtype=numpy.float32, order='C')
+x = ideep4py.mdarray(x)
+weights = convolution2D.BackwardWeights(x, y, cp)
+
+# print("type=", type(x))
+# print("shape=", y.shape)
+# print("size=", y.size)
+# print("ndim=", y.ndim)
+# print("dtype=", y.dtype)
diff --git a/python/ideep4py/tests/primitives/test_dropout.py b/python/ideep4py/tests/primitives/test_dropout.py
new file mode 100644
index 00000000..cc2e36dc
--- /dev/null
+++ b/python/ideep4py/tests/primitives/test_dropout.py
@@ -0,0 +1,21 @@
+import numpy
+from chainer import ideepy
+
+
+dropout_ratio = 0.8
+
+# Forward
+x = numpy.random.rand(128, 3, 224, 224).astype(numpy.float32)
+x_md, = ideepy.to_mdarray((x, ))
+mask, y = ideepy.dropout.Forward(x_md, dropout_ratio)
+y = numpy.array(y, dtype=numpy.float32)
+y_expect = x * mask
+numpy.testing.assert_allclose(y, y_expect)
+
+# Backward
+gy = numpy.random.rand(128, 3, 224, 224).astype(numpy.float32)
+gy_md, = ideepy.to_mdarray((gy, ))
+gx = ideepy.dropout.Backward(mask, gy_md)
+gx = numpy.array(gx, dtype=numpy.float32)
+gx_expect = gy * mask
+numpy.testing.assert_allclose(gx, gx_expect)
diff --git a/python/ideep4py/tests/primitives/test_linear.py b/python/ideep4py/tests/primitives/test_linear.py
new file mode 100644
index 00000000..17af7cf2
--- /dev/null
+++ b/python/ideep4py/tests/primitives/test_linear.py
@@ -0,0 +1,100 @@
+import numpy
+import ideep4py
+# from ideep4py import linearParam, linear_test
+from ideep4py import linear
+
+x = numpy.ndarray(shape=(1, 32), dtype=numpy.float32, order='C')
+x = ideep4py.mdarray(x)
+
+w = numpy.ndarray(shape=(32, 32), dtype=numpy.float32, order='C')
+print("ndarray w", w.shape)
+w = ideep4py.mdarray(w)
+print("w.dim", w.shape)
+b = numpy.ndarray(shape=(32,), dtype=numpy.float32, order='C')
+b = ideep4py.mdarray(b)
+
+print("===============2 dims============")
+
+print("fwd")
+y = linear.Forward(x, w, b)
+print("================")
+y = linear.Forward(x, w, b)
+print("================")
+y = linear.Forward(x, w, b)
+
+print("bwd data")
+x = linear.BackwardData(w, y)
+print("================")
+x = linear.BackwardData(w, y)
+print("================")
+x = linear.BackwardData(w, y)
+print("================")
+
+print("bwd weight bias")
+weights = linear.BackwardWeightsBias(x, y)
+print("weights= ", type(weights))
+print("len", len(weights))
+print("gw.shape", weights[0].shape)
+print("gb.shape = ", weights[1].shape)
+print("================")
+
+x = numpy.ndarray(shape=(1, 32), dtype=numpy.float32, order='C')
+x = ideep4py.mdarray(x)
+weights = linear.BackwardWeightsBias(x, y)
+print("weights= ", type(weights))
+print("len", len(weights))
+print("gw.shape", weights[0].shape)
+print("gb.shape = ", weights[1].shape)
+print("================")
+
+x = numpy.ndarray(shape=(1, 32), dtype=numpy.float32, order='C')
+x = ideep4py.mdarray(x)
+weights = linear.BackwardWeightsBias(x, y)
+print("weights= ", type(weights))
+print("len", len(weights))
+print("gw.shape", weights[0].shape)
+print("gb.shape = ", weights[1].shape)
+print("================")
+
+print("bwd weight")
+weights = linear.BackwardWeights(x, y)
+print("weights= ", type(weights))
+print("gw.shape", weights.shape)
+print("================")
+
+x = numpy.ndarray(shape=(1, 32), dtype=numpy.float32, order='C')
+x = ideep4py.mdarray(x)
+weights = linear.BackwardWeights(x, y)
+print("weights= ", type(weights))
+print("gw.shape", weights.shape)
+print("================")
+
+x = numpy.ndarray(shape=(1, 32), dtype=numpy.float32, order='C')
+x = ideep4py.mdarray(x)
+weights = linear.BackwardWeights(x, y)
+print("weights= ", type(weights))
+print("gw.shape", weights.shape)
+print("================")
+
+# print("==========4 dims=================")
+#
+# x = numpy.ndarray(shape=(1, 32, 224, 224), dtype=numpy.float32, order='C')
+# x = ideep4py.mdarray(x)
+#
+# w = numpy.ndarray(shape=(32, 32, 224, 224), dtype=numpy.float32, order='C')
+# print("ndarray w", w.shape)
+# w = ideep4py.mdarray(w)
+# print("w.dim", w.shape)
+# b = numpy.ndarray(shape=(32,), dtype=numpy.float32, order='C')
+# b = ideep4py.mdarray(b)
+#
+# print("fwd")
+# y = linear.Forward(x, w, b)
+# print("================")
+# y = linear.Forward(x, w, b)
+# print("================")
+# y = linear.Forward(x, w, b)
+#
+# print("================")
+# print("bwd data")
+# x = linear.BackwardData(w, y)
diff --git a/python/ideep4py/tests/primitives/test_lrn.py b/python/ideep4py/tests/primitives/test_lrn.py
new file mode 100755
index 00000000..200d8661
--- /dev/null
+++ b/python/ideep4py/tests/primitives/test_lrn.py
@@ -0,0 +1,33 @@
+import numpy
+import ideep4py
+
+from ideep4py import localResponseNormalizationParam
+from ideep4py import localResponseNormalization
+
+x = numpy.ndarray(shape=(1, 32, 224, 224), dtype=numpy.float32, order='C')
+x = ideep4py.mdarray(x)
+
+pp = localResponseNormalizationParam()
+pp.n = 5
+pp.k = 2
+pp.alpha = 1e-4
+pp.beta = .75
+pp.algo_kind = ideep4py.localResponseNormalizationParam.lrn_across_channels
+
+print("fwd")
+(y, ws) = localResponseNormalization.Forward(x, pp)
+print("==============")
+(y, ws) = localResponseNormalization.Forward(x, pp)
+
+# print ("y =", y)
+print("y.shape=", y.shape)
+print("ws.shape=", ws.shape)
+print("ws.dtype=", ws.dtype)
+
+print("==============")
+print("bwd")
+gx = localResponseNormalization.Backward(x, y, ws, pp)
+print("==============")
+gx = localResponseNormalization.Backward(x, y, ws, pp)
+print("gx.shape=", gx.shape)
+print("===== Finish backward=========")
diff --git a/python/ideep4py/tests/primitives/test_pooling.py b/python/ideep4py/tests/primitives/test_pooling.py
new file mode 100755
index 00000000..c05943da
--- /dev/null
+++ b/python/ideep4py/tests/primitives/test_pooling.py
@@ -0,0 +1,56 @@
+import numpy
+import ideep4py
+
+from ideep4py import pooling2DParam
+from ideep4py import pooling2D
+
+x = numpy.ndarray(shape=(1, 32, 224, 224), dtype=numpy.float32, order='C')
+x = ideep4py.mdarray(x)
+
+pp = pooling2DParam()
+pp.src_d1 = 1
+pp.src_d2 = 32
+pp.src_d3 = 224
+pp.src_d4 = 224
+pp.dst_d1 = 1
+pp.dst_d2 = 32
+pp.dst_d3 = 224
+pp.dst_d4 = 224
+pp.kh = pp.kw = 3
+pp.sy = pp.sx = 1
+pp.pad_lh = pp.pad_lw = pp.pad_rh = pp.pad_rw = 1
+pp.algo_kind = ideep4py.pooling2DParam.pooling_avg
+
+print("fwd")
+y = pooling2D.Forward(x, pp)
+print("==============")
+y = pooling2D.Forward(x, pp)
+print("==============")
+
+pp.algo_kind = ideep4py.pooling2DParam.pooling_max
+(y, ws) = pooling2D.Forward(x, pp)
+print("==============")
+(y, ws) = pooling2D.Forward(x, pp)
+
+print("y.shape=", y.shape)
+print("ws.shape=", ws.shape)
+print("ws.dtype=", ws.dtype)
+
+print("==============")
+print("bwd")
+x = pooling2D.Backward(y, ws, pp)
+print("==============")
+x = pooling2D.Backward(y, ws, pp)
+print("===== Finish max pooling backward=========")
+
+pp.algo_kind = ideep4py.pooling2DParam.pooling_avg
+x = pooling2D.Backward(y, ws, pp)
+print("==============")
+x = pooling2D.Backward(y, ws, pp)
+print("==============")
+x = numpy.ndarray(shape=(1, 32, 224, 224), dtype=numpy.float32, order='C')
+x = ideep4py.mdarray(x)
+x = pooling2D.Backward(x, ws, pp)
+print("===== Finsh avg pooing backward =========")
+print("x.shape=", x.shape)
+print("==============")
diff --git a/python/ideep4py/tests/primitives/test_relu.py b/python/ideep4py/tests/primitives/test_relu.py
new file mode 100755
index 00000000..d97174c9
--- /dev/null
+++ b/python/ideep4py/tests/primitives/test_relu.py
@@ -0,0 +1,36 @@
+import numpy
+from chainer import testing
+import ideep4py
+from ideep4py import relu
+
+# x = numpy.ndarray(shape=(1,32,224,224), dtype=numpy.float32, order='C')
+x = numpy.random.uniform(-1, 1, (1, 32, 224, 224)).astype(numpy.float32)
+y = numpy.maximum(x, 0, dtype=x.dtype)
+
+mx = ideep4py.mdarray(x)
+x2 = numpy.array(mx)
+testing.assert_allclose(x, x2)
+
+print("Relu fwd")
+my = relu.Forward(mx)
+y2 = numpy.array(my)
+testing.assert_allclose(y, y2)
+my = relu.Forward(my)
+y2 = numpy.array(my)
+testing.assert_allclose(y, y2)
+
+
+# Test backward
+print("Relu bwd")
+x = numpy.random.uniform(-1, 1, (1, 32, 224, 224)).astype(numpy.float32)
+gy = numpy.random.uniform(-1, 1, (1, 32, 224, 224)).astype(numpy.float32)
+gx = (x > 0) * gy
+
+
+mx = ideep4py.mdarray(x)
+mgy = ideep4py.mdarray(gy)
+mgx = relu.Backward(mx, mgy)
+
+
+gx1 = numpy.array(mgx)
+testing.assert_allclose(gx1, gx)
diff --git a/python/setup.py b/python/setup.py
new file mode 100644
index 00000000..36695aca
--- /dev/null
+++ b/python/setup.py
@@ -0,0 +1,145 @@
+from setuptools.extension import Extension
+from numpy import get_include
+from platform import system
+
+import os
+import sys
+import external
+
+from setuptools.command.build_py import build_py
+from setuptools.command.install import install
+from setuptools import setup
+
+subdir = 'mkldnn'
+
+# Sepcify prefix under which you put ipl_mkldnn
+# prefix = '/usr/local'
+mkldnn_root = external.mkldnn.root()
+mkldnn_version = 'ae00102be506ed0fe2099c6557df2aa88ad57ec1'
+dlcp_root = os.getcwd() + '/external/dlcp'
+
+
+def prepare_mkldnn():
+    external.mkldnn.prepare(mkldnn_version)
+
+
+class _build_py(build_py):
+    def run(self):
+        prepare_mkldnn()
+        self.run_command('build_ext')
+        build_py.run(self)
+
+
+class _install(install):
+    def run(self):
+        prepare_mkldnn()
+        self.run_command('build_ext')
+        install.run(self)
+
+
+swig_opts = ['-c++', '-builtin', '-modern', '-modernargs',
+             '-Iideep4py/py/mm',
+             '-Iideep4py/py/primitives',
+             '-Iideep4py/py/swig_utils',
+             '-Iideep4py/py/dlcp',
+             '-Iideep4py/include/primitives/',
+             '-Iideep4py/include/mm/',
+             '-I' + dlcp_root + '/include']
+
+if sys.version_info.major < 3:
+    swig_opts += ['-DNEWBUFFER_ON']
+
+ccxx_opts = ['-std=c++11', '-Wno-unknown-pragmas']
+link_opts = ['-Wl,-z,now', '-Wl,-z,noexecstack',
+             '-Wl,-rpath,' + mkldnn_root + '/lib', '-L' + mkldnn_root + '/lib',
+             '-Wl,-rpath,' + dlcp_root + '/lib', '-L' + dlcp_root + '/lib']
+
+includes = [get_include(),
+            'ideep4py/include',
+            'ideep4py/include/mkl',
+            'ideep4py/common',
+            'ideep4py/include/mm',
+            'ideep4py/py/mm',
+            'ideep4py/py/primitives',
+            'ideep4py/py/dlcp',
+            'ideep4py/include/primitives',
+            'ideep4py/include/blas',
+            'ideep4py/include/primitives/ops',
+            'ideep4py/include/primitives/prim_mgr',
+            mkldnn_root + '/include',
+            dlcp_root + '/include']
+
+libraries = ['mkldnn', 'mklml_intel', 'dlcomp']
+
+if system() == 'Linux':
+    ccxx_opts += ['-fopenmp', '-DOPENMP_AFFINITY']
+    libraries += ['boost_system', 'glog', 'm']
+    src = ['ideep4py/py/ideep4py.i',
+           'ideep4py/py/dlcp/dlcp_py.cc',
+           'ideep4py/mm/mem.cc',
+           'ideep4py/mm/tensor.cc',
+           'ideep4py/py/mm/mdarray.cc',
+           'ideep4py/common/cpu_info.cc',
+           'ideep4py/common/utils.cc',
+           'ideep4py/common/common.cc',
+           'ideep4py/blas/sum.cc',
+           'ideep4py/py/mm/basic.cc',
+           'ideep4py/primitives/ops/eltwise_fwd.cc',
+           'ideep4py/primitives/ops/eltwise_bwd.cc',
+           'ideep4py/primitives/eltwise.cc',
+           'ideep4py/primitives/ops/conv_fwd.cc',
+           'ideep4py/primitives/ops/conv_bwd_weights.cc',
+           'ideep4py/primitives/ops/conv_bwd_data.cc',
+           'ideep4py/primitives/ops/reorder_op.cc',
+           'ideep4py/primitives/conv.cc',
+           'ideep4py/primitives/ops/pooling_fwd.cc',
+           'ideep4py/primitives/ops/pooling_bwd.cc',
+           'ideep4py/primitives/pooling.cc',
+           'ideep4py/primitives/ops/linear_fwd.cc',
+           'ideep4py/primitives/ops/linear_bwd_weights.cc',
+           'ideep4py/primitives/ops/linear_bwd_data.cc',
+           'ideep4py/primitives/linear.cc',
+           'ideep4py/primitives/bn.cc',
+           'ideep4py/primitives/ops/bn_fwd.cc',
+           'ideep4py/primitives/ops/bn_bwd.cc',
+           'ideep4py/primitives/ops/concat_fwd.cc',
+           'ideep4py/primitives/ops/concat_bwd.cc',
+           'ideep4py/primitives/concat.cc',
+           'ideep4py/primitives/ops/lrn_fwd.cc',
+           'ideep4py/primitives/ops/lrn_bwd.cc',
+           'ideep4py/primitives/lrn.cc',
+           'ideep4py/primitives/dropout.cc',
+           ]
+else:
+    # TODO
+    src = ['mkldnn/mdarray.i', 'mkldnn/mdarray.cc']
+
+ext_modules = []
+
+ext = Extension(
+    'ideep4py._ideep4py', sources=src,
+    swig_opts=swig_opts,
+    extra_compile_args=ccxx_opts, extra_link_args=link_opts,
+    include_dirs=includes, libraries=libraries)
+
+ext_modules.append(ext)
+
+packages = ['ideep4py', 'ideep4py.cosim']
+
+setup(
+    name='ideep4py',
+    version='0.0',
+    description='',
+    author='Intel',
+    author_email='',
+    url='',
+    license='MIT License',
+    packages=packages,
+    ext_modules=ext_modules,
+    cmdclass={'install': _install, 'build_py': _build_py},
+    zip_safe=False,
+    # setup_requires=setup_requires,
+    # install_requires=install_requires,
+    # tests_require=['mock',
+    #                'pytest'],
+)
diff --git a/tests/ideep4py_tests/test_batch_normalization.py b/tests/ideep4py_tests/test_batch_normalization.py
new file mode 100644
index 00000000..1fe764e6
--- /dev/null
+++ b/tests/ideep4py_tests/test_batch_normalization.py
@@ -0,0 +1,136 @@
+import sys
+import unittest
+
+import numpy
+import six
+import ideep4py
+from ideep4py import batchNormalization
+
+try:
+    import testing
+    from testing import condition
+except Exception as ex:
+    print('*** testing directory is missing: %s' % ex)
+    sys.exit(-1)
+
+
+def _x_hat(x, mean, inv_std):
+    x_mu = x - mean
+    x_mu *= inv_std
+    return x_mu
+
+
+def _batch_normalization(expander, gamma, beta, x, mean, var):
+    mean = mean[expander]
+    std = numpy.sqrt(var)[expander]
+    y_expect = (gamma[expander] * (x - mean) / std + beta[expander])
+    return y_expect
+
+
+@testing.parameterize(*(testing.product({
+    'param_shape': [(3, ), ],
+    'ndim': [2, ],
+    'dtype': [numpy.float32],
+})))
+class TestBatchNormalizationF32(unittest.TestCase):
+
+    def setUp(self):
+        self.eps = 2e-5
+        self.expander = (None, Ellipsis) + (None,) * self.ndim
+        self.gamma = numpy.random.uniform(.5, 1,
+                                          self.param_shape).astype(self.dtype)
+        self.beta = numpy.random.uniform(-1, 1,
+                                         self.param_shape).astype(self.dtype)
+        self.head_ndim = self.gamma.ndim + 1
+        shape = (5,) + self.param_shape + (2,) * self.ndim
+        self.x = numpy.random.uniform(-1, 1, shape).astype(self.dtype)
+        self.gy = numpy.random.uniform(-1, 1, shape).astype(self.dtype)
+
+        self.args = [self.x, self.gamma, self.beta]
+        self.aggr_axes = (0,) + tuple(
+            six.moves.range(self.head_ndim, self.x.ndim))
+        self.mean = self.x.mean(axis=self.aggr_axes)
+        self.var = self.x.var(axis=self.aggr_axes) + self.eps
+        self.check_forward_options = {'atol': 1e-4, 'rtol': 1e-3}
+        self.check_backward_options = {'atol': 1e-4, 'rtol': 1e-3}
+
+    def check_forward(self, args):
+        x, gamma, beta = args
+        expander = (None, Ellipsis) + (None,) * (x.ndim - self.head_ndim)
+        self.expander = expander
+        self.axis = (0,) + tuple(range(self.head_ndim, x.ndim))
+        expand_dim = False
+        if x.ndim == 2:
+            expand_dim = True
+            x = x[:, :, None, None]
+
+        gamma = gamma[expander]
+        beta = beta[expander]
+        W = numpy.concatenate((gamma, beta), axis=0).reshape((2, -1))
+
+        y_act, self.mean, self.var, inv_std = batchNormalization.Forward(
+            ideep4py.mdarray(x),
+            ideep4py.mdarray(W),
+            None,
+            None,
+            self.eps
+        )
+
+        if expand_dim:
+            y_act = numpy.squeeze(y_act, axis=(2, 3))
+        y_act = numpy.array(y_act, dtype=self.dtype)
+
+        y_expect = _batch_normalization(
+            self.expander, self.gamma, self.beta, self.x, self.mean, self.var)
+
+        numpy.testing.assert_allclose(
+            y_expect, y_act, **self.check_forward_options)
+
+    @condition.retry(3)
+    def test_forward_cpu(self):
+        self.check_forward(self.args)
+
+    def check_backward(self, args, y_grad):
+        x, gamma, beta = args
+        gy = y_grad
+        expander = self.expander
+        inv_m = gamma.dtype.type(1. / (x.size // gamma.size))
+
+        expand_dim = False
+        if x.ndim == 2:
+            expand_dim = True
+            x = x[:, :, None, None]
+            gy = gy[:, :, None, None]
+
+        gamma = gamma[self.expander]
+        beta = numpy.zeros_like(gamma)
+        W = numpy.concatenate((gamma, beta), axis=0).reshape((2, -1))
+
+        gx_act, gW = batchNormalization.Backward(
+            ideep4py.mdarray(x),
+            ideep4py.mdarray(gy),
+            ideep4py.mdarray(self.mean),
+            ideep4py.mdarray(self.var),
+            ideep4py.mdarray(W),
+            self.eps
+        )
+        if expand_dim:
+            gx_act = numpy.squeeze(gx_act, axis=(2, 3))
+        gx_act = numpy.array(gx_act, dtype=self.dtype)
+
+        self.inv_std = self.var ** (-0.5)
+
+        gbeta = y_grad.sum(axis=self.aggr_axes)
+        x_hat = _x_hat(x, self.mean[expander], self.inv_std[expander])
+        ggamma = (y_grad * x_hat).sum(axis=self.aggr_axes)
+        gx_expect = (self.gamma * self.inv_std)[expander] * (
+            y_grad - (x_hat * ggamma[expander] + gbeta[expander]) * inv_m)
+
+        numpy.testing.assert_allclose(
+            gx_expect, gx_act, **self.check_backward_options)
+
+    def test_backward_cpu(self):
+        self.check_backward(self.args, self.gy)
+
+
+testing.run_module(__name__, __file__)
diff --git a/tests/ideep4py_tests/test_concat_py.py b/tests/ideep4py_tests/test_concat_py.py
new file mode 100644
index 00000000..5eeeaf10
--- /dev/null
+++ b/tests/ideep4py_tests/test_concat_py.py
@@ -0,0 +1,102 @@
+import sys
+import unittest
+
+import numpy
+import ideep4py
+from ideep4py import intVector, mdarrayVector, concat
+
+try:
+    import testing
+except Exception as ex:
+    print('*** testing directory is missing: %s' % ex)
+    sys.exit(-1)
+
+
+@testing.parameterize(*testing.product_dict(
+    [
+        {'shape': (7, 2, 3, 5), 'axis': 0, 'section': [2, 5],
+         'slices': [[slice(None, 2)], [slice(2, 5)],
+                    [slice(5, None)]]},
+        {'shape': (2, 7, 3, 5), 'axis': 1, 'section': [2, 5],
+         'slices': [[slice(None), slice(None, 2)], [slice(None), slice(2, 5)],
+                    [slice(None), slice(5, None)]]},
+        {'shape': (2, 3, 7, 5), 'axis': 2, 'section': [2, 5],
+         'slices': [[slice(None), slice(None), slice(None, 2)],
+                    [slice(None), slice(None), slice(2, 5)],
+                    [slice(None), slice(None), slice(5, None)]]},
+        {'shape': (2, 3, 5, 7), 'axis': 3, 'section': [2, 5],
+         'slices': [[slice(None), slice(None), slice(None), slice(None, 2)],
+                    [slice(None), slice(None), slice(None), slice(2, 5)],
+                    [slice(None), slice(None), slice(None), slice(5, None)]]},
+        {'shape': (60, 33, 3, 3), 'axis': 0, 'section': [12, 48],
+         'slices': [[slice(None, 12)],
+                    [slice(12, 48)],
+                    [slice(48, None)]]},
+        {'shape': (33, 60, 3, 3), 'axis': 1, 'section': [12, 48],
+         'slices': [[slice(None), slice(None, 12)],
+                    [slice(None), slice(12, 48)],
+                    [slice(None), slice(48, None)]]},
+        {'shape': (33, 3, 60, 3), 'axis': 2, 'section': [12, 48],
+         'slices': [[slice(None), slice(None), slice(None, 12)],
+                    [slice(None), slice(None), slice(12, 48)],
+                    [slice(None), slice(None), slice(48, None)]]},
+        {'shape': (33, 3, 3, 60), 'axis': 3, 'section': [12, 48],
+         'slices': [[slice(None), slice(None), slice(None), slice(None, 12)],
+                    [slice(None), slice(None), slice(None), slice(12, 48)],
+                    [slice(None), slice(None), slice(None), slice(48, None)]]},
+    ],
+    [
+        {'dtype': numpy.float32},
+    ],
+))
+class TestConcatPyF32(unittest.TestCase):
+
+    def setUp(self):
+        self.y = numpy.arange(
+            numpy.prod(self.shape), dtype=self.dtype).reshape(self.shape)
+        self.xs = [self.y[s] for s in self.slices]
+
+    def check_forward(self, xs_data, y_data, axis):
+        xs = tuple(x_data for x_data in xs_data)
+        xs_mdarray = mdarrayVector()
+        for yi in xs:
+            if isinstance(yi, numpy.ndarray):
+                if yi.flags.contiguous is False:
+                    yi = numpy.ascontiguousarray(yi)
+            yi = ideep4py.mdarray(numpy.ascontiguousarray(yi))
+            xs_mdarray.push_back(yi)
+        y_act = concat.Forward(xs_mdarray, self.axis)
+        y_act = numpy.array(y_act, dtype=self.dtype)
+
+        numpy.testing.assert_allclose(y_data, y_act, atol=0, rtol=0)
+
+    def test_forward_cpu(self):
+        self.check_forward(self.xs, self.y, axis=self.axis)
+
+    def check_backward(self, xs_data, y_data, axis):
+        xs = tuple(x_data for x_data in xs_data)
+        xs_mdarray = mdarrayVector()
+        for yi in xs:
+            if isinstance(yi, numpy.ndarray):
+                if yi.flags.contiguous is False:
+                    yi = numpy.ascontiguousarray(yi)
+            yi = ideep4py.mdarray(numpy.ascontiguousarray(yi))
+            xs_mdarray.push_back(yi)
+        y_data = ideep4py.mdarray(y_data)
+        offsets = intVector()
+        # FIXME
+        for i in self.section:
+            offsets.push_back(i)
+        x_act_mdarray = concat.Backward(y_data, offsets, self.axis)
+        i = 0
+        for x in xs:
+            x_act = numpy.array(x_act_mdarray[i], dtype=self.dtype)
+            numpy.testing.assert_allclose(
+                x, x_act, atol=0, rtol=0)
+            i = i + 1
+
+    def test_backward_cpu(self):
+        self.check_backward(self.xs, self.y, axis=self.axis)
+
+
+testing.run_module(__name__, __file__)
diff --git a/tests/ideep4py_tests/test_convolution2d_py.py b/tests/ideep4py_tests/test_convolution2d_py.py
new file mode 100644
index 00000000..54b8b93c
--- /dev/null
+++ b/tests/ideep4py_tests/test_convolution2d_py.py
@@ -0,0 +1,175 @@
+import sys
+import unittest
+import numpy
+import ideep4py
+from ideep4py import convolution2DParam
+from ideep4py import convolution2D
+
+try:
+    import testing
+    from testing import condition
+    from testing.conv import im2col_cpu, col2im_cpu, get_conv_outsize
+except Exception as ex:
+    print('*** testing directory is missing: %s' % ex)
+    sys.exit(-1)
+
+
+def _set_cover_all(self, x, W):
+    in_h, in_w = x.shape[2:]
+    kh, kw = W.shape[2:]
+    self.cover_all = (
+        in_h != get_conv_outsize(self.outh, kh, self.sy,
+                                 self.ph, d=self.dy) or
+        in_w != get_conv_outsize(self.outw, kw, self.sx,
+                                 self.pw, d=self.dx))
+
+
+@testing.parameterize(*testing.product({
+    'dtype': [numpy.float32, ],
+    'cover_all': [False, True],
+    'channel': [1, 2, 4, 8, 10, ],
+    'bs': [1, 2, 4, 8, 10, 16, 32, 64, ],
+    'with_bias': [True, ],
+}))
+@testing.fix_random()
+class TestConvolution2DPyF32(unittest.TestCase):
+
+    def setUp(self):
+        self.x_shape = (self.bs, self.channel, 224, 224)
+        self.w_shape = (self.channel, self.channel, 3, 3)
+        self.b_shape = self.channel
+
+        self.x = numpy.random.uniform(-1, 1, self.x_shape).astype(self.dtype)
+        self.x = ideep4py.mdarray(self.x)
+        self.w = numpy.random.uniform(-1, 1, self.w_shape).astype(self.dtype)
+        self.w = ideep4py.mdarray(self.w)
+        self.b = numpy.random.uniform(-1, 1, self.b_shape).astype(self.dtype)
+        self.b = ideep4py.mdarray(self.b)
+
+        self.cp = convolution2DParam(self.x_shape,
+                                     1, 1,
+                                     1, 1,
+                                     1, 1,
+                                     1, 1)
+
+        stride = 1
+        pad = 1
+        dilate = 1
+        self.sy, self.sx = stride, stride
+        self.ph, self.pw = pad, pad
+        self.n = self.x_shape[0]
+        self.outc = self.w_shape[0]
+        self.outh = self.x_shape[2]
+        self.outw = self.x_shape[3]
+        self.cover_all = self.cover_all
+        self.dy, self.dx = dilate, dilate
+
+        self.gy = numpy.random.uniform(
+            -1, 1,
+            (self.n, self.outc, self.outh, self.outw)).astype(self.dtype)
+        self.gy = ideep4py.mdarray(self.gy)
+
+        self.check_forward_options = {'atol': 1e-3, 'rtol': 1e-2}
+        self.check_backward_options = {'atol': 1e-3, 'rtol': 1e-2}
+
+    def check_forward(self, x, w, b, cp):
+        if self.with_bias:
+            y_act = convolution2D.Forward(x, w, b, cp)
+        else:
+            y_act = convolution2D.Forward(x, w, None, cp)
+        y_act = numpy.array(y_act, dtype=self.dtype)
+
+        x = numpy.array(x, dtype=self.dtype)
+        w = numpy.array(w, dtype=self.dtype)
+        b = numpy.array(b, dtype=self.dtype)
+        kh, kw = w.shape[2:]
+        col = im2col_cpu(
+            x, kh, kw, self.sy, self.sx, self.ph, self.pw,
+            cover_all=self.cover_all, dy=self.dy, dx=self.dx)
+        y = numpy.tensordot(
+            col, w, ((1, 2, 3), (1, 2, 3))).astype(x.dtype, copy=False)
+        if b is not None:
+            y += b
+        y_expect = numpy.rollaxis(y, 3, 1)
+        numpy.testing.assert_allclose(
+            y_act, y_expect, **self.check_forward_options)
+
+    def test_forward_cpu(self):
+        self.check_forward(self.x, self.w, self.b, self.cp)
+
+    def check_backward_weights(self, x, w, b, cp, gy):
+        gW_act, gB_act = convolution2D.BackwardWeightsBias(x, gy, cp)
+        gW_act = numpy.array(gW_act, dtype=self.dtype)
+
+        x = numpy.array(x, dtype=self.dtype)
+        w = numpy.array(w, dtype=self.dtype)
+        b = numpy.array(b, dtype=self.dtype)
+        gy = numpy.array(gy, dtype=self.dtype)
+        kh, kw = w.shape[2:]
+        col = im2col_cpu(
+            x, kh, kw, self.sy, self.sx, self.ph, self.pw,
+            cover_all=self.cover_all, dy=self.dy, dx=self.dx)
+
+        gW_expect = numpy.tensordot(
+            gy, col, ((0, 2, 3), (0, 4, 5))).astype(self.dtype, copy=False)
+        numpy.testing.assert_allclose(
+            gW_act, gW_expect, **self.check_backward_options)
+
+    @condition.retry(3)
+    def test_backward_cpu_weights(self):
+        print("test_backward_cpu_weights")
+        cp = convolution2DParam(self.w_shape,
+                                1, 1,
+                                1, 1,
+                                1, 1,
+                                1, 1)
+
+        self.check_backward_weights(self.x, self.w, self.b, cp, self.gy)
+
+    def check_backward_data(self, x, w, b, cp):
+        out_c, in_c, kh, kw = w.shape
+        n, out_c, in_h, in_w = x.shape
+        self.pd = self.sy * (in_h - 1) + (
+            kh + (kh - 1) * (self.dy - 1)) - self.outh - self.ph
+        self.pr = self.sx * (in_w - 1) + (
+            kw + (kw - 1) * (self.dx - 1)) - self.outw - self.pw
+
+        _set_cover_all(self, x, w)
+        # create conv parameter
+        # for IA specific
+        param = convolution2DParam(x.shape,
+                                   self.dy, self.dx,
+                                   self.sy, self.sx,
+                                   self.ph, self.pw,
+                                   self.pd, self.pr)
+        y_act = convolution2D.BackwardData(w, x, param)
+        if b is not None:
+            y_act += b.reshape(1, b.size, 1, 1)
+        y_act = numpy.array(y_act, dtype=self.dtype)
+
+        x = numpy.array(x, dtype=self.dtype)
+        w = numpy.array(w, dtype=self.dtype)
+
+        gcol = numpy.tensordot(w, x, (0, 1)).astype(x.dtype, copy=False)
+        # - k, m, n: shape of out_channel
+        # - b: number of inputs
+        # - h, w: height and width of kernels
+        # k, m, n, b, h, w -> b, k, m, n, h, w
+        gcol = numpy.rollaxis(gcol, 3)
+        y_expect = col2im_cpu(
+            gcol, self.sy, self.sx, self.ph, self.pw, self.outh, self.outw,
+            dy=self.dy, dx=self.dx)
+        # b, k, h, w
+        if b is not None:
+            y_expect += b.reshape(1, b.size, 1, 1)
+
+        numpy.testing.assert_allclose(
+            y_act, y_expect, **self.check_backward_options)
+
+    @condition.retry(3)
+    def test_backward_cpu_data(self):
+        print("test_backward_cpu_data")
+        self.check_backward_data(self.x, self.w, self.b, self.cp)
+
+
+testing.run_module(__name__, __file__)
diff --git a/tests/ideep4py_tests/test_dropout.py b/tests/ideep4py_tests/test_dropout.py
new file mode 100644
index 00000000..75d2c7bc
--- /dev/null
+++ b/tests/ideep4py_tests/test_dropout.py
@@ -0,0 +1,52 @@
+import sys
+import unittest
+
+import numpy
+import ideep4py
+from ideep4py import dropout
+
+try:
+    import testing
+except Exception as ex:
+    print('*** testing directory is missing: %s' % ex)
+    sys.exit(-1)
+
+
+def _dropout(x, creator):
+    return x * creator.mask
+
+
+@testing.parameterize(*testing.product({
+    'dropout_ratio': [0.0, 0.1, 0.3, 0.5, 0.8],
+    'dtype': [numpy.float32, ],
+}))
+@testing.fix_random()
+class TestDropoutF32(unittest.TestCase):
+
+    def setUp(self):
+        self.x = numpy.random.rand(128, 3, 224, 224).astype(self.dtype)
+        self.x_md = ideep4py.mdarray(self.x)
+        self.gy = numpy.random.rand(128, 3, 224, 224).astype(self.dtype)
+
+    def check_forward(self, x, x_md):
+        mask, y = dropout.Forward(x_md, self.dropout_ratio)
+        y = numpy.array(y, dtype=self.dtype)
+        y_expect = x * mask
+        numpy.testing.assert_allclose(y, y_expect)
+
+    def check_backward(self, x_md, gy):
+        mask, y = dropout.Forward(x_md, self.dropout_ratio)
+        gy_md = ideep4py.mdarray(gy)
+        gx = dropout.Backward(mask, gy_md)
+        gx = numpy.array(gx, dtype=self.dtype)
+        gx_expect = gy * mask
+        numpy.testing.assert_allclose(gx, gx_expect)
+
+    def test_forward_cpu(self):
+        self.check_forward(self.x, self.x_md)
+
+    def test_backward_cpu(self):
+        self.check_backward(self.x_md, self.gy)
+
+
+testing.run_module(__name__, __file__)
diff --git a/tests/ideep4py_tests/test_linear_py.py b/tests/ideep4py_tests/test_linear_py.py
new file mode 100644
index 00000000..182111dd
--- /dev/null
+++ b/tests/ideep4py_tests/test_linear_py.py
@@ -0,0 +1,107 @@
+import sys
+import unittest
+import numpy
+import ideep4py
+from ideep4py import linear
+
+try:
+    import testing
+    from testing import condition
+except Exception as ex:
+    print('*** testing directory is missing: %s' % ex)
+    sys.exit(-1)
+
+
+@testing.parameterize(*testing.product({
+    'x_dtype': [numpy.float32],
+    'W_dtype': [numpy.float32],
+}))
+class TestLinearPyF32(unittest.TestCase):
+
+    def setUp(self):
+        self.W = numpy.random.uniform(
+            -1, 1, (2, 3)).astype(self.W_dtype)
+        self.b = numpy.random.uniform(
+            -1, 1, 2).astype(self.x_dtype)
+
+        self.x = numpy.random.uniform(-1, 1, (4, 3)).astype(self.x_dtype)
+        self.gy = numpy.random.uniform(-1, 1, (4, 2)).astype(self.x_dtype)
+
+        self.check_forward_options = {'atol': 5e-4, 'rtol': 5e-3}
+        self.check_backward_options = {'atol': 5e-4, 'rtol': 5e-3}
+
+    def check_forward(self, x, W, b, y_expect):
+        with_bias = True if b is not None else False
+
+        x = ideep4py.mdarray(x)
+        W = ideep4py.mdarray(W)
+        if with_bias:
+            b = ideep4py.mdarray(b)
+            y_act = linear.Forward(x, W, b)
+        else:
+            y_act = linear.Forward(x, W, None)
+
+        y_act = numpy.array(y_act, dtype=self.x_dtype)
+        numpy.testing.assert_allclose(
+            y_expect, y_act, **self.check_forward_options)
+
+    @condition.retry(3)
+    def test_forward_cpu(self):
+        self.check_forward(self.x, self.W, self.b,
+                           self.x.dot(self.W.T) + self.b)
+
+    @condition.retry(3)
+    def test_forward_cpu_nobias(self):
+        self.check_forward(self.x, self.W, None, self.x.dot(self.W.T))
+
+    def check_backward_data(self, x, W, gy):
+        gx_expect = gy.dot(W).astype(gy.dtype, copy=False)
+
+        W = ideep4py.mdarray(W)
+        gy = ideep4py.mdarray(gy)
+        gx_act = linear.BackwardData(W, gy)
+        gx_act = numpy.array(gx_act, dtype=self.W_dtype)
+        gx_expect = gy.dot(W).astype(gy.dtype, copy=False)
+        numpy.testing.assert_allclose(
+            gx_expect, gx_act, **self.check_backward_options)
+
+    @condition.retry(3)
+    def test_backward_cpu_data(self):
+        self.check_backward_data(self.x, self.W, self.gy)
+
+    def check_backward_weights(self, x, gy):
+        gW_expect = gy.T.dot(x).astype(self.W_dtype, copy=False)
+
+        x = ideep4py.mdarray(x)
+        gy = ideep4py.mdarray(gy)
+        gW_act = linear.BackwardWeights(x, gy)
+        gW_act = numpy.array(gW_act, dtype=self.W_dtype)
+
+        numpy.testing.assert_allclose(
+            gW_expect, gW_act, **self.check_backward_options)
+
+    @condition.retry(3)
+    def test_backward_cpu_weights(self):
+        self.check_backward_weights(self.x, self.gy)
+
+    def check_backward_weights_bias(self, x, gy):
+        gW_expect = gy.T.dot(x).astype(self.W_dtype, copy=False)
+        gb_expect = gy.sum((0))
+
+        x = ideep4py.mdarray(x)
+        gy = ideep4py.mdarray(gy)
+        (gW_act, gb_act) = linear.BackwardWeightsBias(x, gy)
+        gW_act = numpy.array(gW_act, dtype=self.W_dtype)
+        gb_act = numpy.array(gb_act, dtype=self.W_dtype)
+
+        numpy.testing.assert_allclose(
+            gW_expect, gW_act, **self.check_backward_options)
+        numpy.testing.assert_allclose(
+            gb_expect, gb_act, **self.check_backward_options)
+
+    @condition.retry(3)
+    def test_backward_cpu_weights_bias(self):
+        self.check_backward_weights_bias(self.x, self.gy)
+
+
+testing.run_module(__name__, __file__)
diff --git a/tests/ideep4py_tests/test_local_response_normalization_py.py b/tests/ideep4py_tests/test_local_response_normalization_py.py
new file mode 100644
index 00000000..cb77740a
--- /dev/null
+++ b/tests/ideep4py_tests/test_local_response_normalization_py.py
@@ -0,0 +1,86 @@
+import sys
+import unittest
+
+import numpy
+import six
+import ideep4py
+from ideep4py import localResponseNormalizationParam
+from ideep4py import localResponseNormalization
+
+try:
+    import testing
+except Exception as ex:
+    print('*** testing directory is missing: %s' % ex)
+    sys.exit(-1)
+
+
+@testing.parameterize(*testing.product({
+    'dtype': [numpy.float32],
+    'shape': [(2, 7, 1, 1), (2, 7, 3, 2), ],
+}))
+class TestLocalResponseNormalizationPyF32(unittest.TestCase):
+
+    def setUp(self):
+        self.x = numpy.random.uniform(
+            -1, 1, self.shape).astype(self.dtype)
+        self.gy = numpy.random.uniform(
+            -1, 1, self.shape).astype(self.dtype)
+        self.pp = localResponseNormalizationParam(
+            5, 2, 1e-4, .75,
+            ideep4py.localResponseNormalizationParam.lrn_across_channels
+        )
+        self.check_forward_options = {'atol': 1e-4, 'rtol': 1e-3}
+        self.check_backward_options = {'atol': 1e-4, 'rtol': 1e-3}
+
+    def check_forward(self, x, pp):
+        x_mdarray = ideep4py.mdarray(x)
+        (y_act, ws) = localResponseNormalization.Forward(x_mdarray, pp)
+        y_act = numpy.array(y_act, dtype=self.dtype)
+
+        y_expect = numpy.zeros_like(self.x)
+        for n, c, h, w in numpy.ndindex(self.x.shape):
+            s = 0
+            for i in six.moves.range(max(0, c - 2), min(7, c + 2)):
+                s += self.x[n, i, h, w] ** 2
+            denom = (2 + 1e-4 * s) ** .75
+            y_expect[n, c, h, w] = self.x[n, c, h, w] / denom
+
+        numpy.testing.assert_allclose(
+            y_expect, y_act, **self.check_forward_options)
+
+    def test_forward_cpu(self):
+        self.check_forward(self.x, self.pp)
+
+    def check_backward(self, x, gy, pp):
+        x_mdarray = ideep4py.mdarray(x)
+        gy_mdarray = ideep4py.mdarray(gy)
+        (y_act, ws) = localResponseNormalization.Forward(x_mdarray, pp)
+        gx_act = localResponseNormalization.Backward(
+            x_mdarray, gy_mdarray, ws, pp)
+        gx_act = numpy.array(gx_act, dtype=self.dtype)
+
+        half_n = self.pp.n // 2
+        x2 = numpy.square(x)
+        sum_part = x2.copy()
+        for i in six.moves.range(1, half_n + 1):
+            sum_part[:, i:] += x2[:, :-i]
+            sum_part[:, :-i] += x2[:, i:]
+        self.unit_scale = pp.k + pp.alpha * sum_part
+        self.scale = self.unit_scale ** -pp.beta
+        self.y = x_mdarray * self.scale
+
+        summand = self.y * gy / self.unit_scale
+        sum_p = summand.copy()
+        for i in six.moves.range(1, half_n + 1):
+            sum_p[:, i:] += summand[:, :-i]
+            sum_p[:, :-i] += summand[:, i:]
+
+        gx_expect = gy * self.scale - 2 * pp.alpha * pp.beta * x * sum_p
+        numpy.testing.assert_allclose(
+            gx_expect, gx_act, **self.check_backward_options)
+
+    def test_backward_cpu(self):
+        self.check_backward(self.x, self.gy, self.pp)
+
+
+testing.run_module(__name__, __file__)
diff --git a/tests/ideep4py_tests/test_pooling_2d_py.py b/tests/ideep4py_tests/test_pooling_2d_py.py
new file mode 100644
index 00000000..1b17a086
--- /dev/null
+++ b/tests/ideep4py_tests/test_pooling_2d_py.py
@@ -0,0 +1,81 @@
+import sys
+import unittest
+
+import numpy
+import six
+
+import ideep4py
+from ideep4py import pooling2DParam
+from ideep4py import pooling2D
+
+try:
+    import testing
+    from testing import condition
+    from testing.conv import col2im_cpu
+except Exception as ex:
+    print('*** testing directory is missing: %s' % ex)
+    sys.exit(-1)
+
+
+@testing.parameterize(*testing.product({
+    'dtype': [numpy.float32],
+    'channel': [1, 2, 4, 8, 10, 16, 24, 32, 64],
+    'bs': [0, 1, 2, 4, 6, 8, 10, 16, 24, 32, 64],
+    'stride': [2, ],
+}))
+class TestPooling2DPyF32(unittest.TestCase):
+
+    def setUp(self):
+        self.x = numpy.random.uniform(
+            -1, 1, (self.bs, self.channel, 4, 3)).astype(self.dtype)
+        self.gy = numpy.random.uniform(
+            -1, 1, (self.bs, self.channel, 2, 2)).astype(self.dtype)
+
+        self.pp_fwd = pooling2DParam(
+            self.gy.shape, 3, 3, self.stride, self.stride, 1, 1,
+            1, 1, pooling2DParam.pooling_avg_include_padding)
+        self.pp_bwd = pooling2DParam(
+            (self.bs, self.channel, 4, 3), 3, 3, self.stride, self.stride,
+            1, 1, 1, 1, pooling2DParam.pooling_avg_include_padding)
+
+        self.check_forward_options = {'atol': 1e-5, 'rtol': 1e-4}
+        self.check_backward_options = {'atol': 1e-5, 'rtol': 1e-4}
+
+    def check_forward(self, x, pp):
+        x_mdarray = ideep4py.mdarray(x)
+        (y_act,) = pooling2D.Forward(x_mdarray, pp)
+        y_act = numpy.array(y_act, dtype=self.dtype)
+
+        for k in six.moves.range(self.bs):
+            for c in six.moves.range(self.channel):
+                x = self.x[k, c]
+                expect = numpy.array([
+                    [x[0:2, 0:2].sum(), x[0:2, 1:3].sum()],
+                    [x[1:4, 0:2].sum(), x[1:4, 1:3].sum()]]) / 9
+                numpy.testing.assert_allclose(
+                    expect, y_act[k, c], **self.check_forward_options)
+
+    @condition.retry(3)
+    def test_forward_cpu(self):
+        self.check_forward(self.x, self.pp_fwd)
+
+    def check_backward(self, x, gy, pp):
+        # self.shape[2:]
+        h, w = 4, 3
+        gcol = numpy.tile(gy[:, :, None, None],
+                          (1, 1, 3, 3, 1, 1))
+        gx_expect = col2im_cpu(gcol, 2, 2, 1, 1, h, w)
+        gx_expect /= 3 * 3
+        gy_mdarray = ideep4py.mdarray(gy)
+        gx_act = pooling2D.Backward(gy_mdarray, None, pp)
+        gx_act = numpy.array(gx_act, dtype=self.dtype)
+
+        numpy.testing.assert_allclose(
+            gx_expect, gx_act, **self.check_backward_options)
+
+    @condition.retry(3)
+    def test_backward_cpu(self):
+        self.check_backward(self.x, self.gy, self.pp_bwd)
+
+
+testing.run_module(__name__, __file__)
diff --git a/tests/ideep4py_tests/test_relu_py.py b/tests/ideep4py_tests/test_relu_py.py
new file mode 100644
index 00000000..97cf1dd0
--- /dev/null
+++ b/tests/ideep4py_tests/test_relu_py.py
@@ -0,0 +1,65 @@
+import sys
+import unittest
+
+import numpy
+
+import ideep4py
+from ideep4py import relu
+
+try:
+    import testing
+except Exception as ex:
+    print('*** testing directory is missing: %s' % ex)
+    sys.exit(-1)
+
+
+@testing.parameterize(*testing.product({
+    'shape': [(3, 2), (224, 224)],
+    'dtype': [numpy.float32, ],
+}))
+@testing.fix_random()
+class TestReluPyF32(unittest.TestCase):
+
+    def setUp(self):
+        self.x = numpy.random.uniform(-1, 1, self.shape).astype(self.dtype)
+        self.y = numpy.maximum(self.x, 0, dtype=(self.x).dtype)
+        self.gy = numpy.random.uniform(-1, 1, self.shape).astype(self.dtype)
+        self.gx = (self.x > 0) * self.gy
+
+    def check_forward(self, x, y):
+        mx = ideep4py.mdarray(x)
+        x2 = numpy.array(mx)
+        numpy.testing.assert_allclose(x, x2)
+        my = relu.Forward(mx)
+        y2 = numpy.array(my)
+        numpy.testing.assert_allclose(y, y2)
+
+    def test_forward_cpu(self):
+        self.check_forward(self.x, self.y)
+
+    def check_double_forward(self, x, y):
+        mx = ideep4py.mdarray(x)
+        x2 = numpy.array(mx)
+        numpy.testing.assert_allclose(x, x2)
+        my = relu.Forward(mx)
+        y2 = numpy.array(my)
+        numpy.testing.assert_allclose(y, y2)
+        my = relu.Forward(my)
+        y2 = numpy.array(my)
+        numpy.testing.assert_allclose(y, y2)
+
+    def test_double_forward_cpu(self):
+        self.check_double_forward(self.x, self.y)
+
+    def check_backward(self, x, gy, gx):
+        mx = ideep4py.mdarray(x)
+        mgy = ideep4py.mdarray(gy)
+        mgx = relu.Backward(mx, mgy)
+        gx1 = numpy.array(mgx)
+        numpy.testing.assert_allclose(gx1, gx)
+
+    def test_backward_cpu(self):
+        self.check_backward(self.x, self.gy, self.gx)
+
+
+testing.run_module(__name__, __file__)
diff --git a/tests/ideep4py_tests/testing/__init__.py b/tests/ideep4py_tests/testing/__init__.py
new file mode 100644
index 00000000..5c8e6e5f
--- /dev/null
+++ b/tests/ideep4py_tests/testing/__init__.py
@@ -0,0 +1,18 @@
+from testing import parameterized  # NOQA
+from testing.parameterized import parameterize  # NOQA
+from testing.parameterized import product  # NOQA
+from testing.parameterized import product_dict  # NOQA
+from testing.random import fix_random  # NOQA
+
+
+def run_module(name, file):
+    """Run current test cases of the file.
+
+    Args:
+        name: __name__ attribute of the file.
+        file: __file__ attribute of the file.
+    """
+
+    if name == '__main__':
+        import pytest
+        pytest.main([file, '-vvs', '-x', '--pdb'])
diff --git a/tests/ideep4py_tests/testing/condition.py b/tests/ideep4py_tests/testing/condition.py
new file mode 100644
index 00000000..cf7462d7
--- /dev/null
+++ b/tests/ideep4py_tests/testing/condition.py
@@ -0,0 +1,112 @@
+import functools
+import unittest
+
+import six
+
+
+class QuietTestRunner(object):
+
+    def run(self, suite):
+        result = unittest.TestResult()
+        suite(result)
+        return result
+
+
+def repeat_with_success_at_least(times, min_success):
+    """Decorator for multiple trial of the test case.
+
+    The decorated test case is launched multiple times.
+    The case is judged as passed at least specified number of trials.
+    If the number of successful trials exceeds `min_success`,
+    the remaining trials are skipped.
+
+    Args:
+        times(int): The number of trials.
+        min_success(int): Threshold that the decorated test
+            case is regarded as passed.
+
+    """
+
+    assert times >= min_success
+
+    def _repeat_with_success_at_least(f):
+        @functools.wraps(f)
+        def wrapper(*args, **kwargs):
+            assert len(args) > 0
+            instance = args[0]
+            assert isinstance(instance, unittest.TestCase)
+            success_counter = 0
+            failure_counter = 0
+            results = []
+
+            def fail():
+                msg = '\nFail: {0}, Success: {1}'.format(
+                    failure_counter, success_counter)
+                if len(results) > 0:
+                    first = results[0]
+                    errs = first.failures + first.errors
+                    if len(errs) > 0:
+                        err_msg = '\n'.join(fail[1] for fail in errs)
+                        msg += '\n\nThe first error message:\n' + err_msg
+                instance.fail(msg)
+
+            for _ in six.moves.range(times):
+                suite = unittest.TestSuite()
+                # Create new instance to call the setup and the teardown only
+                # once.
+                ins = type(instance)(instance._testMethodName)
+                suite.addTest(
+                    unittest.FunctionTestCase(
+                        lambda: f(ins, *args[1:], **kwargs),
+                        setUp=ins.setUp,
+                        tearDown=ins.tearDown))
+
+                result = QuietTestRunner().run(suite)
+                if result.wasSuccessful():
+                    success_counter += 1
+                else:
+                    results.append(result)
+                    failure_counter += 1
+                if success_counter >= min_success:
+                    instance.assertTrue(True)
+                    return
+                if failure_counter > times - min_success:
+                    fail()
+                    return
+            fail()
+        return wrapper
+    return _repeat_with_success_at_least
+
+
+def repeat(times):
+    """Decorator that imposes the test to be successful in a row.
+
+    Decorated test case is launched multiple times.
+    The case is regarded as passed only if it is successful
+    specified times in a row.
+
+    .. note::
+        In current implementation, this decorator grasps the
+        failure information of each trial.
+
+    Args:
+        times(int): The number of trials.
+    """
+    return repeat_with_success_at_least(times, times)
+
+
+def retry(times):
+    """Decorator that imposes the test to be successful at least once.
+
+    Decorated test case is launched multiple times.
+    The case is regarded as passed if it is successful
+    at least once.
+
+    .. note::
+        In current implementation, this decorator grasps the
+        failure information of each trial.
+
+    Args:
+        times(int): The number of trials.
+    """
+    return repeat_with_success_at_least(times, 1)
diff --git a/tests/ideep4py_tests/testing/conv.py b/tests/ideep4py_tests/testing/conv.py
new file mode 100644
index 00000000..87169691
--- /dev/null
+++ b/tests/ideep4py_tests/testing/conv.py
@@ -0,0 +1,72 @@
+import numpy
+import six
+
+
+def get_conv_outsize(size, k, s, p, cover_all=False, d=1):
+    """Calculates output size of convolution.
+
+    This function takes the size of input feature map, kernel, stride, and
+    pooling of one particular dimension, then calculates the output feature
+    map size of that dimension.
+
+    .. seealso:: :func:`~chainer.utils.get_deconv_outsize`
+
+    Args:
+        size (int): The size of input feature map. It usually is the length of
+            a side of feature map.
+        k (int): The size of convolution kernel.
+        s (int): The size of stride.
+        p (int): The size of padding.
+        cover_all (bool): Use ``cover_all`` option or not.
+        d (int): The size of dilation.
+
+    Returns:
+        int: The expected output size of the convolution operation.
+
+    """
+    dk = k + (k - 1) * (d - 1)
+    if cover_all:
+        return (size + p * 2 - dk + s - 1) // s + 1
+    else:
+        return (size + p * 2 - dk) // s + 1
+
+
+def im2col_cpu(
+        img, kh, kw, sy, sx, ph, pw, pval=0, cover_all=False, dy=1, dx=1,
+        out_h=None, out_w=None):
+    n, c, h, w = img.shape
+    if out_h is None:
+        out_h = get_conv_outsize(h, kh, sy, ph, cover_all, dy)
+    assert out_h > 0, 'Height in the output should be positive.'
+    if out_w is None:
+        out_w = get_conv_outsize(w, kw, sx, pw, cover_all, dx)
+    assert out_w > 0, 'Width in the output should be positive.'
+
+    img = numpy.pad(img,
+                    ((0, 0), (0, 0), (ph, ph + sy - 1), (pw, pw + sx - 1)),
+                    mode='constant', constant_values=(pval,))
+    col = numpy.ndarray((n, c, kh, kw, out_h, out_w), dtype=img.dtype)
+
+    for j in six.moves.range(kh):
+        jdy = j * dy
+        j_lim = jdy + sy * out_h
+        for i in six.moves.range(kw):
+            idx = i * dx
+            i_lim = idx + sx * out_w
+            col[:, :, j, i, :, :] = img[:, :, jdy:j_lim:sy, idx:i_lim:sx]
+
+    return col
+
+
+def col2im_cpu(col, sy, sx, ph, pw, h, w, dy=1, dx=1):
+    n, c, kh, kw, out_h, out_w = col.shape
+    img = numpy.zeros((n, c, h + 2 * ph + sy - 1, w + 2 * pw + sx - 1),
+                      dtype=col.dtype)
+    for j in six.moves.range(kh):
+        jdy = j * dy
+        j_lim = jdy + sy * out_h
+        for i in six.moves.range(kw):
+            idx = i * dx
+            i_lim = idx + sx * out_w
+            img[:, :, jdy:j_lim:sy, idx:i_lim:sx] += col[:, :, j, i]
+    return img[:, :, ph:h + ph, pw:w + pw]
diff --git a/tests/ideep4py_tests/testing/parameterized.py b/tests/ideep4py_tests/testing/parameterized.py
new file mode 100644
index 00000000..1865a0af
--- /dev/null
+++ b/tests/ideep4py_tests/testing/parameterized.py
@@ -0,0 +1,93 @@
+import functools
+import inspect
+import itertools
+import sys
+import types
+import unittest
+
+import six
+
+
+def _gen_case(base, module, i, param):
+    cls_name = '%s_param_%d' % (base.__name__, i)
+
+    # Add parameters as members
+
+    def __str__(self):
+        name = base.__str__(self)
+        return '%s  parameter: %s' % (name, param)
+
+    mb = {'__str__': __str__}
+    for k, v in six.iteritems(param):
+        if isinstance(v, types.FunctionType):
+
+            def create_new_v():
+                f = v
+
+                def new_v(self, *args, **kwargs):
+                    return f(*args, **kwargs)
+                return new_v
+
+            mb[k] = create_new_v()
+        else:
+            mb[k] = v
+
+    cls = type(cls_name, (base,), mb)
+
+    # Wrap test methods to generate useful error message
+
+    def wrap_test_method(method):
+        @functools.wraps(method)
+        def wrap(*args, **kwargs):
+            try:
+                return method(*args, **kwargs)
+            except AssertionError as e:
+                s = six.StringIO()
+                s.write('Parameterized test failed.\n\n')
+                s.write('Base test method: {}.{}\n'.format(
+                    base.__name__, method.__name__))
+                s.write('Test parameters:\n')
+                for k, v in six.iteritems(param):
+                    s.write('  {}: {}\n'.format(k, v))
+                s.write('\n')
+                s.write('{}: {}\n'.format(type(e).__name__, e))
+                raise AssertionError(s.getvalue())
+        return wrap
+
+    # ismethod for Python 2 and isfunction for Python 3
+    members = inspect.getmembers(
+        cls, predicate=lambda _: inspect.ismethod(_) or inspect.isfunction(_))
+    for name, method in members:
+        if name.startswith('test_'):
+            setattr(cls, name, wrap_test_method(method))
+
+    # Add new test class to module
+    setattr(module, cls_name, cls)
+
+
+def _gen_cases(name, base, params):
+    module = sys.modules[name]
+    for i, param in enumerate(params):
+        _gen_case(base, module, i, param)
+
+
+def parameterize(*params):
+    def f(klass):
+        assert issubclass(klass, unittest.TestCase)
+        _gen_cases(klass.__module__, klass, params)
+        # Remove original base class
+        return None
+    return f
+
+
+def product(parameter):
+    keys = sorted(parameter)
+    values = [parameter[key] for key in keys]
+    values_product = itertools.product(*values)
+    return [dict(zip(keys, vals)) for vals in values_product]
+
+
+def product_dict(*parameters):
+    return [
+        {k: v for dic in dicts for k, v in six.iteritems(dic)}
+        for dicts in itertools.product(*parameters)]
diff --git a/tests/ideep4py_tests/testing/random.py b/tests/ideep4py_tests/testing/random.py
new file mode 100644
index 00000000..03d21f58
--- /dev/null
+++ b/tests/ideep4py_tests/testing/random.py
@@ -0,0 +1,132 @@
+from __future__ import absolute_import
+import atexit
+import functools
+import numpy
+import os
+import random
+import types
+import unittest
+
+_old_python_random_state = None
+_old_numpy_random_state = None
+
+
+def _numpy_do_setup(deterministic=True):
+    global _old_python_random_state
+    global _old_numpy_random_state
+    _old_python_random_state = random.getstate()
+    _old_numpy_random_state = numpy.random.get_state()
+    if not deterministic:
+        numpy.random.seed()
+    else:
+        numpy.random.seed(100)
+
+
+def _numpy_do_teardown():
+    global _old_python_random_state
+    global _old_numpy_random_state
+    random.setstate(_old_python_random_state)
+    numpy.random.set_state(_old_numpy_random_state)
+    _old_python_random_state = None
+    _old_numpy_random_state = None
+
+
+def do_setup(deterministic=True):
+    _numpy_do_setup(deterministic)
+
+
+def do_teardown():
+    _numpy_do_teardown()
+
+
+# In some tests (which utilize condition.repeat or condition.retry),
+# setUp/tearDown is nested. _setup_random() and _teardown_random() do their
+# work only in the outermost setUp/tearDown pair.
+_nest_count = 0
+
+
+@atexit.register
+def _check_teardown():
+    assert _nest_count == 0, ('_setup_random() and _teardown_random() '
+                              'must be called in pairs.')
+
+
+def _setup_random():
+    """Sets up the deterministic random states of ``numpy`` and ``cupy``.
+
+    """
+    global _nest_count
+    if _nest_count == 0:
+        nondeterministic = bool(int(os.environ.get(
+            'CHAINER_TEST_RANDOM_NONDETERMINISTIC', '0')))
+        do_setup(not nondeterministic)
+    _nest_count += 1
+
+
+def _teardown_random():
+    """Tears down the deterministic random states set up by ``_setup_random``.
+
+    """
+    global _nest_count
+    assert _nest_count > 0, '_setup_random has not been called'
+    _nest_count -= 1
+    if _nest_count == 0:
+        do_teardown()
+
+
+def generate_seed():
+    assert _nest_count > 0, 'random is not set up'
+    return numpy.random.randint(0xffffffff)
+
+
+def fix_random():
+    """Decorator that fixes random numbers in a test.
+
+    This decorator can be applied to either a test case class or a test method.
+    It should not be applied within ``condition.retry`` or
+    ``condition.repeat``.
+    """
+
+    # TODO(niboshi): Prevent this decorator from being applied within
+    #    condition.repeat or condition.retry decorators. That would repeat
+    #    tests with the same random seeds. It's okay to apply this outside
+    #    these decorators.
+
+    def decorator(impl):
+        if (isinstance(impl, types.FunctionType) and
+                impl.__name__.startswith('test_')):
+            # Applied to test method
+            @functools.wraps(impl)
+            def test_func(self, *args, **kw):
+                _setup_random()
+                try:
+                    impl(self, *args, **kw)
+                finally:
+                    _teardown_random()
+            return test_func
+        elif isinstance(impl, type) and issubclass(impl, unittest.TestCase):
+            # Applied to test case class
+            klass = impl
+
+            setUp_ = klass.setUp
+            tearDown_ = klass.tearDown
+
+            @functools.wraps(setUp_)
+            def setUp(self):
+                _setup_random()
+                setUp_(self)
+
+            @functools.wraps(tearDown_)
+            def tearDown(self):
+                try:
+                    tearDown_(self)
+                finally:
+                    _teardown_random()
+
+            klass.setUp = setUp
+            klass.tearDown = tearDown
+            return klass
+        else:
+            raise ValueError('Can\'t apply fix_random to {}'.format(impl))
+
+    return decorator