diff --git a/cmake/mkl.cmake b/cmake/mkl.cmake index 55115f71..b2a04c16 100644 --- a/cmake/mkl.cmake +++ b/cmake/mkl.cmake @@ -151,7 +151,12 @@ detect_mkl("mkl_rt") if(HAVE_MKL) include_directories(AFTER ${MKLINC}) install(DIRECTORY ${MKLINC}/ DESTINATION include/mklml) - install(DIRECTORY ${MKLINC}/../lib/ DESTINATION lib) + + set(LIBLIST libmklml_intel.so libiomp5.so) + foreach(LIB ${LIBLIST}) + install(FILES ${MKLINC}/../lib/${LIB} DESTINATION lib) + endforeach() + list(APPEND mkldnn_LINKER_LIBS ${MKLLIB}) set(MSG "Intel(R) MKL:") diff --git a/conda/build.sh b/conda/build.sh new file mode 100755 index 00000000..f7079d40 --- /dev/null +++ b/conda/build.sh @@ -0,0 +1,24 @@ +#!/bin/bash + +# Install script for Anaconda environments on macOS and linux. +# This script is not supposed to be called directly, but should be run by: +# +# $ cd +# $ conda build conda +# +# +# If you're debugging this, it may be useful to use the env that conda build is +# using: +# $ cd /conda-bld/ideep_ +# $ source activate _h_env_... # some long path with lots of placeholders +# +# Also, failed builds will accumulate those ideep_ directories. You +# can remove them after a succesfull build with +# $ conda build purge +# +git submodule update --init +mkdir build +cd build +cmake -DCMAKE_INSTALL_PREFIX=$HOME/.local .. +cd ../python +python setup.py install diff --git a/conda/meta.yaml b/conda/meta.yaml new file mode 100644 index 00000000..9be14791 --- /dev/null +++ b/conda/meta.yaml @@ -0,0 +1,27 @@ +{% set version = "2.0.0b1" %} + +package: + name: ideep4py + version: {{ version }} + +source: + path: ../ + +build: + number: 0 + skip: True # [win] + +requirements: + build: + - numpy + - python + run: + - numpy + - python + +test: + imports: + - ideep4py + +about: + license: MIT diff --git a/docker/python/Dockerfile b/docker/python/Dockerfile new file mode 100644 index 00000000..f39b0591 --- /dev/null +++ b/docker/python/Dockerfile @@ -0,0 +1,11 @@ +FROM ubuntu:16.04 + +RUN apt-get update -y && \ + apt-get install -y --no-install-recommends \ + python-dev \ + python-pip \ + python-wheel \ + python-setuptools && \ + rm -rf /var/lib/apt/lists/* /var/cache/apt/archives/* + +RUN pip install ideep4py diff --git a/include/ideep.hpp b/include/ideep.hpp index b5e7c000..f62d566e 100644 --- a/include/ideep.hpp +++ b/include/ideep.hpp @@ -43,10 +43,7 @@ #include "ideep/tensor.hpp" #include "ideep/computations.hpp" #include "ideep/allocators.hpp" - -#if __GNUC__ > 4 #include "ideep/fast_math.hpp" -#endif #endif diff --git a/include/ideep/abstract_types.hpp b/include/ideep/abstract_types.hpp index 19306d9b..8cd255f3 100644 --- a/include/ideep/abstract_types.hpp +++ b/include/ideep/abstract_types.hpp @@ -21,38 +21,27 @@ using error = mkldnn::error; // For 2D convolution with grouped weights, the ndims must be 5 (goihw) #define IDEEP_IS_GROUPED_4DIMS(d) (((d).size() == 5) ? 1 : 0) +#define IDEEP_MOD_PTR(ptr, bytes) (((uintptr_t)(ptr)) & ((bytes) - 1)) +#define IDEEP_IS_ALIGNED_PTR(ptr, bytes) ((IDEEP_MOD_PTR(ptr, bytes)) == 0) + /// Same class for resource management, except public default constructor /// Movable support for better performance template > -class c_wrapper{ -protected: - std::shared_ptr::type> _data; +class c_wrapper : + public std::shared_ptr::type> { + using super = std::shared_ptr::type>; public: /// Constructs a C handle wrapper. /// @param t The C handle to wrap. /// @param weak A flag to specify whether to construct a weak wrapper. - c_wrapper(T t = nullptr, bool weak = false): _data(t, [weak]() { + c_wrapper(T t = nullptr, bool weak = false): super(t, [weak]() { auto dummy = [](T) { return decltype(traits::destructor(0))(0); }; return weak? dummy : traits::destructor; }()) {} - bool operator==(const T other) const { return other == _data.get(); } - bool operator!=(const T other) const { return !(*this == other); } - - c_wrapper(const c_wrapper& other): _data(other._data) {} - c_wrapper(c_wrapper&& movable) : _data(std::move(movable._data)) {} - - c_wrapper &operator=(c_wrapper&& other) { - _data = std::move(other._data); - return *this; - } - - c_wrapper &operator=(const c_wrapper& other) { - _data = other._data; - return *this; - } + using super::super; /// Resets the value of a C handle. /// @param t The new value of the C handle. @@ -61,17 +50,7 @@ class c_wrapper{ auto dummy_destructor = [](T) { return decltype(traits::destructor(0))(0); }; - _data.reset(t, weak ? dummy_destructor : traits::destructor); - } - - /// Returns the value of the underlying C handle. - T get() const { return _data.get(); } - - bool operator==(const c_wrapper &other) const { - return other._data.get() == _data.get(); - } - bool operator!=(const c_wrapper &other) const { - return !(*this == other); + super::reset(t, weak ? dummy_destructor : traits::destructor); } }; diff --git a/include/ideep/computations.hpp b/include/ideep/computations.hpp index 05210ac0..7c24eaed 100644 --- a/include/ideep/computations.hpp +++ b/include/ideep/computations.hpp @@ -49,13 +49,10 @@ #include "scope_guard.hpp" #include "instruments.hpp" #include - -#if __GNUC__ > 4 +#include #include "fast_math.hpp" #endif -#endif - namespace ideep { template<> @@ -78,10 +75,12 @@ inline tensor::data_type tensor::descriptor::type_to_id() { return tensor::data_type::s8; } -/// Descriptor group, create relative descriptors all in one +/// A group of primitive descriptors, pack related reorder descriptors +/// with computational descriptor. class descriptor_group: public c_wrapper_complex { friend class primitive_group; public: + /// Post ops for fusion operations class post_ops : public c_wrapper { public: post_ops() : c_wrapper([]() { @@ -91,7 +90,6 @@ class descriptor_group: public c_wrapper_complex { return result; }()) {} - int num_ops() const { return mkldnn_post_ops_len(get()); } @@ -195,6 +193,8 @@ class descriptor_group: public c_wrapper_complex { } }; + /// Attribute class for extra information into computations, including + /// post operations, rounding mode, etc. class attr_t : public c_wrapper { public: attr_t() : c_wrapper([]() { @@ -273,6 +273,7 @@ class descriptor_group: public c_wrapper_complex { return attr; } + // XXX: concept error static inline attr_t residual(float scale = 1.0, float alpha = 0.f, float beta = 0.f) { attr_t attr; @@ -304,9 +305,14 @@ class descriptor_group: public c_wrapper_complex { } public: + /// Empty construction descriptor_group() : c_wrapper_complex() {} + /// Query interface + /// + /// @param q query kind + /// @param index query index tensor::descriptor expected_descriptor_of(mkldnn::query q , int index = 0) const { mkldnn_primitive_desc_t cdesc; @@ -319,55 +325,83 @@ class descriptor_group: public c_wrapper_complex { return param::descriptor(cdesc); } + /// Query expected input descriptor + /// + /// @param index Input index tensor::descriptor expected_input_descriptor(int index) const { return expected_descriptor_of(mkldnn::input_pd, index); } + /// Query expected output descriptor + /// + /// @param index Input index tensor::descriptor expected_output_descriptor(int index) const { return expected_descriptor_of(mkldnn::output_pd, index); } + /// Query expected src descriptor + /// tensor::descriptor expected_src_descriptor() const { return expected_descriptor_of(mkldnn::src_pd); } + /// Query expected weights descriptor + /// tensor::descriptor expected_weights_descriptor() const { return expected_descriptor_of(mkldnn::weights_pd); } + /// Query expected bias descriptor + /// tensor::descriptor expected_bias_descriptor() const { return expected_descriptor_of(mkldnn::weights_pd, 1); } + /// Query expected dst descriptor + /// tensor::descriptor expected_dst_descriptor() const { return expected_descriptor_of(mkldnn::dst_pd, 0); } + /// Query expected workspace descriptor + /// tensor::descriptor expected_workspace_descriptor() const { return expected_descriptor_of(mkldnn::workspace_pd, 0); } + /// Query expected gradient X descriptor + /// tensor::descriptor expected_gradx_descriptor() const { return expected_descriptor_of(mkldnn::diff_src_pd, 0); } + /// Query expected gradient Y descriptor + /// tensor::descriptor expected_grady_descriptor() const { return expected_descriptor_of(mkldnn::diff_dst_pd, 0); } + /// Qeury expected weights gradient descriptor + /// tensor::descriptor expected_gradw_descriptor() const { return expected_descriptor_of(mkldnn::diff_weights_pd, 0); } + /// Qeury expected bias gradient descriptor + /// tensor::descriptor expected_gradb_descriptor() const { return expected_descriptor_of(mkldnn::diff_weights_pd, 1); } + /// Query number of inputs + /// int num_of_inputs() const { return mkldnn_primitive_desc_query_s32(get() , mkldnn::convert_to_c(mkldnn::num_of_inputs_s32), 0); } + /// Query number of outputs + /// int num_of_outputs() const { return mkldnn_primitive_desc_query_s32(get() , mkldnn::convert_to_c(mkldnn::num_of_outputs_s32), 0); @@ -390,8 +424,11 @@ class descriptor_group: public c_wrapper_complex { } }; +/// A group of primitives, pack related reorder with computation. +/// It serves as a base class of computation class primitive_group: public c_wrapper_complex { public: + /// Empty constructor primitive_group() : c_wrapper_complex() {} @@ -404,6 +441,7 @@ class primitive_group: public c_wrapper_complex { return cdesc; } + /// Query interface tensor::descriptor expected_descriptor_of(mkldnn::query q, int index = 0) const { mkldnn_primitive_desc_t cdesc; @@ -430,6 +468,8 @@ class primitive_group: public c_wrapper_complex { auxiliaries_[index].reset(result); } + /// Specific query interface, not valid for all computations. + /// tensor::descriptor expected_input_descriptor(int index) const { return expected_descriptor_of(mkldnn::input_pd, index); } @@ -688,6 +728,8 @@ struct spliter : public reorder { } }; +/// Computation class, abstruct of computation +/// struct computation : public primitive_group { computation() = default; @@ -837,9 +879,26 @@ struct computation : public primitive_group { std::vector primitive_inputs_; }; +/// Convolution forward computation, this class represent a MKL-DNN +/// convolution forward process, also manage old computation instances. struct convolution_forward: public computation, public utils::computation_cache { + /// Descriptor class for describing convolution forward process + /// struct descriptor : public descriptor_group { + /// Constructor + /// + /// @param src_desc Input tensor descriptor + /// @param weights_desc Weights tensor descriptor + /// @param bias_desc Bias tensor descriptor + /// @param dst_desc Result tensor descriptor + /// @param strides Strides parameters for the convolution + /// @param padding_l Paddings of up-left + /// @param padding_r Paddings of down-right + /// @param attr Extra attribute for the convolution + /// @param aalgorithm Convolution algorithm + /// @param aprop_kind The propagation kind of convolution + /// @param apadding_kind Padding kind of convolution descriptor(const tensor::descriptor &src_desc, const tensor::descriptor &weights_desc, const tensor::descriptor &bias_desc, @@ -877,6 +936,19 @@ struct convolution_forward: public computation, reset(result); create_reorder_pds({src_desc, weights_desc}); } + + /// Constructor + /// + /// @param src_desc Input tensor descriptor + /// @param weights_desc Weights tensor descriptor + /// @param dst_desc Result tensor descriptor + /// @param strides Strides parameters for the convolution + /// @param padding_l Paddings of up-left + /// @param padding_r Paddings of down-right + /// @param attr Extra attribute for the convolution + /// @param aalgorithm Convolution algorithm + /// @param aprop_kind The propagation kind of convolution + /// @param apadding_kind Padding kind of convolution descriptor(const tensor::descriptor &src_desc, const tensor::descriptor &weights_desc, const tensor::descriptor &dst_desc, @@ -912,6 +984,21 @@ struct convolution_forward: public computation, reset(result); create_reorder_pds({src_desc, weights_desc}); } + + /// Constructor + /// + /// @param src_desc Input tensor descriptor + /// @param weights_desc Weights tensor descriptor + /// @param bias_desc Bias tensor descriptor + /// @param dst_desc Result tensor descriptor + /// @param strides Strides parameters for the convolution + /// @param dilates Dilates parameters for the convolution + /// @param padding_l Paddings of up-left + /// @param padding_r Paddings of down-right + /// @param attr Extra attribute for the convolution + /// @param aalgorithm Convolution algorithm + /// @param aprop_kind The propagation kind of convolution + /// @param apadding_kind Padding kind of convolution descriptor(const tensor::descriptor &src_desc, const tensor::descriptor &weights_desc, const tensor::descriptor &bias_desc, @@ -949,6 +1036,20 @@ struct convolution_forward: public computation, reset(result); create_reorder_pds({src_desc, weights_desc}); } + + /// Constructor + /// + /// @param src_desc Input tensor descriptor + /// @param weights_desc Weights tensor descriptor + /// @param dst_desc Result tensor descriptor + /// @param strides Strides parameters for the convolution + /// @param dilates Dilates parameters for the convolution + /// @param padding_l Paddings of up-left + /// @param padding_r Paddings of down-right + /// @param attr Extra attribute for the convolution + /// @param aalgorithm Convolution algorithm + /// @param aprop_kind The propagation kind of convolution + /// @param apadding_kind Padding kind of convolution descriptor(const tensor::descriptor &src_desc, const tensor::descriptor &weights_desc, const tensor::descriptor &dst_desc, @@ -2211,6 +2312,112 @@ struct eltwise_backward : public computation, } }; +struct channel_shuffle_forward { +public: + channel_shuffle_forward() = delete; + +public: + static void compute_impl(const tensor& src, tensor& dst, int group) { + auto C = src.get_dim(1); + auto K = C / group; + auto S = src.get_dim(2) * src.get_dim(3); // h * w + float* X = static_cast(src.get_data_handle()); + float* Y = static_cast(dst.get_data_handle()); + + IDEEP_ENFORCE(C % group == 0, "Invalid channel and group"); + IDEEP_ENFORCE(src.get_data_type() == tensor::data_type::f32, "invalid data type"); + + if (group <= 1) { + direct_copy::compute(src, dst); + return; + } + + # pragma omp parallel for collapse(3) schedule(static) + for (auto n = 0; n < src.get_dim(0); n++) { + for (auto g = 0; g < group; g++) { + for (auto i = 0; i < K; i++) { + auto* X_offset = (X + g * K * S + n * C * S + S * i); + auto* Y_offset = (Y + g * S + n * C * S + group * S * i); +#ifdef __AVX2__ + FM_AVX2_PREF::memcpy(X_offset, Y_offset, S); +#else + std::memcpy(Y_offset, X_offset, sizeof(float) * S); +#endif + } + } + } + } + + template + static void compute(const tensor& src, tensor& dst, const int group = 1) { + IDEEP_ENFORCE(src != dst, "Unsupport in-place op"); + IDEEP_ENFORCE(src.ndims() == 4, "Only support 4 dims"); + + auto src_in = src; + if (!src_in.is_public_format()) { + src_in.init( + {src.get_dims(), src.get_data_type(), format::nchw}); + reorder::compute(src, src_in); + } + + dst.reinit_like(src_in); + compute_impl(src_in, dst, group); + } +}; + +struct channel_shuffle_backward { +public: + channel_shuffle_backward() = delete; + +public: + static void compute_impl(const tensor& grady, tensor& gradx, int group) { + auto C = grady.get_dim(1); + auto K = C / group; + auto S = grady.get_dim(2) * grady.get_dim(3); // h * w + float* dY = static_cast(grady.get_data_handle()); + float* dX = static_cast(gradx.get_data_handle()); + + IDEEP_ENFORCE(C % group == 0, "Invalid channel and group"); + IDEEP_ENFORCE(grady.get_data_type() == tensor::data_type::f32, "invalid data type"); + + if (group <= 1) { + direct_copy::compute(grady, gradx); + return; + } + + # pragma omp parallel for collapse(3) schedule(static) + for (auto n = 0; n < grady.get_dim(0); n++) { + for (auto g = 0; g < group; g++) { + for (auto i = 0; i < K; i++) { + auto* dY_offset = (dY + g * S + n * C * S + group * S * i); + auto* dX_offset = (dX + g * K * S + n * C * S + S * i); +#ifdef __AVX2__ + FM_AVX2_PREF::memcpy(dY_offset, dX_offset, S); +#else + std::memcpy(dX_offset, dY_offset, sizeof(float) * S); +#endif + } + } + } + } + + template + static void compute(const tensor& grady, tensor& gradx, const int group = 1) { + IDEEP_ENFORCE(grady != gradx, "Unsupport in-place op"); + IDEEP_ENFORCE(grady.ndims() == 4, "Only support 4 dims"); + + auto grady_in = grady; + if (!grady_in.is_public_format()) { + grady_in.init( + {grady.get_dims(), grady.get_data_type(), format::nchw}); + reorder::compute(grady, grady_in); + } + + gradx.reinit_like(grady_in); + compute_impl(grady_in, gradx, group); + } +}; + struct sum : public computation, public utils::computation_cache { struct descriptor : public descriptor_group { @@ -3413,7 +3620,6 @@ struct dropout_backward { } }; -#if __GNUC__ > 4 struct eltwise_binary { public: enum eltwise_binary_op { @@ -3439,12 +3645,14 @@ struct eltwise_binary { } switch (op) { case ELTWISE_ADD: - utils::fast_math::add( +#ifdef __AVX2__ + FM_AVX2_PREF::add( static_cast(outputC.get_data_handle()), static_cast(inputA.get_data_handle()), static_cast(inputB_data), static_cast(inputA.get_nelems())); return; +#endif case ELTWISE_MUL: case ELTWISE_DIV: default: @@ -3455,7 +3663,6 @@ struct eltwise_binary { } } }; -#endif struct sum_array { public: diff --git a/include/ideep/fast_math.hpp b/include/ideep/fast_math.hpp index eb17c16b..31aac561 100644 --- a/include/ideep/fast_math.hpp +++ b/include/ideep/fast_math.hpp @@ -2,9 +2,16 @@ #define _FAST_MATH_HPP_ #include #include +#include #include #include #include "abstract_types.hpp" + +#ifdef __AVX2__ + +#define FM_AVX2_PREF \ + ideep::utils::fast_math + namespace ideep { namespace utils { @@ -18,147 +25,110 @@ typedef enum { avx512_mic_4ops, } cpu_isa_t; -template struct TypeMap {}; -#define MAP_T(v, F, I) \ - template<> struct TypeMap { using tF = F; using tI = I;}; -MAP_T(avx2, __m256, __m256i) -#undef MAP_T -#define TF typename TypeMap::tF -#define TI typename TypeMap::tI template class fast_math { static constexpr int thread_hold = 1024; + static constexpr int align_bytes = 32; + public: + using TF = __m256; + using TI = __m256i; template static inline unsigned get_vec_sz() { - switch (isa) { - case avx2: - return 256/8/sizeof(T); - case avx512_common: - case avx512_core: - return 512/8/sizeof(T); - default: - throw error(mkldnn_unimplemented, "Not implemented!"); - return 0; - } + return 256 / 8 / sizeof(T); } - // Move this to utils - template static inline TI size_to_mask(unsigned nres) { - constexpr int on = -1; - constexpr int off = 0; - switch (isa) { - case avx2: - assert(nres < 8 && nres > 0); - switch(nres) { - case 1: - return _mm256_set_epi32(off, off, off, off, off, off, off, on); - case 2: - return _mm256_set_epi32(off, off, off, off, off, off, on, on); - case 3: - return _mm256_set_epi32(off, off, off, off, off, on, on, on); - case 4: - return _mm256_set_epi32(off, off, off, off, on, on, on, on); - case 5: - return _mm256_set_epi32(off, off, off, on, on, on, on, on); - case 6: - return _mm256_set_epi32(off, off, on, on, on, on, on, on); - case 7: - return _mm256_set_epi32(off, on, on, on, on, on, on, on); - default: - return _mm256_set_epi32(off, off, off, off, off, off, off, off); - } - default: - throw error(mkldnn_unimplemented, "Not implemented!"); - } + assert(nres < 8 && nres >= 0); + std::bitset<8> e = ~((1 << nres) - 1); + return _mm256_set_epi32(e[7]-1, e[6]-1, e[5]-1, e[4]-1, e[3]-1, e[2]-1, e[1]-1, e[0]-1); } -#define BIN_OPS(name) \ - template \ - static TF name##_ps (TF v1, TF v2) { \ - switch (isa) { \ - case avx2: \ - return _mm256_##name##_ps(v1, v2); \ - default: \ - throw error(mkldnn_unimplemented, "Not implemented!"); \ - return set1_ps(0.f); \ - } \ + static inline TF add_ps(TF v1, TF v2) { + return _mm256_add_ps(v1, v2); } - BIN_OPS(add); - BIN_OPS(mul); - BIN_OPS(div); -#undef BIN_OPS + static inline TF mul_ps(TF v1, TF v2) { + return _mm256_mul_ps(v1, v2); + } - template - static TF set1_ps (const T v) { - switch (isa) { - case avx2: - return _mm256_set1_ps(v); - default: - throw error(mkldnn_unimplemented, "Not implemented!"); - return set1_ps(0.f); - } + static inline TF div_ps(TF v1, TF v2) { + return _mm256_div_ps(v1, v2); } - template - static TF sqrt_ps (TF v) { - switch (isa) { - case avx2: - return _mm256_sqrt_ps(v); - default: - throw error(mkldnn_unimplemented, "Not implemented!"); - return set1_ps(0.f); - } + static inline TF sqrt_ps(TF v) { + return _mm256_sqrt_ps(v); } - template - static TF load_ps (const T *src) { - switch (isa) { - case avx2: - return _mm256_load_ps(src); - default: - throw error(mkldnn_unimplemented, "Not implemented!"); - return set1_ps(0.f); - } + template + static inline TF set1_ps(const T v) { + return _mm256_set1_ps(v); } - template - static TF maskload_ps (const T *src, TI mask) { - switch (isa) { - case avx2: - return _mm256_maskload_ps(src, mask); - default: - throw error(mkldnn_unimplemented, "Not implemented!"); - return set1_ps(0.f); - } + template + static inline TF load_ps(const T *src) { + return _mm256_load_ps(src); } - template - static void store_ps (T *dst, TF v) { - switch (isa) { - case avx2: - _mm256_store_ps(dst, v); - return; - default: - throw error(mkldnn_unimplemented, "Not implemented!"); - return; - } + template + static inline TF maskload_ps(const T *src, TI mask) { + return _mm256_maskload_ps(src, mask); } - template - static void maskstore_ps (T *dst, TI mask, TF v) { - switch (isa) { - case avx2: - _mm256_maskstore_ps(dst, mask, v); - return; - default: - throw error(mkldnn_unimplemented, "Not implemented!"); + template + static inline void store_ps(T *dst, TF v) { + _mm256_store_ps(dst, v); + } + + template + static inline void maskstore_ps(T *dst, TI mask, TF v) { + _mm256_maskstore_ps(dst, mask, v); + } + + template + static inline void memcpy(T* src, T* dst, size_t size) { + auto itemsize = sizeof(T); + auto vec_sz = get_vec_sz(); + auto num_vec = size / vec_sz; + auto num_res = size % vec_sz; + + if ((size < vec_sz) || + (IDEEP_MOD_PTR(src, align_bytes) != IDEEP_MOD_PTR(dst, align_bytes))) { + std::memcpy(dst, src, itemsize * size); return; } + + auto cpy_cnt = 0; + auto cur_res = num_res; + auto cur_vec = num_vec; + if (!IDEEP_IS_ALIGNED_PTR(src, align_bytes)) { + cpy_cnt = (align_bytes - IDEEP_MOD_PTR(src, align_bytes)) / itemsize; + std::memcpy(dst, src, itemsize * cpy_cnt); + src += cpy_cnt; + dst += cpy_cnt; + } + IDEEP_ENFORCE(cpy_cnt < vec_sz, "invalid copy count"); + IDEEP_ENFORCE(IDEEP_IS_ALIGNED_PTR(dst, align_bytes), "not bytes aligned address"); + + if (cpy_cnt > cur_res) { + cur_vec -= 1; + cur_res = vec_sz - (cpy_cnt - cur_res); + } else { + cur_res -= cpy_cnt; + } + + for (auto j = 0; j < cur_vec; j++, dst += vec_sz, src += vec_sz) { + auto vmm = load_ps(src); + store_ps(dst, vmm); + } + + if (cur_res != 0) { + auto mask = size_to_mask(cur_res); + auto vmm = maskload_ps(src, mask); + maskstore_ps(dst, mask, vmm); + } } // Unary ops @@ -176,7 +146,7 @@ class fast_math { } if (nres != 0) { - TI mask = size_to_mask(nres); + TI mask = size_to_mask(nres); TF vmm1 = maskload_ps(src, mask); vmm1 = op_mask(vmm1, mask); maskstore_ps(dst, mask, vmm1); @@ -190,11 +160,11 @@ class fast_math { single_thread_vecwise_unary_op(dst, src, nelems, op, op_mask); } - template + template static void inv_square_var(float epsilon, - const elem_t* inv_sqrt_var, elem_t* variance, unsigned nelems) { + const T* inv_sqrt_var, T* variance, unsigned nelems) { if (isa == avx2) { - if (std::is_same::value) { + if (std::is_same::value) { const float *src = reinterpret_cast(inv_sqrt_var); float *dst = reinterpret_cast(variance); @@ -222,11 +192,11 @@ class fast_math { } } - template + template static void inv_sqrt_var(float epsilon, const void* variance, void* inv_sqrt_var, unsigned nelems) { if (isa == avx2) { - if (std::is_same::value) { + if (std::is_same::value) { const float *src = reinterpret_cast(variance); float *dst = @@ -245,7 +215,7 @@ class fast_math { } if (nres != 0) { - TI mask = size_to_mask(nres); + TI mask = size_to_mask(nres); TF vmm1 = maskload_ps(src, mask); vmm1 = add_ps(vmm1, epsilones); vmm1 = sqrt_ps(vmm1); @@ -277,7 +247,7 @@ class fast_math { } if (nres != 0) { - TI mask = size_to_mask(nres); + TI mask = size_to_mask(nres); TF vmm1 = maskload_ps(src1, mask); TF vmm2 = maskload_ps(src2, mask); vmm2 = op_mask(vmm1, vmm2); @@ -292,12 +262,12 @@ class fast_math { single_thread_vecwise_binary_op(dst, src1, src2, nelems, op, op_mask); } - template - static void add(elem_t *dst, const elem_t *src1, const elem_t *src2, + template + static void add(T *dst, const T *src1, const T *src2, unsigned nelems) { - if (std::is_same::value) { + if (std::is_same::value) { auto op = [] (TF vmm1, TF vmm2) { - vmm1 = add_ps(vmm1, vmm2); + vmm1 = add_ps(vmm1, vmm2); return vmm1; }; vecwise_binary_op(dst, src1, src2, nelems, op, op); @@ -305,7 +275,10 @@ class fast_math { throw error(mkldnn_unimplemented, "Not implemented!"); } } + }; } } #endif + +#endif diff --git a/include/ideep/tensor.hpp b/include/ideep/tensor.hpp index 56739798..4f382d04 100644 --- a/include/ideep/tensor.hpp +++ b/include/ideep/tensor.hpp @@ -12,15 +12,18 @@ namespace ideep { struct computation; /// @addtogroup api_tensor Tensor -// -/// Param class describes parameters internal to operators +/// +/// Param class handles operands to computations' internal, it wrappers MKL-DNN +/// memory primitive and provides utilities to manipulate underlying object. +/// It's also the base class of tensor, handles major tensor services. class param: public c_wrapper { public: using dims = mkldnn::memory::dims; using dim_t = dims::value_type; using data_type = mkldnn::memory::data_type; - /// A param descriptor. + /// Param descriptor class wrappers MKL-DNN memory primitive descriptor + /// and provides utilities to manipulate underlying object struct descriptor : public c_wrapper { friend class param; inline static mkldnn_primitive_kind_t convert_to_c(kind akind) { @@ -42,7 +45,6 @@ class param: public c_wrapper { md.format = convert_to_c(aformat); } - // borrowed from memory_desc_wrapper static inline void set_default_strides(dims &strides, const dims &adims, const int *perm = NULL) { static const int id_perm[] @@ -76,12 +78,12 @@ class param: public c_wrapper { } public: - /// Initiate a param descriptor, specifying all details. + /// Initiate a param descriptor, specifying blocking details. /// /// @param adims Data dimensions /// @param adata_type Data precision/type. /// @param extra block information for data. - /// @param perm permutation for layout sequence + /// @param perm permutation for layout sequence descriptor(const dims adims, data_type adata_type, const dims stride, const dims block_dims, const dims stride_inner = dims(12, 1)) : c_wrapper([&adims, adata_type, &block_dims, @@ -98,7 +100,7 @@ class param: public c_wrapper { return result; }()), public_format_(format::blocked) {} - /// Initiate a param descriptor, specifying format. + /// Initiate a param descriptor, using format for blocking initialization. /// /// @param adims Data dimensions /// @param adata_type Data precision/type. @@ -107,6 +109,7 @@ class param: public c_wrapper { :c_wrapper([&adims, adata_type, aformat]() { mkldnn::memory::validate_dims(adims); + // XXX: out of range enum might result unspecified behavior mkldnn_memory_desc_t data; if (adims.size() == 3) { fill_param(data, adims, adata_type, aformat); @@ -130,7 +133,7 @@ class param: public c_wrapper { return result; }()), public_format_(public_format(aformat)) {} - /// Initiate a param descriptor, specifying no format. + /// Initiate a param descriptor, assume nature format. /// /// @param adims Data dimensions /// @param adata_type Data precision/type. @@ -143,15 +146,15 @@ class param: public c_wrapper { public_format_ = format::format_undef; } - /// Initiate a tensor descriptor from primitive_desc_t struct + /// Initiate a descriptor from primitive_desc_t struct /// /// @param adesc Pointer to a primitive_desct_t C struct - /// @param aformat Specify a format for current descriptor + /// @param aformat Specify public format for current descriptor descriptor(mkldnn_primitive_desc_t adesc, format aformat) :c_wrapper(adesc), public_format_(aformat) { } - /// Initiate a tensor descriptor from primitive_desc_t struct + /// Initiate a descriptor from primitive_desc_t struct /// /// @param adesc Pointer to a primitive_desct_t C struct descriptor(mkldnn_primitive_desc_t adesc) : descriptor(adesc, @@ -160,19 +163,19 @@ class param: public c_wrapper { mkldnn_primitive_desc_query_memory_d(adesc)->format))) { } - /// Initiate a tensor descriptor from another one, share resource + /// Initiate a descriptor from another, share resource /// /// @param adesc is a reference to another descriptor descriptor(const descriptor &adesc): c_wrapper(adesc), public_format_ (adesc.public_format_) { } - /// Empty initiate a tensor decriptor + /// Empty decriptor constructor /// descriptor():descriptor(dims(0), data_type::f32, format::format_undef) { } - /// Copy a tensor descriptor from another, share resource + /// Share a descriptor from another, share resource descriptor &operator=(const descriptor& adesc) { c_wrapper::operator=(adesc); public_format_ = adesc.public_format_; @@ -181,19 +184,26 @@ class param: public c_wrapper { /// Returns the number of bytes required to allocate the memory /// described including the padding area. + /// inline size_t get_size() const { return mkldnn_memory_primitive_desc_get_size(get()); } + /// Returns number of dimensions + /// inline int ndims() const { return get_mkldnn_memory_desc_t()->ndims; } + /// Returns dimension vector + /// inline dims get_dims() const { auto *internal = get_mkldnn_memory_desc_t(); return dims(internal->dims, &internal->dims[internal->ndims]); } + /// Returns descriptor data type + /// inline data_type get_data_type() const { auto *internal = get_mkldnn_memory_desc_t(); return static_cast(internal->data_type); @@ -225,6 +235,7 @@ class param: public c_wrapper { /// pre-condition. 4-dimension only /// 1. (format_undef, nchw) for all unknown format creation /// 2. (format_undef, ) compatible with all public correspondent + /// @param expected Expected format to transform to descriptor format_to(format expected) const { mkldnn_memory_desc_t adesc; const mkldnn_memory_desc_t *origin = get_mkldnn_memory_desc_t(); @@ -256,6 +267,9 @@ class param: public c_wrapper { return descriptor(result, expected); } + /// Change format from data representation to weights, only nature formats + /// were supported. + /// Example: from nchw to oihw descriptor as_weights_format() const { switch(get_internal_format()) { case format::nc: @@ -299,15 +313,18 @@ class param: public c_wrapper { return mkldnn_primitive_desc_query_memory_d(get()); } + /// Operator == inline bool operator ==(const descriptor &other) const { - // TODO: (format_undef, *) == (nhwc, *) like return mkldnn_memory_primitive_desc_equal(get(), other.get()); } + /// Operator != inline bool operator !=(const descriptor &other) const { return !operator==(other); } + /// Return format generated by MKL-DNN + // XXX: format might be out of range. format get_internal_format() const { return static_cast(this->get_mkldnn_memory_desc_t()->format); } @@ -438,7 +455,13 @@ class param: public c_wrapper { } }; + /// View is for describing a subregion from a param + /// struct view : public c_wrapper { + /// Create view by specifying starting coordinate and size of each dimension + /// @param host From which the view was created + /// @param volume Size of each dimension of the subregion + /// @param start Start coordinates view (const descriptor& host, dims volume, dims start) { mkldnn_primitive_desc_t result; error::wrap_c_api(mkldnn_view_primitive_desc_create(&result, @@ -512,6 +535,10 @@ class param: public c_wrapper { } }; + /// The template initialize param with a descriptor, allocate and manage + /// buffer automatically. A customized allocator can be specified to override + /// default implementation. + /// @param adesc Descriptor for the param template void init(const descriptor &adesc) { mkldnn_primitive_t result; @@ -527,6 +554,9 @@ class param: public c_wrapper { public_format_ = adesc.public_format_; } + /// The template initialize param with a descriptor. Specifiy extra buffer. + /// @param adesc Descriptor for the param + /// @param ahandle Buffer of the param void init(const descriptor &adesc, void *ahandle) { mkldnn_primitive_t result; error::wrap_c_api( @@ -534,24 +564,26 @@ class param: public c_wrapper { "could not create a memory primitive"); reset(result); - set_data_handle(ahandle); buffer_.reset(); + set_data_handle(ahandle); public_format_ = adesc.public_format_; } + /// The template initialize param with a descriptor, allocate and manage + /// buffer automatically. A customized allocator can be specified to override + /// default implementation. + /// @param adesc Descriptor for the param void init(const descriptor &adesc) { init(adesc); } /// Function that refill tensor with new description or buffer - // template void reinit(const descriptor &adesc) { auto curr_size = get_size(); auto new_size = adesc.get_size(); - if (curr_size >= new_size || - (buffer_ == nullptr && get_data_handle() != nullptr)) { + if (curr_size >= new_size && buffer_.get() == get_data_handle()) { // We don't have to allocate new buffer or we don't manage the buffer // either way, we don't allocate new buffer // People who manage buffer provide enough space @@ -584,11 +616,15 @@ class param: public c_wrapper { /// Constructs a param and allocating internal buffer. /// - /// @param adesc param descriptor. + /// @param adesc Descriptor for the param param(const descriptor &adesc) { init(adesc); } + /// Constructs a param and allocating internal buffer. + /// + /// @param adesc Descriptor for the param. + /// @param ahandle Buffer for the param. param(const descriptor &adesc, void *ahandle) { init(adesc, ahandle); } @@ -596,11 +632,16 @@ class param: public c_wrapper { /// Recreate a param with completely different content from old one /// but reuse the param shell. Notice that after resize, its format /// is undefined + /// @param adims New dimension + /// @param adata_type New data type void resize(dims adims, data_type adata_type) { descriptor adesc(adims, adata_type); init(adesc); } + /// Reshape a param, reorder might happen if its format is internal + /// @param new_dims New dimension + /// @result Return new param reference param &reshape(dims new_dims) { if (!get_descriptor().is_shape_compatible(new_dims)) { throw error(mkldnn_runtime_error, "reshape to incompatible shape"); @@ -609,8 +650,8 @@ class param: public c_wrapper { param p; p.init({get_dims(), get_data_type()}); reorder_to(p); - set_data_handle(p.get_data_handle()); buffer_ = p.get_tensor_buffer(); + set_data_handle(p.get_data_handle()); } set_descriptor({new_dims, get_data_type()}); @@ -619,11 +660,12 @@ class param: public c_wrapper { return *this; } + // XXX: ??? param &_reshape(dims new_dims) { return reshape(new_dims); } - /// Returns the internal structure of primitive descriptor. + /// Returns pointer to structure of primitive descriptor. const_mkldnn_primitive_desc_t get_mkldnn_primitive_desc_t() const { const_mkldnn_primitive_desc_t cdesc; error::wrap_c_api(mkldnn_primitive_get_primitive_desc(get(), @@ -632,6 +674,7 @@ class param: public c_wrapper { return cdesc; } + /// Return pointer to memory descriptor structure const mkldnn_memory_desc_t *get_mkldnn_memory_desc_t() const { const_mkldnn_primitive_desc_t cdesc; error::wrap_c_api(mkldnn_primitive_get_primitive_desc(get(), @@ -650,7 +693,10 @@ class param: public c_wrapper { return descriptor(clone, public_format_); } - // Force a descriptor into param + /// Set a descriptor into param to replace the older one, keep buffer + /// It is caller's responsibility to make sure the original buffer is large + /// enough for specified descriptor + /// @param new_desc New descriptor void set_descriptor(const descriptor& new_desc) { // Keep the original management auto buf = std::move(buffer_); @@ -659,15 +705,21 @@ class param: public c_wrapper { public_format_ = new_desc.public_format_; } + /// Create a view from current param + /// @param view_dims Size of each dimension of the view + /// @param offsets Start cooridinate of the view view create_view(dims view_dims, dims offsets) const { return view(get_descriptor(), view_dims, offsets); } + /// Reture param's data type inline data_type get_data_type() const { const mkldnn_memory_desc_t *adesc = get_mkldnn_memory_desc_t(); return static_cast(adesc->data_type); } + /// Return size of specified dimension + /// @param index Dimension index inline dim_t get_dim(int index) const { if (index < 0 || index >= ndims()) return static_cast(0); @@ -675,23 +727,28 @@ class param: public c_wrapper { return mdesc->dims[index]; } + /// Return dimensions' size vector inline dims get_dims() const { const mkldnn_memory_desc_t *mdesc = get_mkldnn_memory_desc_t(); return dims (mdesc->dims, &mdesc->dims[mdesc->ndims]); } + /// Return number of dimensions inline int ndims() const { return get_mkldnn_memory_desc_t()->ndims; } + /// Return whether the tensor is empty inline bool is_empty() const { return ndims() == 0 && get_data_handle() == 0; } + /// Return buffer size required by the param inline size_t get_size() const { return mkldnn_memory_primitive_desc_get_size(get_mkldnn_primitive_desc_t()); } + /// Return element number of the param inline dim_t get_nelems() const { const mkldnn_memory_desc_t *mdesc = get_mkldnn_memory_desc_t(); return std::accumulate( @@ -701,15 +758,18 @@ class param: public c_wrapper { /// Returns a handle of the data contained in the param. On /// the CPU engine, this is a pointer to the allocated memory. inline void *get_data_handle() const { - void *handle; - error::wrap_c_api(mkldnn_memory_get_data_handle(get(), &handle), - "could not get native handle"); - return handle; + void *handle; + error::wrap_c_api(mkldnn_memory_get_data_handle(get(), &handle), + "could not get native handle"); + return handle; } + /// Set new buffer handle into param + /// @param handle Buffer handle inline void set_data_handle(void *handle) { - error::wrap_c_api(mkldnn_memory_set_data_handle(get(), handle), - "could not set native handle"); + if (buffer_.get() != handle && buffer_ != nullptr) buffer_.reset(); + error::wrap_c_api(mkldnn_memory_set_data_handle(get(), handle), + "could not set native handle"); } /// Materialize a param. For specific scenario param will allocate @@ -746,10 +806,12 @@ class param: public c_wrapper { return static_cast(aformat); } + /// Return internal format of the param inline format get_internal_format() const { return static_cast(get_mkldnn_memory_desc_t()->format); } + /// Need reorder if current param used by non MKL-DNN routines. inline bool need_reorder() const { return get_internal_format() != public_format_; } @@ -860,11 +922,16 @@ class param: public c_wrapper { std::shared_ptr buffer_; }; -/// Tensor that describes the data and its explanation. +/// Tensor that describes data buffer and its explanation. +/// It also integrates an optional tensor as an intemediate results, used in +/// Pooling/LRN class tensor : public param { public: using param::param; + /// Pack an extra tensor into current one, allocate buffer using specified + /// allocator. + /// @param descriptor Descriptor of the extra tensor template void init_extra(const descriptor &workspace) { auto twin = new tensor(); @@ -872,66 +939,97 @@ class tensor : public param { twin_.reset(twin); } + /// Pack an extra tensor into current one + /// + /// @param descriptor Descriptor of the extra tensor + /// @param handle Buffer handle void init_extra(const descriptor &workspace, void *handle) { twin_.reset(new tensor(workspace, handle)); } + /// Pack an extra tensor into current one + /// + /// @param tensor Extra tensor to pack in void init_extra(const tensor &ws) { twin_.reset(); twin_ = std::make_shared(ws); } // for gcc4.8 + /// Empty construction tensor() : param() {} + /// Construct tensor + /// + /// @param major Descriptor for the tensor + /// @param workspace Extra descriptor of the tensor which will be packed in tensor(const descriptor &major, const descriptor &workspace) : tensor(major) { init_extra(workspace); } + /// Construct tensor + /// + /// @param major Descriptor of the tensor + /// @param h_major Buffer handle of the tensor + /// @param workspace Descriptor of the extra tensor tensor(const descriptor &major, void *h_major, const descriptor &workspace) : tensor(major, h_major) { init_extra(workspace); } + /// Construct tensor + /// + /// @param major Descriptor of the tensor + /// @param h_major Buffer handle of the tensor + /// @param workspace Descriptor of the extra tensor + /// @param h_workspace Buffer handle of the extra tensor tensor(const descriptor &major, void *h_major, const descriptor &workspace, void *h_workspace) : tensor(major, h_major) { init_extra(workspace, h_workspace); } + /// Copy constructor tensor (const tensor& t) : param(t) { twin_ = t.twin_; } + /// Move constructor tensor (tensor&& movable) : param(std::move(movable)) { twin_ = std::move(movable.twin_); } + /// Assignment operator tensor &operator = (const tensor& t) { param::operator = (t); twin_ = t.twin_; return *this; } + /// Move assignment operator tensor &operator = (tensor&& movable) { param::operator = (std::move(movable)); twin_ = std::move(movable.twin_); return *this; } + /// Return extra packed tensor tensor *get_extra() { return twin_.get(); } + /// Return extra packed tensor const tensor *get_extra() const { return twin_.get(); } + /// Decide wether there is an extra tensor packed in bool has_extra() const { return twin_ != nullptr; } + // XXX: ??? tensor as_weights() const { tensor ret = *this; if (!is_weights()) diff --git a/python/ideep4py/py/mm/mdarray.cc b/python/ideep4py/py/mm/mdarray.cc index b092a81c..4a397988 100755 --- a/python/ideep4py/py/mm/mdarray.cc +++ b/python/ideep4py/py/mm/mdarray.cc @@ -282,6 +282,50 @@ PyObject *mdarray::inplace_axpby(float a, PyObject *self, float b, PyObject *o) return self; } +void mdarray::set(PyObject *o) { + // Resource manager, for GCC do not accept lambda + struct py_decref { + void operator () (PyObject *p) const { + Py_DECREF(p); + } + }; + + std::unique_ptr op(nullptr); + + // Create mdarray from buffer provider + if (reinterpret_cast(o->ob_type) == &PyArray_Type) { + o = py_mdarray_from(o); + op.reset(o); + } + + void *oprd2; + int res = SWIG_ConvertPtr(o, &oprd2, nullptr, 0); + + if (!SWIG_IsOK(res)) { + PyErr_SetString(PyExc_ValueError, "Wrong operand object in add wrapper"); + return; + } + + auto in = *(reinterpret_cast(oprd2))->get(); + auto dims = get_dims(); + auto in_dims = in.get_dims(); + if (dims.size() != in_dims.size()) + throw error(mkldnn_invalid_arguments, "mdarray set: Inconsistent ndims"); + for (size_t d = 0; d < dims.size(); d++) { + if (dims[d] != in_dims[d]) + throw error(mkldnn_invalid_arguments, "mdarray set: Inconsistent dims"); + } + + tensor in_ = in; + if (in.get_descriptor() != get_descriptor()) { + in_.init(get_descriptor()); + reorder::compute(in, in_); + } + + memcpy(get_data_handle(), in_.get_data_handle(), get_size()); + return; +} + PyObject *mdarray::m_Add(PyObject *self, PyObject *o) { // Array Broadcast if (!is_mdarray_supported(self, o)) { diff --git a/python/ideep4py/py/mm/mdarray.h b/python/ideep4py/py/mm/mdarray.h index 18d5bef9..25eae780 100755 --- a/python/ideep4py/py/mm/mdarray.h +++ b/python/ideep4py/py/mm/mdarray.h @@ -325,6 +325,8 @@ class mdarray : public ideep::tensor { PyObject *flat(void); + void set(PyObject *o); + PyObject *reshape(py_handle *self, std::vector dims); PyObject *m_mult_div(PyObject *self, PyObject *o, int mult_or_div, bool inplace); diff --git a/python/ideep4py/py/mm/mdarray.i b/python/ideep4py/py/mm/mdarray.i index cf236547..d952aaa0 100644 --- a/python/ideep4py/py/mm/mdarray.i +++ b/python/ideep4py/py/mm/mdarray.i @@ -100,6 +100,11 @@ PyObject *flat() { return (*self)->flat(); } + + void set(PyObject *o){ + (*self)->set(o); + return; + } } /* mdarray::reshape */ diff --git a/python/ideep4py/tests/mm/test_mdarray_set_mm.py b/python/ideep4py/tests/mm/test_mdarray_set_mm.py new file mode 100755 index 00000000..797f6597 --- /dev/null +++ b/python/ideep4py/tests/mm/test_mdarray_set_mm.py @@ -0,0 +1,54 @@ +import ideep4py # NOQA +import numpy +import testing +from ideep4py import mdarray +import unittest + + +class TestMdarraySet(unittest.TestCase): + def setUp(self): + self.check_options = {'atol': 1e-5, 'rtol': 1e-4} + + def test_set1(self): + x = numpy.array([1, 1, 1], dtype=numpy.float32) + mx = mdarray(x) + numpy.testing.assert_allclose( + mx, x, **self.check_options) + x = numpy.array([1, 2, 1], dtype=numpy.float32) + mx.set(x) + numpy.testing.assert_allclose( + mx, x, **self.check_options) + + def test_set2(self): + x = numpy.arange(24, dtype=numpy.float32) + mx = mdarray(x) + numpy.testing.assert_allclose( + mx, x, **self.check_options) + x.fill(1) + mx.set(x) + numpy.testing.assert_allclose( + mx, x, **self.check_options) + + def test_set3(self): + x = numpy.random.rand(10, 10, 10, 10) + x = x.astype(numpy.float32) + mx = mdarray(x) + numpy.testing.assert_allclose( + mx, x, **self.check_options) + x = numpy.random.rand(10, 10, 10, 10) + x = x.astype(numpy.float32) + mx.set(x) + numpy.testing.assert_allclose( + mx, x, **self.check_options) + + def test_set4(self): + x = numpy.array([0, 0, 0, 0], dtype=numpy.float32) + mx1 = mdarray(x) + mx2 = mdarray(x) + mx1.fill(0) + mx2.set(mx1) + numpy.testing.assert_allclose( + mx1, mx2, **self.check_options) + + +testing.run_module(__name__, __file__) diff --git a/python/setup.py b/python/setup.py index 42e4c8bc..be4ce944 100644 --- a/python/setup.py +++ b/python/setup.py @@ -62,6 +62,12 @@ def clean_ext(): ############################################################################### # Custom build commands ############################################################################### +class build(distutils.command.build.build): + def run(self): + prepare_ext() + distutils.command.build.build.run(self) + + class build_ext(setuptools.command.build_ext.build_ext): def run(self): prepare_ext() @@ -84,6 +90,7 @@ def run(self): cmdclass = { + 'build': build, 'build_ext': build_ext, 'install': install, 'clean': clean, @@ -160,7 +167,7 @@ def run(self): setup( name='ideep4py', - version='2.0.0_b1', + version='2.0.0', description='ideep4py is a wrapper for iDeep library.', author='Intel', author_email='',