Skip to content

Commit

Permalink
core(SIMD): align behavior of vector constructors
Browse files Browse the repository at this point in the history
- setzero() calls are dropped due low-level API nature
- initialization is mandatory if necessary (not an output of other calls)
  • Loading branch information
alalek committed Apr 17, 2020
1 parent 0812207 commit dcf7eb9
Show file tree
Hide file tree
Showing 7 changed files with 170 additions and 80 deletions.
28 changes: 18 additions & 10 deletions modules/core/include/opencv2/core/hal/intrin_avx.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -159,7 +159,8 @@ struct v_uint8x32
(char)v22, (char)v23, (char)v24, (char)v25, (char)v26, (char)v27,
(char)v28, (char)v29, (char)v30, (char)v31);
}
v_uint8x32() : val(_mm256_setzero_si256()) {}
v_uint8x32() {}

uchar get0() const { return (uchar)_v_cvtsi256_si32(val); }
};

Expand All @@ -183,7 +184,8 @@ struct v_int8x32
v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20,
v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31);
}
v_int8x32() : val(_mm256_setzero_si256()) {}
v_int8x32() {}

schar get0() const { return (schar)_v_cvtsi256_si32(val); }
};

Expand All @@ -203,7 +205,8 @@ struct v_uint16x16
(short)v4, (short)v5, (short)v6, (short)v7, (short)v8, (short)v9,
(short)v10, (short)v11, (short)v12, (short)v13, (short)v14, (short)v15);
}
v_uint16x16() : val(_mm256_setzero_si256()) {}
v_uint16x16() {}

ushort get0() const { return (ushort)_v_cvtsi256_si32(val); }
};

Expand All @@ -222,7 +225,8 @@ struct v_int16x16
val = _mm256_setr_epi16(v0, v1, v2, v3, v4, v5, v6, v7,
v8, v9, v10, v11, v12, v13, v14, v15);
}
v_int16x16() : val(_mm256_setzero_si256()) {}
v_int16x16() {}

short get0() const { return (short)_v_cvtsi256_si32(val); }
};

Expand All @@ -239,7 +243,8 @@ struct v_uint32x8
val = _mm256_setr_epi32((unsigned)v0, (unsigned)v1, (unsigned)v2,
(unsigned)v3, (unsigned)v4, (unsigned)v5, (unsigned)v6, (unsigned)v7);
}
v_uint32x8() : val(_mm256_setzero_si256()) {}
v_uint32x8() {}

unsigned get0() const { return (unsigned)_v_cvtsi256_si32(val); }
};

Expand All @@ -255,7 +260,8 @@ struct v_int32x8
{
val = _mm256_setr_epi32(v0, v1, v2, v3, v4, v5, v6, v7);
}
v_int32x8() : val(_mm256_setzero_si256()) {}
v_int32x8() {}

int get0() const { return _v_cvtsi256_si32(val); }
};

Expand All @@ -271,7 +277,8 @@ struct v_float32x8
{
val = _mm256_setr_ps(v0, v1, v2, v3, v4, v5, v6, v7);
}
v_float32x8() : val(_mm256_setzero_ps()) {}
v_float32x8() {}

float get0() const { return _mm_cvtss_f32(_mm256_castps256_ps128(val)); }
};

Expand All @@ -284,7 +291,7 @@ struct v_uint64x4
explicit v_uint64x4(__m256i v) : val(v) {}
v_uint64x4(uint64 v0, uint64 v1, uint64 v2, uint64 v3)
{ val = _mm256_setr_epi64x((int64)v0, (int64)v1, (int64)v2, (int64)v3); }
v_uint64x4() : val(_mm256_setzero_si256()) {}
v_uint64x4() {}
uint64 get0() const
{
#if defined __x86_64__ || defined _M_X64
Expand All @@ -306,7 +313,7 @@ struct v_int64x4
explicit v_int64x4(__m256i v) : val(v) {}
v_int64x4(int64 v0, int64 v1, int64 v2, int64 v3)
{ val = _mm256_setr_epi64x(v0, v1, v2, v3); }
v_int64x4() : val(_mm256_setzero_si256()) {}
v_int64x4() {}

int64 get0() const
{
Expand All @@ -329,7 +336,8 @@ struct v_float64x4
explicit v_float64x4(__m256d v) : val(v) {}
v_float64x4(double v0, double v1, double v2, double v3)
{ val = _mm256_setr_pd(v0, v1, v2, v3); }
v_float64x4() : val(_mm256_setzero_pd()) {}
v_float64x4() {}

double get0() const { return _mm_cvtsd_f64(_mm256_castpd256_pd128(val)); }
};

Expand Down
57 changes: 43 additions & 14 deletions modules/core/include/opencv2/core/hal/intrin_avx512.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -144,7 +144,10 @@ struct v_uint8x64
v31, v30, v29, v28, v27, v26, v25, v24, v23, v22, v21, v20, v19, v18, v17, v16,
v15, v14, v13, v12, v11, v10, v9, v8, v7, v6, v5, v4, v3, v2, v1, v0);
}
v_uint8x64() : val(_mm512_setzero_si512()) {}
v_uint8x64() {}

static inline v_uint8x64 zero() { return v_uint8x64(_mm512_setzero_si512()); }

uchar get0() const { return (uchar)_v_cvtsi512_si32(val); }
};

Expand Down Expand Up @@ -177,7 +180,10 @@ struct v_int8x64
v31, v30, v29, v28, v27, v26, v25, v24, v23, v22, v21, v20, v19, v18, v17, v16,
v15, v14, v13, v12, v11, v10, v9, v8, v7, v6, v5, v4, v3, v2, v1, v0);
}
v_int8x64() : val(_mm512_setzero_si512()) {}
v_int8x64() {}

static inline v_int8x64 zero() { return v_int8x64(_mm512_setzero_si512()); }

schar get0() const { return (schar)_v_cvtsi512_si32(val); }
};

Expand All @@ -200,7 +206,10 @@ struct v_uint16x32
val = _v512_set_epu16(v31, v30, v29, v28, v27, v26, v25, v24, v23, v22, v21, v20, v19, v18, v17, v16,
v15, v14, v13, v12, v11, v10, v9, v8, v7, v6, v5, v4, v3, v2, v1, v0);
}
v_uint16x32() : val(_mm512_setzero_si512()) {}
v_uint16x32() {}

static inline v_uint16x32 zero() { return v_uint16x32(_mm512_setzero_si512()); }

ushort get0() const { return (ushort)_v_cvtsi512_si32(val); }
};

Expand All @@ -221,7 +230,10 @@ struct v_int16x32
(ushort)v15, (ushort)v14, (ushort)v13, (ushort)v12, (ushort)v11, (ushort)v10, (ushort)v9 , (ushort)v8,
(ushort)v7 , (ushort)v6 , (ushort)v5 , (ushort)v4 , (ushort)v3 , (ushort)v2 , (ushort)v1 , (ushort)v0);
}
v_int16x32() : val(_mm512_setzero_si512()) {}
v_int16x32() {}

static inline v_int16x32 zero() { return v_int16x32(_mm512_setzero_si512()); }

short get0() const { return (short)_v_cvtsi512_si32(val); }
};

Expand All @@ -240,7 +252,10 @@ struct v_uint32x16
val = _mm512_setr_epi32((int)v0, (int)v1, (int)v2, (int)v3, (int)v4, (int)v5, (int)v6, (int)v7,
(int)v8, (int)v9, (int)v10, (int)v11, (int)v12, (int)v13, (int)v14, (int)v15);
}
v_uint32x16() : val(_mm512_setzero_si512()) {}
v_uint32x16() {}

static inline v_uint32x16 zero() { return v_uint32x16(_mm512_setzero_si512()); }

unsigned get0() const { return (unsigned)_v_cvtsi512_si32(val); }
};

Expand All @@ -256,7 +271,10 @@ struct v_int32x16
{
val = _mm512_setr_epi32(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15);
}
v_int32x16() : val(_mm512_setzero_si512()) {}
v_int32x16() {}

static inline v_int32x16 zero() { return v_int32x16(_mm512_setzero_si512()); }

int get0() const { return _v_cvtsi512_si32(val); }
};

Expand All @@ -272,7 +290,10 @@ struct v_float32x16
{
val = _mm512_setr_ps(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15);
}
v_float32x16() : val(_mm512_setzero_ps()) {}
v_float32x16() {}

static inline v_float32x16 zero() { return v_float32x16(_mm512_setzero_ps()); }

float get0() const { return _mm_cvtss_f32(_mm512_castps512_ps128(val)); }
};

Expand All @@ -285,7 +306,10 @@ struct v_uint64x8
explicit v_uint64x8(__m512i v) : val(v) {}
v_uint64x8(uint64 v0, uint64 v1, uint64 v2, uint64 v3, uint64 v4, uint64 v5, uint64 v6, uint64 v7)
{ val = _mm512_setr_epi64((int64)v0, (int64)v1, (int64)v2, (int64)v3, (int64)v4, (int64)v5, (int64)v6, (int64)v7); }
v_uint64x8() : val(_mm512_setzero_si512()) {}
v_uint64x8() {}

static inline v_uint64x8 zero() { return v_uint64x8(_mm512_setzero_si512()); }

uint64 get0() const
{
#if defined __x86_64__ || defined _M_X64
Expand All @@ -307,7 +331,9 @@ struct v_int64x8
explicit v_int64x8(__m512i v) : val(v) {}
v_int64x8(int64 v0, int64 v1, int64 v2, int64 v3, int64 v4, int64 v5, int64 v6, int64 v7)
{ val = _mm512_setr_epi64(v0, v1, v2, v3, v4, v5, v6, v7); }
v_int64x8() : val(_mm512_setzero_si512()) {}
v_int64x8() {}

static inline v_int64x8 zero() { return v_int64x8(_mm512_setzero_si512()); }

int64 get0() const
{
Expand All @@ -330,7 +356,10 @@ struct v_float64x8
explicit v_float64x8(__m512d v) : val(v) {}
v_float64x8(double v0, double v1, double v2, double v3, double v4, double v5, double v6, double v7)
{ val = _mm512_setr_pd(v0, v1, v2, v3, v4, v5, v6, v7); }
v_float64x8() : val(_mm512_setzero_pd()) {}
v_float64x8() {}

static inline v_float64x8 zero() { return v_float64x8(_mm512_setzero_pd()); }

double get0() const { return _mm_cvtsd_f64(_mm512_castpd512_pd128(val)); }
};

Expand Down Expand Up @@ -1030,7 +1059,7 @@ inline _Tpvec v_rotate_left(const _Tpvec& a, const _Tpvec& b)
enum { MASK = ((1 << _Tpvec::nlanes) - 1) }; \
if (imm == 0) return a; \
if (imm == _Tpvec::nlanes) return b; \
if (imm >= 2*_Tpvec::nlanes) return _Tpvec(); \
if (imm >= 2*_Tpvec::nlanes) return _Tpvec::zero(); \
return _Tpvec(_mm512_mask_expand_##suffix(_mm512_maskz_compress_##suffix((MASK << SHIFT2)&MASK, b.val), (MASK << (imm))&MASK, a.val)); \
} \
template<int imm> \
Expand All @@ -1040,21 +1069,21 @@ inline _Tpvec v_rotate_right(const _Tpvec& a, const _Tpvec& b)
enum { MASK = ((1 << _Tpvec::nlanes) - 1) }; \
if (imm == 0) return a; \
if (imm == _Tpvec::nlanes) return b; \
if (imm >= 2*_Tpvec::nlanes) return _Tpvec(); \
if (imm >= 2*_Tpvec::nlanes) return _Tpvec::zero(); \
return _Tpvec(_mm512_mask_expand_##suffix(_mm512_maskz_compress_##suffix((MASK << (imm))&MASK, a.val), (MASK << SHIFT2)&MASK, b.val)); \
} \
template<int imm> \
inline _Tpvec v_rotate_left(const _Tpvec& a) \
{ \
if (imm == 0) return a; \
if (imm >= _Tpvec::nlanes) return _Tpvec(); \
if (imm >= _Tpvec::nlanes) return _Tpvec::zero(); \
return _Tpvec(_mm512_maskz_expand_##suffix((1 << _Tpvec::nlanes) - (1 << (imm)), a.val)); \
} \
template<int imm> \
inline _Tpvec v_rotate_right(const _Tpvec& a) \
{ \
if (imm == 0) return a; \
if (imm >= _Tpvec::nlanes) return _Tpvec(); \
if (imm >= _Tpvec::nlanes) return _Tpvec::zero(); \
return _Tpvec(_mm512_maskz_compress_##suffix((1 << _Tpvec::nlanes) - (1 << (imm)), a.val)); \
}

Expand Down
Loading

0 comments on commit dcf7eb9

Please sign in to comment.