diff --git a/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/PacketMathAVX.h b/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/PacketMathAVX.h new file mode 100644 index 0000000000..1a7cd03d49 --- /dev/null +++ b/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/PacketMathAVX.h @@ -0,0 +1,149 @@ +#ifndef CXX11_SRC_FIXEDPOINT_PACKETMATHAVX_H_ +#define CXX11_SRC_FIXEDPOINT_PACKETMATHAVX_H_ +#ifdef _MSC_VER + +#include +#include +#include + +#endif + +namespace Eigen { +namespace internal { + +typedef eigen_packet_wrapper<__m256i, 10> Packet32q8i; +typedef eigen_packet_wrapper<__m128i, 11> Packet16q8i; + +template <> +struct packet_traits : default_packet_traits { + typedef Packet32q8i type; + typedef Packet16q8i half; + enum { + Vectorizable = 1, + AlignedOnScalar = 1, + size = 32, + }; + enum { + HasAdd = 0, + HasSub = 0, + HasMul = 0, + HasNegate = 0, + HasAbs = 0, + HasAbs2 = 0, + HasMin = 0, + HasMax = 0, + HasConj = 0, + HasSetLinear = 0 + }; +}; + +template <> +struct unpacket_traits { + typedef QInt8 type; + typedef Packet16q8i half; + enum { + size = 32, + alignment = Aligned32, + vectorizable = true, + masked_load_available = false, + masked_store_available = false + }; +}; + +template <> +struct unpacket_traits { + typedef QInt8 type; + typedef Packet16q8i half; + enum { + size = 16, + alignment = Aligned32, + vectorizable = true, + masked_load_available = false, + masked_store_available = false + }; +}; +template <> +EIGEN_STRONG_INLINE Packet32q8i pset1(const QInt8& from) { + return _mm256_set1_epi8(from.value); +} +template <> +EIGEN_STRONG_INLINE Packet32q8i ploadu(const QInt8* from) { + EIGEN_DEBUG_UNALIGNED_LOAD return _mm256_loadu_si256( + reinterpret_cast(from)); +} +template <> +EIGEN_STRONG_INLINE Packet16q8i ploadu(const QInt8* from) { + EIGEN_DEBUG_UNALIGNED_LOAD return _mm_loadu_si128( + reinterpret_cast(from)); +} + +template <> +EIGEN_STRONG_INLINE Packet32q8i pload(const QInt8* from) { + EIGEN_DEBUG_ALIGNED_LOAD return _mm256_load_si256( + reinterpret_cast(from)); +} +template <> +EIGEN_STRONG_INLINE Packet16q8i pload(const QInt8* from) { + EIGEN_DEBUG_ALIGNED_LOAD return _mm_load_si128( + reinterpret_cast(from)); +} + +template <> +EIGEN_STRONG_INLINE void pstoreu(QInt8* to, const Packet32q8i& from) { + EIGEN_DEBUG_UNALIGNED_STORE _mm256_storeu_si256( + reinterpret_cast<__m256i*>(to), from.m_val); +} +template <> +EIGEN_STRONG_INLINE void pstoreu(QInt8* to, const Packet16q8i& from) { + EIGEN_DEBUG_UNALIGNED_STORE _mm_storeu_si128(reinterpret_cast<__m128i*>(to), + from.m_val); +} + +template <> +EIGEN_STRONG_INLINE void pstore(QInt8* to, const Packet32q8i& from) { + EIGEN_DEBUG_ALIGNED_STORE _mm256_store_si256(reinterpret_cast<__m256i*>(to), + from.m_val); +} +template <> +EIGEN_STRONG_INLINE void pstore(QInt8* to, const Packet16q8i& from) { + EIGEN_DEBUG_ALIGNED_STORE _mm_store_si128(reinterpret_cast<__m128i*>(to), + from.m_val); +} + +typedef __m256 Packet8f; + +template <> +struct type_casting_traits { + enum { VectorizedCast = 1, SrcCoeffRatio = 4, TgtCoeffRatio = 1 }; +}; + +template <> +EIGEN_STRONG_INLINE Packet32q8i +pcast(const Packet8f& a, const Packet8f& b, + const Packet8f& c, const Packet8f& d) { + const __m256i a_conv = _mm256_cvtps_epi32(a); + const __m256i b_conv = _mm256_cvtps_epi32(b); + const __m256i c_conv = _mm256_cvtps_epi32(c); + const __m256i d_conv = _mm256_cvtps_epi32(d); + __m128i low = _mm256_castsi256_si128(a_conv); + __m128i high = _mm256_extractf128_si256(a_conv, 1); + __m128i tmp = _mm_packs_epi32(low, high); + __m128i low2 = _mm256_castsi256_si128(b_conv); + __m128i high2 = _mm256_extractf128_si256(b_conv, 1); + __m128i tmp2 = _mm_packs_epi32(low2, high2); + __m128i converted_low = _mm_packs_epi16(tmp, tmp2); + low = _mm256_castsi256_si128(c_conv); + high = _mm256_extractf128_si256(c_conv, 1); + tmp = _mm_packs_epi32(low, high); + low2 = _mm256_castsi256_si128(d_conv); + high2 = _mm256_extractf128_si256(d_conv, 1); + tmp2 = _mm_packs_epi32(low2, high2); + __m128i converted_high = _mm_packs_epi16(tmp, tmp2); + return _mm256_insertf128_si256(_mm256_castsi128_si256(converted_low), + converted_high, 1); +} + +} // end namespace internal +} // end namespace Eigen + +#endif // CXX11_SRC_FIXEDPOINT_PACKETMATHAVX_H_ diff --git a/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/PacketMathAVX2.h b/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/PacketMathAVX2.h index 2b16715c72..4c5e02abc9 100644 --- a/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/PacketMathAVX2.h +++ b/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/PacketMathAVX2.h @@ -27,61 +27,14 @@ inline int _mm256_extract_epi8_N1(const __m256i X) { namespace Eigen { namespace internal { -typedef struct Packet32q8i { - __m256i val; - operator __m256i() const { return val; } - Packet32q8i() : val(_mm256_setzero_si256()){}; - Packet32q8i(__m256i val) : val(val) {} -} Packet32q8i; - -typedef struct Packet16q16i { - __m256i val; - operator __m256i() const { return val; } - Packet16q16i() : val(_mm256_setzero_si256()){}; - Packet16q16i(__m256i val) : val(val) {} -} Packet16q16i; - -typedef struct Packet32q8u { - __m256i val; - operator __m256i() const { return val; } - Packet32q8u() : val(_mm256_setzero_si256()){}; - Packet32q8u(__m256i val) : val(val) {} -} Packet32q8u; - -typedef struct Packet16q8i { - __m128i val; - operator __m128i() const { return val; } - Packet16q8i() : val(_mm_setzero_si128()) {} - Packet16q8i(__m128i val) : val(val) {} -} Packet16q8i; - -typedef struct Packet16q8u { - __m128i val; - operator __m128i() const { return val; } - Packet16q8u() : val(_mm_setzero_si128()) {} - Packet16q8u(__m128i val) : val(val) {} -} Packet16q8u; - -typedef struct Packet8q16i { - __m128i val; - operator __m128i() const { return val; } - Packet8q16i() : val(_mm_setzero_si128()) {} - Packet8q16i(__m128i val) : val(val) {} -} Packet8q16i; - -typedef struct Packet8q32i { - __m256i val; - operator __m256i() const { return val; } - Packet8q32i() : val(_mm256_setzero_si256()){}; - Packet8q32i(__m256i val) : val(val) {} -} Packet8q32i; - -typedef struct Packet4q32i { - __m128i val; - operator __m128i() const { return val; } - Packet4q32i() : val(_mm_setzero_si128()) {} - Packet4q32i(__m128i val) : val(val) {} -} Packet4q32i; +typedef eigen_packet_wrapper<__m256i, 20> Packet32q8i; +typedef eigen_packet_wrapper<__m256i, 21> Packet16q16i; +typedef eigen_packet_wrapper<__m256i, 22> Packet32q8u; +typedef eigen_packet_wrapper<__m128i, 23> Packet16q8i; +typedef eigen_packet_wrapper<__m128i, 25> Packet16q8u; +typedef eigen_packet_wrapper<__m128i, 26> Packet8q16i; +typedef eigen_packet_wrapper<__m256i, 27> Packet8q32i; +typedef eigen_packet_wrapper<__m128i, 28> Packet4q32i; #ifndef EIGEN_VECTORIZE_AVX512 template <> @@ -315,64 +268,64 @@ EIGEN_STRONG_INLINE Packet8q32i pload(const QInt32* from) { template <> EIGEN_STRONG_INLINE void pstoreu(QInt8* to, const Packet32q8i& from) { EIGEN_DEBUG_UNALIGNED_STORE _mm256_storeu_si256( - reinterpret_cast<__m256i*>(to), from.val); + reinterpret_cast<__m256i*>(to), from.m_val); } template <> EIGEN_STRONG_INLINE void pstoreu(QInt8* to, const Packet16q8i& from) { EIGEN_DEBUG_UNALIGNED_STORE _mm_storeu_si128(reinterpret_cast<__m128i*>(to), - from.val); + from.m_val); } template <> EIGEN_STRONG_INLINE void pstoreu(QUInt8* to, const Packet32q8u& from) { EIGEN_DEBUG_UNALIGNED_STORE _mm256_storeu_si256( - reinterpret_cast<__m256i*>(to), from.val); + reinterpret_cast<__m256i*>(to), from.m_val); } template <> EIGEN_STRONG_INLINE void pstoreu(QInt16* to, const Packet16q16i& from) { EIGEN_DEBUG_UNALIGNED_STORE _mm256_storeu_si256( - reinterpret_cast<__m256i*>(to), from.val); + reinterpret_cast<__m256i*>(to), from.m_val); } template <> EIGEN_STRONG_INLINE void pstoreu(QInt16* to, const Packet8q16i& from) { EIGEN_DEBUG_UNALIGNED_STORE _mm_storeu_si128(reinterpret_cast<__m128i*>(to), - from.val); + from.m_val); } template <> EIGEN_STRONG_INLINE void pstoreu(QInt32* to, const Packet8q32i& from) { EIGEN_DEBUG_UNALIGNED_STORE _mm256_storeu_si256( - reinterpret_cast<__m256i*>(to), from.val); + reinterpret_cast<__m256i*>(to), from.m_val); } // Aligned store template <> EIGEN_STRONG_INLINE void pstore(QInt32* to, const Packet8q32i& from) { EIGEN_DEBUG_ALIGNED_STORE _mm256_store_si256(reinterpret_cast<__m256i*>(to), - from.val); + from.m_val); } template <> EIGEN_STRONG_INLINE void pstore(QInt16* to, const Packet16q16i& from) { EIGEN_DEBUG_ALIGNED_STORE _mm256_store_si256(reinterpret_cast<__m256i*>(to), - from.val); + from.m_val); } template <> EIGEN_STRONG_INLINE void pstore(QInt16* to, const Packet8q16i& from) { EIGEN_DEBUG_ALIGNED_STORE _mm_store_si128(reinterpret_cast<__m128i*>(to), - from.val); + from.m_val); } template <> EIGEN_STRONG_INLINE void pstore(QUInt8* to, const Packet32q8u& from) { EIGEN_DEBUG_ALIGNED_STORE _mm256_store_si256(reinterpret_cast<__m256i*>(to), - from.val); + from.m_val); } template <> EIGEN_STRONG_INLINE void pstore(QInt8* to, const Packet32q8i& from) { EIGEN_DEBUG_ALIGNED_STORE _mm256_store_si256(reinterpret_cast<__m256i*>(to), - from.val); + from.m_val); } template <> EIGEN_STRONG_INLINE void pstore(QInt8* to, const Packet16q8i& from) { EIGEN_DEBUG_ALIGNED_STORE _mm_store_si128(reinterpret_cast<__m128i*>(to), - from.val); + from.m_val); } // Extract first element. @@ -382,15 +335,15 @@ EIGEN_STRONG_INLINE QInt32 pfirst(const Packet8q32i& a) { } template <> EIGEN_STRONG_INLINE QInt16 pfirst(const Packet16q16i& a) { - return _mm256_extract_epi16_N0(a.val); + return _mm256_extract_epi16_N0(a.m_val); } template <> EIGEN_STRONG_INLINE QUInt8 pfirst(const Packet32q8u& a) { - return static_cast(_mm256_extract_epi8_N0(a.val)); + return static_cast(_mm256_extract_epi8_N0(a.m_val)); } template <> EIGEN_STRONG_INLINE QInt8 pfirst(const Packet32q8i& a) { - return _mm256_extract_epi8_N0(a.val); + return _mm256_extract_epi8_N0(a.m_val); } // Initialize to constant value. @@ -411,7 +364,7 @@ EIGEN_STRONG_INLINE Packet8q32i pset1(const QInt32& from) { template <> EIGEN_STRONG_INLINE Packet8q32i padd(const Packet8q32i& a, const Packet8q32i& b) { - return _mm256_add_epi32(a.val, b.val); + return _mm256_add_epi32(a.m_val, b.m_val); } template <> EIGEN_STRONG_INLINE Packet16q16i pset1(const QInt16& from) { @@ -420,62 +373,62 @@ EIGEN_STRONG_INLINE Packet16q16i pset1(const QInt16& from) { template <> EIGEN_STRONG_INLINE Packet8q32i psub(const Packet8q32i& a, const Packet8q32i& b) { - return _mm256_sub_epi32(a.val, b.val); + return _mm256_sub_epi32(a.m_val, b.m_val); } // Note: mullo truncates the result to 32 bits. template <> EIGEN_STRONG_INLINE Packet8q32i pmul(const Packet8q32i& a, const Packet8q32i& b) { - return _mm256_mullo_epi32(a.val, b.val); + return _mm256_mullo_epi32(a.m_val, b.m_val); } template <> EIGEN_STRONG_INLINE Packet8q32i pnegate(const Packet8q32i& a) { - return _mm256_sub_epi32(_mm256_setzero_si256(), a.val); + return _mm256_sub_epi32(_mm256_setzero_si256(), a.m_val); } // Min and max. template <> EIGEN_STRONG_INLINE Packet8q32i pmin(const Packet8q32i& a, const Packet8q32i& b) { - return _mm256_min_epi32(a.val, b.val); + return _mm256_min_epi32(a.m_val, b.m_val); } template <> EIGEN_STRONG_INLINE Packet8q32i pmax(const Packet8q32i& a, const Packet8q32i& b) { - return _mm256_max_epi32(a.val, b.val); + return _mm256_max_epi32(a.m_val, b.m_val); } template <> EIGEN_STRONG_INLINE Packet16q16i pmin(const Packet16q16i& a, const Packet16q16i& b) { - return _mm256_min_epi16(a.val, b.val); + return _mm256_min_epi16(a.m_val, b.m_val); } template <> EIGEN_STRONG_INLINE Packet16q16i pmax(const Packet16q16i& a, const Packet16q16i& b) { - return _mm256_max_epi16(a.val, b.val); + return _mm256_max_epi16(a.m_val, b.m_val); } template <> EIGEN_STRONG_INLINE Packet32q8u pmin(const Packet32q8u& a, const Packet32q8u& b) { - return _mm256_min_epu8(a.val, b.val); + return _mm256_min_epu8(a.m_val, b.m_val); } template <> EIGEN_STRONG_INLINE Packet32q8u pmax(const Packet32q8u& a, const Packet32q8u& b) { - return _mm256_max_epu8(a.val, b.val); + return _mm256_max_epu8(a.m_val, b.m_val); } template <> EIGEN_STRONG_INLINE Packet32q8i pmin(const Packet32q8i& a, const Packet32q8i& b) { - return _mm256_min_epi8(a.val, b.val); + return _mm256_min_epi8(a.m_val, b.m_val); } template <> EIGEN_STRONG_INLINE Packet32q8i pmax(const Packet32q8i& a, const Packet32q8i& b) { - return _mm256_max_epi8(a.val, b.val); + return _mm256_max_epi8(a.m_val, b.m_val); } // Reductions. diff --git a/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/PacketMathAVX512.h b/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/PacketMathAVX512.h index 6c77aa7b51..5a0ae2e8c8 100644 --- a/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/PacketMathAVX512.h +++ b/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/PacketMathAVX512.h @@ -6,33 +6,10 @@ namespace Eigen { namespace internal { -typedef struct Packet64q8i { - __m512i val; - operator __m512i() const { return val; } - Packet64q8i(); - Packet64q8i(__m512i val) : val(val) {} -} Packet64q8i; - -typedef struct Packet32q16i { - __m512i val; - operator __m512i() const { return val; } - Packet32q16i(); - Packet32q16i(__m512i val) : val(val) {} -} Packet32q16i; - -typedef struct Packet64q8u { - __m512i val; - operator __m512i() const { return val; } - Packet64q8u(); - Packet64q8u(__m512i val) : val(val) {} -} Packet64q8u; - -typedef struct Packet16q32i { - __m512i val; - operator __m512i() const { return val; } - Packet16q32i(); - Packet16q32i(__m512i val) : val(val) {} -} Packet16q32i; +typedef eigen_packet_wrapper<__m512i, 30> Packet64q8i; +typedef eigen_packet_wrapper<__m512i, 31> Packet32q16i; +typedef eigen_packet_wrapper<__m512i, 32> Packet64q8u; +typedef eigen_packet_wrapper<__m512i, 33> Packet16q32i; template <> struct packet_traits : default_packet_traits { @@ -216,44 +193,44 @@ EIGEN_STRONG_INLINE Packet16q32i pload(const QInt32* from) { template <> EIGEN_STRONG_INLINE void pstoreu(QInt8* to, const Packet64q8i& from) { EIGEN_DEBUG_UNALIGNED_STORE _mm512_storeu_si512( - reinterpret_cast<__m512i*>(to), from.val); + reinterpret_cast<__m512i*>(to), from.m_val); } template <> EIGEN_STRONG_INLINE void pstoreu(QInt16* to, const Packet32q16i& from) { EIGEN_DEBUG_UNALIGNED_STORE _mm512_storeu_si512( - reinterpret_cast<__m512i*>(to), from.val); + reinterpret_cast<__m512i*>(to), from.m_val); } template <> EIGEN_STRONG_INLINE void pstoreu(QUInt8* to, const Packet64q8u& from) { EIGEN_DEBUG_UNALIGNED_STORE _mm512_storeu_si512( - reinterpret_cast<__m512i*>(to), from.val); + reinterpret_cast<__m512i*>(to), from.m_val); } template <> EIGEN_STRONG_INLINE void pstoreu(QInt32* to, const Packet16q32i& from) { EIGEN_DEBUG_UNALIGNED_STORE _mm512_storeu_si512( - reinterpret_cast<__m512i*>(to), from.val); + reinterpret_cast<__m512i*>(to), from.m_val); } // Aligned store template <> EIGEN_STRONG_INLINE void pstore(QInt32* to, const Packet16q32i& from) { EIGEN_DEBUG_ALIGNED_STORE _mm512_store_si512(reinterpret_cast<__m512i*>(to), - from.val); + from.m_val); } template <> EIGEN_STRONG_INLINE void pstore(QUInt8* to, const Packet64q8u& from) { EIGEN_DEBUG_ALIGNED_STORE _mm512_store_si512(reinterpret_cast<__m512i*>(to), - from.val); + from.m_val); } template <> EIGEN_STRONG_INLINE void pstore(QInt8* to, const Packet64q8i& from) { EIGEN_DEBUG_ALIGNED_STORE _mm512_store_si512(reinterpret_cast<__m512i*>(to), - from.val); + from.m_val); } template <> EIGEN_STRONG_INLINE void pstore(QInt16* to, const Packet32q16i& from) { EIGEN_DEBUG_ALIGNED_STORE _mm512_store_si512(reinterpret_cast<__m512i*>(to), - from.val); + from.m_val); } // Extract first element. @@ -264,15 +241,15 @@ EIGEN_STRONG_INLINE QInt32 pfirst(const Packet16q32i& a) { template <> EIGEN_STRONG_INLINE QUInt8 pfirst(const Packet64q8u& a) { return static_cast( - _mm_extract_epi8(_mm512_extracti32x4_epi32(a.val, 0), 0)); + _mm_extract_epi8(_mm512_extracti32x4_epi32(a.m_val, 0), 0)); } template <> EIGEN_STRONG_INLINE QInt8 pfirst(const Packet64q8i& a) { - return _mm_extract_epi8(_mm512_extracti32x4_epi32(a.val, 0), 0); + return _mm_extract_epi8(_mm512_extracti32x4_epi32(a.m_val, 0), 0); } template <> EIGEN_STRONG_INLINE QInt16 pfirst(const Packet32q16i& a) { - return _mm_extract_epi16(_mm512_extracti32x4_epi32(a.val, 0), 0); + return _mm_extract_epi16(_mm512_extracti32x4_epi32(a.m_val, 0), 0); } // Initialize to constant value. @@ -297,46 +274,46 @@ EIGEN_STRONG_INLINE Packet16q32i pset1(const QInt32& from) { template <> EIGEN_STRONG_INLINE Packet16q32i padd(const Packet16q32i& a, const Packet16q32i& b) { - return _mm512_add_epi32(a.val, b.val); + return _mm512_add_epi32(a.m_val, b.m_val); } template <> EIGEN_STRONG_INLINE Packet16q32i psub(const Packet16q32i& a, const Packet16q32i& b) { - return _mm512_sub_epi32(a.val, b.val); + return _mm512_sub_epi32(a.m_val, b.m_val); } // Note: mullo truncates the result to 32 bits. template <> EIGEN_STRONG_INLINE Packet16q32i pmul(const Packet16q32i& a, const Packet16q32i& b) { - return _mm512_mullo_epi32(a.val, b.val); + return _mm512_mullo_epi32(a.m_val, b.m_val); } template <> EIGEN_STRONG_INLINE Packet16q32i pnegate(const Packet16q32i& a) { - return _mm512_sub_epi32(_mm512_setzero_si512(), a.val); + return _mm512_sub_epi32(_mm512_setzero_si512(), a.m_val); } // Min and max. template <> EIGEN_STRONG_INLINE Packet16q32i pmin(const Packet16q32i& a, const Packet16q32i& b) { - return _mm512_min_epi32(a.val, b.val); + return _mm512_min_epi32(a.m_val, b.m_val); } template <> EIGEN_STRONG_INLINE Packet16q32i pmax(const Packet16q32i& a, const Packet16q32i& b) { - return _mm512_max_epi32(a.val, b.val); + return _mm512_max_epi32(a.m_val, b.m_val); } template <> EIGEN_STRONG_INLINE Packet64q8u pmin(const Packet64q8u& a, const Packet64q8u& b) { #ifdef EIGEN_VECTORIZE_AVX512BW - return _mm512_min_epu8(a.val, b.val); + return _mm512_min_epu8(a.m_val, b.m_val); #else - __m256i ap0 = _mm512_extracti32x8_epi32(a.val, 0); - __m256i ap1 = _mm512_extracti32x8_epi32(a.val, 1); - __m256i bp0 = _mm512_extracti32x8_epi32(b.val, 0); - __m256i bp1 = _mm512_extracti32x8_epi32(b.val, 1); + __m256i ap0 = _mm512_extracti32x8_epi32(a.m_val, 0); + __m256i ap1 = _mm512_extracti32x8_epi32(a.m_val, 1); + __m256i bp0 = _mm512_extracti32x8_epi32(b.m_val, 0); + __m256i bp1 = _mm512_extracti32x8_epi32(b.m_val, 1); __m256i r0 = _mm256_min_epu8(ap0, bp0); __m256i r1 = _mm256_min_epu8(ap1, bp1); return _mm512_inserti32x8(_mm512_castsi256_si512(r0), r1, 1); @@ -346,12 +323,12 @@ template <> EIGEN_STRONG_INLINE Packet64q8u pmax(const Packet64q8u& a, const Packet64q8u& b) { #ifdef EIGEN_VECTORIZE_AVX512BW - return _mm512_max_epu8(a.val, b.val); + return _mm512_max_epu8(a.m_val, b.m_val); #else - __m256i ap0 = _mm512_extracti32x8_epi32(a.val, 0); - __m256i ap1 = _mm512_extracti32x8_epi32(a.val, 1); - __m256i bp0 = _mm512_extracti32x8_epi32(b.val, 0); - __m256i bp1 = _mm512_extracti32x8_epi32(b.val, 1); + __m256i ap0 = _mm512_extracti32x8_epi32(a.m_val, 0); + __m256i ap1 = _mm512_extracti32x8_epi32(a.m_val, 1); + __m256i bp0 = _mm512_extracti32x8_epi32(b.m_val, 0); + __m256i bp1 = _mm512_extracti32x8_epi32(b.m_val, 1); __m256i r0 = _mm256_max_epu8(ap0, bp0); __m256i r1 = _mm256_max_epu8(ap1, bp1); return _mm512_inserti32x8(_mm512_castsi256_si512(r0), r1, 1); @@ -362,12 +339,12 @@ template <> EIGEN_STRONG_INLINE Packet64q8i pmin(const Packet64q8i& a, const Packet64q8i& b) { #ifdef EIGEN_VECTORIZE_AVX512BW - return _mm512_min_epi8(a.val, b.val); + return _mm512_min_epi8(a.m_val, b.m_val); #else - __m256i ap0 = _mm512_extracti32x8_epi32(a.val, 0); - __m256i ap1 = _mm512_extracti32x8_epi32(a.val, 1); - __m256i bp0 = _mm512_extracti32x8_epi32(b.val, 0); - __m256i bp1 = _mm512_extracti32x8_epi32(b.val, 1); + __m256i ap0 = _mm512_extracti32x8_epi32(a.m_val, 0); + __m256i ap1 = _mm512_extracti32x8_epi32(a.m_val, 1); + __m256i bp0 = _mm512_extracti32x8_epi32(b.m_val, 0); + __m256i bp1 = _mm512_extracti32x8_epi32(b.m_val, 1); __m256i r0 = _mm256_min_epi8(ap0, bp0); __m256i r1 = _mm256_min_epi8(ap1, bp1); return _mm512_inserti32x8(_mm512_castsi256_si512(r0), r1, 1); @@ -377,12 +354,12 @@ template <> EIGEN_STRONG_INLINE Packet32q16i pmin(const Packet32q16i& a, const Packet32q16i& b) { #ifdef EIGEN_VECTORIZE_AVX512BW - return _mm512_min_epi16(a.val, b.val); + return _mm512_min_epi16(a.m_val, b.m_val); #else - __m256i ap0 = _mm512_extracti32x8_epi32(a.val, 0); - __m256i ap1 = _mm512_extracti32x8_epi32(a.val, 1); - __m256i bp0 = _mm512_extracti32x8_epi32(b.val, 0); - __m256i bp1 = _mm512_extracti32x8_epi32(b.val, 1); + __m256i ap0 = _mm512_extracti32x8_epi32(a.m_val, 0); + __m256i ap1 = _mm512_extracti32x8_epi32(a.m_val, 1); + __m256i bp0 = _mm512_extracti32x8_epi32(b.m_val, 0); + __m256i bp1 = _mm512_extracti32x8_epi32(b.m_val, 1); __m256i r0 = _mm256_min_epi16(ap0, bp0); __m256i r1 = _mm256_min_epi16(ap1, bp1); return _mm512_inserti32x8(_mm512_castsi256_si512(r0), r1, 1); @@ -392,12 +369,12 @@ template <> EIGEN_STRONG_INLINE Packet64q8i pmax(const Packet64q8i& a, const Packet64q8i& b) { #ifdef EIGEN_VECTORIZE_AVX512BW - return _mm512_max_epi8(a.val, b.val); + return _mm512_max_epi8(a.m_val, b.m_val); #else - __m256i ap0 = _mm512_extracti32x8_epi32(a.val, 0); - __m256i ap1 = _mm512_extracti32x8_epi32(a.val, 1); - __m256i bp0 = _mm512_extracti32x8_epi32(b.val, 0); - __m256i bp1 = _mm512_extracti32x8_epi32(b.val, 1); + __m256i ap0 = _mm512_extracti32x8_epi32(a.m_val, 0); + __m256i ap1 = _mm512_extracti32x8_epi32(a.m_val, 1); + __m256i bp0 = _mm512_extracti32x8_epi32(b.m_val, 0); + __m256i bp1 = _mm512_extracti32x8_epi32(b.m_val, 1); __m256i r0 = _mm256_max_epi8(ap0, bp0); __m256i r1 = _mm256_max_epi8(ap1, bp1); return _mm512_inserti32x8(_mm512_castsi256_si512(r0), r1, 1); @@ -407,12 +384,12 @@ template <> EIGEN_STRONG_INLINE Packet32q16i pmax(const Packet32q16i& a, const Packet32q16i& b) { #ifdef EIGEN_VECTORIZE_AVX512BW - return _mm512_max_epi16(a.val, b.val); + return _mm512_max_epi16(a.m_val, b.m_val); #else - __m256i ap0 = _mm512_extracti32x8_epi32(a.val, 0); - __m256i ap1 = _mm512_extracti32x8_epi32(a.val, 1); - __m256i bp0 = _mm512_extracti32x8_epi32(b.val, 0); - __m256i bp1 = _mm512_extracti32x8_epi32(b.val, 1); + __m256i ap0 = _mm512_extracti32x8_epi32(a.m_val, 0); + __m256i ap1 = _mm512_extracti32x8_epi32(a.m_val, 1); + __m256i bp0 = _mm512_extracti32x8_epi32(b.m_val, 0); + __m256i bp1 = _mm512_extracti32x8_epi32(b.m_val, 1); __m256i r0 = _mm256_max_epi16(ap0, bp0); __m256i r1 = _mm256_max_epi16(ap1, bp1); return _mm512_inserti32x8(_mm512_castsi256_si512(r0), r1, 1); @@ -422,112 +399,112 @@ EIGEN_STRONG_INLINE Packet32q16i pmax(const Packet32q16i& a, // Reductions. template <> EIGEN_STRONG_INLINE QInt32 predux_min(const Packet16q32i& a) { - Packet4i lane0 = _mm512_extracti32x4_epi32(a.val, 0); - Packet4i lane1 = _mm512_extracti32x4_epi32(a.val, 1); - Packet4i lane2 = _mm512_extracti32x4_epi32(a.val, 2); - Packet4i lane3 = _mm512_extracti32x4_epi32(a.val, 3); + Packet4i lane0 = _mm512_extracti32x4_epi32(a.m_val, 0); + Packet4i lane1 = _mm512_extracti32x4_epi32(a.m_val, 1); + Packet4i lane2 = _mm512_extracti32x4_epi32(a.m_val, 2); + Packet4i lane3 = _mm512_extracti32x4_epi32(a.m_val, 3); Packet4i res = _mm_min_epi32(_mm_min_epi32(lane0, lane1), _mm_min_epi32(lane2, lane3)); res = _mm_min_epi32(res, _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 3, 2))); - return pfirst( - _mm_min_epi32(res, _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 0, 1)))); + res = _mm_min_epi32(res, _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 0, 1))); + return pfirst(res); } template <> EIGEN_STRONG_INLINE QInt32 predux_max(const Packet16q32i& a) { - Packet4i lane0 = _mm512_extracti32x4_epi32(a.val, 0); - Packet4i lane1 = _mm512_extracti32x4_epi32(a.val, 1); - Packet4i lane2 = _mm512_extracti32x4_epi32(a.val, 2); - Packet4i lane3 = _mm512_extracti32x4_epi32(a.val, 3); + Packet4i lane0 = _mm512_extracti32x4_epi32(a.m_val, 0); + Packet4i lane1 = _mm512_extracti32x4_epi32(a.m_val, 1); + Packet4i lane2 = _mm512_extracti32x4_epi32(a.m_val, 2); + Packet4i lane3 = _mm512_extracti32x4_epi32(a.m_val, 3); Packet4i res = _mm_max_epi32(_mm_max_epi32(lane0, lane1), _mm_max_epi32(lane2, lane3)); res = _mm_max_epi32(res, _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 3, 2))); - return pfirst( - _mm_max_epi32(res, _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 0, 1)))); + res = _mm_max_epi32(res, _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 0, 1))); + return pfirst(res); } template <> EIGEN_STRONG_INLINE QInt16 predux_min(const Packet32q16i& a) { - Packet4i lane0 = _mm512_extracti32x4_epi32(a.val, 0); - Packet4i lane1 = _mm512_extracti32x4_epi32(a.val, 1); - Packet4i lane2 = _mm512_extracti32x4_epi32(a.val, 2); - Packet4i lane3 = _mm512_extracti32x4_epi32(a.val, 3); + Packet4i lane0 = _mm512_extracti32x4_epi32(a.m_val, 0); + Packet4i lane1 = _mm512_extracti32x4_epi32(a.m_val, 1); + Packet4i lane2 = _mm512_extracti32x4_epi32(a.m_val, 2); + Packet4i lane3 = _mm512_extracti32x4_epi32(a.m_val, 3); Packet4i res = _mm_min_epi16(_mm_min_epi16(lane0, lane1), _mm_min_epi16(lane2, lane3)); res = _mm_min_epi16(res, _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 3, 2))); - std::uint32_t w = pfirst( - _mm_min_epi16(res, _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 0, 1)))); + res = _mm_min_epi16(res, _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 0, 1))); + std::uint32_t w = pfirst(res); return std::min( {static_cast(w >> 16), static_cast(w)}); } template <> EIGEN_STRONG_INLINE QInt16 predux_max(const Packet32q16i& a) { - Packet4i lane0 = _mm512_extracti32x4_epi32(a.val, 0); - Packet4i lane1 = _mm512_extracti32x4_epi32(a.val, 1); - Packet4i lane2 = _mm512_extracti32x4_epi32(a.val, 2); - Packet4i lane3 = _mm512_extracti32x4_epi32(a.val, 3); + Packet4i lane0 = _mm512_extracti32x4_epi32(a.m_val, 0); + Packet4i lane1 = _mm512_extracti32x4_epi32(a.m_val, 1); + Packet4i lane2 = _mm512_extracti32x4_epi32(a.m_val, 2); + Packet4i lane3 = _mm512_extracti32x4_epi32(a.m_val, 3); Packet4i res = _mm_max_epi16(_mm_max_epi16(lane0, lane1), _mm_max_epi16(lane2, lane3)); res = _mm_max_epi16(res, _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 3, 2))); - std::uint32_t w = pfirst( - _mm_max_epi16(res, _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 0, 1)))); + res = _mm_max_epi16(res, _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 0, 1))); + std::uint32_t w = pfirst(res); return std::max( {static_cast(w >> 16), static_cast(w)}); } template <> EIGEN_STRONG_INLINE QUInt8 predux_min(const Packet64q8u& a) { - Packet4i lane0 = _mm512_extracti32x4_epi32(a.val, 0); - Packet4i lane1 = _mm512_extracti32x4_epi32(a.val, 1); - Packet4i lane2 = _mm512_extracti32x4_epi32(a.val, 2); - Packet4i lane3 = _mm512_extracti32x4_epi32(a.val, 3); + Packet4i lane0 = _mm512_extracti32x4_epi32(a.m_val, 0); + Packet4i lane1 = _mm512_extracti32x4_epi32(a.m_val, 1); + Packet4i lane2 = _mm512_extracti32x4_epi32(a.m_val, 2); + Packet4i lane3 = _mm512_extracti32x4_epi32(a.m_val, 3); Packet4i res = _mm_min_epu8(_mm_min_epu8(lane0, lane1), _mm_min_epu8(lane2, lane3)); res = _mm_min_epu8(res, _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 3, 2))); - std::uint32_t w = pfirst( - _mm_min_epu8(res, _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 0, 1)))); + res = _mm_min_epu8(res, _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 0, 1))); + std::uint32_t w = pfirst(res); return std::min( {static_cast(w >> 24), static_cast(w >> 16), static_cast(w >> 8), static_cast(w)}); } template <> EIGEN_STRONG_INLINE QUInt8 predux_max(const Packet64q8u& a) { - Packet4i lane0 = _mm512_extracti32x4_epi32(a.val, 0); - Packet4i lane1 = _mm512_extracti32x4_epi32(a.val, 1); - Packet4i lane2 = _mm512_extracti32x4_epi32(a.val, 2); - Packet4i lane3 = _mm512_extracti32x4_epi32(a.val, 3); + Packet4i lane0 = _mm512_extracti32x4_epi32(a.m_val, 0); + Packet4i lane1 = _mm512_extracti32x4_epi32(a.m_val, 1); + Packet4i lane2 = _mm512_extracti32x4_epi32(a.m_val, 2); + Packet4i lane3 = _mm512_extracti32x4_epi32(a.m_val, 3); Packet4i res = _mm_max_epu8(_mm_max_epu8(lane0, lane1), _mm_max_epu8(lane2, lane3)); res = _mm_max_epu8(res, _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 3, 2))); - std::uint32_t w = pfirst( - _mm_max_epu8(res, _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 0, 1)))); + res = _mm_max_epu8(res, _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 0, 1))); + std::uint32_t w = pfirst(res); return std::max( {static_cast(w >> 24), static_cast(w >> 16), static_cast(w >> 8), static_cast(w)}); } template <> EIGEN_STRONG_INLINE QInt8 predux_min(const Packet64q8i& a) { - Packet4i lane0 = _mm512_extracti32x4_epi32(a.val, 0); - Packet4i lane1 = _mm512_extracti32x4_epi32(a.val, 1); - Packet4i lane2 = _mm512_extracti32x4_epi32(a.val, 2); - Packet4i lane3 = _mm512_extracti32x4_epi32(a.val, 3); + Packet4i lane0 = _mm512_extracti32x4_epi32(a.m_val, 0); + Packet4i lane1 = _mm512_extracti32x4_epi32(a.m_val, 1); + Packet4i lane2 = _mm512_extracti32x4_epi32(a.m_val, 2); + Packet4i lane3 = _mm512_extracti32x4_epi32(a.m_val, 3); Packet4i res = _mm_min_epi8(_mm_min_epi8(lane0, lane1), _mm_min_epi8(lane2, lane3)); res = _mm_min_epi8(res, _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 3, 2))); - std::uint32_t w = pfirst( - _mm_min_epi8(res, _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 0, 1)))); + res = _mm_min_epi8(res, _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 0, 1))); + std::uint32_t w = pfirst(res); return std::min( {static_cast(w >> 24), static_cast(w >> 16), static_cast(w >> 8), static_cast(w)}); } template <> EIGEN_STRONG_INLINE QInt8 predux_max(const Packet64q8i& a) { - Packet4i lane0 = _mm512_extracti32x4_epi32(a.val, 0); - Packet4i lane1 = _mm512_extracti32x4_epi32(a.val, 1); - Packet4i lane2 = _mm512_extracti32x4_epi32(a.val, 2); - Packet4i lane3 = _mm512_extracti32x4_epi32(a.val, 3); + Packet4i lane0 = _mm512_extracti32x4_epi32(a.m_val, 0); + Packet4i lane1 = _mm512_extracti32x4_epi32(a.m_val, 1); + Packet4i lane2 = _mm512_extracti32x4_epi32(a.m_val, 2); + Packet4i lane3 = _mm512_extracti32x4_epi32(a.m_val, 3); Packet4i res = _mm_max_epi8(_mm_max_epi8(lane0, lane1), _mm_max_epi8(lane2, lane3)); res = _mm_max_epi8(res, _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 3, 2))); - std::uint32_t w = pfirst( - _mm_max_epi8(res, _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 0, 1)))); + res = _mm_max_epi8(res, _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 0, 1))); + std::uint32_t w = pfirst(res); return std::min( {static_cast(w >> 24), static_cast(w >> 16), static_cast(w >> 8), static_cast(w)}); diff --git a/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/TypeCastingAVX2.h b/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/TypeCastingAVX2.h index 9561d6a338..5dd2cd309b 100644 --- a/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/TypeCastingAVX2.h +++ b/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/TypeCastingAVX2.h @@ -13,7 +13,7 @@ struct type_casting_traits { template <> EIGEN_STRONG_INLINE Packet8f pcast(const Packet8q32i& a) { - return _mm256_cvtepi32_ps(a.val); + return _mm256_cvtepi32_ps(a.m_val); } template <> @@ -35,14 +35,33 @@ template <> EIGEN_STRONG_INLINE Packet32q8i pcast(const Packet8q32i& a, const Packet8q32i& b, const Packet8q32i& c, const Packet8q32i& d) { - __m256i converted = _mm256_packs_epi16(_mm256_packs_epi32(a.val, b.val), - _mm256_packs_epi32(c.val, d.val)); + __m256i converted = _mm256_packs_epi16(_mm256_packs_epi32(a.m_val, b.m_val), + _mm256_packs_epi32(c.m_val, d.m_val)); // Since packs does not cross 128 bit lane boundaries, // we have to permute to properly order the final result. const __m256i permute_mask = _mm256_set_epi32(7, 3, 6, 2, 5, 1, 4, 0); return _mm256_permutevar8x32_epi32(converted, permute_mask); } +template <> +struct type_casting_traits { + enum { VectorizedCast = 1, SrcCoeffRatio = 4, TgtCoeffRatio = 1 }; +}; + +template <> +EIGEN_STRONG_INLINE Packet32q8i +pcast(const Packet8f& a, const Packet8f& b, + const Packet8f& c, const Packet8f& d) { + const __m256i a_conv = _mm256_cvtps_epi32(a); + const __m256i b_conv = _mm256_cvtps_epi32(b); + const __m256i c_conv = _mm256_cvtps_epi32(c); + const __m256i d_conv = _mm256_cvtps_epi32(d); + __m256i converted = _mm256_packs_epi16(_mm256_packs_epi32(a_conv, b_conv), + _mm256_packs_epi32(c_conv, d_conv)); + const __m256i permute_mask = _mm256_set_epi32(7, 3, 6, 2, 5, 1, 4, 0); + return _mm256_permutevar8x32_epi32(converted, permute_mask); +} + template <> struct type_casting_traits { enum { VectorizedCast = 1, SrcCoeffRatio = 4, TgtCoeffRatio = 1 }; diff --git a/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/TypeCastingAVX512.h b/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/TypeCastingAVX512.h index d3b0240297..17408d13ab 100644 --- a/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/TypeCastingAVX512.h +++ b/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/TypeCastingAVX512.h @@ -14,7 +14,7 @@ struct type_casting_traits { template <> EIGEN_STRONG_INLINE Packet16f pcast(const Packet16q32i& a) { - return _mm512_cvtepi32_ps(a.val); + return _mm512_cvtepi32_ps(a.m_val); } template <>