stk-code_catmod/lib/simd_wrapper/simde/x86/avx.h
2022-04-29 11:02:25 +08:00

6175 lines
192 KiB
C

/* SPDX-License-Identifier: MIT
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use, copy,
* modify, merge, publish, distribute, sublicense, and/or sell copies
* of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
* BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*
* Copyright:
* 2018-2020 Evan Nemerson <evan@nemerson.com>
* 2020 Michael R. Crusoe <crusoe@debian.org>
*/
#include "sse.h"
#if !defined(SIMDE_X86_AVX_H)
#define SIMDE_X86_AVX_H
#include "sse4.2.h"
HEDLEY_DIAGNOSTIC_PUSH
SIMDE_DISABLE_UNWANTED_DIAGNOSTICS
SIMDE_BEGIN_DECLS_
typedef union {
#if defined(SIMDE_VECTOR_SUBSCRIPT)
SIMDE_ALIGN_TO_32 int8_t i8 SIMDE_VECTOR(32) SIMDE_MAY_ALIAS;
SIMDE_ALIGN_TO_32 int16_t i16 SIMDE_VECTOR(32) SIMDE_MAY_ALIAS;
SIMDE_ALIGN_TO_32 int32_t i32 SIMDE_VECTOR(32) SIMDE_MAY_ALIAS;
SIMDE_ALIGN_TO_32 int64_t i64 SIMDE_VECTOR(32) SIMDE_MAY_ALIAS;
SIMDE_ALIGN_TO_32 uint8_t u8 SIMDE_VECTOR(32) SIMDE_MAY_ALIAS;
SIMDE_ALIGN_TO_32 uint16_t u16 SIMDE_VECTOR(32) SIMDE_MAY_ALIAS;
SIMDE_ALIGN_TO_32 uint32_t u32 SIMDE_VECTOR(32) SIMDE_MAY_ALIAS;
SIMDE_ALIGN_TO_32 uint64_t u64 SIMDE_VECTOR(32) SIMDE_MAY_ALIAS;
#if defined(SIMDE_HAVE_INT128_)
SIMDE_ALIGN_TO_32 simde_int128 i128 SIMDE_VECTOR(32) SIMDE_MAY_ALIAS;
SIMDE_ALIGN_TO_32 simde_uint128 u128 SIMDE_VECTOR(32) SIMDE_MAY_ALIAS;
#endif
SIMDE_ALIGN_TO_32 simde_float32 f32 SIMDE_VECTOR(32) SIMDE_MAY_ALIAS;
SIMDE_ALIGN_TO_32 simde_float64 f64 SIMDE_VECTOR(32) SIMDE_MAY_ALIAS;
SIMDE_ALIGN_TO_32 int_fast32_t i32f SIMDE_VECTOR(32) SIMDE_MAY_ALIAS;
SIMDE_ALIGN_TO_32 uint_fast32_t u32f SIMDE_VECTOR(32) SIMDE_MAY_ALIAS;
#else
SIMDE_ALIGN_TO_32 int8_t i8[32];
SIMDE_ALIGN_TO_32 int16_t i16[16];
SIMDE_ALIGN_TO_32 int32_t i32[8];
SIMDE_ALIGN_TO_32 int64_t i64[4];
SIMDE_ALIGN_TO_32 uint8_t u8[32];
SIMDE_ALIGN_TO_32 uint16_t u16[16];
SIMDE_ALIGN_TO_32 uint32_t u32[8];
SIMDE_ALIGN_TO_32 uint64_t u64[4];
SIMDE_ALIGN_TO_32 int_fast32_t i32f[32 / sizeof(int_fast32_t)];
SIMDE_ALIGN_TO_32 uint_fast32_t u32f[32 / sizeof(uint_fast32_t)];
#if defined(SIMDE_HAVE_INT128_)
SIMDE_ALIGN_TO_32 simde_int128 i128[2];
SIMDE_ALIGN_TO_32 simde_uint128 u128[2];
#endif
SIMDE_ALIGN_TO_32 simde_float32 f32[8];
SIMDE_ALIGN_TO_32 simde_float64 f64[4];
#endif
SIMDE_ALIGN_TO_32 simde__m128_private m128_private[2];
SIMDE_ALIGN_TO_32 simde__m128 m128[2];
#if defined(SIMDE_X86_AVX_NATIVE)
SIMDE_ALIGN_TO_32 __m256 n;
#elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(unsigned char) altivec_u8[2];
SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(unsigned short) altivec_u16[2];
SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(unsigned int) altivec_u32[2];
SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(signed char) altivec_i8[2];
SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(signed short) altivec_i16[2];
SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(int) altivec_i32[2];
SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(float) altivec_f32[2];
#if defined(SIMDE_POWER_ALTIVEC_P7_NATIVE)
SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(unsigned long long) altivec_u64[2];
SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(long long) altivec_i64[2];
SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(double) altivec_f64[2];
#endif
#endif
} simde__m256_private;
typedef union {
#if defined(SIMDE_VECTOR_SUBSCRIPT)
SIMDE_ALIGN_TO_32 int8_t i8 SIMDE_VECTOR(32) SIMDE_MAY_ALIAS;
SIMDE_ALIGN_TO_32 int16_t i16 SIMDE_VECTOR(32) SIMDE_MAY_ALIAS;
SIMDE_ALIGN_TO_32 int32_t i32 SIMDE_VECTOR(32) SIMDE_MAY_ALIAS;
SIMDE_ALIGN_TO_32 int64_t i64 SIMDE_VECTOR(32) SIMDE_MAY_ALIAS;
SIMDE_ALIGN_TO_32 uint8_t u8 SIMDE_VECTOR(32) SIMDE_MAY_ALIAS;
SIMDE_ALIGN_TO_32 uint16_t u16 SIMDE_VECTOR(32) SIMDE_MAY_ALIAS;
SIMDE_ALIGN_TO_32 uint32_t u32 SIMDE_VECTOR(32) SIMDE_MAY_ALIAS;
SIMDE_ALIGN_TO_32 uint64_t u64 SIMDE_VECTOR(32) SIMDE_MAY_ALIAS;
#if defined(SIMDE_HAVE_INT128_)
SIMDE_ALIGN_TO_32 simde_int128 i128 SIMDE_VECTOR(32) SIMDE_MAY_ALIAS;
SIMDE_ALIGN_TO_32 simde_uint128 u128 SIMDE_VECTOR(32) SIMDE_MAY_ALIAS;
#endif
SIMDE_ALIGN_TO_32 simde_float32 f32 SIMDE_VECTOR(32) SIMDE_MAY_ALIAS;
SIMDE_ALIGN_TO_32 simde_float64 f64 SIMDE_VECTOR(32) SIMDE_MAY_ALIAS;
SIMDE_ALIGN_TO_32 int_fast32_t i32f SIMDE_VECTOR(32) SIMDE_MAY_ALIAS;
SIMDE_ALIGN_TO_32 uint_fast32_t u32f SIMDE_VECTOR(32) SIMDE_MAY_ALIAS;
#else
SIMDE_ALIGN_TO_32 int8_t i8[32];
SIMDE_ALIGN_TO_32 int16_t i16[16];
SIMDE_ALIGN_TO_32 int32_t i32[8];
SIMDE_ALIGN_TO_32 int64_t i64[4];
SIMDE_ALIGN_TO_32 uint8_t u8[32];
SIMDE_ALIGN_TO_32 uint16_t u16[16];
SIMDE_ALIGN_TO_32 uint32_t u32[8];
SIMDE_ALIGN_TO_32 uint64_t u64[4];
#if defined(SIMDE_HAVE_INT128_)
SIMDE_ALIGN_TO_32 simde_int128 i128[2];
SIMDE_ALIGN_TO_32 simde_uint128 u128[2];
#endif
SIMDE_ALIGN_TO_32 simde_float32 f32[8];
SIMDE_ALIGN_TO_32 simde_float64 f64[4];
SIMDE_ALIGN_TO_32 int_fast32_t i32f[32 / sizeof(int_fast32_t)];
SIMDE_ALIGN_TO_32 uint_fast32_t u32f[32 / sizeof(uint_fast32_t)];
#endif
SIMDE_ALIGN_TO_32 simde__m128d_private m128d_private[2];
SIMDE_ALIGN_TO_32 simde__m128d m128d[2];
#if defined(SIMDE_X86_AVX_NATIVE)
SIMDE_ALIGN_TO_32 __m256d n;
#elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(unsigned char) altivec_u8[2];
SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(unsigned short) altivec_u16[2];
SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(unsigned int) altivec_u32[2];
SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(signed char) altivec_i8[2];
SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(signed short) altivec_i16[2];
SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(signed int) altivec_i32[2];
SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(float) altivec_f32[2];
#if defined(SIMDE_POWER_ALTIVEC_P7_NATIVE)
SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(unsigned long long) altivec_u64[2];
SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(signed long long) altivec_i64[2];
SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(double) altivec_f64[2];
#endif
#endif
} simde__m256d_private;
typedef union {
#if defined(SIMDE_VECTOR_SUBSCRIPT)
SIMDE_ALIGN_TO_32 int8_t i8 SIMDE_VECTOR(32) SIMDE_MAY_ALIAS;
SIMDE_ALIGN_TO_32 int16_t i16 SIMDE_VECTOR(32) SIMDE_MAY_ALIAS;
SIMDE_ALIGN_TO_32 int32_t i32 SIMDE_VECTOR(32) SIMDE_MAY_ALIAS;
SIMDE_ALIGN_TO_32 int64_t i64 SIMDE_VECTOR(32) SIMDE_MAY_ALIAS;
SIMDE_ALIGN_TO_32 uint8_t u8 SIMDE_VECTOR(32) SIMDE_MAY_ALIAS;
SIMDE_ALIGN_TO_32 uint16_t u16 SIMDE_VECTOR(32) SIMDE_MAY_ALIAS;
SIMDE_ALIGN_TO_32 uint32_t u32 SIMDE_VECTOR(32) SIMDE_MAY_ALIAS;
SIMDE_ALIGN_TO_32 uint64_t u64 SIMDE_VECTOR(32) SIMDE_MAY_ALIAS;
#if defined(SIMDE_HAVE_INT128_)
SIMDE_ALIGN_TO_32 simde_int128 i128 SIMDE_VECTOR(32) SIMDE_MAY_ALIAS;
SIMDE_ALIGN_TO_32 simde_uint128 u128 SIMDE_VECTOR(32) SIMDE_MAY_ALIAS;
#endif
SIMDE_ALIGN_TO_32 simde_float32 f32 SIMDE_VECTOR(32) SIMDE_MAY_ALIAS;
SIMDE_ALIGN_TO_32 simde_float64 f64 SIMDE_VECTOR(32) SIMDE_MAY_ALIAS;
SIMDE_ALIGN_TO_32 int_fast32_t i32f SIMDE_VECTOR(32) SIMDE_MAY_ALIAS;
SIMDE_ALIGN_TO_32 uint_fast32_t u32f SIMDE_VECTOR(32) SIMDE_MAY_ALIAS;
#else
SIMDE_ALIGN_TO_32 int8_t i8[32];
SIMDE_ALIGN_TO_32 int16_t i16[16];
SIMDE_ALIGN_TO_32 int32_t i32[8];
SIMDE_ALIGN_TO_32 int64_t i64[4];
SIMDE_ALIGN_TO_32 uint8_t u8[32];
SIMDE_ALIGN_TO_32 uint16_t u16[16];
SIMDE_ALIGN_TO_32 uint32_t u32[8];
SIMDE_ALIGN_TO_32 uint64_t u64[4];
SIMDE_ALIGN_TO_32 int_fast32_t i32f[32 / sizeof(int_fast32_t)];
SIMDE_ALIGN_TO_32 uint_fast32_t u32f[32 / sizeof(uint_fast32_t)];
#if defined(SIMDE_HAVE_INT128_)
SIMDE_ALIGN_TO_32 simde_int128 i128[2];
SIMDE_ALIGN_TO_32 simde_uint128 u128[2];
#endif
SIMDE_ALIGN_TO_32 simde_float32 f32[8];
SIMDE_ALIGN_TO_32 simde_float64 f64[4];
#endif
SIMDE_ALIGN_TO_32 simde__m128i_private m128i_private[2];
SIMDE_ALIGN_TO_32 simde__m128i m128i[2];
#if defined(SIMDE_X86_AVX_NATIVE)
SIMDE_ALIGN_TO_32 __m256i n;
#elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(unsigned char) altivec_u8[2];
SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(unsigned short) altivec_u16[2];
SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(unsigned int) altivec_u32[2];
SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(signed char) altivec_i8[2];
SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(signed short) altivec_i16[2];
SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(signed int) altivec_i32[2];
SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(float) altivec_f32[2];
#if defined(SIMDE_POWER_ALTIVEC_P7_NATIVE)
SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(unsigned long long) altivec_u64[2];
SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(signed long long) altivec_i64[2];
SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(double) altivec_f64[2];
#endif
#endif
} simde__m256i_private;
#if defined(SIMDE_X86_AVX_NATIVE)
typedef __m256 simde__m256;
typedef __m256i simde__m256i;
typedef __m256d simde__m256d;
#elif defined(SIMDE_VECTOR_SUBSCRIPT)
typedef simde_float32 simde__m256 SIMDE_ALIGN_TO_32 SIMDE_VECTOR(32) SIMDE_MAY_ALIAS;
typedef int_fast32_t simde__m256i SIMDE_ALIGN_TO_32 SIMDE_VECTOR(32) SIMDE_MAY_ALIAS;
typedef simde_float64 simde__m256d SIMDE_ALIGN_TO_32 SIMDE_VECTOR(32) SIMDE_MAY_ALIAS;
#else
typedef simde__m256_private simde__m256;
typedef simde__m256i_private simde__m256i;
typedef simde__m256d_private simde__m256d;
#endif
#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
#if !defined(HEDLEY_INTEL_VERSION) && !defined(_AVXINTRIN_H_INCLUDED) && !defined(__AVXINTRIN_H) && !defined(_CMP_EQ_OQ)
typedef simde__m256 __m256;
typedef simde__m256i __m256i;
typedef simde__m256d __m256d;
#else
#undef __m256
#define __m256 simde__m256
#undef __m256i
#define __m256i simde__m256i
#undef __m256d
#define __m256d simde__m256d
#endif
#endif
HEDLEY_STATIC_ASSERT(32 == sizeof(simde__m256), "simde__m256 size incorrect");
HEDLEY_STATIC_ASSERT(32 == sizeof(simde__m256_private), "simde__m256_private size incorrect");
HEDLEY_STATIC_ASSERT(32 == sizeof(simde__m256i), "simde__m256i size incorrect");
HEDLEY_STATIC_ASSERT(32 == sizeof(simde__m256i_private), "simde__m256i_private size incorrect");
HEDLEY_STATIC_ASSERT(32 == sizeof(simde__m256d), "simde__m256d size incorrect");
HEDLEY_STATIC_ASSERT(32 == sizeof(simde__m256d_private), "simde__m256d_private size incorrect");
#if defined(SIMDE_CHECK_ALIGNMENT) && defined(SIMDE_ALIGN_OF)
HEDLEY_STATIC_ASSERT(SIMDE_ALIGN_OF(simde__m256) == 32, "simde__m256 is not 32-byte aligned");
HEDLEY_STATIC_ASSERT(SIMDE_ALIGN_OF(simde__m256_private) == 32, "simde__m256_private is not 32-byte aligned");
HEDLEY_STATIC_ASSERT(SIMDE_ALIGN_OF(simde__m256i) == 32, "simde__m256i is not 32-byte aligned");
HEDLEY_STATIC_ASSERT(SIMDE_ALIGN_OF(simde__m256i_private) == 32, "simde__m256i_private is not 32-byte aligned");
HEDLEY_STATIC_ASSERT(SIMDE_ALIGN_OF(simde__m256d) == 32, "simde__m256d is not 32-byte aligned");
HEDLEY_STATIC_ASSERT(SIMDE_ALIGN_OF(simde__m256d_private) == 32, "simde__m256d_private is not 32-byte aligned");
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde__m256
simde__m256_from_private(simde__m256_private v) {
simde__m256 r;
simde_memcpy(&r, &v, sizeof(r));
return r;
}
SIMDE_FUNCTION_ATTRIBUTES
simde__m256_private
simde__m256_to_private(simde__m256 v) {
simde__m256_private r;
simde_memcpy(&r, &v, sizeof(r));
return r;
}
SIMDE_FUNCTION_ATTRIBUTES
simde__m256i
simde__m256i_from_private(simde__m256i_private v) {
simde__m256i r;
simde_memcpy(&r, &v, sizeof(r));
return r;
}
SIMDE_FUNCTION_ATTRIBUTES
simde__m256i_private
simde__m256i_to_private(simde__m256i v) {
simde__m256i_private r;
simde_memcpy(&r, &v, sizeof(r));
return r;
}
SIMDE_FUNCTION_ATTRIBUTES
simde__m256d
simde__m256d_from_private(simde__m256d_private v) {
simde__m256d r;
simde_memcpy(&r, &v, sizeof(r));
return r;
}
SIMDE_FUNCTION_ATTRIBUTES
simde__m256d_private
simde__m256d_to_private(simde__m256d v) {
simde__m256d_private r;
simde_memcpy(&r, &v, sizeof(r));
return r;
}
#define SIMDE_CMP_EQ_OQ 0
#define SIMDE_CMP_LT_OS 1
#define SIMDE_CMP_LE_OS 2
#define SIMDE_CMP_UNORD_Q 3
#define SIMDE_CMP_NEQ_UQ 4
#define SIMDE_CMP_NLT_US 5
#define SIMDE_CMP_NLE_US 6
#define SIMDE_CMP_ORD_Q 7
#define SIMDE_CMP_EQ_UQ 8
#define SIMDE_CMP_NGE_US 9
#define SIMDE_CMP_NGT_US 10
#define SIMDE_CMP_FALSE_OQ 11
#define SIMDE_CMP_NEQ_OQ 12
#define SIMDE_CMP_GE_OS 13
#define SIMDE_CMP_GT_OS 14
#define SIMDE_CMP_TRUE_UQ 15
#define SIMDE_CMP_EQ_OS 16
#define SIMDE_CMP_LT_OQ 17
#define SIMDE_CMP_LE_OQ 18
#define SIMDE_CMP_UNORD_S 19
#define SIMDE_CMP_NEQ_US 20
#define SIMDE_CMP_NLT_UQ 21
#define SIMDE_CMP_NLE_UQ 22
#define SIMDE_CMP_ORD_S 23
#define SIMDE_CMP_EQ_US 24
#define SIMDE_CMP_NGE_UQ 25
#define SIMDE_CMP_NGT_UQ 26
#define SIMDE_CMP_FALSE_OS 27
#define SIMDE_CMP_NEQ_OS 28
#define SIMDE_CMP_GE_OQ 29
#define SIMDE_CMP_GT_OQ 30
#define SIMDE_CMP_TRUE_US 31
#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) && !defined(_CMP_EQ_OQ)
#define _CMP_EQ_OQ SIMDE_CMP_EQ_OQ
#define _CMP_LT_OS SIMDE_CMP_LT_OS
#define _CMP_LE_OS SIMDE_CMP_LE_OS
#define _CMP_UNORD_Q SIMDE_CMP_UNORD_Q
#define _CMP_NEQ_UQ SIMDE_CMP_NEQ_UQ
#define _CMP_NLT_US SIMDE_CMP_NLT_US
#define _CMP_NLE_US SIMDE_CMP_NLE_US
#define _CMP_ORD_Q SIMDE_CMP_ORD_Q
#define _CMP_EQ_UQ SIMDE_CMP_EQ_UQ
#define _CMP_NGE_US SIMDE_CMP_NGE_US
#define _CMP_NGT_US SIMDE_CMP_NGT_US
#define _CMP_FALSE_OQ SIMDE_CMP_FALSE_OQ
#define _CMP_NEQ_OQ SIMDE_CMP_NEQ_OQ
#define _CMP_GE_OS SIMDE_CMP_GE_OS
#define _CMP_GT_OS SIMDE_CMP_GT_OS
#define _CMP_TRUE_UQ SIMDE_CMP_TRUE_UQ
#define _CMP_EQ_OS SIMDE_CMP_EQ_OS
#define _CMP_LT_OQ SIMDE_CMP_LT_OQ
#define _CMP_LE_OQ SIMDE_CMP_LE_OQ
#define _CMP_UNORD_S SIMDE_CMP_UNORD_S
#define _CMP_NEQ_US SIMDE_CMP_NEQ_US
#define _CMP_NLT_UQ SIMDE_CMP_NLT_UQ
#define _CMP_NLE_UQ SIMDE_CMP_NLE_UQ
#define _CMP_ORD_S SIMDE_CMP_ORD_S
#define _CMP_EQ_US SIMDE_CMP_EQ_US
#define _CMP_NGE_UQ SIMDE_CMP_NGE_UQ
#define _CMP_NGT_UQ SIMDE_CMP_NGT_UQ
#define _CMP_FALSE_OS SIMDE_CMP_FALSE_OS
#define _CMP_NEQ_OS SIMDE_CMP_NEQ_OS
#define _CMP_GE_OQ SIMDE_CMP_GE_OQ
#define _CMP_GT_OQ SIMDE_CMP_GT_OQ
#define _CMP_TRUE_US SIMDE_CMP_TRUE_US
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde__m256d
simde_mm256_castps_pd (simde__m256 a) {
#if defined(SIMDE_X86_AVX_NATIVE)
return _mm256_castps_pd(a);
#else
return *HEDLEY_REINTERPRET_CAST(simde__m256d*, &a);
#endif
}
#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
#undef _mm256_castps_pd
#define _mm256_castps_pd(a) simde_mm256_castps_pd(a)
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde__m256i
simde_mm256_castps_si256 (simde__m256 a) {
#if defined(SIMDE_X86_AVX_NATIVE)
return _mm256_castps_si256(a);
#else
return *HEDLEY_REINTERPRET_CAST(simde__m256i*, &a);
#endif
}
#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
#undef _mm256_castps_si256
#define _mm256_castps_si256(a) simde_mm256_castps_si256(a)
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde__m256d
simde_mm256_castsi256_pd (simde__m256i a) {
#if defined(SIMDE_X86_AVX_NATIVE)
return _mm256_castsi256_pd(a);
#else
return *HEDLEY_REINTERPRET_CAST(simde__m256d*, &a);
#endif
}
#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
#undef _mm256_castsi256_pd
#define _mm256_castsi256_pd(a) simde_mm256_castsi256_pd(a)
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde__m256
simde_mm256_castsi256_ps (simde__m256i a) {
#if defined(SIMDE_X86_AVX_NATIVE)
return _mm256_castsi256_ps(a);
#else
return *HEDLEY_REINTERPRET_CAST(simde__m256*, &a);
#endif
}
#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
#undef _mm256_castsi256_ps
#define _mm256_castsi256_ps(a) simde_mm256_castsi256_ps(a)
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde__m256
simde_mm256_castpd_ps (simde__m256d a) {
#if defined(SIMDE_X86_AVX_NATIVE)
return _mm256_castpd_ps(a);
#else
return *HEDLEY_REINTERPRET_CAST(simde__m256*, &a);
#endif
}
#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
#undef _mm256_castpd_ps
#define _mm256_castpd_ps(a) simde_mm256_castpd_ps(a)
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde__m256i
simde_mm256_castpd_si256 (simde__m256d a) {
#if defined(SIMDE_X86_AVX_NATIVE)
return _mm256_castpd_si256(a);
#else
return *HEDLEY_REINTERPRET_CAST(simde__m256i*, &a);
#endif
}
#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
#undef _mm256_castpd_si256
#define _mm256_castpd_si256(a) simde_mm256_castpd_si256(a)
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde__m256i
simde_mm256_setzero_si256 (void) {
#if defined(SIMDE_X86_AVX_NATIVE)
return _mm256_setzero_si256();
#else
simde__m256i_private r_;
#if SIMDE_NATURAL_VECTOR_SIZE_LE(128)
r_.m128i[0] = simde_mm_setzero_si128();
r_.m128i[1] = simde_mm_setzero_si128();
#else
SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof(r_.i32f) / sizeof(r_.i32f[0])) ; i++) {
r_.i32f[i] = 0;
}
#endif
return simde__m256i_from_private(r_);
#endif
}
#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
#undef _mm256_setzero_si256
#define _mm256_setzero_si256() simde_mm256_setzero_si256()
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde__m256
simde_mm256_setzero_ps (void) {
#if defined(SIMDE_X86_AVX_NATIVE)
return _mm256_setzero_ps();
#else
return simde_mm256_castsi256_ps(simde_mm256_setzero_si256());
#endif
}
#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
#undef _mm256_setzero_ps
#define _mm256_setzero_ps() simde_mm256_setzero_ps()
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde__m256d
simde_mm256_setzero_pd (void) {
#if defined(SIMDE_X86_AVX_NATIVE)
return _mm256_setzero_pd();
#else
return simde_mm256_castsi256_pd(simde_mm256_setzero_si256());
#endif
}
#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
#undef _mm256_setzero_pd
#define _mm256_setzero_pd() simde_mm256_setzero_pd()
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde__m256
simde_x_mm256_not_ps(simde__m256 a) {
simde__m256_private
r_,
a_ = simde__m256_to_private(a);
#if defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
r_.i32 = ~a_.i32;
#elif SIMDE_NATURAL_VECTOR_SIZE_GE(128)
r_.m128[0] = simde_x_mm_not_ps(a_.m128[0]);
r_.m128[1] = simde_x_mm_not_ps(a_.m128[1]);
#else
SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
r_.i32[i] = ~(a_.i32[i]);
}
#endif
return simde__m256_from_private(r_);
}
SIMDE_FUNCTION_ATTRIBUTES
simde__m256
simde_x_mm256_select_ps(simde__m256 a, simde__m256 b, simde__m256 mask) {
/* This function is for when you want to blend two elements together
* according to a mask. It is similar to _mm256_blendv_ps, except that
* it is undefined whether the blend is based on the highest bit in
* each lane (like blendv) or just bitwise operations. This allows
* us to implement the function efficiently everywhere.
*
* Basically, you promise that all the lanes in mask are either 0 or
* ~0. */
#if defined(SIMDE_X86_AVX_NATIVE)
return _mm256_blendv_ps(a, b, mask);
#else
simde__m256_private
r_,
a_ = simde__m256_to_private(a),
b_ = simde__m256_to_private(b),
mask_ = simde__m256_to_private(mask);
#if defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
r_.i32 = a_.i32 ^ ((a_.i32 ^ b_.i32) & mask_.i32);
#elif SIMDE_NATURAL_VECTOR_SIZE_GE(128)
r_.m128[0] = simde_x_mm_select_ps(a_.m128[0], b_.m128[0], mask_.m128[0]);
r_.m128[1] = simde_x_mm_select_ps(a_.m128[1], b_.m128[1], mask_.m128[1]);
#else
SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
r_.i32[i] = a_.i32[i] ^ ((a_.i32[i] ^ b_.i32[i]) & mask_.i32[i]);
}
#endif
return simde__m256_from_private(r_);
#endif
}
SIMDE_FUNCTION_ATTRIBUTES
simde__m256d
simde_x_mm256_not_pd(simde__m256d a) {
simde__m256d_private
r_,
a_ = simde__m256d_to_private(a);
#if defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
r_.i64 = ~a_.i64;
#elif SIMDE_NATURAL_VECTOR_SIZE_GE(128)
r_.m128d[0] = simde_x_mm_not_pd(a_.m128d[0]);
r_.m128d[1] = simde_x_mm_not_pd(a_.m128d[1]);
#else
SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) {
r_.i64[i] = ~(a_.i64[i]);
}
#endif
return simde__m256d_from_private(r_);
}
SIMDE_FUNCTION_ATTRIBUTES
simde__m256d
simde_x_mm256_select_pd(simde__m256d a, simde__m256d b, simde__m256d mask) {
/* This function is for when you want to blend two elements together
* according to a mask. It is similar to _mm256_blendv_pd, except that
* it is undefined whether the blend is based on the highest bit in
* each lane (like blendv) or just bitwise operations. This allows
* us to implement the function efficiently everywhere.
*
* Basically, you promise that all the lanes in mask are either 0 or
* ~0. */
#if defined(SIMDE_X86_AVX_NATIVE)
return _mm256_blendv_pd(a, b, mask);
#else
simde__m256d_private
r_,
a_ = simde__m256d_to_private(a),
b_ = simde__m256d_to_private(b),
mask_ = simde__m256d_to_private(mask);
#if defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
r_.i64 = a_.i64 ^ ((a_.i64 ^ b_.i64) & mask_.i64);
#elif SIMDE_NATURAL_VECTOR_SIZE_GE(128)
r_.m128d[0] = simde_x_mm_select_pd(a_.m128d[0], b_.m128d[0], mask_.m128d[0]);
r_.m128d[1] = simde_x_mm_select_pd(a_.m128d[1], b_.m128d[1], mask_.m128d[1]);
#else
SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) {
r_.i64[i] = a_.i64[i] ^ ((a_.i64[i] ^ b_.i64[i]) & mask_.i64[i]);
}
#endif
return simde__m256d_from_private(r_);
#endif
}
SIMDE_FUNCTION_ATTRIBUTES
simde__m256i
simde_x_mm256_setone_si256 (void) {
simde__m256i_private r_;
#if defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
__typeof__(r_.i32f) rv = { 0, };
r_.i32f = ~rv;
#elif defined(SIMDE_X86_AVX2_NATIVE)
__m256i t = _mm256_setzero_si256();
r_.n = _mm256_cmpeq_epi32(t, t);
#else
SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof(r_.i32f) / sizeof(r_.i32f[0])) ; i++) {
r_.i32f[i] = ~HEDLEY_STATIC_CAST(int_fast32_t, 0);
}
#endif
return simde__m256i_from_private(r_);
}
SIMDE_FUNCTION_ATTRIBUTES
simde__m256
simde_x_mm256_setone_ps (void) {
return simde_mm256_castsi256_ps(simde_x_mm256_setone_si256());
}
SIMDE_FUNCTION_ATTRIBUTES
simde__m256d
simde_x_mm256_setone_pd (void) {
return simde_mm256_castsi256_pd(simde_x_mm256_setone_si256());
}
SIMDE_FUNCTION_ATTRIBUTES
simde__m256i
simde_mm256_set_epi8 (int8_t e31, int8_t e30, int8_t e29, int8_t e28,
int8_t e27, int8_t e26, int8_t e25, int8_t e24,
int8_t e23, int8_t e22, int8_t e21, int8_t e20,
int8_t e19, int8_t e18, int8_t e17, int8_t e16,
int8_t e15, int8_t e14, int8_t e13, int8_t e12,
int8_t e11, int8_t e10, int8_t e9, int8_t e8,
int8_t e7, int8_t e6, int8_t e5, int8_t e4,
int8_t e3, int8_t e2, int8_t e1, int8_t e0) {
#if defined(SIMDE_X86_AVX_NATIVE)
return _mm256_set_epi8(e31, e30, e29, e28, e27, e26, e25, e24,
e23, e22, e21, e20, e19, e18, e17, e16,
e15, e14, e13, e12, e11, e10, e9, e8,
e7, e6, e5, e4, e3, e2, e1, e0);
#else
simde__m256i_private r_;
#if SIMDE_NATURAL_VECTOR_SIZE_LE(128)
r_.m128i[0] = simde_mm_set_epi8(
e15, e14, e13, e12, e11, e10, e9, e8,
e7, e6, e5, e4, e3, e2, e1, e0);
r_.m128i[1] = simde_mm_set_epi8(
e31, e30, e29, e28, e27, e26, e25, e24,
e23, e22, e21, e20, e19, e18, e17, e16);
#else
r_.i8[ 0] = e0;
r_.i8[ 1] = e1;
r_.i8[ 2] = e2;
r_.i8[ 3] = e3;
r_.i8[ 4] = e4;
r_.i8[ 5] = e5;
r_.i8[ 6] = e6;
r_.i8[ 7] = e7;
r_.i8[ 8] = e8;
r_.i8[ 9] = e9;
r_.i8[10] = e10;
r_.i8[11] = e11;
r_.i8[12] = e12;
r_.i8[13] = e13;
r_.i8[14] = e14;
r_.i8[15] = e15;
r_.i8[16] = e16;
r_.i8[17] = e17;
r_.i8[18] = e18;
r_.i8[19] = e19;
r_.i8[20] = e20;
r_.i8[21] = e21;
r_.i8[22] = e22;
r_.i8[23] = e23;
r_.i8[24] = e24;
r_.i8[25] = e25;
r_.i8[26] = e26;
r_.i8[27] = e27;
r_.i8[28] = e28;
r_.i8[29] = e29;
r_.i8[30] = e30;
r_.i8[31] = e31;
#endif
return simde__m256i_from_private(r_);
#endif
}
#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
#undef _mm256_set_epi8
#define _mm256_set_epi8(e31, e30, e29, e28, e27, e26, e25, e24, e23, e22, e21, e20, e19, e18, e17, e16, e15, e14, e13, e12, e11, e10, e9, e8, e7, e6, e5, e4, e3, e2, e1, e0) \
simde_mm256_set_epi8(e31, e30, e29, e28, e27, e26, e25, e24, e23, e22, e21, e20, e19, e18, e17, e16, e15, e14, e13, e12, e11, e10, e9, e8, e7, e6, e5, e4, e3, e2, e1, e0)
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde__m256i
simde_mm256_set_epi16 (int16_t e15, int16_t e14, int16_t e13, int16_t e12,
int16_t e11, int16_t e10, int16_t e9, int16_t e8,
int16_t e7, int16_t e6, int16_t e5, int16_t e4,
int16_t e3, int16_t e2, int16_t e1, int16_t e0) {
#if defined(SIMDE_X86_AVX_NATIVE)
return _mm256_set_epi16(e15, e14, e13, e12, e11, e10, e9, e8,
e7, e6, e5, e4, e3, e2, e1, e0);
#else
simde__m256i_private r_;
#if SIMDE_NATURAL_VECTOR_SIZE_LE(128)
r_.m128i[0] = simde_mm_set_epi16( e7, e6, e5, e4, e3, e2, e1, e0);
r_.m128i[1] = simde_mm_set_epi16(e15, e14, e13, e12, e11, e10, e9, e8);
#else
r_.i16[ 0] = e0;
r_.i16[ 1] = e1;
r_.i16[ 2] = e2;
r_.i16[ 3] = e3;
r_.i16[ 4] = e4;
r_.i16[ 5] = e5;
r_.i16[ 6] = e6;
r_.i16[ 7] = e7;
r_.i16[ 8] = e8;
r_.i16[ 9] = e9;
r_.i16[10] = e10;
r_.i16[11] = e11;
r_.i16[12] = e12;
r_.i16[13] = e13;
r_.i16[14] = e14;
r_.i16[15] = e15;
#endif
return simde__m256i_from_private(r_);
#endif
}
#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
#undef _mm256_set_epi16
#define _mm256_set_epi16(e15, e14, e13, e12, e11, e10, e9, e8, e7, e6, e5, e4, e3, e2, e1, e0) \
simde_mm256_set_epi16(e15, e14, e13, e12, e11, e10, e9, e8, e7, e6, e5, e4, e3, e2, e1, e0)
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde__m256i
simde_mm256_set_epi32 (int32_t e7, int32_t e6, int32_t e5, int32_t e4,
int32_t e3, int32_t e2, int32_t e1, int32_t e0) {
#if defined(SIMDE_X86_AVX_NATIVE)
return _mm256_set_epi32(e7, e6, e5, e4, e3, e2, e1, e0);
#else
simde__m256i_private r_;
#if SIMDE_NATURAL_VECTOR_SIZE_LE(128)
r_.m128i[0] = simde_mm_set_epi32(e3, e2, e1, e0);
r_.m128i[1] = simde_mm_set_epi32(e7, e6, e5, e4);
#else
r_.i32[ 0] = e0;
r_.i32[ 1] = e1;
r_.i32[ 2] = e2;
r_.i32[ 3] = e3;
r_.i32[ 4] = e4;
r_.i32[ 5] = e5;
r_.i32[ 6] = e6;
r_.i32[ 7] = e7;
#endif
return simde__m256i_from_private(r_);
#endif
}
#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
#undef _mm256_set_epi32
#define _mm256_set_epi32(e7, e6, e5, e4, e3, e2, e1, e0) \
simde_mm256_set_epi32(e7, e6, e5, e4, e3, e2, e1, e0)
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde__m256i
simde_mm256_set_epi64x (int64_t e3, int64_t e2, int64_t e1, int64_t e0) {
#if defined(SIMDE_X86_AVX_NATIVE)
return _mm256_set_epi64x(e3, e2, e1, e0);
#else
simde__m256i_private r_;
#if SIMDE_NATURAL_VECTOR_SIZE_LE(128)
r_.m128i[0] = simde_mm_set_epi64x(e1, e0);
r_.m128i[1] = simde_mm_set_epi64x(e3, e2);
#else
r_.i64[0] = e0;
r_.i64[1] = e1;
r_.i64[2] = e2;
r_.i64[3] = e3;
#endif
return simde__m256i_from_private(r_);
#endif
}
#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
#undef _mm256_set_epi64x
#define _mm256_set_epi64x(e3, e2, e1, e0) simde_mm256_set_epi64x(e3, e2, e1, e0)
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde__m256i
simde_x_mm256_set_epu8 (uint8_t e31, uint8_t e30, uint8_t e29, uint8_t e28,
uint8_t e27, uint8_t e26, uint8_t e25, uint8_t e24,
uint8_t e23, uint8_t e22, uint8_t e21, uint8_t e20,
uint8_t e19, uint8_t e18, uint8_t e17, uint8_t e16,
uint8_t e15, uint8_t e14, uint8_t e13, uint8_t e12,
uint8_t e11, uint8_t e10, uint8_t e9, uint8_t e8,
uint8_t e7, uint8_t e6, uint8_t e5, uint8_t e4,
uint8_t e3, uint8_t e2, uint8_t e1, uint8_t e0) {
simde__m256i_private r_;
r_.u8[ 0] = e0;
r_.u8[ 1] = e1;
r_.u8[ 2] = e2;
r_.u8[ 3] = e3;
r_.u8[ 4] = e4;
r_.u8[ 5] = e5;
r_.u8[ 6] = e6;
r_.u8[ 7] = e7;
r_.u8[ 8] = e8;
r_.u8[ 9] = e9;
r_.u8[10] = e10;
r_.u8[11] = e11;
r_.u8[12] = e12;
r_.u8[13] = e13;
r_.u8[14] = e14;
r_.u8[15] = e15;
r_.u8[16] = e16;
r_.u8[17] = e17;
r_.u8[18] = e18;
r_.u8[19] = e19;
r_.u8[20] = e20;
r_.u8[20] = e20;
r_.u8[21] = e21;
r_.u8[22] = e22;
r_.u8[23] = e23;
r_.u8[24] = e24;
r_.u8[25] = e25;
r_.u8[26] = e26;
r_.u8[27] = e27;
r_.u8[28] = e28;
r_.u8[29] = e29;
r_.u8[30] = e30;
r_.u8[31] = e31;
return simde__m256i_from_private(r_);
}
SIMDE_FUNCTION_ATTRIBUTES
simde__m256i
simde_x_mm256_set_epu16 (uint16_t e15, uint16_t e14, uint16_t e13, uint16_t e12,
uint16_t e11, uint16_t e10, uint16_t e9, uint16_t e8,
uint16_t e7, uint16_t e6, uint16_t e5, uint16_t e4,
uint16_t e3, uint16_t e2, uint16_t e1, uint16_t e0) {
simde__m256i_private r_;
r_.u16[ 0] = e0;
r_.u16[ 1] = e1;
r_.u16[ 2] = e2;
r_.u16[ 3] = e3;
r_.u16[ 4] = e4;
r_.u16[ 5] = e5;
r_.u16[ 6] = e6;
r_.u16[ 7] = e7;
r_.u16[ 8] = e8;
r_.u16[ 9] = e9;
r_.u16[10] = e10;
r_.u16[11] = e11;
r_.u16[12] = e12;
r_.u16[13] = e13;
r_.u16[14] = e14;
r_.u16[15] = e15;
return simde__m256i_from_private(r_);
}
SIMDE_FUNCTION_ATTRIBUTES
simde__m256i
simde_x_mm256_set_epu32 (uint32_t e7, uint32_t e6, uint32_t e5, uint32_t e4,
uint32_t e3, uint32_t e2, uint32_t e1, uint32_t e0) {
#if defined(SIMDE_X86_AVX_NATIVE)
return _mm256_set_epi32(HEDLEY_STATIC_CAST(int32_t, e7), HEDLEY_STATIC_CAST(int32_t, e6), HEDLEY_STATIC_CAST(int32_t, e5), HEDLEY_STATIC_CAST(int32_t, e4),
HEDLEY_STATIC_CAST(int32_t, e3), HEDLEY_STATIC_CAST(int32_t, e2), HEDLEY_STATIC_CAST(int32_t, e1), HEDLEY_STATIC_CAST(int32_t, e0));
#else
simde__m256i_private r_;
#if SIMDE_NATURAL_VECTOR_SIZE_LE(128)
r_.m128i[0] = simde_mm_set_epi32(HEDLEY_STATIC_CAST(int32_t, e3), HEDLEY_STATIC_CAST(int32_t, e2), HEDLEY_STATIC_CAST(int32_t, e1), HEDLEY_STATIC_CAST(int32_t, e0));
r_.m128i[1] = simde_mm_set_epi32(HEDLEY_STATIC_CAST(int32_t, e7), HEDLEY_STATIC_CAST(int32_t, e6), HEDLEY_STATIC_CAST(int32_t, e5), HEDLEY_STATIC_CAST(int32_t, e4));
#else
r_.u32[ 0] = e0;
r_.u32[ 1] = e1;
r_.u32[ 2] = e2;
r_.u32[ 3] = e3;
r_.u32[ 4] = e4;
r_.u32[ 5] = e5;
r_.u32[ 6] = e6;
r_.u32[ 7] = e7;
#endif
return simde__m256i_from_private(r_);
#endif
}
SIMDE_FUNCTION_ATTRIBUTES
simde__m256i
simde_x_mm256_set_epu64x (uint64_t e3, uint64_t e2, uint64_t e1, uint64_t e0) {
simde__m256i_private r_;
r_.u64[0] = e0;
r_.u64[1] = e1;
r_.u64[2] = e2;
r_.u64[3] = e3;
return simde__m256i_from_private(r_);
}
SIMDE_FUNCTION_ATTRIBUTES
simde__m256
simde_mm256_set_ps (simde_float32 e7, simde_float32 e6, simde_float32 e5, simde_float32 e4,
simde_float32 e3, simde_float32 e2, simde_float32 e1, simde_float32 e0) {
#if defined(SIMDE_X86_AVX_NATIVE)
return _mm256_set_ps(e7, e6, e5, e4, e3, e2, e1, e0);
#else
simde__m256_private r_;
#if SIMDE_NATURAL_VECTOR_SIZE_LE(128)
r_.m128[0] = simde_mm_set_ps(e3, e2, e1, e0);
r_.m128[1] = simde_mm_set_ps(e7, e6, e5, e4);
#else
r_.f32[0] = e0;
r_.f32[1] = e1;
r_.f32[2] = e2;
r_.f32[3] = e3;
r_.f32[4] = e4;
r_.f32[5] = e5;
r_.f32[6] = e6;
r_.f32[7] = e7;
#endif
return simde__m256_from_private(r_);
#endif
}
#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
#undef _mm256_set_ps
#define _mm256_set_ps(e7, e6, e5, e4, e3, e2, e1, e0) \
simde_mm256_set_ps(e7, e6, e5, e4, e3, e2, e1, e0)
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde__m256d
simde_mm256_set_pd (simde_float64 e3, simde_float64 e2, simde_float64 e1, simde_float64 e0) {
#if defined(SIMDE_X86_AVX_NATIVE)
return _mm256_set_pd(e3, e2, e1, e0);
#else
simde__m256d_private r_;
#if SIMDE_NATURAL_VECTOR_SIZE_LE(128)
r_.m128d[0] = simde_mm_set_pd(e1, e0);
r_.m128d[1] = simde_mm_set_pd(e3, e2);
#else
r_.f64[0] = e0;
r_.f64[1] = e1;
r_.f64[2] = e2;
r_.f64[3] = e3;
#endif
return simde__m256d_from_private(r_);
#endif
}
#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
#undef _mm256_set_pd
#define _mm256_set_pd(e3, e2, e1, e0) \
simde_mm256_set_pd(e3, e2, e1, e0)
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde__m256
simde_mm256_set_m128 (simde__m128 e1, simde__m128 e0) {
#if defined(SIMDE_X86_AVX_NATIVE)
return _mm256_insertf128_ps(_mm256_castps128_ps256(e0), e1, 1);
#else
simde__m256_private r_;
simde__m128_private
e1_ = simde__m128_to_private(e1),
e0_ = simde__m128_to_private(e0);
#if SIMDE_NATURAL_VECTOR_SIZE_LE(128)
r_.m128_private[0] = e0_;
r_.m128_private[1] = e1_;
#elif defined(SIMDE_HAVE_INT128_)
r_.i128[0] = e0_.i128[0];
r_.i128[1] = e1_.i128[0];
#else
r_.i64[0] = e0_.i64[0];
r_.i64[1] = e0_.i64[1];
r_.i64[2] = e1_.i64[0];
r_.i64[3] = e1_.i64[1];
#endif
return simde__m256_from_private(r_);
#endif
}
#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
#undef _mm256_set_m128
#define _mm256_set_m128(e1, e0) simde_mm256_set_m128(e1, e0)
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde__m256d
simde_mm256_set_m128d (simde__m128d e1, simde__m128d e0) {
#if defined(SIMDE_X86_AVX_NATIVE)
return _mm256_insertf128_pd(_mm256_castpd128_pd256(e0), e1, 1);
#else
simde__m256d_private r_;
simde__m128d_private
e1_ = simde__m128d_to_private(e1),
e0_ = simde__m128d_to_private(e0);
#if SIMDE_NATURAL_VECTOR_SIZE_LE(128)
r_.m128d_private[0] = e0_;
r_.m128d_private[1] = e1_;
#else
r_.i64[0] = e0_.i64[0];
r_.i64[1] = e0_.i64[1];
r_.i64[2] = e1_.i64[0];
r_.i64[3] = e1_.i64[1];
#endif
return simde__m256d_from_private(r_);
#endif
}
#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
#undef _mm256_set_m128d
#define _mm256_set_m128d(e1, e0) simde_mm256_set_m128d(e1, e0)
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde__m256i
simde_mm256_set_m128i (simde__m128i e1, simde__m128i e0) {
#if defined(SIMDE_X86_AVX_NATIVE)
return _mm256_insertf128_si256(_mm256_castsi128_si256(e0), e1, 1);
#else
simde__m256i_private r_;
simde__m128i_private
e1_ = simde__m128i_to_private(e1),
e0_ = simde__m128i_to_private(e0);
#if SIMDE_NATURAL_VECTOR_SIZE_LE(128)
r_.m128i_private[0] = e0_;
r_.m128i_private[1] = e1_;
#else
r_.i64[0] = e0_.i64[0];
r_.i64[1] = e0_.i64[1];
r_.i64[2] = e1_.i64[0];
r_.i64[3] = e1_.i64[1];
#endif
return simde__m256i_from_private(r_);
#endif
}
#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
#undef _mm256_set_m128i
#define _mm256_set_m128i(e1, e0) simde_mm256_set_m128i(e1, e0)
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde__m256i
simde_mm256_set1_epi8 (int8_t a) {
#if defined(SIMDE_X86_AVX_NATIVE)
return _mm256_set1_epi8(a);
#else
simde__m256i_private r_;
#if SIMDE_NATURAL_VECTOR_SIZE_LE(128)
r_.m128i[0] = simde_mm_set1_epi8(a);
r_.m128i[1] = simde_mm_set1_epi8(a);
#else
SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) {
r_.i8[i] = a;
}
#endif
return simde__m256i_from_private(r_);
#endif
}
#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
#undef _mm256_set1_epi8
#define _mm256_set1_epi8(a) simde_mm256_set1_epi8(a)
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde__m256i
simde_mm256_set1_epi16 (int16_t a) {
#if defined(SIMDE_X86_AVX_NATIVE)
return _mm256_set1_epi16(a);
#else
simde__m256i_private r_;
#if SIMDE_NATURAL_VECTOR_SIZE_LE(128)
r_.m128i[0] = simde_mm_set1_epi16(a);
r_.m128i[1] = simde_mm_set1_epi16(a);
#else
SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) {
r_.i16[i] = a;
}
#endif
return simde__m256i_from_private(r_);
#endif
}
#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
#undef _mm256_set1_epi16
#define _mm256_set1_epi16(a) simde_mm256_set1_epi16(a)
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde__m256i
simde_mm256_set1_epi32 (int32_t a) {
#if defined(SIMDE_X86_AVX_NATIVE)
return _mm256_set1_epi32(a);
#else
simde__m256i_private r_;
#if SIMDE_NATURAL_VECTOR_SIZE_LE(128)
r_.m128i[0] = simde_mm_set1_epi32(a);
r_.m128i[1] = simde_mm_set1_epi32(a);
#else
SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
r_.i32[i] = a;
}
#endif
return simde__m256i_from_private(r_);
#endif
}
#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
#undef _mm256_set1_epi32
#define _mm256_set1_epi32(a) simde_mm256_set1_epi32(a)
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde__m256i
simde_mm256_set1_epi64x (int64_t a) {
#if defined(SIMDE_X86_AVX_NATIVE)
return _mm256_set1_epi64x(a);
#else
simde__m256i_private r_;
#if SIMDE_NATURAL_VECTOR_SIZE_LE(128)
r_.m128i[0] = simde_mm_set1_epi64x(a);
r_.m128i[1] = simde_mm_set1_epi64x(a);
#else
SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) {
r_.i64[i] = a;
}
#endif
return simde__m256i_from_private(r_);
#endif
}
#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
#undef _mm256_set1_epi64x
#define _mm256_set1_epi64x(a) simde_mm256_set1_epi64x(a)
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde__m256
simde_mm256_set1_ps (simde_float32 a) {
#if defined(SIMDE_X86_AVX_NATIVE)
return _mm256_set1_ps(a);
#else
simde__m256_private r_;
#if SIMDE_NATURAL_VECTOR_SIZE_LE(128)
r_.m128[0] = simde_mm_set1_ps(a);
r_.m128[1] = simde_mm_set1_ps(a);
#else
SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) {
r_.f32[i] = a;
}
#endif
return simde__m256_from_private(r_);
#endif
}
#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
#undef _mm256_set1_ps
#define _mm256_set1_ps(a) simde_mm256_set1_ps(a)
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde__m256d
simde_mm256_set1_pd (simde_float64 a) {
#if defined(SIMDE_X86_AVX_NATIVE)
return _mm256_set1_pd(a);
#else
simde__m256d_private r_;
#if SIMDE_NATURAL_VECTOR_SIZE_LE(128)
r_.m128d[0] = simde_mm_set1_pd(a);
r_.m128d[1] = simde_mm_set1_pd(a);
#else
SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) {
r_.f64[i] = a;
}
#endif
return simde__m256d_from_private(r_);
#endif
}
#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
#undef _mm256_set1_pd
#define _mm256_set1_pd(a) simde_mm256_set1_pd(a)
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde__m256i
simde_x_mm256_deinterleaveeven_epi16 (simde__m256i a, simde__m256i b) {
simde__m256i_private
r_,
a_ = simde__m256i_to_private(a),
b_ = simde__m256i_to_private(b);
#if SIMDE_NATURAL_VECTOR_SIZE_LE(128)
r_.m128i[0] = simde_x_mm_deinterleaveeven_epi16(a_.m128i[0], b_.m128i[0]);
r_.m128i[1] = simde_x_mm_deinterleaveeven_epi16(a_.m128i[1], b_.m128i[1]);
#elif defined(SIMDE_SHUFFLE_VECTOR_)
r_.i16 = SIMDE_SHUFFLE_VECTOR_(16, 32, a_.i16, b_.i16, 0, 2, 4, 6, 16, 18, 20, 22, 8, 10, 12, 14, 24, 26, 28, 30);
#else
const size_t halfway_point = (sizeof(r_.i16) / sizeof(r_.i16[0])) / 2;
const size_t quarter_point = (sizeof(r_.i16) / sizeof(r_.i16[0])) / 4;
for (size_t i = 0 ; i < quarter_point ; i++) {
r_.i16[i] = a_.i16[2 * i];
r_.i16[i + quarter_point] = b_.i16[2 * i];
r_.i16[halfway_point + i] = a_.i16[halfway_point + 2 * i];
r_.i16[halfway_point + i + quarter_point] = b_.i16[halfway_point + 2 * i];
}
#endif
return simde__m256i_from_private(r_);
}
SIMDE_FUNCTION_ATTRIBUTES
simde__m256i
simde_x_mm256_deinterleaveodd_epi16 (simde__m256i a, simde__m256i b) {
simde__m256i_private
r_,
a_ = simde__m256i_to_private(a),
b_ = simde__m256i_to_private(b);
#if SIMDE_NATURAL_VECTOR_SIZE_LE(128)
r_.m128i[0] = simde_x_mm_deinterleaveodd_epi16(a_.m128i[0], b_.m128i[0]);
r_.m128i[1] = simde_x_mm_deinterleaveodd_epi16(a_.m128i[1], b_.m128i[1]);
#elif defined(SIMDE_SHUFFLE_VECTOR_)
r_.i16 = SIMDE_SHUFFLE_VECTOR_(16, 32, a_.i16, b_.i16, 1, 3, 5, 7, 17, 19, 21, 23, 9, 11, 13, 15, 25, 27, 29, 31);
#else
const size_t halfway_point = (sizeof(r_.i16) / sizeof(r_.i16[0])) / 2;
const size_t quarter_point = (sizeof(r_.i16) / sizeof(r_.i16[0])) / 4;
for (size_t i = 0 ; i < quarter_point ; i++) {
r_.i16[i] = a_.i16[2 * i + 1];
r_.i16[i + quarter_point] = b_.i16[2 * i + 1];
r_.i16[halfway_point + i] = a_.i16[halfway_point + 2 * i + 1];
r_.i16[halfway_point + i + quarter_point] = b_.i16[halfway_point + 2 * i + 1];
}
#endif
return simde__m256i_from_private(r_);
}
SIMDE_FUNCTION_ATTRIBUTES
simde__m256i
simde_x_mm256_deinterleaveeven_epi32 (simde__m256i a, simde__m256i b) {
simde__m256i_private
r_,
a_ = simde__m256i_to_private(a),
b_ = simde__m256i_to_private(b);
#if SIMDE_NATURAL_VECTOR_SIZE_LE(128)
r_.m128i[0] = simde_x_mm_deinterleaveeven_epi32(a_.m128i[0], b_.m128i[0]);
r_.m128i[1] = simde_x_mm_deinterleaveeven_epi32(a_.m128i[1], b_.m128i[1]);
#elif defined(SIMDE_SHUFFLE_VECTOR_)
r_.i32 = SIMDE_SHUFFLE_VECTOR_(32, 32, a_.i32, b_.i32, 0, 2, 8, 10, 4, 6, 12, 14);
#else
const size_t halfway_point = (sizeof(r_.i32) / sizeof(r_.i32[0])) / 2;
const size_t quarter_point = (sizeof(r_.i32) / sizeof(r_.i32[0])) / 4;
for (size_t i = 0 ; i < quarter_point ; i++) {
r_.i32[i] = a_.i32[2 * i];
r_.i32[i + quarter_point] = b_.i32[2 * i];
r_.i32[halfway_point + i] = a_.i32[halfway_point + 2 * i];
r_.i32[halfway_point + i + quarter_point] = b_.i32[halfway_point + 2 * i];
}
#endif
return simde__m256i_from_private(r_);
}
SIMDE_FUNCTION_ATTRIBUTES
simde__m256i
simde_x_mm256_deinterleaveodd_epi32 (simde__m256i a, simde__m256i b) {
simde__m256i_private
r_,
a_ = simde__m256i_to_private(a),
b_ = simde__m256i_to_private(b);
#if SIMDE_NATURAL_VECTOR_SIZE_LE(128)
r_.m128i[0] = simde_x_mm_deinterleaveodd_epi32(a_.m128i[0], b_.m128i[0]);
r_.m128i[1] = simde_x_mm_deinterleaveodd_epi32(a_.m128i[1], b_.m128i[1]);
#elif defined(SIMDE_SHUFFLE_VECTOR_)
r_.i32 = SIMDE_SHUFFLE_VECTOR_(32, 32, a_.i32, b_.i32, 1, 3, 9, 11, 5, 7, 13, 15);
#else
const size_t halfway_point = (sizeof(r_.i32) / sizeof(r_.i32[0])) / 2;
const size_t quarter_point = (sizeof(r_.i32) / sizeof(r_.i32[0])) / 4;
for (size_t i = 0 ; i < quarter_point ; i++) {
r_.i32[i] = a_.i32[2 * i + 1];
r_.i32[i + quarter_point] = b_.i32[2 * i + 1];
r_.i32[halfway_point + i] = a_.i32[halfway_point + 2 * i + 1];
r_.i32[halfway_point + i + quarter_point] = b_.i32[halfway_point + 2 * i + 1];
}
#endif
return simde__m256i_from_private(r_);
}
SIMDE_FUNCTION_ATTRIBUTES
simde__m256
simde_x_mm256_deinterleaveeven_ps (simde__m256 a, simde__m256 b) {
simde__m256_private
r_,
a_ = simde__m256_to_private(a),
b_ = simde__m256_to_private(b);
#if SIMDE_NATURAL_VECTOR_SIZE_LE(128)
r_.m128[0] = simde_x_mm_deinterleaveeven_ps(a_.m128[0], b_.m128[0]);
r_.m128[1] = simde_x_mm_deinterleaveeven_ps(a_.m128[1], b_.m128[1]);
#elif defined(SIMDE_SHUFFLE_VECTOR_)
r_.f32 = SIMDE_SHUFFLE_VECTOR_(32, 32, a_.f32, b_.f32, 0, 2, 8, 10, 4, 6, 12, 14);
#else
const size_t halfway_point = (sizeof(r_.f32) / sizeof(r_.f32[0])) / 2;
const size_t quarter_point = (sizeof(r_.f32) / sizeof(r_.f32[0])) / 4;
for (size_t i = 0 ; i < quarter_point ; i++) {
r_.f32[i] = a_.f32[2 * i];
r_.f32[i + quarter_point] = b_.f32[2 * i];
r_.f32[halfway_point + i] = a_.f32[halfway_point + 2 * i];
r_.f32[halfway_point + i + quarter_point] = b_.f32[halfway_point + 2 * i];
}
#endif
return simde__m256_from_private(r_);
}
SIMDE_FUNCTION_ATTRIBUTES
simde__m256
simde_x_mm256_deinterleaveodd_ps (simde__m256 a, simde__m256 b) {
simde__m256_private
r_,
a_ = simde__m256_to_private(a),
b_ = simde__m256_to_private(b);
#if SIMDE_NATURAL_VECTOR_SIZE_LE(128)
r_.m128[0] = simde_x_mm_deinterleaveodd_ps(a_.m128[0], b_.m128[0]);
r_.m128[1] = simde_x_mm_deinterleaveodd_ps(a_.m128[1], b_.m128[1]);
#elif defined(SIMDE_SHUFFLE_VECTOR_)
r_.f32 = SIMDE_SHUFFLE_VECTOR_(32, 32, a_.f32, b_.f32, 1, 3, 9, 11, 5, 7, 13, 15);
#else
const size_t halfway_point = (sizeof(r_.f32) / sizeof(r_.f32[0])) / 2;
const size_t quarter_point = (sizeof(r_.f32) / sizeof(r_.f32[0])) / 4;
for (size_t i = 0 ; i < quarter_point ; i++) {
r_.f32[i] = a_.f32[2 * i + 1];
r_.f32[i + quarter_point] = b_.f32[2 * i + 1];
r_.f32[halfway_point + i] = a_.f32[halfway_point + 2 * i + 1];
r_.f32[halfway_point + i + quarter_point] = b_.f32[halfway_point + 2 * i + 1];
}
#endif
return simde__m256_from_private(r_);
}
SIMDE_FUNCTION_ATTRIBUTES
simde__m256d
simde_x_mm256_deinterleaveeven_pd (simde__m256d a, simde__m256d b) {
simde__m256d_private
r_,
a_ = simde__m256d_to_private(a),
b_ = simde__m256d_to_private(b);
#if SIMDE_NATURAL_VECTOR_SIZE_LE(128)
r_.m128d[0] = simde_x_mm_deinterleaveeven_pd(a_.m128d[0], b_.m128d[0]);
r_.m128d[1] = simde_x_mm_deinterleaveeven_pd(a_.m128d[1], b_.m128d[1]);
#elif defined(SIMDE_SHUFFLE_VECTOR_)
r_.f64 = SIMDE_SHUFFLE_VECTOR_(64, 32, a_.f64, b_.f64, 0, 4, 2, 6);
#else
const size_t halfway_point = (sizeof(r_.f64) / sizeof(r_.f64[0])) / 2;
const size_t quarter_point = (sizeof(r_.f64) / sizeof(r_.f64[0])) / 4;
for (size_t i = 0 ; i < quarter_point ; i++) {
r_.f64[i] = a_.f64[2 * i];
r_.f64[i + quarter_point] = b_.f64[2 * i];
r_.f64[halfway_point + i] = a_.f64[halfway_point + 2 * i];
r_.f64[halfway_point + i + quarter_point] = b_.f64[halfway_point + 2 * i];
}
#endif
return simde__m256d_from_private(r_);
}
SIMDE_FUNCTION_ATTRIBUTES
simde__m256d
simde_x_mm256_deinterleaveodd_pd (simde__m256d a, simde__m256d b) {
simde__m256d_private
r_,
a_ = simde__m256d_to_private(a),
b_ = simde__m256d_to_private(b);
#if SIMDE_NATURAL_VECTOR_SIZE_LE(128)
r_.m128d[0] = simde_x_mm_deinterleaveodd_pd(a_.m128d[0], b_.m128d[0]);
r_.m128d[1] = simde_x_mm_deinterleaveodd_pd(a_.m128d[1], b_.m128d[1]);
#elif defined(SIMDE_SHUFFLE_VECTOR_)
r_.f64 = SIMDE_SHUFFLE_VECTOR_(64, 32, a_.f64, b_.f64, 1, 5, 3, 7);
#else
const size_t halfway_point = (sizeof(r_.f64) / sizeof(r_.f64[0])) / 2;
const size_t quarter_point = (sizeof(r_.f64) / sizeof(r_.f64[0])) / 4;
for (size_t i = 0 ; i < quarter_point ; i++) {
r_.f64[i] = a_.f64[2 * i + 1];
r_.f64[i + quarter_point] = b_.f64[2 * i + 1];
r_.f64[halfway_point + i] = a_.f64[halfway_point + 2 * i + 1];
r_.f64[halfway_point + i + quarter_point] = b_.f64[halfway_point + 2 * i + 1];
}
#endif
return simde__m256d_from_private(r_);
}
SIMDE_FUNCTION_ATTRIBUTES
simde__m256
simde_x_mm256_abs_ps(simde__m256 a) {
simde__m256_private
r_,
a_ = simde__m256_to_private(a);
SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) {
r_.f32[i] = simde_math_fabsf(a_.f32[i]);
}
return simde__m256_from_private(r_);
}
SIMDE_FUNCTION_ATTRIBUTES
simde__m256d
simde_x_mm256_abs_pd(simde__m256d a) {
simde__m256d_private
r_,
a_ = simde__m256d_to_private(a);
SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) {
r_.f64[i] = simde_math_fabs(a_.f64[i]);
}
return simde__m256d_from_private(r_);
}
SIMDE_FUNCTION_ATTRIBUTES
simde__m256
simde_mm256_add_ps (simde__m256 a, simde__m256 b) {
#if defined(SIMDE_X86_AVX_NATIVE)
return _mm256_add_ps(a, b);
#else
simde__m256_private
r_,
a_ = simde__m256_to_private(a),
b_ = simde__m256_to_private(b);
#if SIMDE_NATURAL_VECTOR_SIZE_LE(128)
r_.m128[0] = simde_mm_add_ps(a_.m128[0], b_.m128[0]);
r_.m128[1] = simde_mm_add_ps(a_.m128[1], b_.m128[1]);
#elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
r_.f32 = a_.f32 + b_.f32;
#else
SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) {
r_.f32[i] = a_.f32[i] + b_.f32[i];
}
#endif
return simde__m256_from_private(r_);
#endif
}
#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
#undef _mm256_add_ps
#define _mm256_add_ps(a, b) simde_mm256_add_ps(a, b)
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde__m256
simde_mm256_hadd_ps (simde__m256 a, simde__m256 b) {
#if defined(SIMDE_X86_AVX_NATIVE)
return _mm256_hadd_ps(a, b);
#else
return simde_mm256_add_ps(simde_x_mm256_deinterleaveeven_ps(a, b), simde_x_mm256_deinterleaveodd_ps(a, b));
#endif
}
#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
#undef _mm256_hadd_ps
#define _mm256_hadd_ps(a, b) simde_mm256_hadd_ps(a, b)
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde__m256d
simde_mm256_add_pd (simde__m256d a, simde__m256d b) {
#if defined(SIMDE_X86_AVX_NATIVE)
return _mm256_add_pd(a, b);
#else
simde__m256d_private
r_,
a_ = simde__m256d_to_private(a),
b_ = simde__m256d_to_private(b);
#if SIMDE_NATURAL_VECTOR_SIZE_LE(128)
r_.m128d[0] = simde_mm_add_pd(a_.m128d[0], b_.m128d[0]);
r_.m128d[1] = simde_mm_add_pd(a_.m128d[1], b_.m128d[1]);
#elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
r_.f64 = a_.f64 + b_.f64;
#else
SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) {
r_.f64[i] = a_.f64[i] + b_.f64[i];
}
#endif
return simde__m256d_from_private(r_);
#endif
}
#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
#undef _mm256_add_pd
#define _mm256_add_pd(a, b) simde_mm256_add_pd(a, b)
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde__m256d
simde_mm256_hadd_pd (simde__m256d a, simde__m256d b) {
#if defined(SIMDE_X86_AVX_NATIVE)
return _mm256_hadd_pd(a, b);
#else
return simde_mm256_add_pd(simde_x_mm256_deinterleaveeven_pd(a, b), simde_x_mm256_deinterleaveodd_pd(a, b));
#endif
}
#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
#undef _mm256_hadd_pd
#define _mm256_hadd_pd(a, b) simde_mm256_hadd_pd(a, b)
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde__m256
simde_mm256_addsub_ps (simde__m256 a, simde__m256 b) {
#if defined(SIMDE_X86_AVX_NATIVE)
return _mm256_addsub_ps(a, b);
#else
simde__m256_private
r_,
a_ = simde__m256_to_private(a),
b_ = simde__m256_to_private(b);
#if SIMDE_NATURAL_VECTOR_SIZE_LE(128)
r_.m128[0] = simde_mm_addsub_ps(a_.m128[0], b_.m128[0]);
r_.m128[1] = simde_mm_addsub_ps(a_.m128[1], b_.m128[1]);
#else
SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i += 2) {
r_.f32[ i ] = a_.f32[ i ] - b_.f32[ i ];
r_.f32[i + 1] = a_.f32[i + 1] + b_.f32[i + 1];
}
#endif
return simde__m256_from_private(r_);
#endif
}
#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
#undef _mm256_addsub_ps
#define _mm256_addsub_ps(a, b) simde_mm256_addsub_ps(a, b)
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde__m256d
simde_mm256_addsub_pd (simde__m256d a, simde__m256d b) {
#if defined(SIMDE_X86_AVX_NATIVE)
return _mm256_addsub_pd(a, b);
#else
simde__m256d_private
r_,
a_ = simde__m256d_to_private(a),
b_ = simde__m256d_to_private(b);
#if SIMDE_NATURAL_VECTOR_SIZE_LE(128)
r_.m128d[0] = simde_mm_addsub_pd(a_.m128d[0], b_.m128d[0]);
r_.m128d[1] = simde_mm_addsub_pd(a_.m128d[1], b_.m128d[1]);
#else
SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i += 2) {
r_.f64[ i ] = a_.f64[ i ] - b_.f64[ i ];
r_.f64[i + 1] = a_.f64[i + 1] + b_.f64[i + 1];
}
#endif
return simde__m256d_from_private(r_);
#endif
}
#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
#undef _mm256_addsub_pd
#define _mm256_addsub_pd(a, b) simde_mm256_addsub_pd(a, b)
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde__m256
simde_mm256_and_ps (simde__m256 a, simde__m256 b) {
#if defined(SIMDE_X86_AVX_NATIVE)
return _mm256_and_ps(a, b);
#else
simde__m256_private
r_,
a_ = simde__m256_to_private(a),
b_ = simde__m256_to_private(b);
#if SIMDE_NATURAL_VECTOR_SIZE_LE(128)
r_.m128[0] = simde_mm_and_ps(a_.m128[0], b_.m128[0]);
r_.m128[1] = simde_mm_and_ps(a_.m128[1], b_.m128[1]);
#elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
r_.i32f = a_.i32f & b_.i32f;
#else
SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof(r_.i32f) / sizeof(r_.i32f[0])) ; i++) {
r_.i32f[i] = a_.i32f[i] & b_.i32f[i];
}
#endif
return simde__m256_from_private(r_);
#endif
}
#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
#undef _mm256_and_ps
#define _mm256_and_ps(a, b) simde_mm256_and_ps(a, b)
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde__m256d
simde_mm256_and_pd (simde__m256d a, simde__m256d b) {
#if defined(SIMDE_X86_AVX_NATIVE)
return _mm256_and_pd(a, b);
#else
simde__m256d_private
r_,
a_ = simde__m256d_to_private(a),
b_ = simde__m256d_to_private(b);
#if SIMDE_NATURAL_VECTOR_SIZE_LE(128)
r_.m128d[0] = simde_mm_and_pd(a_.m128d[0], b_.m128d[0]);
r_.m128d[1] = simde_mm_and_pd(a_.m128d[1], b_.m128d[1]);
#elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
r_.i32f = a_.i32f & b_.i32f;
#else
SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof(r_.i32f) / sizeof(r_.i32f[0])) ; i++) {
r_.i32f[i] = a_.i32f[i] & b_.i32f[i];
}
#endif
return simde__m256d_from_private(r_);
#endif
}
#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
#undef _mm256_and_pd
#define _mm256_and_pd(a, b) simde_mm256_and_pd(a, b)
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde__m256
simde_mm256_andnot_ps (simde__m256 a, simde__m256 b) {
#if defined(SIMDE_X86_AVX_NATIVE)
return _mm256_andnot_ps(a, b);
#else
simde__m256_private
r_,
a_ = simde__m256_to_private(a),
b_ = simde__m256_to_private(b);
#if SIMDE_NATURAL_VECTOR_SIZE_LE(128)
r_.m128[0] = simde_mm_andnot_ps(a_.m128[0], b_.m128[0]);
r_.m128[1] = simde_mm_andnot_ps(a_.m128[1], b_.m128[1]);
#elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
r_.i32f = ~a_.i32f & b_.i32f;
#else
SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof(r_.i32f) / sizeof(r_.i32f[0])) ; i++) {
r_.i32f[i] = ~a_.i32f[i] & b_.i32f[i];
}
#endif
return simde__m256_from_private(r_);
#endif
}
#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
#undef _mm256_andnot_ps
#define _mm256_andnot_ps(a, b) simde_mm256_andnot_ps(a, b)
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde__m256d
simde_mm256_andnot_pd (simde__m256d a, simde__m256d b) {
#if defined(SIMDE_X86_AVX_NATIVE)
return _mm256_andnot_pd(a, b);
#else
simde__m256d_private
r_,
a_ = simde__m256d_to_private(a),
b_ = simde__m256d_to_private(b);
#if SIMDE_NATURAL_VECTOR_SIZE_LE(128)
r_.m128d[0] = simde_mm_andnot_pd(a_.m128d[0], b_.m128d[0]);
r_.m128d[1] = simde_mm_andnot_pd(a_.m128d[1], b_.m128d[1]);
#elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
r_.i32f = ~a_.i32f & b_.i32f;
#else
SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof(r_.i32f) / sizeof(r_.i32f[0])) ; i++) {
r_.i32f[i] = ~a_.i32f[i] & b_.i32f[i];
}
#endif
return simde__m256d_from_private(r_);
#endif
}
#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
#undef _mm256_andnot_pd
#define _mm256_andnot_pd(a, b) simde_mm256_andnot_pd(a, b)
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde__m256
simde_mm256_blend_ps (simde__m256 a, simde__m256 b, const int imm8)
SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 255) {
simde__m256_private
r_,
a_ = simde__m256_to_private(a),
b_ = simde__m256_to_private(b);
SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) {
r_.f32[i] = ((imm8 >> i) & 1) ? b_.f32[i] : a_.f32[i];
}
return simde__m256_from_private(r_);
}
#if defined(SIMDE_X86_AVX_NATIVE)
# define simde_mm256_blend_ps(a, b, imm8) _mm256_blend_ps(a, b, imm8)
#elif SIMDE_NATURAL_VECTOR_SIZE_LE(128)
# define simde_mm256_blend_ps(a, b, imm8) \
simde_mm256_set_m128( \
simde_mm_blend_ps(simde_mm256_extractf128_ps(a, 1), simde_mm256_extractf128_ps(b, 1), (imm8) >> 4), \
simde_mm_blend_ps(simde_mm256_extractf128_ps(a, 0), simde_mm256_extractf128_ps(b, 0), (imm8) & 0x0F))
#endif
#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
#undef _mm256_blend_ps
#define _mm256_blend_ps(a, b, imm8) simde_mm256_blend_ps(a, b, imm8)
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde__m256d
simde_mm256_blend_pd (simde__m256d a, simde__m256d b, const int imm8)
SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 15) {
simde__m256d_private
r_,
a_ = simde__m256d_to_private(a),
b_ = simde__m256d_to_private(b);
SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) {
r_.f64[i] = ((imm8 >> i) & 1) ? b_.f64[i] : a_.f64[i];
}
return simde__m256d_from_private(r_);
}
#if defined(SIMDE_X86_AVX_NATIVE)
# define simde_mm256_blend_pd(a, b, imm8) _mm256_blend_pd(a, b, imm8)
#elif SIMDE_NATURAL_VECTOR_SIZE_LE(128)
# define simde_mm256_blend_pd(a, b, imm8) \
simde_mm256_set_m128d( \
simde_mm_blend_pd(simde_mm256_extractf128_pd(a, 1), simde_mm256_extractf128_pd(b, 1), (imm8) >> 2), \
simde_mm_blend_pd(simde_mm256_extractf128_pd(a, 0), simde_mm256_extractf128_pd(b, 0), (imm8) & 3))
#endif
#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
#undef _mm256_blend_pd
#define _mm256_blend_pd(a, b, imm8) simde_mm256_blend_pd(a, b, imm8)
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde__m256
simde_mm256_blendv_ps (simde__m256 a, simde__m256 b, simde__m256 mask) {
#if defined(SIMDE_X86_AVX_NATIVE)
return _mm256_blendv_ps(a, b, mask);
#else
simde__m256_private
r_,
a_ = simde__m256_to_private(a),
b_ = simde__m256_to_private(b),
mask_ = simde__m256_to_private(mask);
#if SIMDE_NATURAL_VECTOR_SIZE_LE(128)
r_.m128[0] = simde_mm_blendv_ps(a_.m128[0], b_.m128[0], mask_.m128[0]);
r_.m128[1] = simde_mm_blendv_ps(a_.m128[1], b_.m128[1], mask_.m128[1]);
#else
SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof(r_.u32) / sizeof(r_.u32[0])) ; i++) {
r_.f32[i] = (mask_.u32[i] & (UINT32_C(1) << 31)) ? b_.f32[i] : a_.f32[i];
}
#endif
return simde__m256_from_private(r_);
#endif
}
#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
#undef _mm256_blendv_ps
#define _mm256_blendv_ps(a, b, imm8) simde_mm256_blendv_ps(a, b, imm8)
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde__m256d
simde_mm256_blendv_pd (simde__m256d a, simde__m256d b, simde__m256d mask) {
#if defined(SIMDE_X86_AVX_NATIVE)
return _mm256_blendv_pd(a, b, mask);
#else
simde__m256d_private
r_,
a_ = simde__m256d_to_private(a),
b_ = simde__m256d_to_private(b),
mask_ = simde__m256d_to_private(mask);
#if SIMDE_NATURAL_VECTOR_SIZE_LE(128)
r_.m128d[0] = simde_mm_blendv_pd(a_.m128d[0], b_.m128d[0], mask_.m128d[0]);
r_.m128d[1] = simde_mm_blendv_pd(a_.m128d[1], b_.m128d[1], mask_.m128d[1]);
#else
SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof(r_.u64) / sizeof(r_.u64[0])) ; i++) {
r_.f64[i] = (mask_.u64[i] & (UINT64_C(1) << 63)) ? b_.f64[i] : a_.f64[i];
}
#endif
return simde__m256d_from_private(r_);
#endif
}
#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
#undef _mm256_blendv_pd
#define _mm256_blendv_pd(a, b, imm8) simde_mm256_blendv_pd(a, b, imm8)
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde__m256d
simde_mm256_broadcast_pd (simde__m128d const * mem_addr) {
#if defined(SIMDE_X86_AVX_NATIVE)
return _mm256_broadcast_pd(mem_addr);
#else
simde__m256d_private r_;
simde__m128d tmp = simde_mm_loadu_pd(HEDLEY_REINTERPRET_CAST(simde_float64 const*, mem_addr));
r_.m128d[0] = tmp;
r_.m128d[1] = tmp;
return simde__m256d_from_private(r_);
#endif
}
#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
#undef _mm256_broadcast_pd
#define _mm256_broadcast_pd(mem_addr) simde_mm256_broadcast_pd(mem_addr)
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde__m256
simde_mm256_broadcast_ps (simde__m128 const * mem_addr) {
#if defined(SIMDE_X86_AVX_NATIVE)
return _mm256_broadcast_ps(mem_addr);
#else
simde__m256_private r_;
simde__m128 tmp = simde_mm_loadu_ps(HEDLEY_REINTERPRET_CAST(simde_float32 const*, mem_addr));
r_.m128[0] = tmp;
r_.m128[1] = tmp;
return simde__m256_from_private(r_);
#endif
}
#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
#undef _mm256_broadcast_ps
#define _mm256_broadcast_ps(mem_addr) simde_mm256_broadcast_ps(HEDLEY_REINTERPRET_CAST(simde__m128 const*, mem_addr))
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde__m256d
simde_mm256_broadcast_sd (simde_float64 const * a) {
#if defined(SIMDE_X86_AVX_NATIVE)
return _mm256_broadcast_sd(a);
#else
return simde_mm256_set1_pd(*a);
#endif
}
#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
#undef _mm256_broadcast_sd
#define _mm256_broadcast_sd(mem_addr) simde_mm256_broadcast_sd(HEDLEY_REINTERPRET_CAST(double const*, mem_addr))
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde__m128
simde_mm_broadcast_ss (simde_float32 const * a) {
#if defined(SIMDE_X86_AVX_NATIVE)
return _mm_broadcast_ss(a);
#else
return simde_mm_set1_ps(*a);
#endif
}
#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
#undef _mm_broadcast_ss
#define _mm_broadcast_ss(mem_addr) simde_mm_broadcast_ss(mem_addr)
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde__m256
simde_mm256_broadcast_ss (simde_float32 const * a) {
#if defined(SIMDE_X86_AVX_NATIVE)
return _mm256_broadcast_ss(a);
#else
return simde_mm256_set1_ps(*a);
#endif
}
#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
#undef _mm256_broadcast_ss
#define _mm256_broadcast_ss(mem_addr) simde_mm256_broadcast_ss(mem_addr)
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde__m256d
simde_mm256_castpd128_pd256 (simde__m128d a) {
#if defined(SIMDE_X86_AVX_NATIVE)
return _mm256_castpd128_pd256(a);
#else
simde__m256d_private r_;
simde__m128d_private a_ = simde__m128d_to_private(a);
r_.m128d_private[0] = a_;
return simde__m256d_from_private(r_);
#endif
}
#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
#undef _mm256_castpd128_pd256
#define _mm256_castpd128_pd256(a) simde_mm256_castpd128_pd256(a)
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde__m128d
simde_mm256_castpd256_pd128 (simde__m256d a) {
#if defined(SIMDE_X86_AVX_NATIVE)
return _mm256_castpd256_pd128(a);
#else
simde__m256d_private a_ = simde__m256d_to_private(a);
return a_.m128d[0];
#endif
}
#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
#undef _mm256_castpd256_pd128
#define _mm256_castpd256_pd128(a) simde_mm256_castpd256_pd128(a)
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde__m256
simde_mm256_castps128_ps256 (simde__m128 a) {
#if defined(SIMDE_X86_AVX_NATIVE)
return _mm256_castps128_ps256(a);
#else
simde__m256_private r_;
simde__m128_private a_ = simde__m128_to_private(a);
r_.m128_private[0] = a_;
return simde__m256_from_private(r_);
#endif
}
#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
#undef _mm256_castps128_ps256
#define _mm256_castps128_ps256(a) simde_mm256_castps128_ps256(a)
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde__m128
simde_mm256_castps256_ps128 (simde__m256 a) {
#if defined(SIMDE_X86_AVX_NATIVE)
return _mm256_castps256_ps128(a);
#else
simde__m256_private a_ = simde__m256_to_private(a);
return a_.m128[0];
#endif
}
#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
#undef _mm256_castps256_ps128
#define _mm256_castps256_ps128(a) simde_mm256_castps256_ps128(a)
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde__m256i
simde_mm256_castsi128_si256 (simde__m128i a) {
#if defined(SIMDE_X86_AVX_NATIVE)
return _mm256_castsi128_si256(a);
#else
simde__m256i_private r_;
simde__m128i_private a_ = simde__m128i_to_private(a);
r_.m128i_private[0] = a_;
return simde__m256i_from_private(r_);
#endif
}
#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
#undef _mm256_castsi128_si256
#define _mm256_castsi128_si256(a) simde_mm256_castsi128_si256(a)
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde__m128i
simde_mm256_castsi256_si128 (simde__m256i a) {
#if defined(SIMDE_X86_AVX_NATIVE)
return _mm256_castsi256_si128(a);
#else
simde__m256i_private a_ = simde__m256i_to_private(a);
return a_.m128i[0];
#endif
}
#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
#undef _mm256_castsi256_si128
#define _mm256_castsi256_si128(a) simde_mm256_castsi256_si128(a)
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde__m256
simde_mm256_round_ps (simde__m256 a, const int rounding) {
simde__m256_private
r_,
a_ = simde__m256_to_private(a);
switch (rounding & ~SIMDE_MM_FROUND_NO_EXC) {
#if defined(simde_math_nearbyintf)
case SIMDE_MM_FROUND_CUR_DIRECTION:
for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) {
r_.f32[i] = simde_math_nearbyintf(a_.f32[i]);
}
break;
#endif
#if defined(simde_math_roundf)
case SIMDE_MM_FROUND_TO_NEAREST_INT:
for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) {
r_.f32[i] = simde_math_roundf(a_.f32[i]);
}
break;
#endif
#if defined(simde_math_floorf)
case SIMDE_MM_FROUND_TO_NEG_INF:
for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) {
r_.f32[i] = simde_math_floorf(a_.f32[i]);
}
break;
#endif
#if defined(simde_math_ceilf)
case SIMDE_MM_FROUND_TO_POS_INF:
for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) {
r_.f32[i] = simde_math_ceilf(a_.f32[i]);
}
break;
#endif
#if defined(simde_math_truncf)
case SIMDE_MM_FROUND_TO_ZERO:
for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) {
r_.f32[i] = simde_math_truncf(a_.f32[i]);
}
break;
#endif
default:
HEDLEY_UNREACHABLE_RETURN(simde_mm256_undefined_ps());
}
return simde__m256_from_private(r_);
}
#if defined(SIMDE_X86_AVX_NATIVE)
#define simde_mm256_round_ps(a, rounding) _mm256_round_ps(a, rounding)
#elif SIMDE_NATURAL_VECTOR_SIZE_LE(128) && defined(SIMDE_STATEMENT_EXPR_)
#define simde_mm256_round_ps(a, rounding) SIMDE_STATEMENT_EXPR_(({ \
simde__m256_private \
simde_mm256_round_ps_r_, \
simde_mm256_round_ps_a_ = simde__m256_to_private(a); \
\
for (size_t simde_mm256_round_ps_i = 0 ; simde_mm256_round_ps_i < (sizeof(simde_mm256_round_ps_r_.m128) / sizeof(simde_mm256_round_ps_r_.m128[0])) ; simde_mm256_round_ps_i++) { \
simde_mm256_round_ps_r_.m128[simde_mm256_round_ps_i] = simde_mm_round_ps(simde_mm256_round_ps_a_.m128[simde_mm256_round_ps_i], rounding); \
} \
\
simde__m256_from_private(simde_mm256_round_ps_r_); \
}))
#endif
#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
#undef _mm256_round_ps
#define _mm256_round_ps(a, rounding) simde_mm256_round_ps(a, rounding)
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde__m256d
simde_mm256_round_pd (simde__m256d a, const int rounding) {
simde__m256d_private
r_,
a_ = simde__m256d_to_private(a);
switch (rounding & ~SIMDE_MM_FROUND_NO_EXC) {
#if defined(simde_math_nearbyint)
case SIMDE_MM_FROUND_CUR_DIRECTION:
for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) {
r_.f64[i] = simde_math_nearbyint(a_.f64[i]);
}
break;
#endif
#if defined(simde_math_round)
case SIMDE_MM_FROUND_TO_NEAREST_INT:
for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) {
r_.f64[i] = simde_math_round(a_.f64[i]);
}
break;
#endif
#if defined(simde_math_floor)
case SIMDE_MM_FROUND_TO_NEG_INF:
for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) {
r_.f64[i] = simde_math_floor(a_.f64[i]);
}
break;
#endif
#if defined(simde_math_ceil)
case SIMDE_MM_FROUND_TO_POS_INF:
for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) {
r_.f64[i] = simde_math_ceil(a_.f64[i]);
}
break;
#endif
#if defined(simde_math_trunc)
case SIMDE_MM_FROUND_TO_ZERO:
for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) {
r_.f64[i] = simde_math_trunc(a_.f64[i]);
}
break;
#endif
default:
HEDLEY_UNREACHABLE_RETURN(simde_mm256_undefined_pd());
}
return simde__m256d_from_private(r_);
}
#if defined(SIMDE_X86_AVX_NATIVE)
#define simde_mm256_round_pd(a, rounding) _mm256_round_pd(a, rounding)
#elif SIMDE_NATURAL_VECTOR_SIZE_LE(128) && defined(SIMDE_STATEMENT_EXPR_)
#define simde_mm256_round_pd(a, rounding) SIMDE_STATEMENT_EXPR_(({ \
simde__m256d_private \
simde_mm256_round_pd_r_, \
simde_mm256_round_pd_a_ = simde__m256d_to_private(a); \
\
for (size_t simde_mm256_round_pd_i = 0 ; simde_mm256_round_pd_i < (sizeof(simde_mm256_round_pd_r_.m128d) / sizeof(simde_mm256_round_pd_r_.m128d[0])) ; simde_mm256_round_pd_i++) { \
simde_mm256_round_pd_r_.m128d[simde_mm256_round_pd_i] = simde_mm_round_pd(simde_mm256_round_pd_a_.m128d[simde_mm256_round_pd_i], rounding); \
} \
\
simde__m256d_from_private(simde_mm256_round_pd_r_); \
}))
#endif
#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
#undef _mm256_round_pd
#define _mm256_round_pd(a, rounding) simde_mm256_round_pd(a, rounding)
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde__m256d
simde_mm256_ceil_pd (simde__m256d a) {
return simde_mm256_round_pd(a, SIMDE_MM_FROUND_TO_POS_INF);
}
#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
#undef _mm256_ceil_pd
#define _mm256_ceil_pd(a) simde_mm256_ceil_pd(a)
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde__m256
simde_mm256_ceil_ps (simde__m256 a) {
return simde_mm256_round_ps(a, SIMDE_MM_FROUND_TO_POS_INF);
}
#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
#undef _mm256_ceil_ps
#define _mm256_ceil_ps(a) simde_mm256_ceil_ps(a)
#endif
HEDLEY_DIAGNOSTIC_PUSH
SIMDE_DIAGNOSTIC_DISABLE_FLOAT_EQUAL
/* This implementation does not support signaling NaNs (yet?) */
SIMDE_HUGE_FUNCTION_ATTRIBUTES
simde__m128d
simde_mm_cmp_pd (simde__m128d a, simde__m128d b, const int imm8)
SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 31) {
switch (imm8) {
case SIMDE_CMP_EQ_UQ:
case SIMDE_CMP_EQ_US:
return simde_mm_or_pd(simde_mm_cmpunord_pd(a, b), simde_mm_cmpeq_pd(a, b));
break;
case SIMDE_CMP_EQ_OQ:
case SIMDE_CMP_EQ_OS:
return simde_mm_cmpeq_pd(a, b);
break;
case SIMDE_CMP_NGE_US:
case SIMDE_CMP_NGE_UQ:
return simde_x_mm_not_pd(simde_mm_cmpge_pd(a, b));
break;
case SIMDE_CMP_LT_OS:
case SIMDE_CMP_LT_OQ:
return simde_mm_cmplt_pd(a, b);
break;
case SIMDE_CMP_NGT_US:
case SIMDE_CMP_NGT_UQ:
return simde_x_mm_not_pd(simde_mm_cmpgt_pd(a, b));
break;
case SIMDE_CMP_LE_OS:
case SIMDE_CMP_LE_OQ:
return simde_mm_cmple_pd(a, b);
break;
case SIMDE_CMP_NEQ_UQ:
case SIMDE_CMP_NEQ_US:
return simde_mm_cmpneq_pd(a, b);
break;
case SIMDE_CMP_NEQ_OQ:
case SIMDE_CMP_NEQ_OS:
return simde_mm_and_pd(simde_mm_cmpord_pd(a, b), simde_mm_cmpneq_pd(a, b));
break;
case SIMDE_CMP_NLT_US:
case SIMDE_CMP_NLT_UQ:
return simde_x_mm_not_pd(simde_mm_cmplt_pd(a, b));
break;
case SIMDE_CMP_GE_OS:
case SIMDE_CMP_GE_OQ:
return simde_mm_cmpge_pd(a, b);
break;
case SIMDE_CMP_NLE_US:
case SIMDE_CMP_NLE_UQ:
return simde_x_mm_not_pd(simde_mm_cmple_pd(a, b));
break;
case SIMDE_CMP_GT_OS:
case SIMDE_CMP_GT_OQ:
return simde_mm_cmpgt_pd(a, b);
break;
case SIMDE_CMP_FALSE_OQ:
case SIMDE_CMP_FALSE_OS:
return simde_mm_setzero_pd();
break;
case SIMDE_CMP_TRUE_UQ:
case SIMDE_CMP_TRUE_US:
return simde_x_mm_setone_pd();
break;
case SIMDE_CMP_UNORD_Q:
case SIMDE_CMP_UNORD_S:
return simde_mm_cmpunord_pd(a, b);
break;
case SIMDE_CMP_ORD_Q:
case SIMDE_CMP_ORD_S:
return simde_mm_cmpord_pd(a, b);
break;
}
HEDLEY_UNREACHABLE_RETURN(simde_mm_setzero_pd());
}
#if defined(__clang__) && defined(__AVX512DQ__)
#define simde_mm_cmp_pd(a, b, imm8) (__extension__ ({ \
simde__m128d simde_mm_cmp_pd_r; \
switch (imm8) { \
case SIMDE_CMP_FALSE_OQ: \
case SIMDE_CMP_FALSE_OS: \
simde_mm_cmp_pd_r = simde_mm_setzero_pd(); \
break; \
case SIMDE_CMP_TRUE_UQ: \
case SIMDE_CMP_TRUE_US: \
simde_mm_cmp_pd_r = simde_x_mm_setone_pd(); \
break; \
default: \
simde_mm_cmp_pd_r = simde_mm_cmp_pd(a, b, imm8); \
break; \
} \
simde_mm_cmp_pd_r; \
}))
#elif defined(SIMDE_X86_AVX_NATIVE)
# define simde_mm_cmp_pd(a, b, imm8) _mm_cmp_pd(a, b, imm8)
#endif
#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
#undef _mm_cmp_pd
#define _mm_cmp_pd(a, b, imm8) simde_mm_cmp_pd(a, b, imm8)
#endif
SIMDE_HUGE_FUNCTION_ATTRIBUTES
simde__m128
simde_mm_cmp_ps (simde__m128 a, simde__m128 b, const int imm8)
SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 31) {
switch (imm8) {
case SIMDE_CMP_EQ_UQ:
case SIMDE_CMP_EQ_US:
return simde_mm_or_ps(simde_mm_cmpunord_ps(a, b), simde_mm_cmpeq_ps(a, b));
break;
case SIMDE_CMP_EQ_OQ:
case SIMDE_CMP_EQ_OS:
return simde_mm_cmpeq_ps(a, b);
break;
case SIMDE_CMP_NGE_US:
case SIMDE_CMP_NGE_UQ:
return simde_x_mm_not_ps(simde_mm_cmpge_ps(a, b));
break;
case SIMDE_CMP_LT_OS:
case SIMDE_CMP_LT_OQ:
return simde_mm_cmplt_ps(a, b);
break;
case SIMDE_CMP_NGT_US:
case SIMDE_CMP_NGT_UQ:
return simde_x_mm_not_ps(simde_mm_cmpgt_ps(a, b));
break;
case SIMDE_CMP_LE_OS:
case SIMDE_CMP_LE_OQ:
return simde_mm_cmple_ps(a, b);
break;
case SIMDE_CMP_NEQ_UQ:
case SIMDE_CMP_NEQ_US:
return simde_mm_cmpneq_ps(a, b);
break;
case SIMDE_CMP_NEQ_OQ:
case SIMDE_CMP_NEQ_OS:
return simde_mm_and_ps(simde_mm_cmpord_ps(a, b), simde_mm_cmpneq_ps(a, b));
break;
case SIMDE_CMP_NLT_US:
case SIMDE_CMP_NLT_UQ:
return simde_x_mm_not_ps(simde_mm_cmplt_ps(a, b));
break;
case SIMDE_CMP_GE_OS:
case SIMDE_CMP_GE_OQ:
return simde_mm_cmpge_ps(a, b);
break;
case SIMDE_CMP_NLE_US:
case SIMDE_CMP_NLE_UQ:
return simde_x_mm_not_ps(simde_mm_cmple_ps(a, b));
break;
case SIMDE_CMP_GT_OS:
case SIMDE_CMP_GT_OQ:
return simde_mm_cmpgt_ps(a, b);
break;
case SIMDE_CMP_FALSE_OQ:
case SIMDE_CMP_FALSE_OS:
return simde_mm_setzero_ps();
break;
case SIMDE_CMP_TRUE_UQ:
case SIMDE_CMP_TRUE_US:
return simde_x_mm_setone_ps();
break;
case SIMDE_CMP_UNORD_Q:
case SIMDE_CMP_UNORD_S:
return simde_mm_cmpunord_ps(a, b);
break;
case SIMDE_CMP_ORD_Q:
case SIMDE_CMP_ORD_S:
return simde_mm_cmpord_ps(a, b);
break;
}
HEDLEY_UNREACHABLE_RETURN(simde_mm_setzero_ps());
}
/* Prior to 9.0 clang has problems with _mm{,256}_cmp_{ps,pd} for all four of the true/false
* comparisons, but only when AVX-512 is enabled. */
#if defined(__clang__) && defined(__AVX512DQ__)
#define simde_mm_cmp_ps(a, b, imm8) (__extension__ ({ \
simde__m128 simde_mm_cmp_ps_r; \
switch (imm8) { \
case SIMDE_CMP_FALSE_OQ: \
case SIMDE_CMP_FALSE_OS: \
simde_mm_cmp_ps_r = simde_mm_setzero_ps(); \
break; \
case SIMDE_CMP_TRUE_UQ: \
case SIMDE_CMP_TRUE_US: \
simde_mm_cmp_ps_r = simde_x_mm_setone_ps(); \
break; \
default: \
simde_mm_cmp_ps_r = simde_mm_cmp_ps(a, b, imm8); \
break; \
} \
simde_mm_cmp_ps_r; \
}))
#elif defined(SIMDE_X86_AVX_NATIVE)
#define simde_mm_cmp_ps(a, b, imm8) _mm_cmp_ps(a, b, imm8)
#endif
#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
#undef _mm_cmp_ps
#define _mm_cmp_ps(a, b, imm8) simde_mm_cmp_ps(a, b, imm8)
#endif
SIMDE_HUGE_FUNCTION_ATTRIBUTES
simde__m128d
simde_mm_cmp_sd (simde__m128d a, simde__m128d b, const int imm8)
SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 31) {
simde__m128d_private
a_ = simde__m128d_to_private(a),
b_ = simde__m128d_to_private(b);
switch (imm8) {
case SIMDE_CMP_EQ_OQ:
case SIMDE_CMP_EQ_OS:
a_.i64[0] = (a_.f64[0] == b_.f64[0]) ? ~INT64_C(0) : INT64_C(0);
break;
case SIMDE_CMP_LT_OQ:
case SIMDE_CMP_LT_OS:
a_.i64[0] = (a_.f64[0] < b_.f64[0]) ? ~INT64_C(0) : INT64_C(0);
break;
case SIMDE_CMP_LE_OQ:
case SIMDE_CMP_LE_OS:
a_.i64[0] = (a_.f64[0] <= b_.f64[0]) ? ~INT64_C(0) : INT64_C(0);
break;
case SIMDE_CMP_UNORD_Q:
case SIMDE_CMP_UNORD_S:
a_.i64[0] = ((a_.f64[0] != a_.f64[0]) || (b_.f64[0] != b_.f64[0])) ? ~INT64_C(0) : INT64_C(0);
break;
case SIMDE_CMP_NEQ_UQ:
case SIMDE_CMP_NEQ_US:
a_.i64[0] = ((a_.f64[0] == a_.f64[0]) & (b_.f64[0] == b_.f64[0]) & (a_.f64[0] != b_.f64[0])) ? ~INT64_C(0) : INT64_C(0);
break;
case SIMDE_CMP_NEQ_OQ:
case SIMDE_CMP_NEQ_OS:
a_.i64[0] = ((a_.f64[0] == a_.f64[0]) & (b_.f64[0] == b_.f64[0]) & (a_.f64[0] != b_.f64[0])) ? ~INT64_C(0) : INT64_C(0);
break;
case SIMDE_CMP_NLT_UQ:
case SIMDE_CMP_NLT_US:
a_.i64[0] = !(a_.f64[0] < b_.f64[0]) ? ~INT64_C(0) : INT64_C(0);
break;
case SIMDE_CMP_NLE_UQ:
case SIMDE_CMP_NLE_US:
a_.i64[0] = !(a_.f64[0] <= b_.f64[0]) ? ~INT64_C(0) : INT64_C(0);
break;
case SIMDE_CMP_ORD_Q:
case SIMDE_CMP_ORD_S:
a_.i64[0] = ((a_.f64[0] == a_.f64[0]) & (b_.f64[0] == b_.f64[0])) ? ~INT64_C(0) : INT64_C(0);
break;
case SIMDE_CMP_EQ_UQ:
case SIMDE_CMP_EQ_US:
a_.i64[0] = ((a_.f64[0] != a_.f64[0]) | (b_.f64[0] != b_.f64[0]) | (a_.f64[0] == b_.f64[0])) ? ~INT64_C(0) : INT64_C(0);
break;
case SIMDE_CMP_NGE_UQ:
case SIMDE_CMP_NGE_US:
a_.i64[0] = !(a_.f64[0] >= b_.f64[0]) ? ~INT64_C(0) : INT64_C(0);
break;
case SIMDE_CMP_NGT_UQ:
case SIMDE_CMP_NGT_US:
a_.i64[0] = !(a_.f64[0] > b_.f64[0]) ? ~INT64_C(0) : INT64_C(0);
break;
case SIMDE_CMP_FALSE_OQ:
case SIMDE_CMP_FALSE_OS:
a_.i64[0] = INT64_C(0);
break;
case SIMDE_CMP_GE_OQ:
case SIMDE_CMP_GE_OS:
a_.i64[0] = (a_.f64[0] >= b_.f64[0]) ? ~INT64_C(0) : INT64_C(0);
break;
case SIMDE_CMP_GT_OQ:
case SIMDE_CMP_GT_OS:
a_.i64[0] = (a_.f64[0] > b_.f64[0]) ? ~INT64_C(0) : INT64_C(0);
break;
case SIMDE_CMP_TRUE_UQ:
case SIMDE_CMP_TRUE_US:
a_.i64[0] = ~INT64_C(0);
break;
default:
HEDLEY_UNREACHABLE();
}
return simde__m128d_from_private(a_);
}
#if defined(SIMDE_X86_AVX_NATIVE)
# define simde_mm_cmp_sd(a, b, imm8) _mm_cmp_sd(a, b, imm8)
#endif
#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
#undef _mm_cmp_sd
#define _mm_cmp_sd(a, b, imm8) simde_mm_cmp_sd(a, b, imm8)
#endif
SIMDE_HUGE_FUNCTION_ATTRIBUTES
simde__m128
simde_mm_cmp_ss (simde__m128 a, simde__m128 b, const int imm8)
SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 31) {
simde__m128_private
a_ = simde__m128_to_private(a),
b_ = simde__m128_to_private(b);
switch (imm8) {
case SIMDE_CMP_EQ_OQ:
case SIMDE_CMP_EQ_OS:
a_.i32[0] = (a_.f32[0] == b_.f32[0]) ? ~INT32_C(0) : INT32_C(0);
break;
case SIMDE_CMP_LT_OQ:
case SIMDE_CMP_LT_OS:
a_.i32[0] = (a_.f32[0] < b_.f32[0]) ? ~INT32_C(0) : INT32_C(0);
break;
case SIMDE_CMP_LE_OQ:
case SIMDE_CMP_LE_OS:
a_.i32[0] = (a_.f32[0] <= b_.f32[0]) ? ~INT32_C(0) : INT32_C(0);
break;
case SIMDE_CMP_UNORD_Q:
case SIMDE_CMP_UNORD_S:
a_.i32[0] = ((a_.f32[0] != a_.f32[0]) || (b_.f32[0] != b_.f32[0])) ? ~INT32_C(0) : INT32_C(0);
break;
case SIMDE_CMP_NEQ_UQ:
case SIMDE_CMP_NEQ_US:
a_.i32[0] = ((a_.f32[0] == a_.f32[0]) & (b_.f32[0] == b_.f32[0]) & (a_.f32[0] != b_.f32[0])) ? ~INT32_C(0) : INT32_C(0);
break;
case SIMDE_CMP_NEQ_OQ:
case SIMDE_CMP_NEQ_OS:
a_.i32[0] = ((a_.f32[0] == a_.f32[0]) & (b_.f32[0] == b_.f32[0]) & (a_.f32[0] != b_.f32[0])) ? ~INT32_C(0) : INT32_C(0);
break;
case SIMDE_CMP_NLT_UQ:
case SIMDE_CMP_NLT_US:
a_.i32[0] = !(a_.f32[0] < b_.f32[0]) ? ~INT32_C(0) : INT32_C(0);
break;
case SIMDE_CMP_NLE_UQ:
case SIMDE_CMP_NLE_US:
a_.i32[0] = !(a_.f32[0] <= b_.f32[0]) ? ~INT32_C(0) : INT32_C(0);
break;
case SIMDE_CMP_ORD_Q:
case SIMDE_CMP_ORD_S:
a_.i32[0] = ((a_.f32[0] == a_.f32[0]) & (b_.f32[0] == b_.f32[0])) ? ~INT32_C(0) : INT32_C(0);
break;
case SIMDE_CMP_EQ_UQ:
case SIMDE_CMP_EQ_US:
a_.i32[0] = ((a_.f32[0] != a_.f32[0]) | (b_.f32[0] != b_.f32[0]) | (a_.f32[0] == b_.f32[0])) ? ~INT32_C(0) : INT32_C(0);
break;
case SIMDE_CMP_NGE_UQ:
case SIMDE_CMP_NGE_US:
a_.i32[0] = !(a_.f32[0] >= b_.f32[0]) ? ~INT32_C(0) : INT32_C(0);
break;
case SIMDE_CMP_NGT_UQ:
case SIMDE_CMP_NGT_US:
a_.i32[0] = !(a_.f32[0] > b_.f32[0]) ? ~INT32_C(0) : INT32_C(0);
break;
case SIMDE_CMP_FALSE_OQ:
case SIMDE_CMP_FALSE_OS:
a_.i32[0] = INT32_C(0);
break;
case SIMDE_CMP_GE_OQ:
case SIMDE_CMP_GE_OS:
a_.i32[0] = (a_.f32[0] >= b_.f32[0]) ? ~INT32_C(0) : INT32_C(0);
break;
case SIMDE_CMP_GT_OQ:
case SIMDE_CMP_GT_OS:
a_.i32[0] = (a_.f32[0] > b_.f32[0]) ? ~INT32_C(0) : INT32_C(0);
break;
case SIMDE_CMP_TRUE_UQ:
case SIMDE_CMP_TRUE_US:
a_.i32[0] = ~INT32_C(0);
break;
default:
HEDLEY_UNREACHABLE();
}
return simde__m128_from_private(a_);
}
#if defined(SIMDE_X86_AVX_NATIVE)
#define simde_mm_cmp_ss(a, b, imm8) _mm_cmp_ss(a, b, imm8)
#endif
#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
#undef _mm_cmp_ss
#define _mm_cmp_ss(a, b, imm8) simde_mm_cmp_ss(a, b, imm8)
#endif
SIMDE_HUGE_FUNCTION_ATTRIBUTES
simde__m256d
#if defined(__clang__) && defined(__AVX512DQ__)
simde_mm256_cmp_pd_internal_
#else
simde_mm256_cmp_pd
#endif
(simde__m256d a, simde__m256d b, const int imm8)
SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 31) {
simde__m256d_private
r_,
a_ = simde__m256d_to_private(a),
b_ = simde__m256d_to_private(b);
switch (imm8) {
case SIMDE_CMP_EQ_OQ:
case SIMDE_CMP_EQ_OS:
#if defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
r_.i64 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i64), (a_.f64 == b_.f64));
#else
SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) {
r_.i64[i] = (a_.f64[i] == b_.f64[i]) ? ~INT32_C(0) : INT32_C(0);
}
#endif
break;
case SIMDE_CMP_LT_OQ:
case SIMDE_CMP_LT_OS:
#if defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
r_.i64 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i64), (a_.f64 < b_.f64));
#else
SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) {
r_.i64[i] = (a_.f64[i] < b_.f64[i]) ? ~INT32_C(0) : INT32_C(0);
}
#endif
break;
case SIMDE_CMP_LE_OQ:
case SIMDE_CMP_LE_OS:
#if defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
r_.i64 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i64), (a_.f64 <= b_.f64));
#else
SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) {
r_.i64[i] = (a_.f64[i] <= b_.f64[i]) ? ~INT32_C(0) : INT32_C(0);
}
#endif
break;
case SIMDE_CMP_UNORD_Q:
case SIMDE_CMP_UNORD_S:
#if defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
r_.i64 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i64), (a_.f64 != a_.f64) | (b_.f64 != b_.f64));
#else
SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) {
r_.i64[i] = ((a_.f64[i] != a_.f64[i]) || (b_.f64[i] != b_.f64[i])) ? ~INT32_C(0) : INT32_C(0);
}
#endif
break;
case SIMDE_CMP_NEQ_UQ:
case SIMDE_CMP_NEQ_US:
#if defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
r_.i64 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i64), (a_.f64 != b_.f64));
#else
SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) {
r_.i64[i] = (a_.f64[i] != b_.f64[i]) ? ~INT32_C(0) : INT32_C(0);
}
#endif
break;
case SIMDE_CMP_NEQ_OQ:
case SIMDE_CMP_NEQ_OS:
#if defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
r_.i64 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i64), (a_.f64 == a_.f64) & (b_.f64 == b_.f64) & (a_.f64 != b_.f64));
#else
SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) {
r_.i64[i] = ((a_.f64[i] == a_.f64[i]) & (b_.f64[i] == b_.f64[i]) & (a_.f64[i] != b_.f64[i])) ? ~INT32_C(0) : INT32_C(0);
}
#endif
break;
case SIMDE_CMP_NLT_UQ:
case SIMDE_CMP_NLT_US:
#if defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
r_.i64 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i64), ~(a_.f64 < b_.f64));
#else
SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) {
r_.i64[i] = !(a_.f64[i] < b_.f64[i]) ? ~INT32_C(0) : INT32_C(0);
}
#endif
break;
case SIMDE_CMP_NLE_UQ:
case SIMDE_CMP_NLE_US:
#if defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
r_.i64 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i64), ~(a_.f64 <= b_.f64));
#else
SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) {
r_.i64[i] = !(a_.f64[i] <= b_.f64[i]) ? ~INT32_C(0) : INT32_C(0);
}
#endif
break;
case SIMDE_CMP_ORD_Q:
case SIMDE_CMP_ORD_S:
#if defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
r_.i64 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i64), ((a_.f64 == a_.f64) & (b_.f64 == b_.f64)));
#else
SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) {
r_.i64[i] = ((a_.f64[i] == a_.f64[i]) & (b_.f64[i] == b_.f64[i])) ? ~INT32_C(0) : INT32_C(0);
}
#endif
break;
case SIMDE_CMP_EQ_UQ:
case SIMDE_CMP_EQ_US:
#if defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
r_.i64 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i64), (a_.f64 != a_.f64) | (b_.f64 != b_.f64) | (a_.f64 == b_.f64));
#else
SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) {
r_.i64[i] = ((a_.f64[i] != a_.f64[i]) | (b_.f64[i] != b_.f64[i]) | (a_.f64[i] == b_.f64[i])) ? ~INT32_C(0) : INT32_C(0);
}
#endif
break;
case SIMDE_CMP_NGE_UQ:
case SIMDE_CMP_NGE_US:
#if defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
r_.i64 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i64), ~(a_.f64 >= b_.f64));
#else
SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) {
r_.i64[i] = !(a_.f64[i] >= b_.f64[i]) ? ~INT32_C(0) : INT32_C(0);
}
#endif
break;
case SIMDE_CMP_NGT_UQ:
case SIMDE_CMP_NGT_US:
#if defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
r_.i64 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i64), ~(a_.f64 > b_.f64));
#else
SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) {
r_.i64[i] = !(a_.f64[i] > b_.f64[i]) ? ~INT32_C(0) : INT32_C(0);
}
#endif
break;
case SIMDE_CMP_FALSE_OQ:
case SIMDE_CMP_FALSE_OS:
r_ = simde__m256d_to_private(simde_mm256_setzero_pd());
break;
case SIMDE_CMP_GE_OQ:
case SIMDE_CMP_GE_OS:
#if defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
r_.i64 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i64), (a_.f64 >= b_.f64));
#else
SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) {
r_.i64[i] = (a_.f64[i] >= b_.f64[i]) ? ~INT32_C(0) : INT32_C(0);
}
#endif
break;
case SIMDE_CMP_GT_OQ:
case SIMDE_CMP_GT_OS:
#if defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
r_.i64 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i64), (a_.f64 > b_.f64));
#else
SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) {
r_.i64[i] = (a_.f64[i] > b_.f64[i]) ? ~INT32_C(0) : INT32_C(0);
}
#endif
break;
case SIMDE_CMP_TRUE_UQ:
case SIMDE_CMP_TRUE_US:
r_ = simde__m256d_to_private(simde_x_mm256_setone_pd());
break;
default:
HEDLEY_UNREACHABLE();
}
return simde__m256d_from_private(r_);
}
#if defined(__clang__) && defined(__AVX512DQ__)
#define simde_mm256_cmp_pd(a, b, imm8) (__extension__ ({ \
simde__m256d simde_mm256_cmp_pd_r; \
switch (imm8) { \
case SIMDE_CMP_FALSE_OQ: \
case SIMDE_CMP_FALSE_OS: \
simde_mm256_cmp_pd_r = simde_mm256_setzero_pd(); \
break; \
case SIMDE_CMP_TRUE_UQ: \
case SIMDE_CMP_TRUE_US: \
simde_mm256_cmp_pd_r = simde_x_mm256_setone_pd(); \
break; \
default: \
simde_mm256_cmp_pd_r = simde_mm256_cmp_pd_internal_(a, b, imm8); \
break; \
} \
simde_mm256_cmp_pd_r; \
}))
#elif defined(SIMDE_X86_AVX_NATIVE)
#define simde_mm256_cmp_pd(a, b, imm8) _mm256_cmp_pd(a, b, imm8)
#endif
#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
#undef _mm256_cmp_pd
#define _mm256_cmp_pd(a, b, imm8) simde_mm256_cmp_pd(a, b, imm8)
#endif
SIMDE_HUGE_FUNCTION_ATTRIBUTES
simde__m256
#if defined(__clang__) && defined(__AVX512DQ__)
simde_mm256_cmp_ps_internal_
#else
simde_mm256_cmp_ps
#endif
(simde__m256 a, simde__m256 b, const int imm8)
SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 31) {
simde__m256_private
r_,
a_ = simde__m256_to_private(a),
b_ = simde__m256_to_private(b);
switch (imm8) {
case SIMDE_CMP_EQ_OQ:
case SIMDE_CMP_EQ_OS:
#if defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
r_.i32 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i32), (a_.f32 == b_.f32));
#else
SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) {
r_.i32[i] = (a_.f32[i] == b_.f32[i]) ? ~INT32_C(0) : INT32_C(0);
}
#endif
break;
case SIMDE_CMP_LT_OQ:
case SIMDE_CMP_LT_OS:
#if defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
r_.i32 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i32), (a_.f32 < b_.f32));
#else
SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) {
r_.i32[i] = (a_.f32[i] < b_.f32[i]) ? ~INT32_C(0) : INT32_C(0);
}
#endif
break;
case SIMDE_CMP_LE_OQ:
case SIMDE_CMP_LE_OS:
#if defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
r_.i32 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i32), (a_.f32 <= b_.f32));
#else
SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) {
r_.i32[i] = (a_.f32[i] <= b_.f32[i]) ? ~INT32_C(0) : INT32_C(0);
}
#endif
break;
case SIMDE_CMP_UNORD_Q:
case SIMDE_CMP_UNORD_S:
#if defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
r_.i32 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i32), (a_.f32 != a_.f32) | (b_.f32 != b_.f32));
#else
SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) {
r_.i32[i] = ((a_.f32[i] != a_.f32[i]) || (b_.f32[i] != b_.f32[i])) ? ~INT32_C(0) : INT32_C(0);
}
#endif
break;
case SIMDE_CMP_NEQ_UQ:
case SIMDE_CMP_NEQ_US:
#if defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
r_.i32 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i32), (a_.f32 != b_.f32));
#else
SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) {
r_.i32[i] = (a_.f32[i] != b_.f32[i]) ? ~INT32_C(0) : INT32_C(0);
}
#endif
break;
case SIMDE_CMP_NEQ_OQ:
case SIMDE_CMP_NEQ_OS:
#if defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
r_.i32 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i32), (a_.f32 == a_.f32) & (b_.f32 == b_.f32) & (a_.f32 != b_.f32));
#else
SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) {
r_.i32[i] = ((a_.f32[i] == a_.f32[i]) & (b_.f32[i] == b_.f32[i]) & (a_.f32[i] != b_.f32[i])) ? ~INT32_C(0) : INT32_C(0);
}
#endif
break;
case SIMDE_CMP_NLT_UQ:
case SIMDE_CMP_NLT_US:
#if defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
r_.i32 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i32), ~(a_.f32 < b_.f32));
#else
SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) {
r_.i32[i] = !(a_.f32[i] < b_.f32[i]) ? ~INT32_C(0) : INT32_C(0);
}
#endif
break;
case SIMDE_CMP_NLE_UQ:
case SIMDE_CMP_NLE_US:
#if defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
r_.i32 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i32), ~(a_.f32 <= b_.f32));
#else
SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) {
r_.i32[i] = !(a_.f32[i] <= b_.f32[i]) ? ~INT32_C(0) : INT32_C(0);
}
#endif
break;
case SIMDE_CMP_ORD_Q:
case SIMDE_CMP_ORD_S:
#if defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
r_.i32 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i32), ((a_.f32 == a_.f32) & (b_.f32 == b_.f32)));
#else
SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) {
r_.i32[i] = ((a_.f32[i] == a_.f32[i]) & (b_.f32[i] == b_.f32[i])) ? ~INT32_C(0) : INT32_C(0);
}
#endif
break;
case SIMDE_CMP_EQ_UQ:
case SIMDE_CMP_EQ_US:
#if defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
r_.i32 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i32), (a_.f32 != a_.f32) | (b_.f32 != b_.f32) | (a_.f32 == b_.f32));
#else
SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) {
r_.i32[i] = ((a_.f32[i] != a_.f32[i]) | (b_.f32[i] != b_.f32[i]) | (a_.f32[i] == b_.f32[i])) ? ~INT32_C(0) : INT32_C(0);
}
#endif
break;
case SIMDE_CMP_NGE_UQ:
case SIMDE_CMP_NGE_US:
#if defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
r_.i32 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i32), ~(a_.f32 >= b_.f32));
#else
SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) {
r_.i32[i] = !(a_.f32[i] >= b_.f32[i]) ? ~INT32_C(0) : INT32_C(0);
}
#endif
break;
case SIMDE_CMP_NGT_UQ:
case SIMDE_CMP_NGT_US:
#if defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
r_.i32 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i32), ~(a_.f32 > b_.f32));
#else
SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) {
r_.i32[i] = !(a_.f32[i] > b_.f32[i]) ? ~INT32_C(0) : INT32_C(0);
}
#endif
break;
case SIMDE_CMP_FALSE_OQ:
case SIMDE_CMP_FALSE_OS:
r_ = simde__m256_to_private(simde_mm256_setzero_ps());
break;
case SIMDE_CMP_GE_OQ:
case SIMDE_CMP_GE_OS:
#if defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
r_.i32 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i32), (a_.f32 >= b_.f32));
#else
SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) {
r_.i32[i] = (a_.f32[i] >= b_.f32[i]) ? ~INT32_C(0) : INT32_C(0);
}
#endif
break;
case SIMDE_CMP_GT_OQ:
case SIMDE_CMP_GT_OS:
#if defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
r_.i32 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i32), (a_.f32 > b_.f32));
#else
SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) {
r_.i32[i] = (a_.f32[i] > b_.f32[i]) ? ~INT32_C(0) : INT32_C(0);
}
#endif
break;
case SIMDE_CMP_TRUE_UQ:
case SIMDE_CMP_TRUE_US:
r_ = simde__m256_to_private(simde_x_mm256_setone_ps());
break;
default:
HEDLEY_UNREACHABLE();
}
return simde__m256_from_private(r_);
}
#if defined(__clang__) && defined(__AVX512DQ__)
#define simde_mm256_cmp_ps(a, b, imm8) (__extension__ ({ \
simde__m256 simde_mm256_cmp_ps_r; \
switch (imm8) { \
case SIMDE_CMP_FALSE_OQ: \
case SIMDE_CMP_FALSE_OS: \
simde_mm256_cmp_ps_r = simde_mm256_setzero_ps(); \
break; \
case SIMDE_CMP_TRUE_UQ: \
case SIMDE_CMP_TRUE_US: \
simde_mm256_cmp_ps_r = simde_x_mm256_setone_ps(); \
break; \
default: \
simde_mm256_cmp_ps_r = simde_mm256_cmp_ps_internal_(a, b, imm8); \
break; \
} \
simde_mm256_cmp_ps_r; \
}))
#elif defined(SIMDE_X86_AVX_NATIVE)
#define simde_mm256_cmp_ps(a, b, imm8) _mm256_cmp_ps(a, b, imm8)
#elif defined(SIMDE_STATEMENT_EXPR_) && SIMDE_NATURAL_VECTOR_SIZE_LE(128)
#define simde_mm256_cmp_ps(a, b, imm8) SIMDE_STATEMENT_EXPR_(({ \
simde__m256_private \
simde_mm256_cmp_ps_r_, \
simde_mm256_cmp_ps_a_ = simde__m256_to_private((a)), \
simde_mm256_cmp_ps_b_ = simde__m256_to_private((b)); \
\
for (size_t i = 0 ; i < (sizeof(simde_mm256_cmp_ps_r_.m128) / sizeof(simde_mm256_cmp_ps_r_.m128[0])) ; i++) { \
simde_mm256_cmp_ps_r_.m128[i] = simde_mm_cmp_ps(simde_mm256_cmp_ps_a_.m128[i], simde_mm256_cmp_ps_b_.m128[i], (imm8)); \
} \
\
simde__m256_from_private(simde_mm256_cmp_ps_r_); \
}))
#endif
#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
#undef _mm256_cmp_ps
#define _mm256_cmp_ps(a, b, imm8) simde_mm256_cmp_ps(a, b, imm8)
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde__m256
simde_x_mm256_copysign_ps(simde__m256 dest, simde__m256 src) {
simde__m256_private
r_,
dest_ = simde__m256_to_private(dest),
src_ = simde__m256_to_private(src);
#if defined(simde_math_copysignf)
SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) {
r_.f32[i] = simde_math_copysignf(dest_.f32[i], src_.f32[i]);
}
#else
simde__m256 sgnbit = simde_mm256_xor_ps(simde_mm256_set1_ps(SIMDE_FLOAT32_C(0.0)), simde_mm256_set1_ps(-SIMDE_FLOAT32_C(0.0)));
return simde_mm256_xor_ps(simde_mm256_and_ps(sgnbit, src), simde_mm256_andnot_ps(sgnbit, dest));
#endif
return simde__m256_from_private(r_);
}
SIMDE_FUNCTION_ATTRIBUTES
simde__m256d
simde_x_mm256_copysign_pd(simde__m256d dest, simde__m256d src) {
simde__m256d_private
r_,
dest_ = simde__m256d_to_private(dest),
src_ = simde__m256d_to_private(src);
#if defined(simde_math_copysign)
SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) {
r_.f64[i] = simde_math_copysign(dest_.f64[i], src_.f64[i]);
}
#else
simde__m256d sgnbit = simde_mm256_xor_pd(simde_mm256_set1_pd(SIMDE_FLOAT64_C(0.0)), simde_mm256_set1_pd(-SIMDE_FLOAT64_C(0.0)));
return simde_mm256_xor_pd(simde_mm256_and_pd(sgnbit, src), simde_mm256_andnot_pd(sgnbit, dest));
#endif
return simde__m256d_from_private(r_);
}
HEDLEY_DIAGNOSTIC_POP /* -Wfloat-equal */
SIMDE_FUNCTION_ATTRIBUTES
simde__m256d
simde_mm256_cvtepi32_pd (simde__m128i a) {
#if defined(SIMDE_X86_AVX_NATIVE)
return _mm256_cvtepi32_pd(a);
#else
simde__m256d_private r_;
simde__m128i_private a_ = simde__m128i_to_private(a);
SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) {
r_.f64[i] = HEDLEY_STATIC_CAST(simde_float64, a_.i32[i]);
}
return simde__m256d_from_private(r_);
#endif
}
#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
#undef _mm256_cvtepi32_pd
#define _mm256_cvtepi32_pd(a) simde_mm256_cvtepi32_pd(a)
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde__m256
simde_mm256_cvtepi32_ps (simde__m256i a) {
#if defined(SIMDE_X86_AVX_NATIVE)
return _mm256_cvtepi32_ps(a);
#else
simde__m256_private r_;
simde__m256i_private a_ = simde__m256i_to_private(a);
SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) {
r_.f32[i] = HEDLEY_STATIC_CAST(simde_float32, a_.i32[i]);
}
return simde__m256_from_private(r_);
#endif
}
#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
#undef _mm256_cvtepi32_ps
#define _mm256_cvtepi32_ps(a) simde_mm256_cvtepi32_ps(a)
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde__m128i
simde_mm256_cvtpd_epi32 (simde__m256d a) {
#if defined(SIMDE_X86_AVX_NATIVE)
return _mm256_cvtpd_epi32(a);
#else
simde__m128i_private r_;
simde__m256d_private a_ = simde__m256d_to_private(a);
#if defined(simde_math_nearbyint)
SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof(a_.f64) / sizeof(a_.f64[0])) ; i++) {
r_.i32[i] = SIMDE_CONVERT_FTOI(int32_t, simde_math_nearbyint(a_.f64[i]));
}
#else
HEDLEY_UNREACHABLE();
#endif
return simde__m128i_from_private(r_);
#endif
}
#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
#undef _mm256_cvtpd_epi32
#define _mm256_cvtpd_epi32(a) simde_mm256_cvtpd_epi32(a)
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde__m128
simde_mm256_cvtpd_ps (simde__m256d a) {
#if defined(SIMDE_X86_AVX_NATIVE)
return _mm256_cvtpd_ps(a);
#else
simde__m128_private r_;
simde__m256d_private a_ = simde__m256d_to_private(a);
SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) {
r_.f32[i] = HEDLEY_STATIC_CAST(simde_float32, a_.f64[i]);
}
return simde__m128_from_private(r_);
#endif
}
#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
#undef _mm256_cvtpd_ps
#define _mm256_cvtpd_ps(a) simde_mm256_cvtpd_ps(a)
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde__m256i
simde_mm256_cvtps_epi32 (simde__m256 a) {
#if defined(SIMDE_X86_AVX_NATIVE)
return _mm256_cvtps_epi32(a);
#else
simde__m256i_private r_;
simde__m256_private a_ = simde__m256_to_private(a);
#if defined(simde_math_nearbyintf)
SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof(a_.f32) / sizeof(a_.f32[0])) ; i++) {
r_.i32[i] = SIMDE_CONVERT_FTOI(int32_t, simde_math_nearbyintf(a_.f32[i]));
}
#else
HEDLEY_UNREACHABLE();
#endif
return simde__m256i_from_private(r_);
#endif
}
#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
#undef _mm256_cvtps_epi32
#define _mm256_cvtps_epi32(a) simde_mm256_cvtps_epi32(a)
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde__m256d
simde_mm256_cvtps_pd (simde__m128 a) {
#if defined(SIMDE_X86_AVX_NATIVE)
return _mm256_cvtps_pd(a);
#else
simde__m256d_private r_;
simde__m128_private a_ = simde__m128_to_private(a);
SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof(a_.f32) / sizeof(a_.f32[0])) ; i++) {
r_.f64[i] = HEDLEY_STATIC_CAST(double, a_.f32[i]);
}
return simde__m256d_from_private(r_);
#endif
}
#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
#undef _mm256_cvtps_pd
#define _mm256_cvtps_pd(a) simde_mm256_cvtps_pd(a)
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde_float64
simde_mm256_cvtsd_f64 (simde__m256d a) {
#if defined(SIMDE_X86_AVX_NATIVE) && ( \
SIMDE_DETECT_CLANG_VERSION_CHECK(3,9,0) || \
HEDLEY_GCC_VERSION_CHECK(7,0,0) || \
HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \
HEDLEY_MSVC_VERSION_CHECK(19,14,0))
return _mm256_cvtsd_f64(a);
#else
simde__m256d_private a_ = simde__m256d_to_private(a);
return a_.f64[0];
#endif
}
#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
#undef _mm256_cvtsd_f64
#define _mm256_cvtsd_f64(a) simde_mm256_cvtsd_f64(a)
#endif
SIMDE_FUNCTION_ATTRIBUTES
int32_t
simde_mm256_cvtsi256_si32 (simde__m256i a) {
#if defined(SIMDE_X86_AVX_NATIVE) && ( \
SIMDE_DETECT_CLANG_VERSION_CHECK(3,9,0) || \
HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \
HEDLEY_MSVC_VERSION_CHECK(19,14,0))
return _mm256_cvtsi256_si32(a);
#else
simde__m256i_private a_ = simde__m256i_to_private(a);
return a_.i32[0];
#endif
}
#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
#undef _mm256_cvtsi256_si32
#define _mm256_cvtsi256_si32(a) simde_mm256_cvtsi256_si32(a)
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde_float32
simde_mm256_cvtss_f32 (simde__m256 a) {
#if defined(SIMDE_X86_AVX_NATIVE) && ( \
SIMDE_DETECT_CLANG_VERSION_CHECK(3,9,0) || \
HEDLEY_GCC_VERSION_CHECK(7,0,0) || \
HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \
HEDLEY_MSVC_VERSION_CHECK(19,14,0))
return _mm256_cvtss_f32(a);
#else
simde__m256_private a_ = simde__m256_to_private(a);
return a_.f32[0];
#endif
}
#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
#undef _mm256_cvtss_f32
#define _mm256_cvtss_f32(a) simde_mm256_cvtss_f32(a)
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde__m128i
simde_mm256_cvttpd_epi32 (simde__m256d a) {
#if defined(SIMDE_X86_AVX_NATIVE)
return _mm256_cvttpd_epi32(a);
#else
simde__m128i_private r_;
simde__m256d_private a_ = simde__m256d_to_private(a);
#if defined(simde_math_trunc)
SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof(a_.f64) / sizeof(a_.f64[0])) ; i++) {
r_.i32[i] = SIMDE_CONVERT_FTOI(int32_t, simde_math_trunc(a_.f64[i]));
}
#else
HEDLEY_UNREACHABLE();
#endif
return simde__m128i_from_private(r_);
#endif
}
#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
#undef _mm256_cvttpd_epi32
#define _mm256_cvttpd_epi32(a) simde_mm256_cvttpd_epi32(a)
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde__m256i
simde_mm256_cvttps_epi32 (simde__m256 a) {
#if defined(SIMDE_X86_AVX_NATIVE)
return _mm256_cvttps_epi32(a);
#else
simde__m256i_private r_;
simde__m256_private a_ = simde__m256_to_private(a);
#if defined(simde_math_truncf)
SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof(a_.f32) / sizeof(a_.f32[0])) ; i++) {
r_.i32[i] = SIMDE_CONVERT_FTOI(int32_t, simde_math_truncf(a_.f32[i]));
}
#else
HEDLEY_UNREACHABLE();
#endif
return simde__m256i_from_private(r_);
#endif
}
#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
#undef _mm256_cvttps_epi32
#define _mm256_cvttps_epi32(a) simde_mm256_cvttps_epi32(a)
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde__m256
simde_mm256_div_ps (simde__m256 a, simde__m256 b) {
#if defined(SIMDE_X86_AVX_NATIVE)
return _mm256_div_ps(a, b);
#else
simde__m256_private
r_,
a_ = simde__m256_to_private(a),
b_ = simde__m256_to_private(b);
#if SIMDE_NATURAL_VECTOR_SIZE_LE(128)
r_.m128[0] = simde_mm_div_ps(a_.m128[0], b_.m128[0]);
r_.m128[1] = simde_mm_div_ps(a_.m128[1], b_.m128[1]);
#elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
r_.f32 = a_.f32 / b_.f32;
#else
SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) {
r_.f32[i] = a_.f32[i] / b_.f32[i];
}
#endif
return simde__m256_from_private(r_);
#endif
}
#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
#undef _mm256_div_ps
#define _mm256_div_ps(a, b) simde_mm256_div_ps(a, b)
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde__m256d
simde_mm256_div_pd (simde__m256d a, simde__m256d b) {
#if defined(SIMDE_X86_AVX_NATIVE)
return _mm256_div_pd(a, b);
#else
simde__m256d_private
r_,
a_ = simde__m256d_to_private(a),
b_ = simde__m256d_to_private(b);
#if SIMDE_NATURAL_VECTOR_SIZE_LE(128)
r_.m128d[0] = simde_mm_div_pd(a_.m128d[0], b_.m128d[0]);
r_.m128d[1] = simde_mm_div_pd(a_.m128d[1], b_.m128d[1]);
#elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
r_.f64 = a_.f64 / b_.f64;
#else
SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) {
r_.f64[i] = a_.f64[i] / b_.f64[i];
}
#endif
return simde__m256d_from_private(r_);
#endif
}
#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
#undef _mm256_div_pd
#define _mm256_div_pd(a, b) simde_mm256_div_pd(a, b)
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde__m128d
simde_mm256_extractf128_pd (simde__m256d a, const int imm8)
SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 1) {
simde__m256d_private a_ = simde__m256d_to_private(a);
return a_.m128d[imm8];
}
#if defined(SIMDE_X86_AVX_NATIVE)
# define simde_mm256_extractf128_pd(a, imm8) _mm256_extractf128_pd(a, imm8)
#endif
#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
#undef _mm256_extractf128_pd
#define _mm256_extractf128_pd(a, imm8) simde_mm256_extractf128_pd(a, imm8)
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde__m128
simde_mm256_extractf128_ps (simde__m256 a, const int imm8)
SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 1) {
simde__m256_private a_ = simde__m256_to_private(a);
return a_.m128[imm8];
}
#if defined(SIMDE_X86_AVX_NATIVE)
# define simde_mm256_extractf128_ps(a, imm8) _mm256_extractf128_ps(a, imm8)
#endif
#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
#undef _mm256_extractf128_ps
#define _mm256_extractf128_ps(a, imm8) simde_mm256_extractf128_ps(a, imm8)
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde__m128i
simde_mm256_extractf128_si256 (simde__m256i a, const int imm8)
SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 1) {
simde__m256i_private a_ = simde__m256i_to_private(a);
return a_.m128i[imm8];
}
#if defined(SIMDE_X86_AVX_NATIVE)
# define simde_mm256_extractf128_si256(a, imm8) _mm256_extractf128_si256(a, imm8)
#endif
#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
#undef _mm256_extractf128_si256
#define _mm256_extractf128_si256(a, imm8) simde_mm256_extractf128_si256(a, imm8)
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde__m256d
simde_mm256_floor_pd (simde__m256d a) {
return simde_mm256_round_pd(a, SIMDE_MM_FROUND_TO_NEG_INF);
}
#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
#undef _mm256_floor_pd
#define _mm256_floor_pd(a) simde_mm256_floor_pd(a)
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde__m256
simde_mm256_floor_ps (simde__m256 a) {
return simde_mm256_round_ps(a, SIMDE_MM_FROUND_TO_NEG_INF);
}
#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
#undef _mm256_floor_ps
#define _mm256_floor_ps(a) simde_mm256_floor_ps(a)
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde__m256i
simde_mm256_insert_epi8 (simde__m256i a, int8_t i, const int index)
SIMDE_REQUIRE_RANGE(index, 0, 31) {
simde__m256i_private a_ = simde__m256i_to_private(a);
a_.i8[index] = i;
return simde__m256i_from_private(a_);
}
#if defined(SIMDE_X86_AVX_NATIVE)
#define simde_mm256_insert_epi8(a, i, index) _mm256_insert_epi8(a, i, index)
#endif
#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
#undef _mm256_insert_epi8
#define _mm256_insert_epi8(a, i, index) simde_mm256_insert_epi8(a, i, index)
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde__m256i
simde_mm256_insert_epi16 (simde__m256i a, int16_t i, const int index)
SIMDE_REQUIRE_RANGE(index, 0, 15) {
simde__m256i_private a_ = simde__m256i_to_private(a);
a_.i16[index] = i;
return simde__m256i_from_private(a_);
}
#if defined(SIMDE_X86_AVX_NATIVE)
#define simde_mm256_insert_epi16(a, i, index) _mm256_insert_epi16(a, i, index)
#endif
#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
#undef _mm256_insert_epi16
#define _mm256_insert_epi16(a, i, imm8) simde_mm256_insert_epi16(a, i, imm8)
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde__m256i
simde_mm256_insert_epi32 (simde__m256i a, int32_t i, const int index)
SIMDE_REQUIRE_RANGE(index, 0, 7) {
simde__m256i_private a_ = simde__m256i_to_private(a);
a_.i32[index] = i;
return simde__m256i_from_private(a_);
}
#if defined(SIMDE_X86_AVX_NATIVE)
#define simde_mm256_insert_epi32(a, i, index) _mm256_insert_epi32(a, i, index)
#endif
#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
#undef _mm256_insert_epi32
#define _mm256_insert_epi32(a, i, index) simde_mm256_insert_epi32(a, i, index)
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde__m256i
simde_mm256_insert_epi64 (simde__m256i a, int64_t i, const int index)
SIMDE_REQUIRE_RANGE(index, 0, 3) {
simde__m256i_private a_ = simde__m256i_to_private(a);
a_.i64[index] = i;
return simde__m256i_from_private(a_);
}
#if defined(SIMDE_X86_AVX_NATIVE) && defined(SIMDE_ARCH_AMD64) && \
(!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,20,0)) && \
SIMDE_DETECT_CLANG_VERSION_CHECK(3,7,0)
#define simde_mm256_insert_epi64(a, i, index) _mm256_insert_epi64(a, i, index)
#endif
#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !defined(SIMDE_ARCH_AMD64))
#undef _mm256_insert_epi64
#define _mm256_insert_epi64(a, i, index) simde_mm256_insert_epi64(a, i, index)
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde__m256d simde_mm256_insertf128_pd(simde__m256d a, simde__m128d b, int imm8)
SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 1) {
simde__m256d_private a_ = simde__m256d_to_private(a);
simde__m128d_private b_ = simde__m128d_to_private(b);
a_.m128d_private[imm8] = b_;
return simde__m256d_from_private(a_);
}
#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
#undef _mm256_insertf128_pd
#define _mm256_insertf128_pd(a, b, imm8) simde_mm256_insertf128_pd(a, b, imm8)
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde__m256 simde_mm256_insertf128_ps(simde__m256 a, simde__m128 b, int imm8)
SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 1) {
simde__m256_private a_ = simde__m256_to_private(a);
simde__m128_private b_ = simde__m128_to_private(b);
a_.m128_private[imm8] = b_;
return simde__m256_from_private(a_);
}
#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
#undef _mm256_insertf128_ps
#define _mm256_insertf128_ps(a, b, imm8) simde_mm256_insertf128_ps(a, b, imm8)
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde__m256i simde_mm256_insertf128_si256(simde__m256i a, simde__m128i b, int imm8)
SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 1) {
simde__m256i_private a_ = simde__m256i_to_private(a);
simde__m128i_private b_ = simde__m128i_to_private(b);
a_.m128i_private[imm8] = b_;
return simde__m256i_from_private(a_);
}
#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
#undef _mm256_insertf128_si256
#define _mm256_insertf128_si256(a, b, imm8) simde_mm256_insertf128_si256(a, b, imm8)
#endif
#if defined(SIMDE_X86_AVX_NATIVE)
# define simde_mm256_dp_ps(a, b, imm8) _mm256_dp_ps(a, b, imm8)
#else
# define simde_mm256_dp_ps(a, b, imm8) \
simde_mm256_set_m128( \
simde_mm_dp_ps(simde_mm256_extractf128_ps(a, 1), simde_mm256_extractf128_ps(b, 1), imm8), \
simde_mm_dp_ps(simde_mm256_extractf128_ps(a, 0), simde_mm256_extractf128_ps(b, 0), imm8))
#endif
#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
#undef _mm256_dp_ps
#define _mm256_dp_ps(a, b, imm8) simde_mm256_dp_ps(a, b, imm8)
#endif
SIMDE_FUNCTION_ATTRIBUTES
int32_t
simde_mm256_extract_epi32 (simde__m256i a, const int index)
SIMDE_REQUIRE_RANGE(index, 0, 7) {
simde__m256i_private a_ = simde__m256i_to_private(a);
return a_.i32[index];
}
#if defined(SIMDE_X86_AVX_NATIVE)
#define simde_mm256_extract_epi32(a, index) _mm256_extract_epi32(a, index)
#endif
#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
#undef _mm256_extract_epi32
#define _mm256_extract_epi32(a, index) simde_mm256_extract_epi32(a, index)
#endif
SIMDE_FUNCTION_ATTRIBUTES
int64_t
simde_mm256_extract_epi64 (simde__m256i a, const int index)
SIMDE_REQUIRE_RANGE(index, 0, 3) {
simde__m256i_private a_ = simde__m256i_to_private(a);
return a_.i64[index];
}
#if defined(SIMDE_X86_AVX_NATIVE) && defined(SIMDE_ARCH_AMD64)
#if !defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,20,0)
#define simde_mm256_extract_epi64(a, index) _mm256_extract_epi64(a, index)
#endif
#endif
#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !defined(SIMDE_ARCH_AMD64))
#undef _mm256_extract_epi64
#define _mm256_extract_epi64(a, index) simde_mm256_extract_epi64(a, index)
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde__m256i
simde_mm256_lddqu_si256 (simde__m256i const * mem_addr) {
#if defined(SIMDE_X86_AVX_NATIVE)
return _mm256_loadu_si256(mem_addr);
#else
simde__m256i r;
simde_memcpy(&r, SIMDE_ALIGN_ASSUME_LIKE(mem_addr, simde__m256i), sizeof(r));
return r;
#endif
}
#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
#undef _mm256_lddqu_si256
#define _mm256_lddqu_si256(a) simde_mm256_lddqu_si256(a)
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde__m256d
simde_mm256_load_pd (const double mem_addr[HEDLEY_ARRAY_PARAM(4)]) {
#if defined(SIMDE_X86_AVX_NATIVE)
return _mm256_load_pd(mem_addr);
#else
simde__m256d r;
simde_memcpy(&r, SIMDE_ALIGN_ASSUME_LIKE(mem_addr, simde__m256d), sizeof(r));
return r;
#endif
}
#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
#undef _mm256_load_pd
#define _mm256_load_pd(a) simde_mm256_load_pd(a)
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde__m256
simde_mm256_load_ps (const float mem_addr[HEDLEY_ARRAY_PARAM(8)]) {
#if defined(SIMDE_X86_AVX_NATIVE)
return _mm256_load_ps(mem_addr);
#else
simde__m256 r;
simde_memcpy(&r, SIMDE_ALIGN_ASSUME_LIKE(mem_addr, simde__m256), sizeof(r));
return r;
#endif
}
#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
#undef _mm256_load_ps
#define _mm256_load_ps(a) simde_mm256_load_ps(a)
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde__m256i
simde_mm256_load_si256 (simde__m256i const * mem_addr) {
#if defined(SIMDE_X86_AVX_NATIVE)
return _mm256_load_si256(mem_addr);
#else
simde__m256i r;
simde_memcpy(&r, SIMDE_ALIGN_ASSUME_LIKE(mem_addr, simde__m256i), sizeof(r));
return r;
#endif
}
#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
#undef _mm256_load_si256
#define _mm256_load_si256(a) simde_mm256_load_si256(a)
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde__m256d
simde_mm256_loadu_pd (const double a[HEDLEY_ARRAY_PARAM(4)]) {
#if defined(SIMDE_X86_AVX_NATIVE)
return _mm256_loadu_pd(a);
#else
simde__m256d r;
simde_memcpy(&r, a, sizeof(r));
return r;
#endif
}
#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
#undef _mm256_loadu_pd
#define _mm256_loadu_pd(a) simde_mm256_loadu_pd(a)
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde__m256
simde_mm256_loadu_ps (const float a[HEDLEY_ARRAY_PARAM(8)]) {
#if defined(SIMDE_X86_AVX_NATIVE)
return _mm256_loadu_ps(a);
#else
simde__m256 r;
simde_memcpy(&r, a, sizeof(r));
return r;
#endif
}
#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
#undef _mm256_loadu_ps
#define _mm256_loadu_ps(a) simde_mm256_loadu_ps(a)
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde__m256i
simde_mm256_loadu_epi8(void const * mem_addr) {
#if defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512BW_NATIVE) && !defined(SIMDE_BUG_GCC_95483) && !defined(SIMDE_BUG_CLANG_REV_344862)
return _mm256_loadu_epi8(mem_addr);
#elif defined(SIMDE_X86_AVX_NATIVE)
return _mm256_loadu_si256(SIMDE_ALIGN_CAST(__m256i const *, mem_addr));
#else
simde__m256i r;
simde_memcpy(&r, mem_addr, sizeof(r));
return r;
#endif
}
#define simde_x_mm256_loadu_epi8(mem_addr) simde_mm256_loadu_epi8(mem_addr)
#if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) || defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && (defined(SIMDE_BUG_GCC_95483) || defined(SIMDE_BUG_CLANG_REV_344862)))
#undef _mm256_loadu_epi8
#define _mm256_loadu_epi8(a) simde_mm256_loadu_epi8(a)
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde__m256i
simde_mm256_loadu_epi16(void const * mem_addr) {
#if defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512BW_NATIVE) && !defined(SIMDE_BUG_GCC_95483) && !defined(SIMDE_BUG_CLANG_REV_344862)
return _mm256_loadu_epi16(mem_addr);
#elif defined(SIMDE_X86_AVX_NATIVE)
return _mm256_loadu_si256(SIMDE_ALIGN_CAST(__m256i const *, mem_addr));
#else
simde__m256i r;
simde_memcpy(&r, mem_addr, sizeof(r));
return r;
#endif
}
#define simde_x_mm256_loadu_epi16(mem_addr) simde_mm256_loadu_epi16(mem_addr)
#if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) || defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && (defined(SIMDE_BUG_GCC_95483) || defined(SIMDE_BUG_CLANG_REV_344862)))
#undef _mm256_loadu_epi16
#define _mm256_loadu_epi16(a) simde_mm256_loadu_epi16(a)
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde__m256i
simde_mm256_loadu_epi32(void const * mem_addr) {
#if defined(SIMDE_X86_AVX512VL_NATIVE) && !defined(SIMDE_BUG_GCC_95483) && !defined(SIMDE_BUG_CLANG_REV_344862)
return _mm256_loadu_epi32(mem_addr);
#elif defined(SIMDE_X86_AVX_NATIVE)
return _mm256_loadu_si256(SIMDE_ALIGN_CAST(__m256i const *, mem_addr));
#else
simde__m256i r;
simde_memcpy(&r, mem_addr, sizeof(r));
return r;
#endif
}
#define simde_x_mm256_loadu_epi32(mem_addr) simde_mm256_loadu_epi32(mem_addr)
#if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && (defined(SIMDE_BUG_GCC_95483) || defined(SIMDE_BUG_CLANG_REV_344862)))
#undef _mm256_loadu_epi32
#define _mm256_loadu_epi32(a) simde_mm256_loadu_epi32(a)
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde__m256i
simde_mm256_loadu_epi64(void const * mem_addr) {
#if defined(SIMDE_X86_AVX512VL_NATIVE) && !defined(SIMDE_BUG_GCC_95483) && !defined(SIMDE_BUG_CLANG_REV_344862)
return _mm256_loadu_epi64(mem_addr);
#elif defined(SIMDE_X86_AVX_NATIVE)
return _mm256_loadu_si256(SIMDE_ALIGN_CAST(__m256i const *, mem_addr));
#else
simde__m256i r;
simde_memcpy(&r, mem_addr, sizeof(r));
return r;
#endif
}
#define simde_x_mm256_loadu_epi64(mem_addr) simde_mm256_loadu_epi64(mem_addr)
#if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && (defined(SIMDE_BUG_GCC_95483) || defined(SIMDE_BUG_CLANG_REV_344862)))
#undef _mm256_loadu_epi64
#define _mm256_loadu_epi64(a) simde_mm256_loadu_epi64(a)
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde__m256i
simde_mm256_loadu_si256 (void const * mem_addr) {
#if defined(SIMDE_X86_AVX_NATIVE)
return _mm256_loadu_si256(SIMDE_ALIGN_CAST(const __m256i*, mem_addr));
#else
simde__m256i r;
simde_memcpy(&r, mem_addr, sizeof(r));
return r;
#endif
}
#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
#undef _mm256_loadu_si256
#define _mm256_loadu_si256(mem_addr) simde_mm256_loadu_si256(mem_addr)
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde__m256
simde_mm256_loadu2_m128 (const float hiaddr[HEDLEY_ARRAY_PARAM(4)], const float loaddr[HEDLEY_ARRAY_PARAM(4)]) {
#if defined(SIMDE_X86_AVX_NATIVE) && !defined(SIMDE_BUG_GCC_91341) && !defined(SIMDE_BUG_MCST_LCC_MISSING_AVX_LOAD_STORE_M128_FUNCS)
return _mm256_loadu2_m128(hiaddr, loaddr);
#else
return
simde_mm256_insertf128_ps(simde_mm256_castps128_ps256(simde_mm_loadu_ps(loaddr)),
simde_mm_loadu_ps(hiaddr), 1);
#endif
}
#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
#undef _mm256_loadu2_m128
#define _mm256_loadu2_m128(hiaddr, loaddr) simde_mm256_loadu2_m128(hiaddr, loaddr)
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde__m256d
simde_mm256_loadu2_m128d (const double hiaddr[HEDLEY_ARRAY_PARAM(2)], const double loaddr[HEDLEY_ARRAY_PARAM(2)]) {
#if defined(SIMDE_X86_AVX_NATIVE) && !defined(SIMDE_BUG_GCC_91341) && !defined(SIMDE_BUG_MCST_LCC_MISSING_AVX_LOAD_STORE_M128_FUNCS)
return _mm256_loadu2_m128d(hiaddr, loaddr);
#else
return
simde_mm256_insertf128_pd(simde_mm256_castpd128_pd256(simde_mm_loadu_pd(loaddr)),
simde_mm_loadu_pd(hiaddr), 1);
#endif
}
#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
#undef _mm256_loadu2_m128d
#define _mm256_loadu2_m128d(hiaddr, loaddr) simde_mm256_loadu2_m128d(hiaddr, loaddr)
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde__m256i
simde_mm256_loadu2_m128i (const simde__m128i* hiaddr, const simde__m128i* loaddr) {
#if defined(SIMDE_X86_AVX_NATIVE) && !defined(SIMDE_BUG_GCC_91341) && !defined(SIMDE_BUG_MCST_LCC_MISSING_AVX_LOAD_STORE_M128_FUNCS)
return _mm256_loadu2_m128i(hiaddr, loaddr);
#else
return
simde_mm256_insertf128_si256(simde_mm256_castsi128_si256(simde_mm_loadu_si128(loaddr)),
simde_mm_loadu_si128(hiaddr), 1);
#endif
}
#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
#undef _mm256_loadu2_m128i
#define _mm256_loadu2_m128i(hiaddr, loaddr) simde_mm256_loadu2_m128i(hiaddr, loaddr)
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde__m128d
simde_mm_maskload_pd (const simde_float64 mem_addr[HEDLEY_ARRAY_PARAM(4)], simde__m128i mask) {
#if defined(SIMDE_X86_AVX_NATIVE)
#if defined(__clang__) && !SIMDE_DETECT_CLANG_VERSION_CHECK(3,8,0)
return _mm_maskload_pd(mem_addr, HEDLEY_REINTERPRET_CAST(simde__m128d, mask));
#else
return _mm_maskload_pd(mem_addr, mask);
#endif
#else
simde__m128d_private
mem_ = simde__m128d_to_private(simde_mm_loadu_pd(mem_addr)),
r_;
simde__m128i_private mask_ = simde__m128i_to_private(mask);
#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
r_.neon_i64 = vandq_s64(mem_.neon_i64, vshrq_n_s64(mask_.neon_i64, 63));
#else
SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) {
r_.i64[i] = mem_.i64[i] & (mask_.i64[i] >> 63);
}
#endif
return simde__m128d_from_private(r_);
#endif
}
#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
#undef _mm_maskload_pd
#define _mm_maskload_pd(mem_addr, mask) simde_mm_maskload_pd(HEDLEY_REINTERPRET_CAST(double const*, mem_addr), mask)
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde__m256d
simde_mm256_maskload_pd (const simde_float64 mem_addr[HEDLEY_ARRAY_PARAM(4)], simde__m256i mask) {
#if defined(SIMDE_X86_AVX_NATIVE)
#if defined(__clang__) && !SIMDE_DETECT_CLANG_VERSION_CHECK(3,8,0)
return _mm256_maskload_pd(mem_addr, HEDLEY_REINTERPRET_CAST(simde__m256d, mask));
#else
return _mm256_maskload_pd(mem_addr, mask);
#endif
#else
simde__m256d_private r_;
simde__m256i_private mask_ = simde__m256i_to_private(mask);
r_ = simde__m256d_to_private(simde_mm256_loadu_pd(mem_addr));
SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) {
r_.i64[i] &= mask_.i64[i] >> 63;
}
return simde__m256d_from_private(r_);
#endif
}
#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
#undef _mm256_maskload_pd
#define _mm256_maskload_pd(mem_addr, mask) simde_mm256_maskload_pd(HEDLEY_REINTERPRET_CAST(double const*, mem_addr), mask)
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde__m128
simde_mm_maskload_ps (const simde_float32 mem_addr[HEDLEY_ARRAY_PARAM(4)], simde__m128i mask) {
#if defined(SIMDE_X86_AVX_NATIVE)
#if defined(__clang__) && !SIMDE_DETECT_CLANG_VERSION_CHECK(3,8,0)
return _mm_maskload_ps(mem_addr, HEDLEY_REINTERPRET_CAST(simde__m128, mask));
#else
return _mm_maskload_ps(mem_addr, mask);
#endif
#else
simde__m128_private
mem_ = simde__m128_to_private(simde_mm_loadu_ps(mem_addr)),
r_;
simde__m128i_private mask_ = simde__m128i_to_private(mask);
#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
r_.neon_i32 = vandq_s32(mem_.neon_i32, vshrq_n_s32(mask_.neon_i32, 31));
#else
SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
r_.i32[i] = mem_.i32[i] & (mask_.i32[i] >> 31);
}
#endif
return simde__m128_from_private(r_);
#endif
}
#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
#undef _mm_maskload_ps
#define _mm_maskload_ps(mem_addr, mask) simde_mm_maskload_ps(HEDLEY_REINTERPRET_CAST(float const*, mem_addr), mask)
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde__m256
simde_mm256_maskload_ps (const simde_float32 mem_addr[HEDLEY_ARRAY_PARAM(4)], simde__m256i mask) {
#if defined(SIMDE_X86_AVX_NATIVE)
#if defined(__clang__) && !SIMDE_DETECT_CLANG_VERSION_CHECK(3,8,0)
return _mm256_maskload_ps(mem_addr, HEDLEY_REINTERPRET_CAST(simde__m256, mask));
#else
return _mm256_maskload_ps(mem_addr, mask);
#endif
#else
simde__m256_private r_;
simde__m256i_private mask_ = simde__m256i_to_private(mask);
r_ = simde__m256_to_private(simde_mm256_loadu_ps(mem_addr));
SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) {
r_.i32[i] &= mask_.i32[i] >> 31;
}
return simde__m256_from_private(r_);
#endif
}
#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
#undef _mm256_maskload_ps
#define _mm256_maskload_ps(mem_addr, mask) simde_mm256_maskload_ps(HEDLEY_REINTERPRET_CAST(float const*, mem_addr), mask)
#endif
SIMDE_FUNCTION_ATTRIBUTES
void
simde_mm_maskstore_pd (simde_float64 mem_addr[HEDLEY_ARRAY_PARAM(2)], simde__m128i mask, simde__m128d a) {
#if defined(SIMDE_X86_AVX_NATIVE)
#if defined(__clang__) && !SIMDE_DETECT_CLANG_VERSION_CHECK(3,8,0)
_mm_maskstore_pd(mem_addr, HEDLEY_REINTERPRET_CAST(simde__m128d, mask), a);
#else
_mm_maskstore_pd(mem_addr, mask, a);
#endif
#else
simde__m128i_private mask_ = simde__m128i_to_private(mask);
simde__m128d_private a_ = simde__m128d_to_private(a);
SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof(a_.f64) / sizeof(a_.f64[0])) ; i++) {
if (mask_.u64[i] >> 63)
mem_addr[i] = a_.f64[i];
}
#endif
}
#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
#undef _mm_maskstore_pd
#define _mm_maskstore_pd(mem_addr, mask, a) simde_mm_maskstore_pd(HEDLEY_REINTERPRET_CAST(double*, mem_addr), mask, a)
#endif
SIMDE_FUNCTION_ATTRIBUTES
void
simde_mm256_maskstore_pd (simde_float64 mem_addr[HEDLEY_ARRAY_PARAM(4)], simde__m256i mask, simde__m256d a) {
#if defined(SIMDE_X86_AVX_NATIVE)
#if defined(__clang__) && !SIMDE_DETECT_CLANG_VERSION_CHECK(3,8,0)
_mm256_maskstore_pd(mem_addr, HEDLEY_REINTERPRET_CAST(simde__m256d, mask), a);
#else
_mm256_maskstore_pd(mem_addr, mask, a);
#endif
#else
simde__m256i_private mask_ = simde__m256i_to_private(mask);
simde__m256d_private a_ = simde__m256d_to_private(a);
SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof(a_.f64) / sizeof(a_.f64[0])) ; i++) {
if (mask_.u64[i] & (UINT64_C(1) << 63))
mem_addr[i] = a_.f64[i];
}
#endif
}
#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
#undef _mm256_maskstore_pd
#define _mm256_maskstore_pd(mem_addr, mask, a) simde_mm256_maskstore_pd(HEDLEY_REINTERPRET_CAST(double*, mem_addr), mask, a)
#endif
SIMDE_FUNCTION_ATTRIBUTES
void
simde_mm_maskstore_ps (simde_float32 mem_addr[HEDLEY_ARRAY_PARAM(4)], simde__m128i mask, simde__m128 a) {
#if defined(SIMDE_X86_AVX_NATIVE)
#if defined(__clang__) && !SIMDE_DETECT_CLANG_VERSION_CHECK(3,8,0)
_mm_maskstore_ps(mem_addr, HEDLEY_REINTERPRET_CAST(simde__m128, mask), a);
#else
_mm_maskstore_ps(mem_addr, mask, a);
#endif
#else
simde__m128i_private mask_ = simde__m128i_to_private(mask);
simde__m128_private a_ = simde__m128_to_private(a);
SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof(a_.f32) / sizeof(a_.f32[0])) ; i++) {
if (mask_.u32[i] & (UINT32_C(1) << 31))
mem_addr[i] = a_.f32[i];
}
#endif
}
#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
#undef _mm_maskstore_ps
#define _mm_maskstore_ps(mem_addr, mask, a) simde_mm_maskstore_ps(HEDLEY_REINTERPRET_CAST(float*, mem_addr), mask, a)
#endif
SIMDE_FUNCTION_ATTRIBUTES
void
simde_mm256_maskstore_ps (simde_float32 mem_addr[HEDLEY_ARRAY_PARAM(8)], simde__m256i mask, simde__m256 a) {
#if defined(SIMDE_X86_AVX_NATIVE)
#if defined(__clang__) && !SIMDE_DETECT_CLANG_VERSION_CHECK(3,8,0)
_mm256_maskstore_ps(mem_addr, HEDLEY_REINTERPRET_CAST(simde__m256, mask), a);
#else
_mm256_maskstore_ps(mem_addr, mask, a);
#endif
#else
simde__m256i_private mask_ = simde__m256i_to_private(mask);
simde__m256_private a_ = simde__m256_to_private(a);
SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof(a_.f32) / sizeof(a_.f32[0])) ; i++) {
if (mask_.u32[i] & (UINT32_C(1) << 31))
mem_addr[i] = a_.f32[i];
}
#endif
}
#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
#undef _mm256_maskstore_ps
#define _mm256_maskstore_ps(mem_addr, mask, a) simde_mm256_maskstore_ps(HEDLEY_REINTERPRET_CAST(float*, mem_addr), mask, a)
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde__m256
simde_mm256_min_ps (simde__m256 a, simde__m256 b) {
#if defined(SIMDE_X86_AVX_NATIVE)
return _mm256_min_ps(a, b);
#else
simde__m256_private
r_,
a_ = simde__m256_to_private(a),
b_ = simde__m256_to_private(b);
#if SIMDE_NATURAL_VECTOR_SIZE_LE(128)
r_.m128[0] = simde_mm_min_ps(a_.m128[0], b_.m128[0]);
r_.m128[1] = simde_mm_min_ps(a_.m128[1], b_.m128[1]);
#else
SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) {
r_.f32[i] = (a_.f32[i] < b_.f32[i]) ? a_.f32[i] : b_.f32[i];
}
#endif
return simde__m256_from_private(r_);
#endif
}
#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
#undef _mm256_min_ps
#define _mm256_min_ps(a, b) simde_mm256_min_ps(a, b)
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde__m256d
simde_mm256_min_pd (simde__m256d a, simde__m256d b) {
#if defined(SIMDE_X86_AVX_NATIVE)
return _mm256_min_pd(a, b);
#else
simde__m256d_private
r_,
a_ = simde__m256d_to_private(a),
b_ = simde__m256d_to_private(b);
#if SIMDE_NATURAL_VECTOR_SIZE_LE(128)
r_.m128d[0] = simde_mm_min_pd(a_.m128d[0], b_.m128d[0]);
r_.m128d[1] = simde_mm_min_pd(a_.m128d[1], b_.m128d[1]);
#else
SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) {
r_.f64[i] = (a_.f64[i] < b_.f64[i]) ? a_.f64[i] : b_.f64[i];
}
#endif
return simde__m256d_from_private(r_);
#endif
}
#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
#undef _mm256_min_pd
#define _mm256_min_pd(a, b) simde_mm256_min_pd(a, b)
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde__m256
simde_mm256_max_ps (simde__m256 a, simde__m256 b) {
#if defined(SIMDE_X86_AVX_NATIVE)
return _mm256_max_ps(a, b);
#else
simde__m256_private
r_,
a_ = simde__m256_to_private(a),
b_ = simde__m256_to_private(b);
#if SIMDE_NATURAL_VECTOR_SIZE_LE(128)
r_.m128[0] = simde_mm_max_ps(a_.m128[0], b_.m128[0]);
r_.m128[1] = simde_mm_max_ps(a_.m128[1], b_.m128[1]);
#else
SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) {
r_.f32[i] = (a_.f32[i] > b_.f32[i]) ? a_.f32[i] : b_.f32[i];
}
#endif
return simde__m256_from_private(r_);
#endif
}
#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
#undef _mm256_max_ps
#define _mm256_max_ps(a, b) simde_mm256_max_ps(a, b)
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde__m256d
simde_mm256_max_pd (simde__m256d a, simde__m256d b) {
#if defined(SIMDE_X86_AVX_NATIVE)
return _mm256_max_pd(a, b);
#else
simde__m256d_private
r_,
a_ = simde__m256d_to_private(a),
b_ = simde__m256d_to_private(b);
#if SIMDE_NATURAL_VECTOR_SIZE_LE(128)
r_.m128d[0] = simde_mm_max_pd(a_.m128d[0], b_.m128d[0]);
r_.m128d[1] = simde_mm_max_pd(a_.m128d[1], b_.m128d[1]);
#else
SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) {
r_.f64[i] = (a_.f64[i] > b_.f64[i]) ? a_.f64[i] : b_.f64[i];
}
#endif
return simde__m256d_from_private(r_);
#endif
}
#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
#undef _mm256_max_pd
#define _mm256_max_pd(a, b) simde_mm256_max_pd(a, b)
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde__m256d
simde_mm256_movedup_pd (simde__m256d a) {
#if defined(SIMDE_X86_AVX_NATIVE)
return _mm256_movedup_pd(a);
#else
simde__m256d_private
r_,
a_ = simde__m256d_to_private(a);
#if defined(SIMDE_SHUFFLE_VECTOR_)
r_.f64 = SIMDE_SHUFFLE_VECTOR_(64, 32, a_.f64, a_.f64, 0, 0, 2, 2);
#else
SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i += 2) {
r_.f64[i] = r_.f64[i + 1] = a_.f64[i];
}
#endif
return simde__m256d_from_private(r_);
#endif
}
#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
#undef _mm256_movedup_pd
#define _mm256_movedup_pd(a) simde_mm256_movedup_pd(a)
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde__m256
simde_mm256_movehdup_ps (simde__m256 a) {
#if defined(SIMDE_X86_AVX_NATIVE)
return _mm256_movehdup_ps(a);
#else
simde__m256_private
r_,
a_ = simde__m256_to_private(a);
#if defined(SIMDE_SHUFFLE_VECTOR_)
r_.f32 = SIMDE_SHUFFLE_VECTOR_(32, 32, a_.f32, a_.f32, 1, 1, 3, 3, 5, 5, 7, 7);
#else
SIMDE_VECTORIZE
for (size_t i = 1 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i += 2) {
r_.f32[i - 1] = r_.f32[i] = a_.f32[i];
}
#endif
return simde__m256_from_private(r_);
#endif
}
#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
#undef _mm256_movehdup_ps
#define _mm256_movehdup_ps(a) simde_mm256_movehdup_ps(a)
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde__m256
simde_mm256_moveldup_ps (simde__m256 a) {
#if defined(SIMDE_X86_AVX_NATIVE)
return _mm256_moveldup_ps(a);
#else
simde__m256_private
r_,
a_ = simde__m256_to_private(a);
#if defined(SIMDE_SHUFFLE_VECTOR_)
r_.f32 = SIMDE_SHUFFLE_VECTOR_(32, 32, a_.f32, a_.f32, 0, 0, 2, 2, 4, 4, 6, 6);
#else
SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i += 2) {
r_.f32[i] = r_.f32[i + 1] = a_.f32[i];
}
#endif
return simde__m256_from_private(r_);
#endif
}
#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
#undef _mm256_moveldup_ps
#define _mm256_moveldup_ps(a) simde_mm256_moveldup_ps(a)
#endif
SIMDE_FUNCTION_ATTRIBUTES
int
simde_mm256_movemask_ps (simde__m256 a) {
#if defined(SIMDE_X86_AVX_NATIVE)
return _mm256_movemask_ps(a);
#else
simde__m256_private a_ = simde__m256_to_private(a);
int r = 0;
SIMDE_VECTORIZE_REDUCTION(|:r)
for (size_t i = 0 ; i < (sizeof(a_.f32) / sizeof(a_.f32[0])) ; i++) {
r |= (a_.u32[i] >> 31) << i;
}
return r;
#endif
}
#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
#undef _mm256_movemask_ps
#define _mm256_movemask_ps(a) simde_mm256_movemask_ps(a)
#endif
SIMDE_FUNCTION_ATTRIBUTES
int
simde_mm256_movemask_pd (simde__m256d a) {
#if defined(SIMDE_X86_AVX_NATIVE)
return _mm256_movemask_pd(a);
#else
simde__m256d_private a_ = simde__m256d_to_private(a);
int r = 0;
SIMDE_VECTORIZE_REDUCTION(|:r)
for (size_t i = 0 ; i < (sizeof(a_.f64) / sizeof(a_.f64[0])) ; i++) {
r |= (a_.u64[i] >> 63) << i;
}
return r;
#endif
}
#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
#undef _mm256_movemask_pd
#define _mm256_movemask_pd(a) simde_mm256_movemask_pd(a)
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde__m256
simde_mm256_mul_ps (simde__m256 a, simde__m256 b) {
#if defined(SIMDE_X86_AVX_NATIVE)
return _mm256_mul_ps(a, b);
#else
simde__m256_private
r_,
a_ = simde__m256_to_private(a),
b_ = simde__m256_to_private(b);
#if SIMDE_NATURAL_VECTOR_SIZE_LE(128)
r_.m128[0] = simde_mm_mul_ps(a_.m128[0], b_.m128[0]);
r_.m128[1] = simde_mm_mul_ps(a_.m128[1], b_.m128[1]);
#elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
r_.f32 = a_.f32 * b_.f32;
#else
SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) {
r_.f32[i] = a_.f32[i] * b_.f32[i];
}
#endif
return simde__m256_from_private(r_);
#endif
}
#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
#undef _mm256_mul_ps
#define _mm256_mul_ps(a, b) simde_mm256_mul_ps(a, b)
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde__m256d
simde_mm256_mul_pd (simde__m256d a, simde__m256d b) {
#if defined(SIMDE_X86_AVX_NATIVE)
return _mm256_mul_pd(a, b);
#else
simde__m256d_private
r_,
a_ = simde__m256d_to_private(a),
b_ = simde__m256d_to_private(b);
#if SIMDE_NATURAL_VECTOR_SIZE_LE(128)
r_.m128d[0] = simde_mm_mul_pd(a_.m128d[0], b_.m128d[0]);
r_.m128d[1] = simde_mm_mul_pd(a_.m128d[1], b_.m128d[1]);
#elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
r_.f64 = a_.f64 * b_.f64;
#else
SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) {
r_.f64[i] = a_.f64[i] * b_.f64[i];
}
#endif
return simde__m256d_from_private(r_);
#endif
}
#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
#undef _mm256_mul_pd
#define _mm256_mul_pd(a, b) simde_mm256_mul_pd(a, b)
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde__m256
simde_mm256_or_ps (simde__m256 a, simde__m256 b) {
#if defined(SIMDE_X86_AVX_NATIVE)
return _mm256_or_ps(a, b);
#else
simde__m256_private
r_,
a_ = simde__m256_to_private(a),
b_ = simde__m256_to_private(b);
#if SIMDE_NATURAL_VECTOR_SIZE_LE(128)
r_.m128[0] = simde_mm_or_ps(a_.m128[0], b_.m128[0]);
r_.m128[1] = simde_mm_or_ps(a_.m128[1], b_.m128[1]);
#elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
r_.i32f = a_.i32f | b_.i32f;
#else
SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof(r_.u32) / sizeof(r_.u32[0])) ; i++) {
r_.u32[i] = a_.u32[i] | b_.u32[i];
}
#endif
return simde__m256_from_private(r_);
#endif
}
#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
#undef _mm256_or_ps
#define _mm256_or_ps(a, b) simde_mm256_or_ps(a, b)
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde__m256d
simde_mm256_or_pd (simde__m256d a, simde__m256d b) {
#if defined(SIMDE_X86_AVX_NATIVE)
return _mm256_or_pd(a, b);
#else
simde__m256d_private
r_,
a_ = simde__m256d_to_private(a),
b_ = simde__m256d_to_private(b);
#if SIMDE_NATURAL_VECTOR_SIZE_LE(128)
r_.m128d[0] = simde_mm_or_pd(a_.m128d[0], b_.m128d[0]);
r_.m128d[1] = simde_mm_or_pd(a_.m128d[1], b_.m128d[1]);
#elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
r_.i32f = a_.i32f | b_.i32f;
#else
SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof(r_.u64) / sizeof(r_.u64[0])) ; i++) {
r_.u64[i] = a_.u64[i] | b_.u64[i];
}
#endif
return simde__m256d_from_private(r_);
#endif
}
#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
#undef _mm256_or_pd
#define _mm256_or_pd(a, b) simde_mm256_or_pd(a, b)
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde__m256
simde_mm256_permute_ps (simde__m256 a, const int imm8)
SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 255) {
simde__m256_private
r_,
a_ = simde__m256_to_private(a);
SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) {
r_.f32[i] = a_.m128_private[i >> 2].f32[(imm8 >> ((i << 1) & 7)) & 3];
}
return simde__m256_from_private(r_);
}
#if defined(SIMDE_X86_AVX_NATIVE)
# define simde_mm256_permute_ps(a, imm8) _mm256_permute_ps(a, imm8)
#endif
#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
#undef _mm256_permute_ps
#define _mm256_permute_ps(a, imm8) simde_mm256_permute_ps(a, imm8)
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde__m256d
simde_mm256_permute_pd (simde__m256d a, const int imm8)
SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 15) {
simde__m256d_private
r_,
a_ = simde__m256d_to_private(a);
SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) {
r_.f64[i] = a_.f64[((imm8 >> i) & 1) + (i & 2)];
}
return simde__m256d_from_private(r_);
}
#if defined(SIMDE_X86_AVX_NATIVE)
# define simde_mm256_permute_pd(a, imm8) _mm256_permute_pd(a, imm8)
#endif
#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
#undef _mm256_permute_pd
#define _mm256_permute_pd(a, imm8) simde_mm256_permute_pd(a, imm8)
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde__m128
simde_mm_permute_ps (simde__m128 a, const int imm8)
SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 255) {
simde__m128_private
r_,
a_ = simde__m128_to_private(a);
SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) {
r_.f32[i] = a_.f32[(imm8 >> ((i << 1) & 7)) & 3];
}
return simde__m128_from_private(r_);
}
#if defined(SIMDE_X86_AVX_NATIVE)
# define simde_mm_permute_ps(a, imm8) _mm_permute_ps(a, imm8)
#endif
#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
#undef _mm_permute_ps
#define _mm_permute_ps(a, imm8) simde_mm_permute_ps(a, imm8)
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde__m128d
simde_mm_permute_pd (simde__m128d a, const int imm8)
SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 3) {
simde__m128d_private
r_,
a_ = simde__m128d_to_private(a);
SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) {
r_.f64[i] = a_.f64[((imm8 >> i) & 1) + (i & 2)];
}
return simde__m128d_from_private(r_);
}
#if defined(SIMDE_X86_AVX_NATIVE)
# define simde_mm_permute_pd(a, imm8) _mm_permute_pd(a, imm8)
#endif
#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
#undef _mm_permute_pd
#define _mm_permute_pd(a, imm8) simde_mm_permute_pd(a, imm8)
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde__m128
simde_mm_permutevar_ps (simde__m128 a, simde__m128i b) {
#if defined(SIMDE_X86_AVX_NATIVE)
return _mm_permutevar_ps(a, b);
#else
simde__m128_private
r_,
a_ = simde__m128_to_private(a);
simde__m128i_private b_ = simde__m128i_to_private(b);
SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) {
r_.f32[i] = a_.f32[b_.i32[i] & 3];
}
return simde__m128_from_private(r_);
#endif
}
#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
#undef _mm_permutevar_ps
#define _mm_permutevar_ps(a, b) simde_mm_permutevar_ps(a, b)
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde__m128d
simde_mm_permutevar_pd (simde__m128d a, simde__m128i b) {
#if defined(SIMDE_X86_AVX_NATIVE)
return _mm_permutevar_pd(a, b);
#else
simde__m128d_private
r_,
a_ = simde__m128d_to_private(a);
simde__m128i_private b_ = simde__m128i_to_private(b);
SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) {
r_.f64[i] = a_.f64[(b_.i64[i] & 2) >> 1];
}
return simde__m128d_from_private(r_);
#endif
}
#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
#undef _mm_permutevar_pd
#define _mm_permutevar_pd(a, b) simde_mm_permutevar_pd(a, b)
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde__m256
simde_mm256_permutevar_ps (simde__m256 a, simde__m256i b) {
#if defined(SIMDE_X86_AVX_NATIVE)
return _mm256_permutevar_ps(a, b);
#else
simde__m256_private
r_,
a_ = simde__m256_to_private(a);
simde__m256i_private b_ = simde__m256i_to_private(b);
SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) {
r_.f32[i] = a_.f32[(b_.i32[i] & 3) + (i & 4)];
}
return simde__m256_from_private(r_);
#endif
}
#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
#undef _mm256_permutevar_ps
#define _mm256_permutevar_ps(a, b) simde_mm256_permutevar_ps(a, b)
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde__m256d
simde_mm256_permutevar_pd (simde__m256d a, simde__m256i b) {
#if defined(SIMDE_X86_AVX_NATIVE)
return _mm256_permutevar_pd(a, b);
#else
simde__m256d_private
r_,
a_ = simde__m256d_to_private(a);
simde__m256i_private b_ = simde__m256i_to_private(b);
SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) {
r_.f64[i] = a_.f64[((b_.i64[i] & 2) >> 1) + (i & 2)];
}
return simde__m256d_from_private(r_);
#endif
}
#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
#undef _mm256_permutevar_pd
#define _mm256_permutevar_pd(a, b) simde_mm256_permutevar_pd(a, b)
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde__m256
simde_mm256_permute2f128_ps (simde__m256 a, simde__m256 b, const int imm8)
SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 255) {
simde__m256_private
r_,
a_ = simde__m256_to_private(a),
b_ = simde__m256_to_private(b);
r_.m128_private[0] = (imm8 & 0x08) ? simde__m128_to_private(simde_mm_setzero_ps()) : ((imm8 & 0x02) ? b_.m128_private[(imm8 ) & 1] : a_.m128_private[(imm8 ) & 1]);
r_.m128_private[1] = (imm8 & 0x80) ? simde__m128_to_private(simde_mm_setzero_ps()) : ((imm8 & 0x20) ? b_.m128_private[(imm8 >> 4) & 1] : a_.m128_private[(imm8 >> 4) & 1]);
return simde__m256_from_private(r_);
}
#if defined(SIMDE_X86_AVX_NATIVE)
# define simde_mm256_permute2f128_ps(a, b, imm8) _mm256_permute2f128_ps(a, b, imm8)
#endif
#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
#undef _mm256_permute2f128_ps
#define _mm256_permute2f128_ps(a, b, imm8) simde_mm256_permute2f128_ps(a, b, imm8)
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde__m256d
simde_mm256_permute2f128_pd (simde__m256d a, simde__m256d b, const int imm8)
SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 255) {
simde__m256d_private
r_,
a_ = simde__m256d_to_private(a),
b_ = simde__m256d_to_private(b);
r_.m128d_private[0] = (imm8 & 0x08) ? simde__m128d_to_private(simde_mm_setzero_pd()) : ((imm8 & 0x02) ? b_.m128d_private[(imm8 ) & 1] : a_.m128d_private[(imm8 ) & 1]);
r_.m128d_private[1] = (imm8 & 0x80) ? simde__m128d_to_private(simde_mm_setzero_pd()) : ((imm8 & 0x20) ? b_.m128d_private[(imm8 >> 4) & 1] : a_.m128d_private[(imm8 >> 4) & 1]);
return simde__m256d_from_private(r_);
}
#if defined(SIMDE_X86_AVX_NATIVE)
# define simde_mm256_permute2f128_pd(a, b, imm8) _mm256_permute2f128_pd(a, b, imm8)
#endif
#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
#undef _mm256_permute2f128_pd
#define _mm256_permute2f128_pd(a, b, imm8) simde_mm256_permute2f128_pd(a, b, imm8)
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde__m256i
simde_mm256_permute2f128_si256 (simde__m256i a, simde__m256i b, const int imm8)
SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 255) {
simde__m256i_private
r_,
a_ = simde__m256i_to_private(a),
b_ = simde__m256i_to_private(b);
r_.m128i_private[0] = (imm8 & 0x08) ? simde__m128i_to_private(simde_mm_setzero_si128()) : ((imm8 & 0x02) ? b_.m128i_private[(imm8 ) & 1] : a_.m128i_private[(imm8 ) & 1]);
r_.m128i_private[1] = (imm8 & 0x80) ? simde__m128i_to_private(simde_mm_setzero_si128()) : ((imm8 & 0x20) ? b_.m128i_private[(imm8 >> 4) & 1] : a_.m128i_private[(imm8 >> 4) & 1]);
return simde__m256i_from_private(r_);
}
#if defined(SIMDE_X86_AVX_NATIVE)
# define simde_mm256_permute2f128_si128(a, b, imm8) _mm256_permute2f128_si128(a, b, imm8)
#endif
#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
#undef _mm256_permute2f128_si256
#define _mm256_permute2f128_si256(a, b, imm8) simde_mm256_permute2f128_si256(a, b, imm8)
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde__m256
simde_mm256_rcp_ps (simde__m256 a) {
#if defined(SIMDE_X86_AVX_NATIVE)
return _mm256_rcp_ps(a);
#else
simde__m256_private
r_,
a_ = simde__m256_to_private(a);
#if SIMDE_NATURAL_VECTOR_SIZE_LE(128)
r_.m128[0] = simde_mm_rcp_ps(a_.m128[0]);
r_.m128[1] = simde_mm_rcp_ps(a_.m128[1]);
#else
SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) {
r_.f32[i] = SIMDE_FLOAT32_C(1.0) / a_.f32[i];
}
#endif
return simde__m256_from_private(r_);
#endif
}
#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
#undef _mm256_rcp_ps
#define _mm256_rcp_ps(a) simde_mm256_rcp_ps(a)
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde__m256
simde_mm256_rsqrt_ps (simde__m256 a) {
#if defined(SIMDE_X86_AVX_NATIVE)
return _mm256_rsqrt_ps(a);
#else
simde__m256_private
r_,
a_ = simde__m256_to_private(a);
#if defined(simde_math_sqrtf)
SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) {
r_.f32[i] = 1.0f / simde_math_sqrtf(a_.f32[i]);
}
#else
HEDLEY_UNREACHABLE();
#endif
return simde__m256_from_private(r_);
#endif
}
#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
#undef _mm256_rsqrt_ps
#define _mm256_rsqrt_ps(a) simde_mm256_rsqrt_ps(a)
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde__m256i
simde_mm256_setr_epi8 (
int8_t e31, int8_t e30, int8_t e29, int8_t e28, int8_t e27, int8_t e26, int8_t e25, int8_t e24,
int8_t e23, int8_t e22, int8_t e21, int8_t e20, int8_t e19, int8_t e18, int8_t e17, int8_t e16,
int8_t e15, int8_t e14, int8_t e13, int8_t e12, int8_t e11, int8_t e10, int8_t e9, int8_t e8,
int8_t e7, int8_t e6, int8_t e5, int8_t e4, int8_t e3, int8_t e2, int8_t e1, int8_t e0) {
#if defined(SIMDE_X86_AVX_NATIVE)
return _mm256_setr_epi8(
e31, e30, e29, e28, e27, e26, e25, e24,
e23, e22, e21, e20, e19, e18, e17, e16,
e15, e14, e13, e12, e11, e10, e9, e8,
e7, e6, e5, e4, e3, e2, e1, e0);
#else
return simde_mm256_set_epi8(
e0, e1, e2, e3, e4, e5, e6, e7,
e8, e9, e10, e11, e12, e13, e14, e15,
e16, e17, e18, e19, e20, e21, e22, e23,
e24, e25, e26, e27, e28, e29, e30, e31);
#endif
}
#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
#undef _mm256_setr_epi8
#define _mm256_setr_epi8(e31, e30, e29, e28, e27, e26, e25, e24, e23, e22, e21, e20, e19, e18, e17, e16, e15, e14, e13, e12, e11, e10, e9, e8, e7, e6, e5, e4, e3, e2, e1, e0) \
simde_mm256_setr_epi8(e31, e30, e29, e28, e27, e26, e25, e24, e23, e22, e21, e20, e19, e18, e17, e16, e15, e14, e13, e12, e11, e10, e9, e8, e7, e6, e5, e4, e3, e2, e1, e0)
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde__m256i
simde_mm256_setr_epi16 (
int16_t e15, int16_t e14, int16_t e13, int16_t e12, int16_t e11, int16_t e10, int16_t e9, int16_t e8,
int16_t e7, int16_t e6, int16_t e5, int16_t e4, int16_t e3, int16_t e2, int16_t e1, int16_t e0) {
#if defined(SIMDE_X86_AVX_NATIVE)
return _mm256_setr_epi16(
e15, e14, e13, e12, e11, e10, e9, e8,
e7, e6, e5, e4, e3, e2, e1, e0);
#else
return simde_mm256_set_epi16(
e0, e1, e2, e3, e4, e5, e6, e7,
e8, e9, e10, e11, e12, e13, e14, e15);
#endif
}
#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
#undef _mm256_setr_epi16
#define _mm256_setr_epi16(e15, e14, e13, e12, e11, e10, e9, e8, e7, e6, e5, e4, e3, e2, e1, e0) \
simde_mm256_setr_epi16(e15, e14, e13, e12, e11, e10, e9, e8, e7, e6, e5, e4, e3, e2, e1, e0)
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde__m256i
simde_mm256_setr_epi32 (
int32_t e7, int32_t e6, int32_t e5, int32_t e4, int32_t e3, int32_t e2, int32_t e1, int32_t e0) {
#if defined(SIMDE_X86_AVX_NATIVE)
return _mm256_setr_epi32(e7, e6, e5, e4, e3, e2, e1, e0);
#else
return simde_mm256_set_epi32(e0, e1, e2, e3, e4, e5, e6, e7);
#endif
}
#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
#undef _mm256_setr_epi32
#define _mm256_setr_epi32(e7, e6, e5, e4, e3, e2, e1, e0) \
simde_mm256_setr_epi32(e7, e6, e5, e4, e3, e2, e1, e0)
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde__m256i
simde_mm256_setr_epi64x (int64_t e3, int64_t e2, int64_t e1, int64_t e0) {
#if defined(SIMDE_X86_AVX_NATIVE)
return _mm256_setr_epi64x(e3, e2, e1, e0);
#else
return simde_mm256_set_epi64x(e0, e1, e2, e3);
#endif
}
#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
#undef _mm256_setr_epi64x
#define _mm256_setr_epi64x(e3, e2, e1, e0) \
simde_mm256_setr_epi64x(e3, e2, e1, e0)
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde__m256
simde_mm256_setr_ps (
simde_float32 e7, simde_float32 e6, simde_float32 e5, simde_float32 e4,
simde_float32 e3, simde_float32 e2, simde_float32 e1, simde_float32 e0) {
#if defined(SIMDE_X86_AVX_NATIVE)
return _mm256_setr_ps(e7, e6, e5, e4, e3, e2, e1, e0);
#else
return simde_mm256_set_ps(e0, e1, e2, e3, e4, e5, e6, e7);
#endif
}
#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
#undef _mm256_setr_ps
#define _mm256_setr_ps(e7, e6, e5, e4, e3, e2, e1, e0) \
simde_mm256_setr_ps(e7, e6, e5, e4, e3, e2, e1, e0)
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde__m256d
simde_mm256_setr_pd (simde_float64 e3, simde_float64 e2, simde_float64 e1, simde_float64 e0) {
#if defined(SIMDE_X86_AVX_NATIVE)
return _mm256_setr_pd(e3, e2, e1, e0);
#else
return simde_mm256_set_pd(e0, e1, e2, e3);
#endif
}
#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
#undef _mm256_setr_pd
#define _mm256_setr_pd(e3, e2, e1, e0) \
simde_mm256_setr_pd(e3, e2, e1, e0)
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde__m256
simde_mm256_setr_m128 (simde__m128 lo, simde__m128 hi) {
#if defined(SIMDE_X86_AVX_NATIVE) && \
!defined(SIMDE_BUG_GCC_REV_247851) && \
SIMDE_DETECT_CLANG_VERSION_CHECK(3,6,0)
return _mm256_setr_m128(lo, hi);
#else
return simde_mm256_set_m128(hi, lo);
#endif
}
#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
#undef _mm256_setr_m128
#define _mm256_setr_m128(lo, hi) \
simde_mm256_setr_m128(lo, hi)
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde__m256d
simde_mm256_setr_m128d (simde__m128d lo, simde__m128d hi) {
#if defined(SIMDE_X86_AVX_NATIVE) && \
!defined(SIMDE_BUG_GCC_REV_247851) && \
SIMDE_DETECT_CLANG_VERSION_CHECK(3,6,0)
return _mm256_setr_m128d(lo, hi);
#else
return simde_mm256_set_m128d(hi, lo);
#endif
}
#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
#undef _mm256_setr_m128d
#define _mm256_setr_m128d(lo, hi) \
simde_mm256_setr_m128d(lo, hi)
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde__m256i
simde_mm256_setr_m128i (simde__m128i lo, simde__m128i hi) {
#if defined(SIMDE_X86_AVX_NATIVE) && \
!defined(SIMDE_BUG_GCC_REV_247851) && \
SIMDE_DETECT_CLANG_VERSION_CHECK(3,6,0)
return _mm256_setr_m128i(lo, hi);
#else
return simde_mm256_set_m128i(hi, lo);
#endif
}
#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
#undef _mm256_setr_m128i
#define _mm256_setr_m128i(lo, hi) \
simde_mm256_setr_m128i(lo, hi)
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde__m256
simde_mm256_shuffle_ps (simde__m256 a, simde__m256 b, const int imm8)
SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 255) {
simde__m256_private
r_,
a_ = simde__m256_to_private(a),
b_ = simde__m256_to_private(b);
r_.f32[0] = a_.m128_private[0].f32[(imm8 >> 0) & 3];
r_.f32[1] = a_.m128_private[0].f32[(imm8 >> 2) & 3];
r_.f32[2] = b_.m128_private[0].f32[(imm8 >> 4) & 3];
r_.f32[3] = b_.m128_private[0].f32[(imm8 >> 6) & 3];
r_.f32[4] = a_.m128_private[1].f32[(imm8 >> 0) & 3];
r_.f32[5] = a_.m128_private[1].f32[(imm8 >> 2) & 3];
r_.f32[6] = b_.m128_private[1].f32[(imm8 >> 4) & 3];
r_.f32[7] = b_.m128_private[1].f32[(imm8 >> 6) & 3];
return simde__m256_from_private(r_);
}
#if defined(SIMDE_X86_AVX_NATIVE)
#define simde_mm256_shuffle_ps(a, b, imm8) _mm256_shuffle_ps(a, b, imm8)
#elif SIMDE_NATURAL_VECTOR_SIZE_LE(128)
#define simde_mm256_shuffle_ps(a, b, imm8) \
simde_mm256_set_m128( \
simde_mm_shuffle_ps(simde_mm256_extractf128_ps(a, 1), simde_mm256_extractf128_ps(b, 1), (imm8)), \
simde_mm_shuffle_ps(simde_mm256_extractf128_ps(a, 0), simde_mm256_extractf128_ps(b, 0), (imm8)))
#elif defined(SIMDE_SHUFFLE_VECTOR_)
#define simde_mm256_shuffle_ps(a, b, imm8) \
SIMDE_SHUFFLE_VECTOR_(32, 32, a, b, \
(((imm8) >> 0) & 3) + 0, \
(((imm8) >> 2) & 3) + 0, \
(((imm8) >> 4) & 3) + 8, \
(((imm8) >> 6) & 3) + 8, \
(((imm8) >> 0) & 3) + 4, \
(((imm8) >> 2) & 3) + 4, \
(((imm8) >> 4) & 3) + 12, \
(((imm8) >> 6) & 3) + 12)
#endif
#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
#undef _mm256_shuffle_ps
#define _mm256_shuffle_ps(a, b, imm8) simde_mm256_shuffle_ps(a, b, imm8)
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde__m256d
simde_mm256_shuffle_pd (simde__m256d a, simde__m256d b, const int imm8)
SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 15) {
simde__m256d_private
r_,
a_ = simde__m256d_to_private(a),
b_ = simde__m256d_to_private(b);
r_.f64[0] = a_.f64[((imm8 ) & 1) ];
r_.f64[1] = b_.f64[((imm8 >> 1) & 1) ];
r_.f64[2] = a_.f64[((imm8 >> 2) & 1) | 2];
r_.f64[3] = b_.f64[((imm8 >> 3) & 1) | 2];
return simde__m256d_from_private(r_);
}
#if defined(SIMDE_X86_AVX_NATIVE)
#define simde_mm256_shuffle_pd(a, b, imm8) _mm256_shuffle_pd(a, b, imm8)
#elif SIMDE_NATURAL_VECTOR_SIZE_LE(128)
#define simde_mm256_shuffle_pd(a, b, imm8) \
simde_mm256_set_m128d( \
simde_mm_shuffle_pd(simde_mm256_extractf128_pd(a, 1), simde_mm256_extractf128_pd(b, 1), (imm8 >> 0) & 3), \
simde_mm_shuffle_pd(simde_mm256_extractf128_pd(a, 0), simde_mm256_extractf128_pd(b, 0), (imm8 >> 2) & 3))
#elif defined(SIMDE_SHUFFLE_VECTOR_)
#define simde_mm256_shuffle_pd(a, b, imm8) \
SIMDE_SHUFFLE_VECTOR_(64, 32, a, b, \
(((imm8) >> 0) & 1) + 0, \
(((imm8) >> 1) & 1) + 4, \
(((imm8) >> 2) & 1) + 2, \
(((imm8) >> 3) & 1) + 6)
#endif
#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
#undef _mm256_shuffle_pd
#define _mm256_shuffle_pd(a, b, imm8) simde_mm256_shuffle_pd(a, b, imm8)
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde__m256
simde_mm256_sqrt_ps (simde__m256 a) {
#if defined(SIMDE_X86_AVX_NATIVE)
return _mm256_sqrt_ps(a);
#else
simde__m256_private
r_,
a_ = simde__m256_to_private(a);
#if SIMDE_NATURAL_VECTOR_SIZE_LE(128)
r_.m128[0] = simde_mm_sqrt_ps(a_.m128[0]);
r_.m128[1] = simde_mm_sqrt_ps(a_.m128[1]);
#elif defined(simde_math_sqrtf)
SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) {
r_.f32[i] = simde_math_sqrtf(a_.f32[i]);
}
#else
HEDLEY_UNREACHABLE();
#endif
return simde__m256_from_private(r_);
#endif
}
#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
#undef _mm256_sqrt_ps
#define _mm256_sqrt_ps(a) simde_mm256_sqrt_ps(a)
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde__m256d
simde_mm256_sqrt_pd (simde__m256d a) {
#if defined(SIMDE_X86_AVX_NATIVE)
return _mm256_sqrt_pd(a);
#else
simde__m256d_private
r_,
a_ = simde__m256d_to_private(a);
#if SIMDE_NATURAL_VECTOR_SIZE_LE(128)
r_.m128d[0] = simde_mm_sqrt_pd(a_.m128d[0]);
r_.m128d[1] = simde_mm_sqrt_pd(a_.m128d[1]);
#elif defined(simde_math_sqrt)
SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) {
r_.f64[i] = simde_math_sqrt(a_.f64[i]);
}
#else
HEDLEY_UNREACHABLE();
#endif
return simde__m256d_from_private(r_);
#endif
}
#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
#undef _mm256_sqrt_pd
#define _mm256_sqrt_pd(a) simde_mm256_sqrt_pd(a)
#endif
SIMDE_FUNCTION_ATTRIBUTES
void
simde_mm256_store_ps (simde_float32 mem_addr[8], simde__m256 a) {
#if defined(SIMDE_X86_AVX_NATIVE)
_mm256_store_ps(mem_addr, a);
#else
simde_memcpy(SIMDE_ALIGN_ASSUME_LIKE(mem_addr, simde__m256), &a, sizeof(a));
#endif
}
#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
#undef _mm256_store_ps
#define _mm256_store_ps(mem_addr, a) simde_mm256_store_ps(HEDLEY_REINTERPRET_CAST(float*, mem_addr), a)
#endif
SIMDE_FUNCTION_ATTRIBUTES
void
simde_mm256_store_pd (simde_float64 mem_addr[4], simde__m256d a) {
#if defined(SIMDE_X86_AVX_NATIVE)
_mm256_store_pd(mem_addr, a);
#else
simde_memcpy(SIMDE_ALIGN_ASSUME_LIKE(mem_addr, simde__m256d), &a, sizeof(a));
#endif
}
#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
#undef _mm256_store_pd
#define _mm256_store_pd(mem_addr, a) simde_mm256_store_pd(HEDLEY_REINTERPRET_CAST(double*, mem_addr), a)
#endif
SIMDE_FUNCTION_ATTRIBUTES
void
simde_mm256_store_si256 (simde__m256i* mem_addr, simde__m256i a) {
#if defined(SIMDE_X86_AVX_NATIVE)
_mm256_store_si256(mem_addr, a);
#else
simde_memcpy(SIMDE_ALIGN_ASSUME_LIKE(mem_addr, simde__m256i), &a, sizeof(a));
#endif
}
#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
#undef _mm256_store_si256
#define _mm256_store_si256(mem_addr, a) simde_mm256_store_si256(mem_addr, a)
#endif
SIMDE_FUNCTION_ATTRIBUTES
void
simde_mm256_storeu_ps (simde_float32 mem_addr[8], simde__m256 a) {
#if defined(SIMDE_X86_AVX_NATIVE)
_mm256_storeu_ps(mem_addr, a);
#else
simde_memcpy(mem_addr, &a, sizeof(a));
#endif
}
#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
#undef _mm256_storeu_ps
#define _mm256_storeu_ps(mem_addr, a) simde_mm256_storeu_ps(HEDLEY_REINTERPRET_CAST(float*, mem_addr), a)
#endif
SIMDE_FUNCTION_ATTRIBUTES
void
simde_mm256_storeu_pd (simde_float64 mem_addr[4], simde__m256d a) {
#if defined(SIMDE_X86_AVX_NATIVE)
_mm256_storeu_pd(mem_addr, a);
#else
simde_memcpy(mem_addr, &a, sizeof(a));
#endif
}
#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
#undef _mm256_storeu_pd
#define _mm256_storeu_pd(mem_addr, a) simde_mm256_storeu_pd(HEDLEY_REINTERPRET_CAST(double*, mem_addr), a)
#endif
SIMDE_FUNCTION_ATTRIBUTES
void
simde_mm256_storeu_si256 (void* mem_addr, simde__m256i a) {
#if defined(SIMDE_X86_AVX_NATIVE)
_mm256_storeu_si256(SIMDE_ALIGN_CAST(__m256i*, mem_addr), a);
#else
simde_memcpy(mem_addr, &a, sizeof(a));
#endif
}
#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
#undef _mm256_storeu_si256
#define _mm256_storeu_si256(mem_addr, a) simde_mm256_storeu_si256(mem_addr, a)
#endif
SIMDE_FUNCTION_ATTRIBUTES
void
simde_mm256_storeu2_m128 (simde_float32 hi_addr[4], simde_float32 lo_addr[4], simde__m256 a) {
#if defined(SIMDE_X86_AVX_NATIVE) && !defined(SIMDE_BUG_GCC_91341) && !defined(SIMDE_BUG_MCST_LCC_MISSING_AVX_LOAD_STORE_M128_FUNCS)
_mm256_storeu2_m128(hi_addr, lo_addr, a);
#else
simde_mm_storeu_ps(lo_addr, simde_mm256_castps256_ps128(a));
simde_mm_storeu_ps(hi_addr, simde_mm256_extractf128_ps(a, 1));
#endif
}
#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
#undef _mm256_storeu2_m128
#define _mm256_storeu2_m128(hi_addr, lo_addr, a) simde_mm256_storeu2_m128(hi_addr, lo_addr, a)
#endif
SIMDE_FUNCTION_ATTRIBUTES
void
simde_mm256_storeu2_m128d (simde_float64 hi_addr[2], simde_float64 lo_addr[2], simde__m256d a) {
#if defined(SIMDE_X86_AVX_NATIVE) && !defined(SIMDE_BUG_GCC_91341) && !defined(SIMDE_BUG_MCST_LCC_MISSING_AVX_LOAD_STORE_M128_FUNCS)
_mm256_storeu2_m128d(hi_addr, lo_addr, a);
#else
simde_mm_storeu_pd(lo_addr, simde_mm256_castpd256_pd128(a));
simde_mm_storeu_pd(hi_addr, simde_mm256_extractf128_pd(a, 1));
#endif
}
#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
#undef _mm256_storeu2_m128d
#define _mm256_storeu2_m128d(hi_addr, lo_addr, a) simde_mm256_storeu2_m128d(hi_addr, lo_addr, a)
#endif
SIMDE_FUNCTION_ATTRIBUTES
void
simde_mm256_storeu2_m128i (simde__m128i* hi_addr, simde__m128i* lo_addr, simde__m256i a) {
#if defined(SIMDE_X86_AVX_NATIVE) && !defined(SIMDE_BUG_GCC_91341) && !defined(SIMDE_BUG_MCST_LCC_MISSING_AVX_LOAD_STORE_M128_FUNCS)
_mm256_storeu2_m128i(hi_addr, lo_addr, a);
#else
simde_mm_storeu_si128(lo_addr, simde_mm256_castsi256_si128(a));
simde_mm_storeu_si128(hi_addr, simde_mm256_extractf128_si256(a, 1));
#endif
}
#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
#undef _mm256_storeu2_m128i
#define _mm256_storeu2_m128i(hi_addr, lo_addr, a) simde_mm256_storeu2_m128i(hi_addr, lo_addr, a)
#endif
SIMDE_FUNCTION_ATTRIBUTES
void
simde_mm256_stream_ps (simde_float32 mem_addr[8], simde__m256 a) {
#if defined(SIMDE_X86_AVX_NATIVE)
_mm256_stream_ps(mem_addr, a);
#else
simde_memcpy(SIMDE_ALIGN_ASSUME_LIKE(mem_addr, simde__m256), &a, sizeof(a));
#endif
}
#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
#undef _mm256_stream_ps
#define _mm256_stream_ps(mem_addr, a) simde_mm256_stream_ps(HEDLEY_REINTERPRET_CAST(float*, mem_addr), a)
#endif
SIMDE_FUNCTION_ATTRIBUTES
void
simde_mm256_stream_pd (simde_float64 mem_addr[4], simde__m256d a) {
#if defined(SIMDE_X86_AVX_NATIVE)
_mm256_stream_pd(mem_addr, a);
#else
simde_memcpy(SIMDE_ALIGN_ASSUME_LIKE(mem_addr, simde__m256d), &a, sizeof(a));
#endif
}
#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
#undef _mm256_stream_pd
#define _mm256_stream_pd(mem_addr, a) simde_mm256_stream_pd(HEDLEY_REINTERPRET_CAST(double*, mem_addr), a)
#endif
SIMDE_FUNCTION_ATTRIBUTES
void
simde_mm256_stream_si256 (simde__m256i* mem_addr, simde__m256i a) {
#if defined(SIMDE_X86_AVX_NATIVE)
_mm256_stream_si256(mem_addr, a);
#else
simde_memcpy(SIMDE_ALIGN_ASSUME_LIKE(mem_addr, simde__m256i), &a, sizeof(a));
#endif
}
#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
#undef _mm256_stream_si256
#define _mm256_stream_si256(mem_addr, a) simde_mm256_stream_si256(mem_addr, a)
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde__m256
simde_mm256_sub_ps (simde__m256 a, simde__m256 b) {
#if defined(SIMDE_X86_AVX_NATIVE)
return _mm256_sub_ps(a, b);
#else
simde__m256_private
r_,
a_ = simde__m256_to_private(a),
b_ = simde__m256_to_private(b);
#if SIMDE_NATURAL_VECTOR_SIZE_LE(128)
r_.m128[0] = simde_mm_sub_ps(a_.m128[0], b_.m128[0]);
r_.m128[1] = simde_mm_sub_ps(a_.m128[1], b_.m128[1]);
#elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
r_.f32 = a_.f32 - b_.f32;
#else
SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) {
r_.f32[i] = a_.f32[i] - b_.f32[i];
}
#endif
return simde__m256_from_private(r_);
#endif
}
#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
#undef _mm256_sub_ps
#define _mm256_sub_ps(a, b) simde_mm256_sub_ps(a, b)
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde__m256
simde_mm256_hsub_ps (simde__m256 a, simde__m256 b) {
#if defined(SIMDE_X86_AVX_NATIVE)
return _mm256_hsub_ps(a, b);
#else
return simde_mm256_sub_ps(simde_x_mm256_deinterleaveeven_ps(a, b), simde_x_mm256_deinterleaveodd_ps(a, b));
#endif
}
#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
#undef _mm256_hsub_ps
#define _mm256_hsub_ps(a, b) simde_mm256_hsub_ps(a, b)
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde__m256d
simde_mm256_sub_pd (simde__m256d a, simde__m256d b) {
#if defined(SIMDE_X86_AVX_NATIVE)
return _mm256_sub_pd(a, b);
#else
simde__m256d_private
r_,
a_ = simde__m256d_to_private(a),
b_ = simde__m256d_to_private(b);
#if SIMDE_NATURAL_VECTOR_SIZE_LE(128)
r_.m128d[0] = simde_mm_sub_pd(a_.m128d[0], b_.m128d[0]);
r_.m128d[1] = simde_mm_sub_pd(a_.m128d[1], b_.m128d[1]);
#elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
r_.f64 = a_.f64 - b_.f64;
#else
SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) {
r_.f64[i] = a_.f64[i] - b_.f64[i];
}
#endif
return simde__m256d_from_private(r_);
#endif
}
#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
#undef _mm256_sub_pd
#define _mm256_sub_pd(a, b) simde_mm256_sub_pd(a, b)
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde__m256d
simde_mm256_hsub_pd (simde__m256d a, simde__m256d b) {
#if defined(SIMDE_X86_AVX_NATIVE)
return _mm256_hsub_pd(a, b);
#else
return simde_mm256_sub_pd(simde_x_mm256_deinterleaveeven_pd(a, b), simde_x_mm256_deinterleaveodd_pd(a, b));
#endif
}
#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
#undef _mm256_hsub_pd
#define _mm256_hsub_pd(a, b) simde_mm256_hsub_pd(a, b)
#endif
#if defined(SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_)
HEDLEY_DIAGNOSTIC_PUSH
SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde__m256
simde_mm256_undefined_ps (void) {
simde__m256_private r_;
#if \
defined(SIMDE_X86_AVX_NATIVE) && \
(!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(5,0,0)) && \
(!defined(__has_builtin) || HEDLEY_HAS_BUILTIN(__builtin_ia32_undef256))
r_.n = _mm256_undefined_ps();
#elif !defined(SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_)
r_ = simde__m256_to_private(simde_mm256_setzero_ps());
#endif
return simde__m256_from_private(r_);
}
#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
#undef _mm256_undefined_ps
#define _mm256_undefined_ps() simde_mm256_undefined_ps()
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde__m256d
simde_mm256_undefined_pd (void) {
simde__m256d_private r_;
#if \
defined(SIMDE_X86_AVX_NATIVE) && \
(!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(5,0,0)) && \
(!defined(__has_builtin) || HEDLEY_HAS_BUILTIN(__builtin_ia32_undef256))
r_.n = _mm256_undefined_pd();
#elif !defined(SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_)
r_ = simde__m256d_to_private(simde_mm256_setzero_pd());
#endif
return simde__m256d_from_private(r_);
}
#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
#undef _mm256_undefined_pd
#define _mm256_undefined_pd() simde_mm256_undefined_pd()
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde__m256i
simde_mm256_undefined_si256 (void) {
simde__m256i_private r_;
#if \
defined(SIMDE_X86_AVX_NATIVE) && \
(!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(5,0,0)) && \
(!defined(__has_builtin) || HEDLEY_HAS_BUILTIN(__builtin_ia32_undef256))
r_.n = _mm256_undefined_si256();
#elif !defined(SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_)
r_ = simde__m256i_to_private(simde_mm256_setzero_si256());
#endif
return simde__m256i_from_private(r_);
}
#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
#undef _mm256_undefined_si256
#define _mm256_undefined_si256() simde_mm256_undefined_si256()
#endif
#if defined(SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_)
HEDLEY_DIAGNOSTIC_POP
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde__m256
simde_mm256_xor_ps (simde__m256 a, simde__m256 b) {
#if defined(SIMDE_X86_AVX_NATIVE)
return _mm256_xor_ps(a, b);
#else
simde__m256_private
r_,
a_ = simde__m256_to_private(a),
b_ = simde__m256_to_private(b);
#if SIMDE_NATURAL_VECTOR_SIZE_LE(128)
r_.m128[0] = simde_mm_xor_ps(a_.m128[0], b_.m128[0]);
r_.m128[1] = simde_mm_xor_ps(a_.m128[1], b_.m128[1]);
#elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
r_.i32f = a_.i32f ^ b_.i32f;
#else
SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof(r_.u32) / sizeof(r_.u32[0])) ; i++) {
r_.u32[i] = a_.u32[i] ^ b_.u32[i];
}
#endif
return simde__m256_from_private(r_);
#endif
}
#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
#undef _mm256_xor_ps
#define _mm256_xor_ps(a, b) simde_mm256_xor_ps(a, b)
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde__m256d
simde_mm256_xor_pd (simde__m256d a, simde__m256d b) {
#if defined(SIMDE_X86_AVX_NATIVE)
return _mm256_xor_pd(a, b);
#else
simde__m256d_private
r_,
a_ = simde__m256d_to_private(a),
b_ = simde__m256d_to_private(b);
#if SIMDE_NATURAL_VECTOR_SIZE_LE(128)
r_.m128d[0] = simde_mm_xor_pd(a_.m128d[0], b_.m128d[0]);
r_.m128d[1] = simde_mm_xor_pd(a_.m128d[1], b_.m128d[1]);
#elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
r_.i32f = a_.i32f ^ b_.i32f;
#else
SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof(r_.u64) / sizeof(r_.u64[0])) ; i++) {
r_.u64[i] = a_.u64[i] ^ b_.u64[i];
}
#endif
return simde__m256d_from_private(r_);
#endif
}
#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
#undef _mm256_xor_pd
#define _mm256_xor_pd(a, b) simde_mm256_xor_pd(a, b)
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde__m256
simde_x_mm256_xorsign_ps(simde__m256 dest, simde__m256 src) {
return simde_mm256_xor_ps(simde_mm256_and_ps(simde_mm256_set1_ps(-0.0f), src), dest);
}
SIMDE_FUNCTION_ATTRIBUTES
simde__m256d
simde_x_mm256_xorsign_pd(simde__m256d dest, simde__m256d src) {
return simde_mm256_xor_pd(simde_mm256_and_pd(simde_mm256_set1_pd(-0.0), src), dest);
}
SIMDE_FUNCTION_ATTRIBUTES
simde__m256
simde_x_mm256_negate_ps(simde__m256 a) {
#if defined(SIMDE_X86_AVX_NATIVE)
return simde_mm256_xor_ps(a,_mm256_set1_ps(SIMDE_FLOAT32_C(-0.0)));
#else
simde__m256_private
r_,
a_ = simde__m256_to_private(a);
#if defined(SIMDE_VECTOR_NEGATE)
r_.f32 = -a_.f32;
#else
SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) {
r_.f32[i] = -a_.f32[i];
}
#endif
return simde__m256_from_private(r_);
#endif
}
SIMDE_FUNCTION_ATTRIBUTES
simde__m256d
simde_x_mm256_negate_pd(simde__m256d a) {
#if defined(SIMDE_X86_AVX2_NATIVE)
return simde_mm256_xor_pd(a, _mm256_set1_pd(SIMDE_FLOAT64_C(-0.0)));
#else
simde__m256d_private
r_,
a_ = simde__m256d_to_private(a);
#if defined(SIMDE_VECTOR_NEGATE)
r_.f64 = -a_.f64;
#else
SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) {
r_.f64[i] = -a_.f64[i];
}
#endif
return simde__m256d_from_private(r_);
#endif
}
SIMDE_FUNCTION_ATTRIBUTES
simde__m256
simde_mm256_unpackhi_ps (simde__m256 a, simde__m256 b) {
#if defined(SIMDE_X86_AVX_NATIVE)
return _mm256_unpackhi_ps(a, b);
#else
simde__m256_private
r_,
a_ = simde__m256_to_private(a),
b_ = simde__m256_to_private(b);
#if defined(SIMDE_SHUFFLE_VECTOR_)
r_.f32 = SIMDE_SHUFFLE_VECTOR_(32, 32, a_.f32, b_.f32, 2, 10, 3, 11, 6, 14, 7, 15);
#else
r_.f32[0] = a_.f32[2];
r_.f32[1] = b_.f32[2];
r_.f32[2] = a_.f32[3];
r_.f32[3] = b_.f32[3];
r_.f32[4] = a_.f32[6];
r_.f32[5] = b_.f32[6];
r_.f32[6] = a_.f32[7];
r_.f32[7] = b_.f32[7];
#endif
return simde__m256_from_private(r_);
#endif
}
#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
#undef _mm256_unpackhi_ps
#define _mm256_unpackhi_ps(a, b) simde_mm256_unpackhi_ps(a, b)
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde__m256d
simde_mm256_unpackhi_pd (simde__m256d a, simde__m256d b) {
#if defined(SIMDE_X86_AVX_NATIVE)
return _mm256_unpackhi_pd(a, b);
#else
simde__m256d_private
r_,
a_ = simde__m256d_to_private(a),
b_ = simde__m256d_to_private(b);
#if defined(SIMDE_SHUFFLE_VECTOR_)
r_.f64 = SIMDE_SHUFFLE_VECTOR_(64, 32, a_.f64, b_.f64, 1, 5, 3, 7);
#else
r_.f64[0] = a_.f64[1];
r_.f64[1] = b_.f64[1];
r_.f64[2] = a_.f64[3];
r_.f64[3] = b_.f64[3];
#endif
return simde__m256d_from_private(r_);
#endif
}
#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
#undef _mm256_unpackhi_pd
#define _mm256_unpackhi_pd(a, b) simde_mm256_unpackhi_pd(a, b)
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde__m256
simde_mm256_unpacklo_ps (simde__m256 a, simde__m256 b) {
#if defined(SIMDE_X86_AVX_NATIVE)
return _mm256_unpacklo_ps(a, b);
#else
simde__m256_private
r_,
a_ = simde__m256_to_private(a),
b_ = simde__m256_to_private(b);
#if defined(SIMDE_SHUFFLE_VECTOR_)
r_.f32 = SIMDE_SHUFFLE_VECTOR_(32, 32, a_.f32, b_.f32, 0, 8, 1, 9, 4, 12, 5, 13);
#else
r_.f32[0] = a_.f32[0];
r_.f32[1] = b_.f32[0];
r_.f32[2] = a_.f32[1];
r_.f32[3] = b_.f32[1];
r_.f32[4] = a_.f32[4];
r_.f32[5] = b_.f32[4];
r_.f32[6] = a_.f32[5];
r_.f32[7] = b_.f32[5];
#endif
return simde__m256_from_private(r_);
#endif
}
#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
#undef _mm256_unpacklo_ps
#define _mm256_unpacklo_ps(a, b) simde_mm256_unpacklo_ps(a, b)
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde__m256d
simde_mm256_unpacklo_pd (simde__m256d a, simde__m256d b) {
#if defined(SIMDE_X86_AVX_NATIVE)
return _mm256_unpacklo_pd(a, b);
#else
simde__m256d_private
r_,
a_ = simde__m256d_to_private(a),
b_ = simde__m256d_to_private(b);
#if defined(SIMDE_SHUFFLE_VECTOR_)
r_.f64 = SIMDE_SHUFFLE_VECTOR_(64, 32, a_.f64, b_.f64, 0, 4, 2, 6);
#else
r_.f64[0] = a_.f64[0];
r_.f64[1] = b_.f64[0];
r_.f64[2] = a_.f64[2];
r_.f64[3] = b_.f64[2];
#endif
return simde__m256d_from_private(r_);
#endif
}
#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
#undef _mm256_unpacklo_pd
#define _mm256_unpacklo_pd(a, b) simde_mm256_unpacklo_pd(a, b)
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde__m256
simde_mm256_zextps128_ps256 (simde__m128 a) {
#if defined(SIMDE_X86_AVX_NATIVE)
return _mm256_insertf128_ps(_mm256_setzero_ps(), a, 0);
#else
simde__m256_private r_;
r_.m128_private[0] = simde__m128_to_private(a);
r_.m128_private[1] = simde__m128_to_private(simde_mm_setzero_ps());
return simde__m256_from_private(r_);
#endif
}
#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
#undef _mm256_zextps128_ps256
#define _mm256_zextps128_ps256(a) simde_mm256_zextps128_ps256(a)
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde__m256d
simde_mm256_zextpd128_pd256 (simde__m128d a) {
#if defined(SIMDE_X86_AVX_NATIVE)
return _mm256_insertf128_pd(_mm256_setzero_pd(), a, 0);
#else
simde__m256d_private r_;
r_.m128d_private[0] = simde__m128d_to_private(a);
r_.m128d_private[1] = simde__m128d_to_private(simde_mm_setzero_pd());
return simde__m256d_from_private(r_);
#endif
}
#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
#undef _mm256_zextpd128_pd256
#define _mm256_zextpd128_pd256(a) simde_mm256_zextpd128_pd256(a)
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde__m256i
simde_mm256_zextsi128_si256 (simde__m128i a) {
#if defined(SIMDE_X86_AVX_NATIVE)
return _mm256_insertf128_si256(_mm256_setzero_si256(), a, 0);
#else
simde__m256i_private r_;
r_.m128i_private[0] = simde__m128i_to_private(a);
r_.m128i_private[1] = simde__m128i_to_private(simde_mm_setzero_si128());
return simde__m256i_from_private(r_);
#endif
}
#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
#undef _mm256_zextsi128_si256
#define _mm256_zextsi128_si256(a) simde_mm256_zextsi128_si256(a)
#endif
SIMDE_FUNCTION_ATTRIBUTES
int
simde_mm_testc_ps (simde__m128 a, simde__m128 b) {
#if defined(SIMDE_X86_AVX_NATIVE)
return _mm_testc_ps(a, b);
#else
simde__m128_private
a_ = simde__m128_to_private(a),
b_ = simde__m128_to_private(b);
#if defined(SIMDE_WASM_SIMD128_NATIVE)
v128_t m = wasm_u32x4_shr(wasm_v128_or(wasm_v128_not(b_.wasm_v128), a_.wasm_v128), 31);
m = wasm_v128_and(m, simde_mm_movehl_ps(m, m));
m = wasm_v128_and(m, simde_mm_shuffle_epi32(m, SIMDE_MM_SHUFFLE(3, 2, 0, 1)));
return wasm_i32x4_extract_lane(m, 0);
#else
uint_fast32_t r = 0;
SIMDE_VECTORIZE_REDUCTION(|:r)
for (size_t i = 0 ; i < (sizeof(a_.u32) / sizeof(a_.u32[0])) ; i++) {
r |= ~a_.u32[i] & b_.u32[i];
}
return HEDLEY_STATIC_CAST(int, ((~r >> 31) & 1));
#endif
#endif
}
#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
#undef _mm_testc_ps
#define _mm_testc_ps(a, b) simde_mm_testc_ps(a, b)
#endif
SIMDE_FUNCTION_ATTRIBUTES
int
simde_mm_testc_pd (simde__m128d a, simde__m128d b) {
#if defined(SIMDE_X86_AVX_NATIVE)
return _mm_testc_pd(a, b);
#else
simde__m128d_private
a_ = simde__m128d_to_private(a),
b_ = simde__m128d_to_private(b);
#if defined(SIMDE_WASM_SIMD128_NATIVE)
v128_t m = wasm_u64x2_shr(wasm_v128_or(wasm_v128_not(b_.wasm_v128), a_.wasm_v128), 63);
return HEDLEY_STATIC_CAST(int, wasm_i64x2_extract_lane(m, 0) & wasm_i64x2_extract_lane(m, 1));
#else
uint_fast64_t r = 0;
SIMDE_VECTORIZE_REDUCTION(|:r)
for (size_t i = 0 ; i < (sizeof(a_.u64) / sizeof(a_.u64[0])) ; i++) {
r |= ~a_.u64[i] & b_.u64[i];
}
return HEDLEY_STATIC_CAST(int, ((~r >> 63) & 1));
#endif
#endif
}
#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
#undef _mm_testc_pd
#define _mm_testc_pd(a, b) simde_mm_testc_pd(a, b)
#endif
SIMDE_FUNCTION_ATTRIBUTES
int
simde_mm256_testc_ps (simde__m256 a, simde__m256 b) {
#if defined(SIMDE_X86_AVX_NATIVE)
return _mm256_testc_ps(a, b);
#else
uint_fast32_t r = 0;
simde__m256_private
a_ = simde__m256_to_private(a),
b_ = simde__m256_to_private(b);
SIMDE_VECTORIZE_REDUCTION(|:r)
for (size_t i = 0 ; i < (sizeof(a_.u32) / sizeof(a_.u32[0])) ; i++) {
r |= ~a_.u32[i] & b_.u32[i];
}
return HEDLEY_STATIC_CAST(int, ((~r >> 31) & 1));
#endif
}
#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
#undef _mm256_testc_ps
#define _mm256_testc_ps(a, b) simde_mm256_testc_ps(a, b)
#endif
SIMDE_FUNCTION_ATTRIBUTES
int
simde_mm256_testc_pd (simde__m256d a, simde__m256d b) {
#if defined(SIMDE_X86_AVX_NATIVE)
return _mm256_testc_pd(a, b);
#else
uint_fast64_t r = 0;
simde__m256d_private
a_ = simde__m256d_to_private(a),
b_ = simde__m256d_to_private(b);
SIMDE_VECTORIZE_REDUCTION(|:r)
for (size_t i = 0 ; i < (sizeof(a_.u64) / sizeof(a_.u64[0])) ; i++) {
r |= ~a_.u64[i] & b_.u64[i];
}
return HEDLEY_STATIC_CAST(int, ((~r >> 63) & 1));
#endif
}
#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
#undef _mm256_testc_pd
#define _mm256_testc_pd(a, b) simde_mm256_testc_pd(a, b)
#endif
SIMDE_FUNCTION_ATTRIBUTES
int
simde_mm256_testc_si256 (simde__m256i a, simde__m256i b) {
#if defined(SIMDE_X86_AVX_NATIVE)
return _mm256_testc_si256(a, b);
#else
int_fast32_t r = 0;
simde__m256i_private
a_ = simde__m256i_to_private(a),
b_ = simde__m256i_to_private(b);
SIMDE_VECTORIZE_REDUCTION(|:r)
for (size_t i = 0 ; i < (sizeof(a_.i32f) / sizeof(a_.i32f[0])) ; i++) {
r |= ~a_.i32f[i] & b_.i32f[i];
}
return HEDLEY_STATIC_CAST(int, !r);
#endif
}
#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
#undef _mm256_testc_si256
#define _mm256_testc_si256(a, b) simde_mm256_testc_si256(a, b)
#endif
SIMDE_FUNCTION_ATTRIBUTES
int
simde_mm_testz_ps (simde__m128 a, simde__m128 b) {
#if defined(SIMDE_X86_AVX_NATIVE)
return _mm_testz_ps(a, b);
#else
simde__m128_private
a_ = simde__m128_to_private(a),
b_ = simde__m128_to_private(b);
#if defined(SIMDE_WASM_SIMD128_NATIVE)
v128_t m = wasm_u32x4_shr(wasm_v128_not(wasm_v128_and(a_.wasm_v128, b_.wasm_v128)), 31);
m = wasm_v128_and(m, simde_mm_movehl_ps(m, m));
m = wasm_v128_and(m, simde_mm_shuffle_epi32(m, SIMDE_MM_SHUFFLE(3, 2, 0, 1)));
return wasm_i32x4_extract_lane(m, 0);
#else
uint_fast32_t r = 0;
SIMDE_VECTORIZE_REDUCTION(|:r)
for (size_t i = 0 ; i < (sizeof(a_.u32) / sizeof(a_.u32[0])) ; i++) {
r |= a_.u32[i] & b_.u32[i];
}
return HEDLEY_STATIC_CAST(int, ((~r >> 31) & 1));
#endif
#endif
}
#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
#undef _mm_testz_ps
#define _mm_testz_ps(a, b) simde_mm_testz_ps(a, b)
#endif
SIMDE_FUNCTION_ATTRIBUTES
int
simde_mm_testz_pd (simde__m128d a, simde__m128d b) {
#if defined(SIMDE_X86_AVX_NATIVE)
return _mm_testz_pd(a, b);
#else
simde__m128d_private
a_ = simde__m128d_to_private(a),
b_ = simde__m128d_to_private(b);
#if defined(SIMDE_WASM_SIMD128_NATIVE)
v128_t m = wasm_u64x2_shr(wasm_v128_not(wasm_v128_and(a_.wasm_v128, b_.wasm_v128)), 63);
return HEDLEY_STATIC_CAST(int, wasm_i64x2_extract_lane(m, 0) & wasm_i64x2_extract_lane(m, 1));
#else
uint_fast64_t r = 0;
SIMDE_VECTORIZE_REDUCTION(|:r)
for (size_t i = 0 ; i < (sizeof(a_.u64) / sizeof(a_.u64[0])) ; i++) {
r |= a_.u64[i] & b_.u64[i];
}
return HEDLEY_STATIC_CAST(int, ((~r >> 63) & 1));
#endif
#endif
}
#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
#undef _mm_testz_pd
#define _mm_testz_pd(a, b) simde_mm_testz_pd(a, b)
#endif
SIMDE_FUNCTION_ATTRIBUTES
int
simde_mm256_testz_ps (simde__m256 a, simde__m256 b) {
#if defined(SIMDE_X86_AVX_NATIVE)
return _mm256_testz_ps(a, b);
#else
uint_fast32_t r = 0;
simde__m256_private
a_ = simde__m256_to_private(a),
b_ = simde__m256_to_private(b);
SIMDE_VECTORIZE_REDUCTION(|:r)
for (size_t i = 0 ; i < (sizeof(a_.u32) / sizeof(a_.u32[0])) ; i++) {
r |= a_.u32[i] & b_.u32[i];
}
return HEDLEY_STATIC_CAST(int, ((~r >> 31) & 1));
#endif
}
#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
#undef _mm256_testz_ps
#define _mm256_testz_ps(a, b) simde_mm256_testz_ps(a, b)
#endif
SIMDE_FUNCTION_ATTRIBUTES
int
simde_mm256_testz_pd (simde__m256d a, simde__m256d b) {
#if defined(SIMDE_X86_AVX_NATIVE)
return _mm256_testz_pd(a, b);
#else
uint_fast64_t r = 0;
simde__m256d_private
a_ = simde__m256d_to_private(a),
b_ = simde__m256d_to_private(b);
SIMDE_VECTORIZE_REDUCTION(|:r)
for (size_t i = 0 ; i < (sizeof(a_.u64) / sizeof(a_.u64[0])) ; i++) {
r |= a_.u64[i] & b_.u64[i];
}
return HEDLEY_STATIC_CAST(int, ((~r >> 63) & 1));
#endif
}
#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
#undef _mm256_testz_pd
#define _mm256_testz_pd(a, b) simde_mm256_testz_pd(a, b)
#endif
SIMDE_FUNCTION_ATTRIBUTES
int
simde_mm256_testz_si256 (simde__m256i a, simde__m256i b) {
#if defined(SIMDE_X86_AVX_NATIVE)
return _mm256_testz_si256(a, b);
#else
int_fast32_t r = 0;
simde__m256i_private
a_ = simde__m256i_to_private(a),
b_ = simde__m256i_to_private(b);
#if SIMDE_NATURAL_VECTOR_SIZE_LE(128)
r = simde_mm_testz_si128(a_.m128i[0], b_.m128i[0]) && simde_mm_testz_si128(a_.m128i[1], b_.m128i[1]);
#else
SIMDE_VECTORIZE_REDUCTION(|:r)
for (size_t i = 0 ; i < (sizeof(a_.i32f) / sizeof(a_.i32f[0])) ; i++) {
r |= a_.i32f[i] & b_.i32f[i];
}
r = !r;
#endif
return HEDLEY_STATIC_CAST(int, r);
#endif
}
#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
#undef _mm256_testz_si256
#define _mm256_testz_si256(a, b) simde_mm256_testz_si256(a, b)
#endif
SIMDE_FUNCTION_ATTRIBUTES
int
simde_mm_testnzc_ps (simde__m128 a, simde__m128 b) {
#if defined(SIMDE_X86_AVX_NATIVE)
return _mm_testnzc_ps(a, b);
#else
simde__m128_private
a_ = simde__m128_to_private(a),
b_ = simde__m128_to_private(b);
#if defined(SIMDE_WASM_SIMD128_NATIVE)
v128_t m = wasm_u32x4_shr(wasm_v128_and(a_.wasm_v128, b_.wasm_v128), 31);
v128_t m2 = wasm_u32x4_shr(wasm_v128_andnot(b_.wasm_v128, a_.wasm_v128), 31);
m = wasm_v128_or(m, simde_mm_movehl_ps(m, m));
m2 = wasm_v128_or(m2, simde_mm_movehl_ps(m2, m2));
m = wasm_v128_or(m, simde_mm_shuffle_epi32(m, SIMDE_MM_SHUFFLE(3, 2, 0, 1)));
m2 = wasm_v128_or(m2, simde_mm_shuffle_epi32(m2, SIMDE_MM_SHUFFLE(3, 2, 0, 1)));
return wasm_i32x4_extract_lane(m, 0) & wasm_i32x4_extract_lane(m2, 0);
#else
uint32_t rz = 0, rc = 0;
for (size_t i = 0 ; i < (sizeof(a_.u32) / sizeof(a_.u32[0])) ; i++) {
rc |= ~a_.u32[i] & b_.u32[i];
rz |= a_.u32[i] & b_.u32[i];
}
return
(rc >> ((sizeof(rc) * CHAR_BIT) - 1)) &
(rz >> ((sizeof(rz) * CHAR_BIT) - 1));
#endif
#endif
}
#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
#undef _mm_testnzc_ps
#define _mm_testnzc_ps(a, b) simde_mm_testnzc_ps(a, b)
#endif
SIMDE_FUNCTION_ATTRIBUTES
int
simde_mm_testnzc_pd (simde__m128d a, simde__m128d b) {
#if defined(SIMDE_X86_AVX_NATIVE)
return _mm_testnzc_pd(a, b);
#else
simde__m128d_private
a_ = simde__m128d_to_private(a),
b_ = simde__m128d_to_private(b);
#if defined(SIMDE_WASM_SIMD128_NATIVE)
v128_t m = wasm_u64x2_shr(wasm_v128_and(a_.wasm_v128, b_.wasm_v128), 63);
v128_t m2 = wasm_u64x2_shr(wasm_v128_andnot(b_.wasm_v128, a_.wasm_v128), 63);
return HEDLEY_STATIC_CAST(int, (wasm_i64x2_extract_lane(m, 0) | wasm_i64x2_extract_lane(m, 1))
& (wasm_i64x2_extract_lane(m2, 0) | wasm_i64x2_extract_lane(m2, 1)));
#else
uint64_t rc = 0, rz = 0;
for (size_t i = 0 ; i < (sizeof(a_.u64) / sizeof(a_.u64[0])) ; i++) {
rc |= ~a_.u64[i] & b_.u64[i];
rz |= a_.u64[i] & b_.u64[i];
}
return
(rc >> ((sizeof(rc) * CHAR_BIT) - 1)) &
(rz >> ((sizeof(rz) * CHAR_BIT) - 1));
#endif
#endif
}
#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
#undef _mm_testnzc_pd
#define _mm_testnzc_pd(a, b) simde_mm_testnzc_pd(a, b)
#endif
SIMDE_FUNCTION_ATTRIBUTES
int
simde_mm256_testnzc_ps (simde__m256 a, simde__m256 b) {
#if defined(SIMDE_X86_AVX_NATIVE)
return _mm256_testnzc_ps(a, b);
#else
uint32_t rc = 0, rz = 0;
simde__m256_private
a_ = simde__m256_to_private(a),
b_ = simde__m256_to_private(b);
for (size_t i = 0 ; i < (sizeof(a_.u32) / sizeof(a_.u32[0])) ; i++) {
rc |= ~a_.u32[i] & b_.u32[i];
rz |= a_.u32[i] & b_.u32[i];
}
return
(rc >> ((sizeof(rc) * CHAR_BIT) - 1)) &
(rz >> ((sizeof(rz) * CHAR_BIT) - 1));
#endif
}
#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
#undef _mm256_testnzc_ps
#define _mm256_testnzc_ps(a, b) simde_mm256_testnzc_ps(a, b)
#endif
SIMDE_FUNCTION_ATTRIBUTES
int
simde_mm256_testnzc_pd (simde__m256d a, simde__m256d b) {
#if defined(SIMDE_X86_AVX_NATIVE)
return _mm256_testnzc_pd(a, b);
#else
uint64_t rc = 0, rz = 0;
simde__m256d_private
a_ = simde__m256d_to_private(a),
b_ = simde__m256d_to_private(b);
for (size_t i = 0 ; i < (sizeof(a_.u64) / sizeof(a_.u64[0])) ; i++) {
rc |= ~a_.u64[i] & b_.u64[i];
rz |= a_.u64[i] & b_.u64[i];
}
return
(rc >> ((sizeof(rc) * CHAR_BIT) - 1)) &
(rz >> ((sizeof(rz) * CHAR_BIT) - 1));
#endif
}
#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
#undef _mm256_testnzc_pd
#define _mm256_testnzc_pd(a, b) simde_mm256_testnzc_pd(a, b)
#endif
SIMDE_FUNCTION_ATTRIBUTES
int
simde_mm256_testnzc_si256 (simde__m256i a, simde__m256i b) {
#if defined(SIMDE_X86_AVX_NATIVE)
return _mm256_testnzc_si256(a, b);
#else
int32_t rc = 0, rz = 0;
simde__m256i_private
a_ = simde__m256i_to_private(a),
b_ = simde__m256i_to_private(b);
for (size_t i = 0 ; i < (sizeof(a_.i32f) / sizeof(a_.i32f[0])) ; i++) {
rc |= ~a_.i32f[i] & b_.i32f[i];
rz |= a_.i32f[i] & b_.i32f[i];
}
return !!(rc & rz);
#endif
}
#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
#undef _mm256_testnzc_si256
#define _mm256_testnzc_si256(a, b) simde_mm256_testnzc_si256(a, b)
#endif
SIMDE_END_DECLS_
HEDLEY_DIAGNOSTIC_POP
#endif /* !defined(SIMDE_X86_AVX_H) */