203 lines
7.8 KiB
C
203 lines
7.8 KiB
C
#if !defined(SIMDE_X86_AVX512_BITSHUFFLE_H)
|
|
#define SIMDE_X86_AVX512_BITSHUFFLE_H
|
|
|
|
#include "types.h"
|
|
|
|
HEDLEY_DIAGNOSTIC_PUSH
|
|
SIMDE_DISABLE_UNWANTED_DIAGNOSTICS
|
|
SIMDE_BEGIN_DECLS_
|
|
|
|
SIMDE_FUNCTION_ATTRIBUTES
|
|
simde__mmask16
|
|
simde_mm_bitshuffle_epi64_mask (simde__m128i b, simde__m128i c) {
|
|
#if defined(SIMDE_X86_AVX512BITALG_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE)
|
|
return _mm_bitshuffle_epi64_mask(b, c);
|
|
#else
|
|
simde__m128i_private
|
|
b_ = simde__m128i_to_private(b),
|
|
c_ = simde__m128i_to_private(c);
|
|
simde__mmask16 r = 0;
|
|
|
|
#if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR)
|
|
__typeof__(b_.u64) rv = { 0, 0 };
|
|
__typeof__(b_.u64) lshift = { 0, 8 };
|
|
|
|
for (int8_t i = 0 ; i < 8 ; i++) {
|
|
__typeof__(b_.u64) ct = (HEDLEY_REINTERPRET_CAST(__typeof__(ct), c_.u8) >> (i * 8)) & 63;
|
|
rv |= ((b_.u64 >> ct) & 1) << lshift;
|
|
lshift += 1;
|
|
}
|
|
|
|
r =
|
|
HEDLEY_STATIC_CAST(simde__mmask16, rv[0]) |
|
|
HEDLEY_STATIC_CAST(simde__mmask16, rv[1]);
|
|
#else
|
|
for (size_t i = 0 ; i < (sizeof(c_.m64_private) / sizeof(c_.m64_private[0])) ; i++) {
|
|
SIMDE_VECTORIZE_REDUCTION(|:r)
|
|
for (size_t j = 0 ; j < (sizeof(c_.m64_private[i].u8) / sizeof(c_.m64_private[i].u8[0])) ; j++) {
|
|
r |= (((b_.u64[i] >> (c_.m64_private[i].u8[j]) & 63) & 1) << ((i * 8) + j));
|
|
}
|
|
}
|
|
#endif
|
|
|
|
return r;
|
|
#endif
|
|
}
|
|
#if defined(SIMDE_X86_AVX512BITALG_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES)
|
|
#undef _mm_bitshuffle_epi64_mask
|
|
#define _mm_bitshuffle_epi64_mask(b, c) simde_mm_bitshuffle_epi64_mask(b, c)
|
|
#endif
|
|
|
|
SIMDE_FUNCTION_ATTRIBUTES
|
|
simde__mmask16
|
|
simde_mm_mask_bitshuffle_epi64_mask (simde__mmask16 k, simde__m128i b, simde__m128i c) {
|
|
#if defined(SIMDE_X86_AVX512BITALG_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE)
|
|
return _mm_mask_bitshuffle_epi64_mask(k, b, c);
|
|
#else
|
|
return (k & simde_mm_bitshuffle_epi64_mask(b, c));
|
|
#endif
|
|
}
|
|
#if defined(SIMDE_X86_AVX512BITALG_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES)
|
|
#undef _mm_mask_bitshuffle_epi64_mask
|
|
#define _mm_mask_bitshuffle_epi64_mask(k, b, c) simde_mm_mask_bitshuffle_epi64_mask(k, b, c)
|
|
#endif
|
|
|
|
SIMDE_FUNCTION_ATTRIBUTES
|
|
simde__mmask32
|
|
simde_mm256_bitshuffle_epi64_mask (simde__m256i b, simde__m256i c) {
|
|
#if defined(SIMDE_X86_AVX512BITALG_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE)
|
|
return _mm256_bitshuffle_epi64_mask(b, c);
|
|
#else
|
|
simde__m256i_private
|
|
b_ = simde__m256i_to_private(b),
|
|
c_ = simde__m256i_to_private(c);
|
|
simde__mmask32 r = 0;
|
|
|
|
#if SIMDE_NATURAL_VECTOR_SIZE_LE(128)
|
|
for (size_t i = 0 ; i < sizeof(b_.m128i) / sizeof(b_.m128i[0]) ; i++) {
|
|
r |= (HEDLEY_STATIC_CAST(simde__mmask32, simde_mm_bitshuffle_epi64_mask(b_.m128i[i], c_.m128i[i])) << (i * 16));
|
|
}
|
|
#elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR)
|
|
__typeof__(b_.u64) rv = { 0, 0, 0, 0 };
|
|
__typeof__(b_.u64) lshift = { 0, 8, 16, 24 };
|
|
|
|
for (int8_t i = 0 ; i < 8 ; i++) {
|
|
__typeof__(b_.u64) ct = (HEDLEY_REINTERPRET_CAST(__typeof__(ct), c_.u8) >> (i * 8)) & 63;
|
|
rv |= ((b_.u64 >> ct) & 1) << lshift;
|
|
lshift += 1;
|
|
}
|
|
|
|
r =
|
|
HEDLEY_STATIC_CAST(simde__mmask32, rv[0]) |
|
|
HEDLEY_STATIC_CAST(simde__mmask32, rv[1]) |
|
|
HEDLEY_STATIC_CAST(simde__mmask32, rv[2]) |
|
|
HEDLEY_STATIC_CAST(simde__mmask32, rv[3]);
|
|
#else
|
|
for (size_t i = 0 ; i < (sizeof(c_.m128i_private) / sizeof(c_.m128i_private[0])) ; i++) {
|
|
for (size_t j = 0 ; j < (sizeof(c_.m128i_private[i].m64_private) / sizeof(c_.m128i_private[i].m64_private[0])) ; j++) {
|
|
SIMDE_VECTORIZE_REDUCTION(|:r)
|
|
for (size_t k = 0 ; k < (sizeof(c_.m128i_private[i].m64_private[j].u8) / sizeof(c_.m128i_private[i].m64_private[j].u8[0])) ; k++) {
|
|
r |= (((b_.m128i_private[i].u64[j] >> (c_.m128i_private[i].m64_private[j].u8[k]) & 63) & 1) << ((i * 16) + (j * 8) + k));
|
|
}
|
|
}
|
|
}
|
|
#endif
|
|
|
|
return r;
|
|
#endif
|
|
}
|
|
#if defined(SIMDE_X86_AVX512BITALG_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES)
|
|
#undef _mm256_bitshuffle_epi64_mask
|
|
#define _mm256_bitshuffle_epi64_mask(b, c) simde_mm256_bitshuffle_epi64_mask(b, c)
|
|
#endif
|
|
|
|
SIMDE_FUNCTION_ATTRIBUTES
|
|
simde__mmask32
|
|
simde_mm256_mask_bitshuffle_epi64_mask (simde__mmask32 k, simde__m256i b, simde__m256i c) {
|
|
#if defined(SIMDE_X86_AVX512BITALG_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE)
|
|
return _mm256_mask_bitshuffle_epi64_mask(k, b, c);
|
|
#else
|
|
return (k & simde_mm256_bitshuffle_epi64_mask(b, c));
|
|
#endif
|
|
}
|
|
#if defined(SIMDE_X86_AVX512BITALG_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES)
|
|
#undef _mm256_mask_bitshuffle_epi64_mask
|
|
#define _mm256_mask_bitshuffle_epi64_mask(k, b, c) simde_mm256_mask_bitshuffle_epi64_mask(k, b, c)
|
|
#endif
|
|
|
|
SIMDE_FUNCTION_ATTRIBUTES
|
|
simde__mmask64
|
|
simde_mm512_bitshuffle_epi64_mask (simde__m512i b, simde__m512i c) {
|
|
#if defined(SIMDE_X86_AVX512BITALG_NATIVE)
|
|
return _mm512_bitshuffle_epi64_mask(b, c);
|
|
#else
|
|
simde__m512i_private
|
|
b_ = simde__m512i_to_private(b),
|
|
c_ = simde__m512i_to_private(c);
|
|
simde__mmask64 r = 0;
|
|
|
|
#if SIMDE_NATURAL_VECTOR_SIZE_LE(128)
|
|
for (size_t i = 0 ; i < (sizeof(b_.m128i) / sizeof(b_.m128i[0])) ; i++) {
|
|
r |= (HEDLEY_STATIC_CAST(simde__mmask64, simde_mm_bitshuffle_epi64_mask(b_.m128i[i], c_.m128i[i])) << (i * 16));
|
|
}
|
|
#elif SIMDE_NATURAL_VECTOR_SIZE_LE(256)
|
|
for (size_t i = 0 ; i < (sizeof(b_.m256i) / sizeof(b_.m256i[0])) ; i++) {
|
|
r |= (HEDLEY_STATIC_CAST(simde__mmask64, simde_mm256_bitshuffle_epi64_mask(b_.m256i[i], c_.m256i[i])) << (i * 32));
|
|
}
|
|
#elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR)
|
|
__typeof__(b_.u64) rv = { 0, 0, 0, 0, 0, 0, 0, 0 };
|
|
__typeof__(b_.u64) lshift = { 0, 8, 16, 24, 32, 40, 48, 56 };
|
|
|
|
for (int8_t i = 0 ; i < 8 ; i++) {
|
|
__typeof__(b_.u64) ct = (HEDLEY_REINTERPRET_CAST(__typeof__(ct), c_.u8) >> (i * 8)) & 63;
|
|
rv |= ((b_.u64 >> ct) & 1) << lshift;
|
|
lshift += 1;
|
|
}
|
|
|
|
r =
|
|
HEDLEY_STATIC_CAST(simde__mmask64, rv[0]) |
|
|
HEDLEY_STATIC_CAST(simde__mmask64, rv[1]) |
|
|
HEDLEY_STATIC_CAST(simde__mmask64, rv[2]) |
|
|
HEDLEY_STATIC_CAST(simde__mmask64, rv[3]) |
|
|
HEDLEY_STATIC_CAST(simde__mmask64, rv[4]) |
|
|
HEDLEY_STATIC_CAST(simde__mmask64, rv[5]) |
|
|
HEDLEY_STATIC_CAST(simde__mmask64, rv[6]) |
|
|
HEDLEY_STATIC_CAST(simde__mmask64, rv[7]);
|
|
#else
|
|
for (size_t i = 0 ; i < (sizeof(c_.m128i_private) / sizeof(c_.m128i_private[0])) ; i++) {
|
|
for (size_t j = 0 ; j < (sizeof(c_.m128i_private[i].m64_private) / sizeof(c_.m128i_private[i].m64_private[0])) ; j++) {
|
|
SIMDE_VECTORIZE_REDUCTION(|:r)
|
|
for (size_t k = 0 ; k < (sizeof(c_.m128i_private[i].m64_private[j].u8) / sizeof(c_.m128i_private[i].m64_private[j].u8[0])) ; k++) {
|
|
r |= (((b_.m128i_private[i].u64[j] >> (c_.m128i_private[i].m64_private[j].u8[k]) & 63) & 1) << ((i * 16) + (j * 8) + k));
|
|
}
|
|
}
|
|
}
|
|
#endif
|
|
|
|
return r;
|
|
#endif
|
|
}
|
|
#if defined(SIMDE_X86_AVX512BITALG_ENABLE_NATIVE_ALIASES)
|
|
#undef _mm512_bitshuffle_epi64_mask
|
|
#define _mm512_bitshuffle_epi64_mask(b, c) simde_mm512_bitshuffle_epi64_mask(b, c)
|
|
#endif
|
|
|
|
SIMDE_FUNCTION_ATTRIBUTES
|
|
simde__mmask64
|
|
simde_mm512_mask_bitshuffle_epi64_mask (simde__mmask64 k, simde__m512i b, simde__m512i c) {
|
|
#if defined(SIMDE_X86_AVX512BITALG_NATIVE)
|
|
return _mm512_mask_bitshuffle_epi64_mask(k, b, c);
|
|
#else
|
|
return (k & simde_mm512_bitshuffle_epi64_mask(b, c));
|
|
#endif
|
|
}
|
|
#if defined(SIMDE_X86_AVX512BITALG_ENABLE_NATIVE_ALIASES)
|
|
#undef _mm512_mask_bitshuffle_epi64_mask
|
|
#define _mm512_mask_bitshuffle_epi64_mask(k, b, c) simde_mm512_mask_bitshuffle_epi64_mask(k, b, c)
|
|
#endif
|
|
|
|
SIMDE_END_DECLS_
|
|
HEDLEY_DIAGNOSTIC_POP
|
|
|
|
#endif /* !defined(SIMDE_X86_AVX512_BITSHUFFLE_H) */
|