stk-code_catmod/lib/simd_wrapper/simde/x86/avx512/bitshuffle.h
2022-04-29 11:02:25 +08:00

203 lines
7.8 KiB
C

#if !defined(SIMDE_X86_AVX512_BITSHUFFLE_H)
#define SIMDE_X86_AVX512_BITSHUFFLE_H
#include "types.h"
HEDLEY_DIAGNOSTIC_PUSH
SIMDE_DISABLE_UNWANTED_DIAGNOSTICS
SIMDE_BEGIN_DECLS_
SIMDE_FUNCTION_ATTRIBUTES
simde__mmask16
simde_mm_bitshuffle_epi64_mask (simde__m128i b, simde__m128i c) {
#if defined(SIMDE_X86_AVX512BITALG_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE)
return _mm_bitshuffle_epi64_mask(b, c);
#else
simde__m128i_private
b_ = simde__m128i_to_private(b),
c_ = simde__m128i_to_private(c);
simde__mmask16 r = 0;
#if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR)
__typeof__(b_.u64) rv = { 0, 0 };
__typeof__(b_.u64) lshift = { 0, 8 };
for (int8_t i = 0 ; i < 8 ; i++) {
__typeof__(b_.u64) ct = (HEDLEY_REINTERPRET_CAST(__typeof__(ct), c_.u8) >> (i * 8)) & 63;
rv |= ((b_.u64 >> ct) & 1) << lshift;
lshift += 1;
}
r =
HEDLEY_STATIC_CAST(simde__mmask16, rv[0]) |
HEDLEY_STATIC_CAST(simde__mmask16, rv[1]);
#else
for (size_t i = 0 ; i < (sizeof(c_.m64_private) / sizeof(c_.m64_private[0])) ; i++) {
SIMDE_VECTORIZE_REDUCTION(|:r)
for (size_t j = 0 ; j < (sizeof(c_.m64_private[i].u8) / sizeof(c_.m64_private[i].u8[0])) ; j++) {
r |= (((b_.u64[i] >> (c_.m64_private[i].u8[j]) & 63) & 1) << ((i * 8) + j));
}
}
#endif
return r;
#endif
}
#if defined(SIMDE_X86_AVX512BITALG_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES)
#undef _mm_bitshuffle_epi64_mask
#define _mm_bitshuffle_epi64_mask(b, c) simde_mm_bitshuffle_epi64_mask(b, c)
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde__mmask16
simde_mm_mask_bitshuffle_epi64_mask (simde__mmask16 k, simde__m128i b, simde__m128i c) {
#if defined(SIMDE_X86_AVX512BITALG_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE)
return _mm_mask_bitshuffle_epi64_mask(k, b, c);
#else
return (k & simde_mm_bitshuffle_epi64_mask(b, c));
#endif
}
#if defined(SIMDE_X86_AVX512BITALG_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES)
#undef _mm_mask_bitshuffle_epi64_mask
#define _mm_mask_bitshuffle_epi64_mask(k, b, c) simde_mm_mask_bitshuffle_epi64_mask(k, b, c)
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde__mmask32
simde_mm256_bitshuffle_epi64_mask (simde__m256i b, simde__m256i c) {
#if defined(SIMDE_X86_AVX512BITALG_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE)
return _mm256_bitshuffle_epi64_mask(b, c);
#else
simde__m256i_private
b_ = simde__m256i_to_private(b),
c_ = simde__m256i_to_private(c);
simde__mmask32 r = 0;
#if SIMDE_NATURAL_VECTOR_SIZE_LE(128)
for (size_t i = 0 ; i < sizeof(b_.m128i) / sizeof(b_.m128i[0]) ; i++) {
r |= (HEDLEY_STATIC_CAST(simde__mmask32, simde_mm_bitshuffle_epi64_mask(b_.m128i[i], c_.m128i[i])) << (i * 16));
}
#elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR)
__typeof__(b_.u64) rv = { 0, 0, 0, 0 };
__typeof__(b_.u64) lshift = { 0, 8, 16, 24 };
for (int8_t i = 0 ; i < 8 ; i++) {
__typeof__(b_.u64) ct = (HEDLEY_REINTERPRET_CAST(__typeof__(ct), c_.u8) >> (i * 8)) & 63;
rv |= ((b_.u64 >> ct) & 1) << lshift;
lshift += 1;
}
r =
HEDLEY_STATIC_CAST(simde__mmask32, rv[0]) |
HEDLEY_STATIC_CAST(simde__mmask32, rv[1]) |
HEDLEY_STATIC_CAST(simde__mmask32, rv[2]) |
HEDLEY_STATIC_CAST(simde__mmask32, rv[3]);
#else
for (size_t i = 0 ; i < (sizeof(c_.m128i_private) / sizeof(c_.m128i_private[0])) ; i++) {
for (size_t j = 0 ; j < (sizeof(c_.m128i_private[i].m64_private) / sizeof(c_.m128i_private[i].m64_private[0])) ; j++) {
SIMDE_VECTORIZE_REDUCTION(|:r)
for (size_t k = 0 ; k < (sizeof(c_.m128i_private[i].m64_private[j].u8) / sizeof(c_.m128i_private[i].m64_private[j].u8[0])) ; k++) {
r |= (((b_.m128i_private[i].u64[j] >> (c_.m128i_private[i].m64_private[j].u8[k]) & 63) & 1) << ((i * 16) + (j * 8) + k));
}
}
}
#endif
return r;
#endif
}
#if defined(SIMDE_X86_AVX512BITALG_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES)
#undef _mm256_bitshuffle_epi64_mask
#define _mm256_bitshuffle_epi64_mask(b, c) simde_mm256_bitshuffle_epi64_mask(b, c)
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde__mmask32
simde_mm256_mask_bitshuffle_epi64_mask (simde__mmask32 k, simde__m256i b, simde__m256i c) {
#if defined(SIMDE_X86_AVX512BITALG_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE)
return _mm256_mask_bitshuffle_epi64_mask(k, b, c);
#else
return (k & simde_mm256_bitshuffle_epi64_mask(b, c));
#endif
}
#if defined(SIMDE_X86_AVX512BITALG_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES)
#undef _mm256_mask_bitshuffle_epi64_mask
#define _mm256_mask_bitshuffle_epi64_mask(k, b, c) simde_mm256_mask_bitshuffle_epi64_mask(k, b, c)
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde__mmask64
simde_mm512_bitshuffle_epi64_mask (simde__m512i b, simde__m512i c) {
#if defined(SIMDE_X86_AVX512BITALG_NATIVE)
return _mm512_bitshuffle_epi64_mask(b, c);
#else
simde__m512i_private
b_ = simde__m512i_to_private(b),
c_ = simde__m512i_to_private(c);
simde__mmask64 r = 0;
#if SIMDE_NATURAL_VECTOR_SIZE_LE(128)
for (size_t i = 0 ; i < (sizeof(b_.m128i) / sizeof(b_.m128i[0])) ; i++) {
r |= (HEDLEY_STATIC_CAST(simde__mmask64, simde_mm_bitshuffle_epi64_mask(b_.m128i[i], c_.m128i[i])) << (i * 16));
}
#elif SIMDE_NATURAL_VECTOR_SIZE_LE(256)
for (size_t i = 0 ; i < (sizeof(b_.m256i) / sizeof(b_.m256i[0])) ; i++) {
r |= (HEDLEY_STATIC_CAST(simde__mmask64, simde_mm256_bitshuffle_epi64_mask(b_.m256i[i], c_.m256i[i])) << (i * 32));
}
#elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR)
__typeof__(b_.u64) rv = { 0, 0, 0, 0, 0, 0, 0, 0 };
__typeof__(b_.u64) lshift = { 0, 8, 16, 24, 32, 40, 48, 56 };
for (int8_t i = 0 ; i < 8 ; i++) {
__typeof__(b_.u64) ct = (HEDLEY_REINTERPRET_CAST(__typeof__(ct), c_.u8) >> (i * 8)) & 63;
rv |= ((b_.u64 >> ct) & 1) << lshift;
lshift += 1;
}
r =
HEDLEY_STATIC_CAST(simde__mmask64, rv[0]) |
HEDLEY_STATIC_CAST(simde__mmask64, rv[1]) |
HEDLEY_STATIC_CAST(simde__mmask64, rv[2]) |
HEDLEY_STATIC_CAST(simde__mmask64, rv[3]) |
HEDLEY_STATIC_CAST(simde__mmask64, rv[4]) |
HEDLEY_STATIC_CAST(simde__mmask64, rv[5]) |
HEDLEY_STATIC_CAST(simde__mmask64, rv[6]) |
HEDLEY_STATIC_CAST(simde__mmask64, rv[7]);
#else
for (size_t i = 0 ; i < (sizeof(c_.m128i_private) / sizeof(c_.m128i_private[0])) ; i++) {
for (size_t j = 0 ; j < (sizeof(c_.m128i_private[i].m64_private) / sizeof(c_.m128i_private[i].m64_private[0])) ; j++) {
SIMDE_VECTORIZE_REDUCTION(|:r)
for (size_t k = 0 ; k < (sizeof(c_.m128i_private[i].m64_private[j].u8) / sizeof(c_.m128i_private[i].m64_private[j].u8[0])) ; k++) {
r |= (((b_.m128i_private[i].u64[j] >> (c_.m128i_private[i].m64_private[j].u8[k]) & 63) & 1) << ((i * 16) + (j * 8) + k));
}
}
}
#endif
return r;
#endif
}
#if defined(SIMDE_X86_AVX512BITALG_ENABLE_NATIVE_ALIASES)
#undef _mm512_bitshuffle_epi64_mask
#define _mm512_bitshuffle_epi64_mask(b, c) simde_mm512_bitshuffle_epi64_mask(b, c)
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde__mmask64
simde_mm512_mask_bitshuffle_epi64_mask (simde__mmask64 k, simde__m512i b, simde__m512i c) {
#if defined(SIMDE_X86_AVX512BITALG_NATIVE)
return _mm512_mask_bitshuffle_epi64_mask(k, b, c);
#else
return (k & simde_mm512_bitshuffle_epi64_mask(b, c));
#endif
}
#if defined(SIMDE_X86_AVX512BITALG_ENABLE_NATIVE_ALIASES)
#undef _mm512_mask_bitshuffle_epi64_mask
#define _mm512_mask_bitshuffle_epi64_mask(k, b, c) simde_mm512_mask_bitshuffle_epi64_mask(k, b, c)
#endif
SIMDE_END_DECLS_
HEDLEY_DIAGNOSTIC_POP
#endif /* !defined(SIMDE_X86_AVX512_BITSHUFFLE_H) */