stk-code_catmod/lib/simd_wrapper/simde/x86/avx512/permutexvar.h
2022-04-29 11:02:25 +08:00

1181 lines
49 KiB
C

/* SPDX-License-Identifier: MIT
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use, copy,
* modify, merge, publish, distribute, sublicense, and/or sell copies
* of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
* BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*
* Copyright:
* 2020 Evan Nemerson <evan@nemerson.com>
* 2020 Christopher Moore <moore@free.fr>
*/
#if !defined(SIMDE_X86_AVX512_PERMUTEXVAR_H)
#define SIMDE_X86_AVX512_PERMUTEXVAR_H
#include "types.h"
#include "and.h"
#include "andnot.h"
#include "blend.h"
#include "mov.h"
#include "or.h"
#include "set1.h"
#include "slli.h"
#include "srli.h"
#include "test.h"
HEDLEY_DIAGNOSTIC_PUSH
SIMDE_DISABLE_UNWANTED_DIAGNOSTICS
SIMDE_BEGIN_DECLS_
SIMDE_FUNCTION_ATTRIBUTES
simde__m128i
simde_mm_permutexvar_epi16 (simde__m128i idx, simde__m128i a) {
#if defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE)
return _mm_permutexvar_epi16(idx, a);
#elif defined(SIMDE_X86_SSSE3_NATIVE)
simde__m128i mask16 = simde_mm_set1_epi16(0x0007);
simde__m128i shift16 = simde_mm_set1_epi16(0x0202);
simde__m128i byte_index16 = simde_mm_set1_epi16(0x0100);
simde__m128i index16 = simde_mm_and_si128(idx, mask16);
index16 = simde_mm_mullo_epi16(index16, shift16);
index16 = simde_mm_add_epi16(index16, byte_index16);
return simde_mm_shuffle_epi8(a, index16);
#else
simde__m128i_private
idx_ = simde__m128i_to_private(idx),
a_ = simde__m128i_to_private(a),
r_;
#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
uint16x8_t mask16 = vdupq_n_u16(0x0007);
uint16x8_t byte_index16 = vdupq_n_u16(0x0100);
uint16x8_t index16 = vandq_u16(idx_.neon_u16, mask16);
index16 = vmulq_n_u16(index16, 0x0202);
index16 = vaddq_u16(index16, byte_index16);
r_.neon_u8 = vqtbl1q_u8(a_.neon_u8, vreinterpretq_u8_u16(index16));
#elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
SIMDE_POWER_ALTIVEC_VECTOR(unsigned short) index16;
index16 = vec_and(idx_.altivec_u16, vec_splat_u16(7));
index16 = vec_mladd(index16, vec_splats(HEDLEY_STATIC_CAST(unsigned short, 0x0202)), vec_splats(HEDLEY_STATIC_CAST(unsigned short, 0x0100)));
r_.altivec_u8 = vec_perm(a_.altivec_u8, a_.altivec_u8, HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(unsigned char), index16));
#elif defined(SIMDE_WASM_SIMD128_NATIVE)
const v128_t mask16 = wasm_i16x8_splat(0x0007);
const v128_t shift16 = wasm_i16x8_splat(0x0202);
const v128_t byte_index16 = wasm_i16x8_splat(0x0100);
v128_t index16 = wasm_v128_and(idx_.wasm_v128, mask16);
index16 = wasm_i16x8_mul(index16, shift16);
index16 = wasm_i16x8_add(index16, byte_index16);
r_.wasm_v128 = wasm_i8x16_swizzle(a_.wasm_v128, index16);
#else
SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) {
r_.i16[i] = a_.i16[idx_.i16[i] & 0x07];
}
#endif
return simde__m128i_from_private(r_);
#endif
}
#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) || defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES)
#undef _mm_permutexvar_epi16
#define _mm_permutexvar_epi16(idx, a) simde_mm_permutexvar_epi16(idx, a)
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde__m128i
simde_mm_mask_permutexvar_epi16 (simde__m128i src, simde__mmask8 k, simde__m128i idx, simde__m128i a) {
#if defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE)
return _mm_mask_permutexvar_epi16(src, k, idx, a);
#else
return simde_mm_mask_mov_epi16(src, k, simde_mm_permutexvar_epi16(idx, a));
#endif
}
#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) || defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES)
#undef _mm_mask_permutexvar_epi16
#define _mm_mask_permutexvar_epi16(src, k, idx, a) simde_mm_mask_permutexvar_epi16(src, k, idx, a)
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde__m128i
simde_mm_maskz_permutexvar_epi16 (simde__mmask8 k, simde__m128i idx, simde__m128i a) {
#if defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE)
return _mm_maskz_permutexvar_epi16(k, idx, a);
#else
return simde_mm_maskz_mov_epi16(k, simde_mm_permutexvar_epi16(idx, a));
#endif
}
#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) || defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES)
#undef _mm_maskz_permutexvar_epi16
#define _mm_maskz_permutexvar_epi16(k, idx, a) simde_mm_maskz_permutexvar_epi16(k, idx, a)
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde__m128i
simde_mm_permutexvar_epi8 (simde__m128i idx, simde__m128i a) {
#if defined(SIMDE_X86_AVX512VBMI_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE)
return _mm_permutexvar_epi8(idx, a);
#elif defined(SIMDE_X86_SSSE3_NATIVE)
simde__m128i mask = simde_mm_set1_epi8(0x0F);
simde__m128i index = simde_mm_and_si128(idx, mask);
return simde_mm_shuffle_epi8(a, index);
#else
simde__m128i_private
idx_ = simde__m128i_to_private(idx),
a_ = simde__m128i_to_private(a),
r_;
#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
uint8x16_t mask = vdupq_n_u8(0x0F);
uint8x16_t index = vandq_u8(idx_.neon_u8, mask);
r_.neon_u8 = vqtbl1q_u8(a_.neon_u8, index);
#elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
r_.altivec_u8 = vec_perm(a_.altivec_u8, a_.altivec_u8, idx_.altivec_u8);
#elif defined(SIMDE_WASM_SIMD128_NATIVE)
const v128_t mask = wasm_i8x16_splat(0x0F);
v128_t index = wasm_v128_and(idx_.wasm_v128, mask);
r_.wasm_v128 = wasm_i8x16_swizzle(a_.wasm_v128, index);
#else
SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) {
r_.i8[i] = a_.i8[idx_.i8[i] & 0x0F];
}
#endif
return simde__m128i_from_private(r_);
#endif
}
#if defined(SIMDE_X86_AVX512VBMI_ENABLE_NATIVE_ALIASES) || defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES)
#undef _mm_permutexvar_epi8
#define _mm_permutexvar_epi8(idx, a) simde_mm_permutexvar_epi8(idx, a)
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde__m128i
simde_mm_mask_permutexvar_epi8 (simde__m128i src, simde__mmask16 k, simde__m128i idx, simde__m128i a) {
#if defined(SIMDE_X86_AVX512VBMI_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE)
return _mm_mask_permutexvar_epi8(src, k, idx, a);
#else
return simde_mm_mask_mov_epi8(src, k, simde_mm_permutexvar_epi8(idx, a));
#endif
}
#if defined(SIMDE_X86_AVX512VBMI_ENABLE_NATIVE_ALIASES) || defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES)
#undef _mm_mask_permutexvar_epi8
#define _mm_mask_permutexvar_epi8(src, k, idx, a) simde_mm_mask_permutexvar_epi8(src, k, idx, a)
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde__m128i
simde_mm_maskz_permutexvar_epi8 (simde__mmask16 k, simde__m128i idx, simde__m128i a) {
#if defined(SIMDE_X86_AVX512VBMI_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE)
return _mm_maskz_permutexvar_epi8(k, idx, a);
#else
return simde_mm_maskz_mov_epi8(k, simde_mm_permutexvar_epi8(idx, a));
#endif
}
#if defined(SIMDE_X86_AVX512VBMI_ENABLE_NATIVE_ALIASES) || defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES)
#undef _mm_maskz_permutexvar_epi8
#define _mm_maskz_permutexvar_epi8(k, idx, a) simde_mm_maskz_permutexvar_epi8(k, idx, a)
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde__m256i
simde_mm256_permutexvar_epi16 (simde__m256i idx, simde__m256i a) {
#if defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE)
return _mm256_permutexvar_epi16(idx, a);
#elif defined(SIMDE_X86_AVX2_NATIVE)
simde__m256i mask16 = simde_mm256_set1_epi16(0x001F);
simde__m256i shift16 = simde_mm256_set1_epi16(0x0202);
simde__m256i byte_index16 = simde_mm256_set1_epi16(0x0100);
simde__m256i index16 = simde_mm256_and_si256(idx, mask16);
index16 = simde_mm256_mullo_epi16(index16, shift16);
simde__m256i lo = simde_mm256_permute4x64_epi64(a, (1 << 6) + (0 << 4) + (1 << 2) + (0 << 0));
simde__m256i hi = simde_mm256_permute4x64_epi64(a, (3 << 6) + (2 << 4) + (3 << 2) + (2 << 0));
simde__m256i select = simde_mm256_slli_epi64(index16, 3);
index16 = simde_mm256_add_epi16(index16, byte_index16);
lo = simde_mm256_shuffle_epi8(lo, index16);
hi = simde_mm256_shuffle_epi8(hi, index16);
return simde_mm256_blendv_epi8(lo, hi, select);
#else
simde__m256i_private
idx_ = simde__m256i_to_private(idx),
a_ = simde__m256i_to_private(a),
r_;
#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
uint8x16x2_t table = { { a_.m128i_private[0].neon_u8,
a_.m128i_private[1].neon_u8 } };
uint16x8_t mask16 = vdupq_n_u16(0x000F);
uint16x8_t byte_index16 = vdupq_n_u16(0x0100);
SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof(r_.m128i_private) / sizeof(r_.m128i_private[0])) ; i++) {
uint16x8_t index16 = vandq_u16(idx_.m128i_private[i].neon_u16, mask16);
index16 = vmulq_n_u16(index16, 0x0202);
index16 = vaddq_u16(index16, byte_index16);
r_.m128i_private[i].neon_u8 = vqtbl2q_u8(table, vreinterpretq_u8_u16(index16));
}
#elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
SIMDE_POWER_ALTIVEC_VECTOR(unsigned short) index16, mask16, shift16, byte_index16;
mask16 = vec_splat_u16(0x000F);
shift16 = vec_splats(HEDLEY_STATIC_CAST(unsigned short, 0x0202));
byte_index16 = vec_splats(HEDLEY_STATIC_CAST(unsigned short, 0x0100));
SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof(r_.m128i_private) / sizeof(r_.m128i_private[0])) ; i++) {
index16 = vec_and(idx_.m128i_private[i].altivec_u16, mask16);
index16 = vec_mladd(index16, shift16, byte_index16);
r_.m128i_private[i].altivec_u8 = vec_perm(a_.m128i_private[0].altivec_u8,
a_.m128i_private[1].altivec_u8,
HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(unsigned char), index16));
}
#elif defined(SIMDE_WASM_SIMD128_NATIVE)
v128_t index, index16, r, t;
const v128_t mask16 = wasm_i16x8_splat(0x000F);
const v128_t shift16 = wasm_i16x8_splat(0x0202);
const v128_t byte_index16 = wasm_i16x8_splat(0x0100);
const v128_t sixteen = wasm_i8x16_splat(16);
const v128_t a0 = a_.m128i_private[0].wasm_v128;
const v128_t a1 = a_.m128i_private[1].wasm_v128;
SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof(r_.m128i_private) / sizeof(r_.m128i_private[0])) ; i++) {
index16 = wasm_v128_and(idx_.m128i_private[i].wasm_v128, mask16);
index16 = wasm_i16x8_mul(index16, shift16);
index = wasm_i16x8_add(index16, byte_index16);
r = wasm_i8x16_swizzle(a0, index);
index = wasm_i8x16_sub(index, sixteen);
t = wasm_i8x16_swizzle(a1, index);
r_.m128i_private[i].wasm_v128 = wasm_v128_or(r, t);
}
#else
SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) {
r_.i16[i] = a_.i16[idx_.i16[i] & 0x0F];
}
#endif
return simde__m256i_from_private(r_);
#endif
}
#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) || defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES)
#undef _mm256_permutexvar_epi16
#define _mm256_permutexvar_epi16(idx, a) simde_mm256_permutexvar_epi16(idx, a)
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde__m256i
simde_mm256_mask_permutexvar_epi16 (simde__m256i src, simde__mmask16 k, simde__m256i idx, simde__m256i a) {
#if defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE)
return _mm256_mask_permutexvar_epi16(src, k, idx, a);
#else
return simde_mm256_mask_mov_epi16(src, k, simde_mm256_permutexvar_epi16(idx, a));
#endif
}
#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) || defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES)
#undef _mm256_mask_permutexvar_epi16
#define _mm256_mask_permutexvar_epi16(src, k, idx, a) simde_mm256_mask_permutexvar_epi16(src, k, idx, a)
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde__m256i
simde_mm256_maskz_permutexvar_epi16 (simde__mmask16 k, simde__m256i idx, simde__m256i a) {
#if defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE)
return _mm256_maskz_permutexvar_epi16(k, idx, a);
#else
return simde_mm256_maskz_mov_epi16(k, simde_mm256_permutexvar_epi16(idx, a));
#endif
}
#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) || defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES)
#undef _mm256_maskz_permutexvar_epi16
#define _mm256_maskz_permutexvar_epi16(k, idx, a) simde_mm256_maskz_permutexvar_epi16(k, idx, a)
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde__m256i
simde_mm256_permutexvar_epi32 (simde__m256i idx, simde__m256i a) {
#if defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE)
return _mm256_permutexvar_epi32(idx, a);
#elif defined(SIMDE_X86_AVX2_NATIVE)
return simde_mm256_permutevar8x32_epi32(a, idx);
#else
simde__m256i_private
idx_ = simde__m256i_to_private(idx),
a_ = simde__m256i_to_private(a),
r_;
#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
uint8x16x2_t table = { { a_.m128i_private[0].neon_u8,
a_.m128i_private[1].neon_u8 } };
uint32x4_t mask32 = vdupq_n_u32(0x00000007);
uint32x4_t byte_index32 = vdupq_n_u32(0x03020100);
SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof(r_.m128i_private) / sizeof(r_.m128i_private[0])) ; i++) {
uint32x4_t index32 = vandq_u32(idx_.m128i_private[i].neon_u32, mask32);
index32 = vmulq_n_u32(index32, 0x04040404);
index32 = vaddq_u32(index32, byte_index32);
r_.m128i_private[i].neon_u8 = vqtbl2q_u8(table, vreinterpretq_u8_u32(index32));
}
#else
#if !defined(__INTEL_COMPILER)
SIMDE_VECTORIZE
#endif
for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
r_.i32[i] = a_.i32[idx_.i32[i] & 0x07];
}
#endif
return simde__m256i_from_private(r_);
#endif
}
#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) || defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES)
#undef _mm256_permutexvar_epi32
#define _mm256_permutexvar_epi32(idx, a) simde_mm256_permutexvar_epi32(idx, a)
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde__m256i
simde_mm256_mask_permutexvar_epi32 (simde__m256i src, simde__mmask8 k, simde__m256i idx, simde__m256i a) {
#if defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE)
return _mm256_mask_permutexvar_epi32(src, k, idx, a);
#else
return simde_mm256_mask_mov_epi32(src, k, simde_mm256_permutexvar_epi32(idx, a));
#endif
}
#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) || defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES)
#undef _mm256_mask_permutexvar_epi32
#define _mm256_mask_permutexvar_epi32(src, k, idx, a) simde_mm256_mask_permutexvar_epi32(src, k, idx, a)
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde__m256i
simde_mm256_maskz_permutexvar_epi32 (simde__mmask8 k, simde__m256i idx, simde__m256i a) {
#if defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE)
return _mm256_maskz_permutexvar_epi32(k, idx, a);
#else
return simde_mm256_maskz_mov_epi32(k, simde_mm256_permutexvar_epi32(idx, a));
#endif
}
#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) || defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES)
#undef _mm256_maskz_permutexvar_epi32
#define _mm256_maskz_permutexvar_epi32(k, idx, a) simde_mm256_maskz_permutexvar_epi32(k, idx, a)
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde__m256i
simde_mm256_permutexvar_epi64 (simde__m256i idx, simde__m256i a) {
#if defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE)
return _mm256_permutexvar_epi64(idx, a);
#else
simde__m256i_private
idx_ = simde__m256i_to_private(idx),
a_ = simde__m256i_to_private(a),
r_;
#if !defined(__INTEL_COMPILER)
SIMDE_VECTORIZE
#endif
for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) {
r_.i64[i] = a_.i64[idx_.i64[i] & 3];
}
return simde__m256i_from_private(r_);
#endif
}
#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) || defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES)
#undef _mm256_permutexvar_epi64
#define _mm256_permutexvar_epi64(idx, a) simde_mm256_permutexvar_epi64(idx, a)
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde__m256i
simde_mm256_mask_permutexvar_epi64 (simde__m256i src, simde__mmask8 k, simde__m256i idx, simde__m256i a) {
#if defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE)
return _mm256_mask_permutexvar_epi64(src, k, idx, a);
#else
return simde_mm256_mask_mov_epi64(src, k, simde_mm256_permutexvar_epi64(idx, a));
#endif
}
#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) || defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES)
#undef _mm256_mask_permutexvar_epi64
#define _mm256_mask_permutexvar_epi64(src, k, idx, a) simde_mm256_mask_permutexvar_epi64(src, k, idx, a)
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde__m256i
simde_mm256_maskz_permutexvar_epi64 (simde__mmask8 k, simde__m256i idx, simde__m256i a) {
#if defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE)
return _mm256_maskz_permutexvar_epi64(k, idx, a);
#else
return simde_mm256_maskz_mov_epi64(k, simde_mm256_permutexvar_epi64(idx, a));
#endif
}
#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) || defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES)
#undef _mm256_maskz_permutexvar_epi64
#define _mm256_maskz_permutexvar_epi64(k, idx, a) simde_mm256_maskz_permutexvar_epi64(k, idx, a)
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde__m256i
simde_mm256_permutexvar_epi8 (simde__m256i idx, simde__m256i a) {
#if defined(SIMDE_X86_AVX512VBMI_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE)
return _mm256_permutexvar_epi8(idx, a);
#elif defined(SIMDE_X86_AVX2_NATIVE)
simde__m256i mask = simde_mm256_set1_epi8(0x0F);
simde__m256i lo = simde_mm256_permute4x64_epi64(a, (1 << 6) + (0 << 4) + (1 << 2) + (0 << 0));
simde__m256i hi = simde_mm256_permute4x64_epi64(a, (3 << 6) + (2 << 4) + (3 << 2) + (2 << 0));
simde__m256i index = simde_mm256_and_si256(idx, mask);
simde__m256i select = simde_mm256_slli_epi64(idx, 3);
lo = simde_mm256_shuffle_epi8(lo, index);
hi = simde_mm256_shuffle_epi8(hi, index);
return simde_mm256_blendv_epi8(lo, hi, select);
#else
simde__m256i_private
idx_ = simde__m256i_to_private(idx),
a_ = simde__m256i_to_private(a),
r_;
#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
uint8x16x2_t table = { { a_.m128i_private[0].neon_u8,
a_.m128i_private[1].neon_u8 } };
uint8x16_t mask = vdupq_n_u8(0x1F);
SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof(r_.m128i_private) / sizeof(r_.m128i_private[0])) ; i++) {
r_.m128i_private[i].neon_u8 = vqtbl2q_u8(table, vandq_u8(idx_.m128i_private[i].neon_u8, mask));
}
#elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof(r_.m128i_private) / sizeof(r_.m128i_private[0])) ; i++) {
r_.m128i_private[i].altivec_u8 = vec_perm(a_.m128i_private[0].altivec_u8, a_.m128i_private[1].altivec_u8, idx_.m128i_private[i].altivec_u8);
}
#elif defined(SIMDE_WASM_SIMD128_NATIVE)
v128_t index, r, t;
const v128_t mask = wasm_i8x16_splat(0x1F);
const v128_t sixteen = wasm_i8x16_splat(16);
const v128_t a0 = a_.m128i_private[0].wasm_v128;
const v128_t a1 = a_.m128i_private[1].wasm_v128;
SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof(r_.m128i_private) / sizeof(r_.m128i_private[0])) ; i++) {
index = wasm_v128_and(idx_.m128i_private[i].wasm_v128, mask);
r = wasm_i8x16_swizzle(a0, index);
index = wasm_i8x16_sub(index, sixteen);
t = wasm_i8x16_swizzle(a1, index);
r_.m128i_private[i].wasm_v128 = wasm_v128_or(r, t);
}
#else
SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) {
r_.i8[i] = a_.i8[idx_.i8[i] & 0x1F];
}
#endif
return simde__m256i_from_private(r_);
#endif
}
#if defined(SIMDE_X86_AVX512VBMI_ENABLE_NATIVE_ALIASES) || defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES)
#undef _mm256_permutexvar_epi8
#define _mm256_permutexvar_epi8(idx, a) simde_mm256_permutexvar_epi8(idx, a)
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde__m256i
simde_mm256_mask_permutexvar_epi8 (simde__m256i src, simde__mmask32 k, simde__m256i idx, simde__m256i a) {
#if defined(SIMDE_X86_AVX512VBMI_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE)
return _mm256_mask_permutexvar_epi8(src, k, idx, a);
#else
return simde_mm256_mask_mov_epi8(src, k, simde_mm256_permutexvar_epi8(idx, a));
#endif
}
#if defined(SIMDE_X86_AVX512VBMI_ENABLE_NATIVE_ALIASES) || defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES)
#undef _mm256_mask_permutexvar_epi8
#define _mm256_mask_permutexvar_epi8(src, k, idx, a) simde_mm256_mask_permutexvar_epi8(src, k, idx, a)
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde__m256i
simde_mm256_maskz_permutexvar_epi8 (simde__mmask32 k, simde__m256i idx, simde__m256i a) {
#if defined(SIMDE_X86_AVX512VBMI_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE)
return _mm256_maskz_permutexvar_epi8(k, idx, a);
#else
return simde_mm256_maskz_mov_epi8(k, simde_mm256_permutexvar_epi8(idx, a));
#endif
}
#if defined(SIMDE_X86_AVX512VBMI_ENABLE_NATIVE_ALIASES) || defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES)
#undef _mm256_maskz_permutexvar_epi8
#define _mm256_maskz_permutexvar_epi8(k, idx, a) simde_mm256_maskz_permutexvar_epi8(k, idx, a)
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde__m256d
simde_mm256_permutexvar_pd (simde__m256i idx, simde__m256d a) {
#if defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE)
return _mm256_permutexvar_pd(idx, a);
#else
return simde_mm256_castsi256_pd(simde_mm256_permutexvar_epi64(idx, simde_mm256_castpd_si256(a)));
#endif
}
#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) || defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES)
#undef _mm256_permutexvar_pd
#define _mm256_permutexvar_pd(idx, a) simde_mm256_permutexvar_pd(idx, a)
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde__m256d
simde_mm256_mask_permutexvar_pd (simde__m256d src, simde__mmask8 k, simde__m256i idx, simde__m256d a) {
#if defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE)
return _mm256_mask_permutexvar_pd(src, k, idx, a);
#else
return simde_mm256_mask_mov_pd(src, k, simde_mm256_permutexvar_pd(idx, a));
#endif
}
#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) || defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES)
#undef _mm256_mask_permutexvar_pd
#define _mm256_mask_permutexvar_pd(src, k, idx, a) simde_mm256_mask_permutexvar_pd(src, k, idx, a)
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde__m256d
simde_mm256_maskz_permutexvar_pd (simde__mmask8 k, simde__m256i idx, simde__m256d a) {
#if defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE)
return _mm256_maskz_permutexvar_pd(k, idx, a);
#else
return simde_mm256_maskz_mov_pd(k, simde_mm256_permutexvar_pd(idx, a));
#endif
}
#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) || defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES)
#undef _mm256_maskz_permutexvar_pd
#define _mm256_maskz_permutexvar_pd(k, idx, a) simde_mm256_maskz_permutexvar_pd(k, idx, a)
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde__m256
simde_mm256_permutexvar_ps (simde__m256i idx, simde__m256 a) {
#if defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE)
return _mm256_permutexvar_ps(idx, a);
#elif defined(SIMDE_X86_AVX2_NATIVE)
return simde_mm256_permutevar8x32_ps(a, idx);
#else
return simde_mm256_castsi256_ps(simde_mm256_permutexvar_epi32(idx, simde_mm256_castps_si256(a)));
#endif
}
#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) || defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES)
#undef _mm256_permutexvar_ps
#define _mm256_permutexvar_ps(idx, a) simde_mm256_permutexvar_ps(idx, a)
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde__m256
simde_mm256_mask_permutexvar_ps (simde__m256 src, simde__mmask8 k, simde__m256i idx, simde__m256 a) {
#if defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE)
return _mm256_mask_permutexvar_ps(src, k, idx, a);
#else
return simde_mm256_mask_mov_ps(src, k, simde_mm256_permutexvar_ps(idx, a));
#endif
}
#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) || defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES)
#undef _mm256_mask_permutexvar_ps
#define _mm256_mask_permutexvar_ps(src, k, idx, a) simde_mm256_mask_permutexvar_ps(src, k, idx, a)
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde__m256
simde_mm256_maskz_permutexvar_ps (simde__mmask8 k, simde__m256i idx, simde__m256 a) {
#if defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE)
return _mm256_maskz_permutexvar_ps(k, idx, a);
#else
return simde_mm256_maskz_mov_ps(k, simde_mm256_permutexvar_ps(idx, a));
#endif
}
#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) || defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES)
#undef _mm256_maskz_permutexvar_ps
#define _mm256_maskz_permutexvar_ps(k, idx, a) simde_mm256_maskz_permutexvar_ps(k, idx, a)
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde__m512i
simde_mm512_permutexvar_epi16 (simde__m512i idx, simde__m512i a) {
#if defined(SIMDE_X86_AVX512BW_NATIVE)
return _mm512_permutexvar_epi16(idx, a);
#else
simde__m512i_private
idx_ = simde__m512i_to_private(idx),
a_ = simde__m512i_to_private(a),
r_;
#if defined(SIMDE_X86_AVX2_NATIVE)
simde__m256i t0, t1, index, select, a01, a23;
simde__m256i mask = simde_mm256_set1_epi16(0x001F);
simde__m256i shift = simde_mm256_set1_epi16(0x0202);
simde__m256i byte_index = simde_mm256_set1_epi16(0x0100);
simde__m256i a0 = simde_mm256_broadcastsi128_si256(a_.m128i[0]);
simde__m256i a1 = simde_mm256_broadcastsi128_si256(a_.m128i[1]);
simde__m256i a2 = simde_mm256_broadcastsi128_si256(a_.m128i[2]);
simde__m256i a3 = simde_mm256_broadcastsi128_si256(a_.m128i[3]);
SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof(r_.m256i_private) / sizeof(r_.m256i_private[0])) ; i++) {
index = idx_.m256i[i];
index = simde_mm256_and_si256(index, mask);
index = simde_mm256_mullo_epi16(index, shift);
index = simde_mm256_add_epi16(index, byte_index);
t0 = simde_mm256_shuffle_epi8(a0, index);
t1 = simde_mm256_shuffle_epi8(a1, index);
select = simde_mm256_slli_epi64(index, 3);
a01 = simde_mm256_blendv_epi8(t0, t1, select);
t0 = simde_mm256_shuffle_epi8(a2, index);
t1 = simde_mm256_shuffle_epi8(a3, index);
a23 = simde_mm256_blendv_epi8(t0, t1, select);
select = simde_mm256_slli_epi64(index, 2);
r_.m256i[i] = simde_mm256_blendv_epi8(a01, a23, select);
}
#elif defined(SIMDE_ARM_NEON_A64V8_NATIVE)
uint8x16x4_t table = { { a_.m128i_private[0].neon_u8,
a_.m128i_private[1].neon_u8,
a_.m128i_private[2].neon_u8,
a_.m128i_private[3].neon_u8 } };
uint16x8_t mask16 = vdupq_n_u16(0x001F);
uint16x8_t byte_index16 = vdupq_n_u16(0x0100);
SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof(r_.m128i_private) / sizeof(r_.m128i_private[0])) ; i++) {
uint16x8_t index16 = vandq_u16(idx_.m128i_private[i].neon_u16, mask16);
index16 = vmulq_n_u16(index16, 0x0202);
index16 = vaddq_u16(index16, byte_index16);
r_.m128i_private[i].neon_u8 = vqtbl4q_u8(table, vreinterpretq_u8_u16(index16));
}
#elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
SIMDE_POWER_ALTIVEC_VECTOR(unsigned short) index16, mask16, shift16, byte_index16;
SIMDE_POWER_ALTIVEC_VECTOR(unsigned char) index, test, r01, r23;
mask16 = vec_splats(HEDLEY_STATIC_CAST(unsigned short, 0x001F));
shift16 = vec_splats(HEDLEY_STATIC_CAST(unsigned short, 0x0202));
byte_index16 = vec_splats(HEDLEY_STATIC_CAST(unsigned short, 0x0100));
test = vec_splats(HEDLEY_STATIC_CAST(unsigned char, 0x20));
SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof(r_.m128i_private) / sizeof(r_.m128i_private[0])) ; i++) {
index16 = vec_and(idx_.m128i_private[i].altivec_u16, mask16);
index16 = vec_mladd(index16, shift16, byte_index16);
index = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(unsigned char), index16);
r01 = vec_perm(a_.m128i_private[0].altivec_u8, a_.m128i_private[1].altivec_u8, index);
r23 = vec_perm(a_.m128i_private[2].altivec_u8, a_.m128i_private[3].altivec_u8, index);
r_.m128i_private[i].altivec_u8 = vec_sel(r01, r23, vec_cmpeq(vec_and(index, test), test));
}
#elif defined(SIMDE_WASM_SIMD128_NATIVE)
v128_t index, r, t;
const v128_t mask = wasm_i16x8_splat(0x001F);
const v128_t shift = wasm_i16x8_splat(0x0202);
const v128_t byte_index = wasm_i16x8_splat(0x0100);
const v128_t sixteen = wasm_i8x16_splat(16);
const v128_t a0 = a_.m128i_private[0].wasm_v128;
const v128_t a1 = a_.m128i_private[1].wasm_v128;
const v128_t a2 = a_.m128i_private[2].wasm_v128;
const v128_t a3 = a_.m128i_private[3].wasm_v128;
SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof(r_.m128i_private) / sizeof(r_.m128i_private[0])) ; i++) {
index = wasm_v128_and(idx_.m128i_private[i].wasm_v128, mask);
index = wasm_i16x8_mul(index, shift);
index = wasm_i16x8_add(index, byte_index);
r = wasm_i8x16_swizzle(a0, index);
index = wasm_i8x16_sub(index, sixteen);
t = wasm_i8x16_swizzle(a1, index);
r = wasm_v128_or(r, t);
index = wasm_i8x16_sub(index, sixteen);
t = wasm_i8x16_swizzle(a2, index);
r = wasm_v128_or(r, t);
index = wasm_i8x16_sub(index, sixteen);
t = wasm_i8x16_swizzle(a3, index);
r_.m128i_private[i].wasm_v128 = wasm_v128_or(r, t);
}
#else
SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) {
r_.i16[i] = a_.i16[idx_.i16[i] & 0x1F];
}
#endif
return simde__m512i_from_private(r_);
#endif
}
#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES)
#undef _mm512_permutexvar_epi16
#define _mm512_permutexvar_epi16(idx, a) simde_mm512_permutexvar_epi16(idx, a)
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde__m512i
simde_mm512_mask_permutexvar_epi16 (simde__m512i src, simde__mmask32 k, simde__m512i idx, simde__m512i a) {
#if defined(SIMDE_X86_AVX512BW_NATIVE)
return _mm512_mask_permutexvar_epi16(src, k, idx, a);
#else
return simde_mm512_mask_mov_epi16(src, k, simde_mm512_permutexvar_epi16(idx, a));
#endif
}
#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES)
#undef _mm512_mask_permutexvar_epi16
#define _mm512_mask_permutexvar_epi16(src, k, idx, a) simde_mm512_mask_permutexvar_epi16(src, k, idx, a)
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde__m512i
simde_mm512_maskz_permutexvar_epi16 (simde__mmask32 k, simde__m512i idx, simde__m512i a) {
#if defined(SIMDE_X86_AVX512BW_NATIVE)
return _mm512_maskz_permutexvar_epi16(k, idx, a);
#else
return simde_mm512_maskz_mov_epi16(k, simde_mm512_permutexvar_epi16(idx, a));
#endif
}
#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES)
#undef _mm512_maskz_permutexvar_epi16
#define _mm512_maskz_permutexvar_epi16(k, idx, a) simde_mm512_maskz_permutexvar_epi16(k, idx, a)
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde__m512i
simde_mm512_permutexvar_epi32 (simde__m512i idx, simde__m512i a) {
#if defined(SIMDE_X86_AVX512F_NATIVE)
return _mm512_permutexvar_epi32(idx, a);
#else
simde__m512i_private
idx_ = simde__m512i_to_private(idx),
a_ = simde__m512i_to_private(a),
r_;
#if defined(SIMDE_X86_AVX2_NATIVE)
simde__m256i index, r0, r1, select;
SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof(r_.m256i_private) / sizeof(r_.m256i_private[0])) ; i++) {
index = idx_.m256i[i];
r0 = simde_mm256_permutevar8x32_epi32(a_.m256i[0], index);
r1 = simde_mm256_permutevar8x32_epi32(a_.m256i[1], index);
select = simde_mm256_slli_epi32(index, 28);
r_.m256i[i] = simde_mm256_castps_si256(simde_mm256_blendv_ps(simde_mm256_castsi256_ps(r0),
simde_mm256_castsi256_ps(r1),
simde_mm256_castsi256_ps(select)));
}
#elif defined(SIMDE_ARM_NEON_A64V8_NATIVE)
uint8x16x4_t table = { { a_.m128i_private[0].neon_u8,
a_.m128i_private[1].neon_u8,
a_.m128i_private[2].neon_u8,
a_.m128i_private[3].neon_u8 } };
uint32x4_t mask32 = vdupq_n_u32(0x0000000F);
uint32x4_t byte_index32 = vdupq_n_u32(0x03020100);
SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof(r_.m128i_private) / sizeof(r_.m128i_private[0])) ; i++) {
uint32x4_t index32 = vandq_u32(idx_.m128i_private[i].neon_u32, mask32);
index32 = vmulq_n_u32(index32, 0x04040404);
index32 = vaddq_u32(index32, byte_index32);
r_.m128i_private[i].neon_u8 = vqtbl4q_u8(table, vreinterpretq_u8_u32(index32));
}
#elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
SIMDE_POWER_ALTIVEC_VECTOR(unsigned int) index32, mask32, byte_index32, temp32, sixteen;
SIMDE_POWER_ALTIVEC_VECTOR(unsigned short) zero, shift;
SIMDE_POWER_ALTIVEC_VECTOR(unsigned char) index, test, r01, r23;
mask32 = vec_splats(HEDLEY_STATIC_CAST(unsigned int, 0x0000000F));
byte_index32 = vec_splats(HEDLEY_STATIC_CAST(unsigned int, 0x03020100));
zero = vec_splat_u16(0);
shift = vec_splats(HEDLEY_STATIC_CAST(unsigned short, 0x0404));
sixteen = vec_splats(HEDLEY_STATIC_CAST(unsigned int, 16));
test = vec_splats(HEDLEY_STATIC_CAST(unsigned char, 0x20));
SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof(r_.m128i_private) / sizeof(r_.m128i_private[0])) ; i++) {
index32 = vec_and(idx_.m128i_private[i].altivec_u32, mask32);
/* Multiply index32 by 0x04040404; unfortunately vec_mul isn't available so (mis)use 16-bit vec_mladd */
temp32 = vec_sl(index32, sixteen);
index32 = vec_add(index32, temp32);
index32 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(unsigned int),
vec_mladd(HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(unsigned short), index32),
shift,
zero));
index32 = vec_add(index32, byte_index32);
index = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(unsigned char), index32);
r01 = vec_perm(a_.m128i_private[0].altivec_u8, a_.m128i_private[1].altivec_u8, index);
r23 = vec_perm(a_.m128i_private[2].altivec_u8, a_.m128i_private[3].altivec_u8, index);
r_.m128i_private[i].altivec_u8 = vec_sel(r01, r23, vec_cmpeq(vec_and(index, test), test));
}
#elif defined(SIMDE_WASM_SIMD128_NATIVE)
v128_t index, r, t;
const v128_t mask = wasm_i32x4_splat(0x0000000F);
const v128_t shift = wasm_i32x4_splat(0x04040404);
const v128_t byte_index = wasm_i32x4_splat(0x03020100);
const v128_t sixteen = wasm_i8x16_splat(16);
const v128_t a0 = a_.m128i_private[0].wasm_v128;
const v128_t a1 = a_.m128i_private[1].wasm_v128;
const v128_t a2 = a_.m128i_private[2].wasm_v128;
const v128_t a3 = a_.m128i_private[3].wasm_v128;
SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof(r_.m128i_private) / sizeof(r_.m128i_private[0])) ; i++) {
index = wasm_v128_and(idx_.m128i_private[i].wasm_v128, mask);
index = wasm_i32x4_mul(index, shift);
index = wasm_i32x4_add(index, byte_index);
r = wasm_i8x16_swizzle(a0, index);
index = wasm_i8x16_sub(index, sixteen);
t = wasm_i8x16_swizzle(a1, index);
r = wasm_v128_or(r, t);
index = wasm_i8x16_sub(index, sixteen);
t = wasm_i8x16_swizzle(a2, index);
r = wasm_v128_or(r, t);
index = wasm_i8x16_sub(index, sixteen);
t = wasm_i8x16_swizzle(a3, index);
r_.m128i_private[i].wasm_v128 = wasm_v128_or(r, t);
}
#else
#if !defined(__INTEL_COMPILER)
SIMDE_VECTORIZE
#endif
for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
r_.i32[i] = a_.i32[idx_.i32[i] & 0x0F];
}
#endif
return simde__m512i_from_private(r_);
#endif
}
#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES)
#undef _mm512_permutexvar_epi32
#define _mm512_permutexvar_epi32(idx, a) simde_mm512_permutexvar_epi32(idx, a)
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde__m512i
simde_mm512_mask_permutexvar_epi32 (simde__m512i src, simde__mmask16 k, simde__m512i idx, simde__m512i a) {
#if defined(SIMDE_X86_AVX512F_NATIVE)
return _mm512_mask_permutexvar_epi32(src, k, idx, a);
#else
return simde_mm512_mask_mov_epi32(src, k, simde_mm512_permutexvar_epi32(idx, a));
#endif
}
#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES)
#undef _mm512_mask_permutexvar_epi32
#define _mm512_mask_permutexvar_epi32(src, k, idx, a) simde_mm512_mask_permutexvar_epi32(src, k, idx, a)
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde__m512i
simde_mm512_maskz_permutexvar_epi32 (simde__mmask16 k, simde__m512i idx, simde__m512i a) {
#if defined(SIMDE_X86_AVX512F_NATIVE)
return _mm512_maskz_permutexvar_epi32(k, idx, a);
#else
return simde_mm512_maskz_mov_epi32(k, simde_mm512_permutexvar_epi32(idx, a));
#endif
}
#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES)
#undef _mm512_maskz_permutexvar_epi32
#define _mm512_maskz_permutexvar_epi32(k, idx, a) simde_mm512_maskz_permutexvar_epi32(k, idx, a)
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde__m512i
simde_mm512_permutexvar_epi64 (simde__m512i idx, simde__m512i a) {
#if defined(SIMDE_X86_AVX512F_NATIVE)
return _mm512_permutexvar_epi64(idx, a);
#else
simde__m512i_private
idx_ = simde__m512i_to_private(idx),
a_ = simde__m512i_to_private(a),
r_;
#if !defined(__INTEL_COMPILER)
SIMDE_VECTORIZE
#endif
for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) {
r_.i64[i] = a_.i64[idx_.i64[i] & 7];
}
return simde__m512i_from_private(r_);
#endif
}
#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES)
#undef _mm512_permutexvar_epi64
#define _mm512_permutexvar_epi64(idx, a) simde_mm512_permutexvar_epi64(idx, a)
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde__m512i
simde_mm512_mask_permutexvar_epi64 (simde__m512i src, simde__mmask8 k, simde__m512i idx, simde__m512i a) {
#if defined(SIMDE_X86_AVX512F_NATIVE)
return _mm512_mask_permutexvar_epi64(src, k, idx, a);
#else
return simde_mm512_mask_mov_epi64(src, k, simde_mm512_permutexvar_epi64(idx, a));
#endif
}
#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES)
#undef _mm512_mask_permutexvar_epi64
#define _mm512_mask_permutexvar_epi64(src, k, idx, a) simde_mm512_mask_permutexvar_epi64(src, k, idx, a)
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde__m512i
simde_mm512_maskz_permutexvar_epi64 (simde__mmask8 k, simde__m512i idx, simde__m512i a) {
#if defined(SIMDE_X86_AVX512F_NATIVE)
return _mm512_maskz_permutexvar_epi64(k, idx, a);
#else
return simde_mm512_maskz_mov_epi64(k, simde_mm512_permutexvar_epi64(idx, a));
#endif
}
#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES)
#undef _mm512_maskz_permutexvar_epi64
#define _mm512_maskz_permutexvar_epi64(k, idx, a) simde_mm512_maskz_permutexvar_epi64(k, idx, a)
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde__m512i
simde_mm512_permutexvar_epi8 (simde__m512i idx, simde__m512i a) {
#if defined(SIMDE_X86_AVX512VBMI_NATIVE)
return _mm512_permutexvar_epi8(idx, a);
#elif defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE)
simde__m512i hilo, hi, lo, hi2, lo2, idx2;
simde__m512i ones = simde_mm512_set1_epi8(1);
simde__m512i low_bytes = simde_mm512_set1_epi16(0x00FF);
idx2 = simde_mm512_srli_epi16(idx, 1);
hilo = simde_mm512_permutexvar_epi16(idx2, a);
simde__mmask64 mask = simde_mm512_test_epi8_mask(idx, ones);
lo = simde_mm512_and_si512(hilo, low_bytes);
hi = simde_mm512_srli_epi16(hilo, 8);
idx2 = simde_mm512_srli_epi16(idx, 9);
hilo = simde_mm512_permutexvar_epi16(idx2, a);
lo2 = simde_mm512_slli_epi16(hilo, 8);
hi2 = simde_mm512_andnot_si512(low_bytes, hilo);
lo = simde_mm512_or_si512(lo, lo2);
hi = simde_mm512_or_si512(hi, hi2);
return simde_mm512_mask_blend_epi8(mask, lo, hi);
#else
simde__m512i_private
idx_ = simde__m512i_to_private(idx),
a_ = simde__m512i_to_private(a),
r_;
#if defined(SIMDE_X86_AVX2_NATIVE)
simde__m256i t0, t1, index, select, a01, a23;
simde__m256i mask = simde_mm256_set1_epi8(0x3F);
simde__m256i a0 = simde_mm256_broadcastsi128_si256(a_.m128i[0]);
simde__m256i a1 = simde_mm256_broadcastsi128_si256(a_.m128i[1]);
simde__m256i a2 = simde_mm256_broadcastsi128_si256(a_.m128i[2]);
simde__m256i a3 = simde_mm256_broadcastsi128_si256(a_.m128i[3]);
SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof(r_.m256i_private) / sizeof(r_.m256i_private[0])) ; i++) {
index = idx_.m256i[i];
index = simde_mm256_and_si256(index, mask);
select = simde_mm256_slli_epi64(index, 3);
t0 = simde_mm256_shuffle_epi8(a0, index);
t1 = simde_mm256_shuffle_epi8(a1, index);
a01 = simde_mm256_blendv_epi8(t0, t1, select);
t0 = simde_mm256_shuffle_epi8(a2, index);
t1 = simde_mm256_shuffle_epi8(a3, index);
a23 = simde_mm256_blendv_epi8(t0, t1, select);
select = simde_mm256_slli_epi64(index, 2);
r_.m256i[i] = simde_mm256_blendv_epi8(a01, a23, select);
}
#elif defined(SIMDE_ARM_NEON_A64V8_NATIVE)
uint8x16x4_t table = { { a_.m128i_private[0].neon_u8,
a_.m128i_private[1].neon_u8,
a_.m128i_private[2].neon_u8,
a_.m128i_private[3].neon_u8 } };
uint8x16_t mask = vdupq_n_u8(0x3F);
SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof(r_.m128i_private) / sizeof(r_.m128i_private[0])) ; i++) {
r_.m128i_private[i].neon_u8 = vqtbl4q_u8(table, vandq_u8(idx_.m128i_private[i].neon_u8, mask));
}
#elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
SIMDE_POWER_ALTIVEC_VECTOR(unsigned char) test, r01, r23;
test = vec_splats(HEDLEY_STATIC_CAST(unsigned char, 0x20));
SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof(r_.m128i_private) / sizeof(r_.m128i_private[0])) ; i++) {
r01 = vec_perm(a_.m128i_private[0].altivec_u8, a_.m128i_private[1].altivec_u8, idx_.m128i_private[i].altivec_u8);
r23 = vec_perm(a_.m128i_private[2].altivec_u8, a_.m128i_private[3].altivec_u8, idx_.m128i_private[i].altivec_u8);
r_.m128i_private[i].altivec_u8 = vec_sel(r01, r23, vec_cmpeq(vec_and(idx_.m128i_private[i].altivec_u8, test), test));
}
#elif defined(SIMDE_WASM_SIMD128_NATIVE)
v128_t index, r, t;
const v128_t mask = wasm_i8x16_splat(0x3F);
const v128_t sixteen = wasm_i8x16_splat(16);
const v128_t a0 = a_.m128i_private[0].wasm_v128;
const v128_t a1 = a_.m128i_private[1].wasm_v128;
const v128_t a2 = a_.m128i_private[2].wasm_v128;
const v128_t a3 = a_.m128i_private[3].wasm_v128;
SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof(r_.m128i_private) / sizeof(r_.m128i_private[0])) ; i++) {
index = wasm_v128_and(idx_.m128i_private[i].wasm_v128, mask);
r = wasm_i8x16_swizzle(a0, index);
index = wasm_i8x16_sub(index, sixteen);
t = wasm_i8x16_swizzle(a1, index);
r = wasm_v128_or(r, t);
index = wasm_i8x16_sub(index, sixteen);
t = wasm_i8x16_swizzle(a2, index);
r = wasm_v128_or(r, t);
index = wasm_i8x16_sub(index, sixteen);
t = wasm_i8x16_swizzle(a3, index);
r_.m128i_private[i].wasm_v128 = wasm_v128_or(r, t);
}
#else
SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) {
r_.i8[i] = a_.i8[idx_.i8[i] & 0x3F];
}
#endif
return simde__m512i_from_private(r_);
#endif
}
#if defined(SIMDE_X86_AVX512VBMI_ENABLE_NATIVE_ALIASES)
#undef _mm512_permutexvar_epi8
#define _mm512_permutexvar_epi8(idx, a) simde_mm512_permutexvar_epi8(idx, a)
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde__m512i
simde_mm512_mask_permutexvar_epi8 (simde__m512i src, simde__mmask64 k, simde__m512i idx, simde__m512i a) {
#if defined(SIMDE_X86_AVX512VBMI_NATIVE)
return _mm512_mask_permutexvar_epi8(src, k, idx, a);
#else
return simde_mm512_mask_mov_epi8(src, k, simde_mm512_permutexvar_epi8(idx, a));
#endif
}
#if defined(SIMDE_X86_AVX512VBMI_ENABLE_NATIVE_ALIASES)
#undef _mm512_mask_permutexvar_epi8
#define _mm512_mask_permutexvar_epi8(src, k, idx, a) simde_mm512_mask_permutexvar_epi8(src, k, idx, a)
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde__m512i
simde_mm512_maskz_permutexvar_epi8 (simde__mmask64 k, simde__m512i idx, simde__m512i a) {
#if defined(SIMDE_X86_AVX512VBMI_NATIVE)
return _mm512_maskz_permutexvar_epi8(k, idx, a);
#else
return simde_mm512_maskz_mov_epi8(k, simde_mm512_permutexvar_epi8(idx, a));
#endif
}
#if defined(SIMDE_X86_AVX512VBMI_ENABLE_NATIVE_ALIASES)
#undef _mm512_maskz_permutexvar_epi8
#define _mm512_maskz_permutexvar_epi8(k, idx, a) simde_mm512_maskz_permutexvar_epi8(k, idx, a)
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde__m512d
simde_mm512_permutexvar_pd (simde__m512i idx, simde__m512d a) {
#if defined(SIMDE_X86_AVX512F_NATIVE)
return _mm512_permutexvar_pd(idx, a);
#else
return simde_mm512_castsi512_pd(simde_mm512_permutexvar_epi64(idx, simde_mm512_castpd_si512(a)));
#endif
}
#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES)
#undef _mm512_permutexvar_pd
#define _mm512_permutexvar_pd(idx, a) simde_mm512_permutexvar_pd(idx, a)
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde__m512d
simde_mm512_mask_permutexvar_pd (simde__m512d src, simde__mmask8 k, simde__m512i idx, simde__m512d a) {
#if defined(SIMDE_X86_AVX512F_NATIVE)
return _mm512_mask_permutexvar_pd(src, k, idx, a);
#else
return simde_mm512_mask_mov_pd(src, k, simde_mm512_permutexvar_pd(idx, a));
#endif
}
#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES)
#undef _mm512_mask_permutexvar_pd
#define _mm512_mask_permutexvar_pd(src, k, idx, a) simde_mm512_mask_permutexvar_pd(src, k, idx, a)
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde__m512d
simde_mm512_maskz_permutexvar_pd (simde__mmask8 k, simde__m512i idx, simde__m512d a) {
#if defined(SIMDE_X86_AVX512F_NATIVE)
return _mm512_maskz_permutexvar_pd(k, idx, a);
#else
return simde_mm512_maskz_mov_pd(k, simde_mm512_permutexvar_pd(idx, a));
#endif
}
#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES)
#undef _mm512_maskz_permutexvar_pd
#define _mm512_maskz_permutexvar_pd(k, idx, a) simde_mm512_maskz_permutexvar_pd(k, idx, a)
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde__m512
simde_mm512_permutexvar_ps (simde__m512i idx, simde__m512 a) {
#if defined(SIMDE_X86_AVX512F_NATIVE)
return _mm512_permutexvar_ps(idx, a);
#else
return simde_mm512_castsi512_ps(simde_mm512_permutexvar_epi32(idx, simde_mm512_castps_si512(a)));
#endif
}
#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES)
#undef _mm512_permutexvar_ps
#define _mm512_permutexvar_ps(idx, a) simde_mm512_permutexvar_ps(idx, a)
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde__m512
simde_mm512_mask_permutexvar_ps (simde__m512 src, simde__mmask16 k, simde__m512i idx, simde__m512 a) {
#if defined(SIMDE_X86_AVX512F_NATIVE)
return _mm512_mask_permutexvar_ps(src, k, idx, a);
#else
return simde_mm512_mask_mov_ps(src, k, simde_mm512_permutexvar_ps(idx, a));
#endif
}
#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES)
#undef _mm512_mask_permutexvar_ps
#define _mm512_mask_permutexvar_ps(src, k, idx, a) simde_mm512_mask_permutexvar_ps(src, k, idx, a)
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde__m512
simde_mm512_maskz_permutexvar_ps (simde__mmask16 k, simde__m512i idx, simde__m512 a) {
#if defined(SIMDE_X86_AVX512F_NATIVE)
return _mm512_maskz_permutexvar_ps(k, idx, a);
#else
return simde_mm512_maskz_mov_ps(k, simde_mm512_permutexvar_ps(idx, a));
#endif
}
#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES)
#undef _mm512_maskz_permutexvar_ps
#define _mm512_maskz_permutexvar_ps(k, idx, a) simde_mm512_maskz_permutexvar_ps(k, idx, a)
#endif
SIMDE_END_DECLS_
HEDLEY_DIAGNOSTIC_POP
#endif /* !defined(SIMDE_X86_AVX512_PERMUTEXVAR_H) */