stk-code_catmod/lib/simd_wrapper/simde/x86/avx512/shldv.h
2022-04-29 11:02:25 +08:00

158 lines
4.9 KiB
C

#if !defined(SIMDE_X86_AVX512_SHLDV_H)
#define SIMDE_X86_AVX512_SHLDV_H
#include "types.h"
#include "../avx2.h"
HEDLEY_DIAGNOSTIC_PUSH
SIMDE_DISABLE_UNWANTED_DIAGNOSTICS
SIMDE_BEGIN_DECLS_
SIMDE_FUNCTION_ATTRIBUTES
simde__m128i
simde_mm_shldv_epi32(simde__m128i a, simde__m128i b, simde__m128i c) {
#if defined(SIMDE_X86_AVX512VBMI2_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE)
return _mm_shldv_epi32(a, b, c);
#else
simde__m128i_private r_;
#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
simde__m128i_private
a_ = simde__m128i_to_private(a),
b_ = simde__m128i_to_private(b),
c_ = simde__m128i_to_private(c);
uint64x2_t
values_lo = vreinterpretq_u64_u32(vzip1q_u32(b_.neon_u32, a_.neon_u32)),
values_hi = vreinterpretq_u64_u32(vzip2q_u32(b_.neon_u32, a_.neon_u32));
int32x4_t count = vandq_s32(c_.neon_i32, vdupq_n_s32(31));
values_lo = vshlq_u64(values_lo, vmovl_s32(vget_low_s32(count)));
values_hi = vshlq_u64(values_hi, vmovl_high_s32(count));
r_.neon_u32 =
vuzp2q_u32(
vreinterpretq_u32_u64(values_lo),
vreinterpretq_u32_u64(values_hi)
);
#elif defined(SIMDE_X86_AVX2_NATIVE)
simde__m256i
tmp1,
lo =
simde_mm256_castps_si256(
simde_mm256_unpacklo_ps(
simde_mm256_castsi256_ps(simde_mm256_castsi128_si256(b)),
simde_mm256_castsi256_ps(simde_mm256_castsi128_si256(a))
)
),
hi =
simde_mm256_castps_si256(
simde_mm256_unpackhi_ps(
simde_mm256_castsi256_ps(simde_mm256_castsi128_si256(b)),
simde_mm256_castsi256_ps(simde_mm256_castsi128_si256(a))
)
),
tmp2 =
simde_mm256_castpd_si256(
simde_mm256_permute2f128_pd(
simde_mm256_castsi256_pd(lo),
simde_mm256_castsi256_pd(hi),
32
)
);
tmp2 =
simde_mm256_sllv_epi64(
tmp2,
simde_mm256_cvtepi32_epi64(
simde_mm_and_si128(
c,
simde_mm_set1_epi32(31)
)
)
);
tmp1 =
simde_mm256_castpd_si256(
simde_mm256_permute2f128_pd(
simde_mm256_castsi256_pd(tmp2),
simde_mm256_castsi256_pd(tmp2),
1
)
);
r_ =
simde__m128i_to_private(
simde_mm256_castsi256_si128(
simde_mm256_castps_si256(
simde_mm256_shuffle_ps(
simde_mm256_castsi256_ps(tmp2),
simde_mm256_castsi256_ps(tmp1),
221
)
)
)
);
#elif defined(SIMDE_X86_SSE2_NATIVE)
simde__m128i_private
c_ = simde__m128i_to_private(c),
lo = simde__m128i_to_private(simde_mm_unpacklo_epi32(b, a)),
hi = simde__m128i_to_private(simde_mm_unpackhi_epi32(b, a));
size_t halfway = (sizeof(r_.u32) / sizeof(r_.u32[0]) / 2);
SIMDE_VECTORIZE
for (size_t i = 0 ; i < halfway ; i++) {
lo.u64[i] <<= (c_.u32[i] & 31);
hi.u64[i] <<= (c_.u32[halfway + i] & 31);
}
r_ =
simde__m128i_to_private(
simde_mm_castps_si128(
simde_mm_shuffle_ps(
simde_mm_castsi128_ps(simde__m128i_from_private(lo)),
simde_mm_castsi128_ps(simde__m128i_from_private(hi)),
221)
)
);
#elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) && defined(SIMDE_SHUFFLE_VECTOR_) && defined(SIMDE_CONVERT_VECTOR_) && (SIMDE_ENDIAN_ORDER == SIMDE_ENDIAN_LITTLE)
simde__m128i_private
c_ = simde__m128i_to_private(c);
simde__m256i_private
a_ = simde__m256i_to_private(simde_mm256_castsi128_si256(a)),
b_ = simde__m256i_to_private(simde_mm256_castsi128_si256(b)),
tmp1,
tmp2;
tmp1.u64 = HEDLEY_REINTERPRET_CAST(__typeof__(tmp1.u64), SIMDE_SHUFFLE_VECTOR_(32, 32, b_.i32, a_.i32, 0, 8, 1, 9, 2, 10, 3, 11));
SIMDE_CONVERT_VECTOR_(tmp2.u64, c_.u32);
tmp1.u64 <<= (tmp2.u64 & 31);
r_.i32 = SIMDE_SHUFFLE_VECTOR_(32, 16, tmp1.m128i_private[0].i32, tmp1.m128i_private[1].i32, 1, 3, 5, 7);
#else
simde__m128i_private
a_ = simde__m128i_to_private(a),
b_ = simde__m128i_to_private(b),
c_ = simde__m128i_to_private(c);
SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof(r_.u32) / sizeof(r_.u32[0])) ; i++) {
r_.u32[i] = HEDLEY_STATIC_CAST(uint32_t, (((HEDLEY_STATIC_CAST(uint64_t, a_.u32[i]) << 32) | b_.u32[i]) << (c_.u32[i] & 31)) >> 32);
}
#endif
return simde__m128i_from_private(r_);
#endif
}
#if defined(SIMDE_X86_AVX512VBMI2_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES)
#undef _mm_shldv_epi32
#define _mm_shldv_epi32(a, b, c) simde_mm_shldv_epi32(a, b, c)
#endif
SIMDE_END_DECLS_
HEDLEY_DIAGNOSTIC_POP
#endif /* !defined(SIMDE_X86_AVX512_SHLDV_H) */