stk-code_catmod/lib/simd_wrapper/simde/x86/avx512/popcnt.h
2022-04-29 11:02:25 +08:00

1347 lines
45 KiB
C

#if !defined(SIMDE_X86_AVX512_POPCNT_H)
#define SIMDE_X86_AVX512_POPCNT_H
#include "types.h"
#include "mov.h"
HEDLEY_DIAGNOSTIC_PUSH
SIMDE_DISABLE_UNWANTED_DIAGNOSTICS
SIMDE_BEGIN_DECLS_
SIMDE_FUNCTION_ATTRIBUTES
simde__m128i
simde_mm_popcnt_epi8 (simde__m128i a) {
#if defined(SIMDE_X86_AVX512BITALG_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE)
return _mm_popcnt_epi8(a);
#else
simde__m128i_private
r_,
a_ = simde__m128i_to_private(a);
#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
r_.neon_i8 = vcntq_s8(a_.neon_i8);
#elif defined(SIMDE_WASM_SIMD128_NATIVE)
r_.wasm_v128 = wasm_i8x16_popcnt(a_.wasm_v128);
#elif defined(SIMDE_X86_SSSE3_NATIVE)
const __m128i low_nibble_set = _mm_set1_epi8(0x0f);
const __m128i high_nibble_of_input = _mm_andnot_si128(low_nibble_set, a_.n);
const __m128i low_nibble_of_input = _mm_and_si128(low_nibble_set, a_.n);
const __m128i lut = _mm_set_epi8(4, 3, 3, 2, 3, 2, 2, 1, 3, 2, 2, 1, 2, 1, 1, 0);
r_.n =
_mm_add_epi8(
_mm_shuffle_epi8(
lut,
low_nibble_of_input
),
_mm_shuffle_epi8(
lut,
_mm_srli_epi16(
high_nibble_of_input,
4
)
)
);
#elif defined(SIMDE_X86_SSE2_NATIVE)
/* v -= ((v >> 1) & UINT8_C(0x55)); */
r_.n =
_mm_sub_epi8(
a_.n,
_mm_and_si128(
_mm_srli_epi16(a_.n, 1),
_mm_set1_epi8(0x55)
)
);
/* v = (v & 0x33) + ((v >> 2) & 0x33); */
r_.n =
_mm_add_epi8(
_mm_and_si128(
r_.n,
_mm_set1_epi8(0x33)
),
_mm_and_si128(
_mm_srli_epi16(r_.n, 2),
_mm_set1_epi8(0x33)
)
);
/* v = (v + (v >> 4)) & 0xf */
r_.n =
_mm_and_si128(
_mm_add_epi8(
r_.n,
_mm_srli_epi16(r_.n, 4)
),
_mm_set1_epi8(0x0f)
);
#elif defined(SIMDE_POWER_ALTIVEC_P8_NATIVE)
r_.altivec_i8 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(signed char), vec_popcnt(HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(unsigned char), a_.altivec_i8)));
#elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR)
a_.u8 -= ((a_.u8 >> 1) & 0x55);
a_.u8 = ((a_.u8 & 0x33) + ((a_.u8 >> 2) & 0x33));
a_.u8 = (a_.u8 + (a_.u8 >> 4)) & 15;
r_.u8 = a_.u8 >> ((sizeof(uint8_t) - 1) * CHAR_BIT);
#else
SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof(r_.u8) / sizeof(r_.u8[0])) ; i++) {
uint8_t v = HEDLEY_STATIC_CAST(uint8_t, a_.u8[i]);
v -= ((v >> 1) & 0x55);
v = (v & 0x33) + ((v >> 2) & 0x33);
v = (v + (v >> 4)) & 0xf;
r_.u8[i] = v >> (sizeof(uint8_t) - 1) * CHAR_BIT;
}
#endif
return simde__m128i_from_private(r_);
#endif
}
#if defined(SIMDE_X86_AVX512BITALG_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES)
#undef _mm_popcnt_epi8
#define _mm_popcnt_epi8(a) simde_mm_popcnt_epi8(a)
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde__m128i
simde_mm_mask_popcnt_epi8 (simde__m128i src, simde__mmask16 k, simde__m128i a) {
#if defined(SIMDE_X86_AVX512BITALG_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE)
return _mm_mask_popcnt_epi8(src, k, a);
#else
return simde_mm_mask_mov_epi8(src, k, simde_mm_popcnt_epi8(a));
#endif
}
#if defined(SIMDE_X86_AVX512BITALG_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES)
#undef _mm_mask_popcnt_epi8
#define _mm_mask_popcnt_epi8(src, k, a) simde_mm_mask_popcnt_epi8(src, k, a)
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde__m128i
simde_mm_maskz_popcnt_epi8 (simde__mmask16 k, simde__m128i a) {
#if defined(SIMDE_X86_AVX512BITALG_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE)
return _mm_maskz_popcnt_epi8(k, a);
#else
return simde_mm_maskz_mov_epi8(k, simde_mm_popcnt_epi8(a));
#endif
}
#if defined(SIMDE_X86_AVX512BITALG_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES)
#undef _mm_maskz_popcnt_epi8
#define _mm_maskz_popcnt_epi8(k, a) simde_mm_maskz_popcnt_epi8(k, a)
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde__m128i
simde_mm_popcnt_epi16 (simde__m128i a) {
#if defined(SIMDE_X86_AVX512BITALG_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE)
return _mm_popcnt_epi16(a);
#else
simde__m128i_private
r_,
a_ = simde__m128i_to_private(a);
#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
r_.neon_i16 = vpaddlq_s8(vcntq_s8(a_.neon_i8));
#elif defined(SIMDE_WASM_SIMD128_NATIVE)
r_.wasm_v128 = wasm_i16x8_extadd_pairwise_i8x16(wasm_i8x16_popcnt(a_.wasm_v128));
#elif defined(SIMDE_POWER_ALTIVEC_P8_NATIVE)
r_.altivec_u16 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(unsigned short), vec_popcnt(HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(unsigned short), a_.altivec_u16)));
#elif defined(SIMDE_X86_XOP_NATIVE)
const __m128i low_nibble_set = _mm_set1_epi8(0x0f);
const __m128i high_nibble_of_input = _mm_andnot_si128(low_nibble_set, a_.n);
const __m128i low_nibble_of_input = _mm_and_si128(low_nibble_set, a_.n);
const __m128i lut = _mm_set_epi8(4, 3, 3, 2, 3, 2, 2, 1, 3, 2, 2, 1, 2, 1, 1, 0);
r_.n =
_mm_haddw_epi8(
_mm_add_epi8(
_mm_shuffle_epi8(
lut,
low_nibble_of_input
),
_mm_shuffle_epi8(
lut,
_mm_srli_epi16(high_nibble_of_input, 4)
)
)
);
#elif defined(SIMDE_X86_SSE2_NATIVE)
r_.n =
_mm_sub_epi16(
a_.n,
_mm_and_si128(
_mm_srli_epi16(a_.n, 1),
_mm_set1_epi16(0x5555)
)
);
r_.n =
_mm_add_epi16(
_mm_and_si128(
r_.n,
_mm_set1_epi16(0x3333)
),
_mm_and_si128(
_mm_srli_epi16(r_.n, 2),
_mm_set1_epi16(0x3333)
)
);
r_.n =
_mm_and_si128(
_mm_add_epi16(
r_.n,
_mm_srli_epi16(r_.n, 4)
),
_mm_set1_epi16(0x0f0f)
);
r_.n =
_mm_srli_epi16(
_mm_mullo_epi16(
r_.n,
_mm_set1_epi16(0x0101)
),
(sizeof(uint16_t) - 1) * CHAR_BIT
);
#elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR)
a_.u16 -= ((a_.u16 >> 1) & UINT16_C(0x5555));
a_.u16 = ((a_.u16 & UINT16_C(0x3333)) + ((a_.u16 >> 2) & UINT16_C(0x3333)));
a_.u16 = (a_.u16 + (a_.u16 >> 4)) & UINT16_C(0x0f0f);
r_.u16 = (a_.u16 * UINT16_C(0x0101)) >> ((sizeof(uint16_t) - 1) * CHAR_BIT);
#else
SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof(r_.u16) / sizeof(r_.u16[0])) ; i++) {
uint16_t v = HEDLEY_STATIC_CAST(uint16_t, a_.u16[i]);
v -= ((v >> 1) & UINT16_C(0x5555));
v = ((v & UINT16_C(0x3333)) + ((v >> 2) & UINT16_C(0x3333)));
v = (v + (v >> 4)) & UINT16_C(0x0f0f);
r_.u16[i] = HEDLEY_STATIC_CAST(uint16_t, (v * UINT16_C(0x0101))) >> ((sizeof(uint16_t) - 1) * CHAR_BIT);
}
#endif
return simde__m128i_from_private(r_);
#endif
}
#if defined(SIMDE_X86_AVX512BITALG_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES)
#undef _mm_popcnt_epi16
#define _mm_popcnt_epi16(a) simde_mm_popcnt_epi16(a)
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde__m128i
simde_mm_mask_popcnt_epi16 (simde__m128i src, simde__mmask8 k, simde__m128i a) {
#if defined(SIMDE_X86_AVX512BITALG_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE)
return _mm_mask_popcnt_epi16(src, k, a);
#else
return simde_mm_mask_mov_epi16(src, k, simde_mm_popcnt_epi16(a));
#endif
}
#if defined(SIMDE_X86_AVX512BITALG_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES)
#undef _mm_mask_popcnt_epi16
#define _mm_mask_popcnt_epi16(src, k, a) simde_mm_mask_popcnt_epi16(src, k, a)
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde__m128i
simde_mm_maskz_popcnt_epi16 (simde__mmask8 k, simde__m128i a) {
#if defined(SIMDE_X86_AVX512BITALG_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE)
return _mm_maskz_popcnt_epi16(k, a);
#else
return simde_mm_maskz_mov_epi16(k, simde_mm_popcnt_epi16(a));
#endif
}
#if defined(SIMDE_X86_AVX512BITALG_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES)
#undef _mm_maskz_popcnt_epi16
#define _mm_maskz_popcnt_epi16(k, a) simde_mm_maskz_popcnt_epi16(k, a)
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde__m128i
simde_mm_popcnt_epi32 (simde__m128i a) {
#if defined(SIMDE_X86_AVX512VPOPCNTDQ_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE)
return _mm_popcnt_epi32(a);
#else
simde__m128i_private
r_,
a_ = simde__m128i_to_private(a);
#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
r_.neon_i32 = vpaddlq_s16(vpaddlq_s8(vcntq_s8(a_.neon_i8)));
#elif defined(SIMDE_POWER_ALTIVEC_P8_NATIVE)
r_.altivec_u32 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(unsigned int), vec_popcnt(HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(unsigned int), a_.altivec_u32)));
#elif defined(SIMDE_X86_XOP_NATIVE)
const __m128i low_nibble_set = _mm_set1_epi8(0x0f);
const __m128i high_nibble_of_input = _mm_andnot_si128(low_nibble_set, a_.n);
const __m128i low_nibble_of_input = _mm_and_si128(low_nibble_set, a_.n);
const __m128i lut = _mm_set_epi8(4, 3, 3, 2, 3, 2, 2, 1, 3, 2, 2, 1, 2, 1, 1, 0);
r_.n =
_mm_haddd_epi8(
_mm_add_epi8(
_mm_shuffle_epi8(
lut,
low_nibble_of_input
),
_mm_shuffle_epi8(
lut,
_mm_srli_epi16(high_nibble_of_input, 4)
)
)
);
#elif defined(SIMDE_X86_SSE4_1_NATIVE)
r_.n =
_mm_sub_epi32(
a_.n,
_mm_and_si128(
_mm_srli_epi32(a_.n, 1),
_mm_set1_epi32(0x55555555)
)
);
r_.n =
_mm_add_epi32(
_mm_and_si128(
r_.n,
_mm_set1_epi32(0x33333333)
),
_mm_and_si128(
_mm_srli_epi32(r_.n, 2),
_mm_set1_epi32(0x33333333)
)
);
r_.n =
_mm_and_si128(
_mm_add_epi32(
r_.n,
_mm_srli_epi32(r_.n, 4)
),
_mm_set1_epi32(0x0f0f0f0f)
);
r_.n =
_mm_srli_epi32(
_mm_mullo_epi32(
r_.n,
_mm_set1_epi32(0x01010101)
),
(sizeof(uint32_t) - 1) * CHAR_BIT
);
#elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR)
a_.u32 -= ((a_.u32 >> 1) & UINT32_C(0x55555555));
a_.u32 = ((a_.u32 & UINT32_C(0x33333333)) + ((a_.u32 >> 2) & UINT32_C(0x33333333)));
a_.u32 = (a_.u32 + (a_.u32 >> 4)) & UINT32_C(0x0f0f0f0f);
r_.u32 = (a_.u32 * UINT32_C(0x01010101)) >> ((sizeof(uint32_t) - 1) * CHAR_BIT);
#else
SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof(r_.u32) / sizeof(r_.u32[0])) ; i++) {
uint32_t v = HEDLEY_STATIC_CAST(uint32_t, a_.u32[i]);
v -= ((v >> 1) & UINT32_C(0x55555555));
v = ((v & UINT32_C(0x33333333)) + ((v >> 2) & UINT32_C(0x33333333)));
v = (v + (v >> 4)) & UINT32_C(0x0f0f0f0f);
r_.u32[i] = HEDLEY_STATIC_CAST(uint32_t, (v * UINT32_C(0x01010101))) >> ((sizeof(uint32_t) - 1) * CHAR_BIT);
}
#endif
return simde__m128i_from_private(r_);
#endif
}
#if defined(SIMDE_X86_AVX512VPOPCNTDQ_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES)
#undef _mm_popcnt_epi32
#define _mm_popcnt_epi32(a) simde_mm_popcnt_epi32(a)
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde__m128i
simde_mm_mask_popcnt_epi32 (simde__m128i src, simde__mmask8 k, simde__m128i a) {
#if defined(SIMDE_X86_AVX512VPOPCNTDQ_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE)
return _mm_mask_popcnt_epi32(src, k, a);
#else
return simde_mm_mask_mov_epi32(src, k, simde_mm_popcnt_epi32(a));
#endif
}
#if defined(SIMDE_X86_AVX512VPOPCNTDQ_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES)
#undef _mm_mask_popcnt_epi32
#define _mm_mask_popcnt_epi32(src, k, a) simde_mm_mask_popcnt_epi32(src, k, a)
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde__m128i
simde_mm_maskz_popcnt_epi32 (simde__mmask8 k, simde__m128i a) {
#if defined(SIMDE_X86_AVX512VPOPCNTDQ_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE)
return _mm_maskz_popcnt_epi32(k, a);
#else
return simde_mm_maskz_mov_epi32(k, simde_mm_popcnt_epi32(a));
#endif
}
#if defined(SIMDE_X86_AVX512VPOPCNTDQ_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES)
#undef _mm_maskz_popcnt_epi32
#define _mm_maskz_popcnt_epi32(k, a) simde_mm_maskz_popcnt_epi32(k, a)
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde__m128i
simde_mm_popcnt_epi64 (simde__m128i a) {
#if defined(SIMDE_X86_AVX512VPOPCNTDQ_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE)
return _mm_popcnt_epi64(a);
#else
simde__m128i_private
r_,
a_ = simde__m128i_to_private(a);
#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
r_.neon_i64 = vpaddlq_s32(vpaddlq_s16(vpaddlq_s8(vcntq_s8(a_.neon_i8))));
#elif defined(SIMDE_POWER_ALTIVEC_P8_NATIVE)
r_.altivec_u64 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(unsigned long long), vec_popcnt(HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(unsigned long long), a_.altivec_u64)));
#elif defined(SIMDE_X86_SSSE3_NATIVE)
const __m128i low_nibble_set = _mm_set1_epi8(0x0f);
const __m128i high_nibble_of_input = _mm_andnot_si128(low_nibble_set, a_.n);
const __m128i low_nibble_of_input = _mm_and_si128(low_nibble_set, a_.n);
const __m128i lut = _mm_set_epi8(4, 3, 3, 2, 3, 2, 2, 1, 3, 2, 2, 1, 2, 1, 1, 0);
r_.n =
_mm_sad_epu8(
_mm_add_epi8(
_mm_shuffle_epi8(
lut,
low_nibble_of_input
),
_mm_shuffle_epi8(
lut,
_mm_srli_epi16(high_nibble_of_input, 4)
)
),
_mm_setzero_si128()
);
#elif defined(SIMDE_X86_SSE2_NATIVE)
r_.n =
_mm_sub_epi8(
a_.n,
_mm_and_si128(
_mm_srli_epi16(a_.n, 1),
_mm_set1_epi8(0x55)
)
);
r_.n =
_mm_add_epi8(
_mm_and_si128(
r_.n,
_mm_set1_epi8(0x33)
),
_mm_and_si128(
_mm_srli_epi16(r_.n, 2),
_mm_set1_epi8(0x33)
)
);
r_.n =
_mm_and_si128(
_mm_add_epi8(
r_.n,
_mm_srli_epi16(r_.n, 4)
),
_mm_set1_epi8(0x0f)
);
r_.n =
_mm_sad_epu8(
r_.n,
_mm_setzero_si128()
);
#elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR)
a_.u64 -= ((a_.u64 >> 1) & UINT64_C(0x5555555555555555));
a_.u64 = ((a_.u64 & UINT64_C(0x3333333333333333)) + ((a_.u64 >> 2) & UINT64_C(0x3333333333333333)));
a_.u64 = (a_.u64 + (a_.u64 >> 4)) & UINT64_C(0x0f0f0f0f0f0f0f0f);
r_.u64 = (a_.u64 * UINT64_C(0x0101010101010101)) >> ((sizeof(uint64_t) - 1) * CHAR_BIT);
#else
SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof(r_.u64) / sizeof(r_.u64[0])) ; i++) {
uint64_t v = HEDLEY_STATIC_CAST(uint64_t, a_.u64[i]);
v -= ((v >> 1) & UINT64_C(0x5555555555555555));
v = ((v & UINT64_C(0x3333333333333333)) + ((v >> 2) & UINT64_C(0x3333333333333333)));
v = (v + (v >> 4)) & UINT64_C(0x0f0f0f0f0f0f0f0f);
r_.u64[i] = HEDLEY_STATIC_CAST(uint64_t, (v * UINT64_C(0x0101010101010101))) >> ((sizeof(uint64_t) - 1) * CHAR_BIT);
}
#endif
return simde__m128i_from_private(r_);
#endif
}
#if defined(SIMDE_X86_AVX512VPOPCNTDQ_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES)
#undef _mm_popcnt_epi64
#define _mm_popcnt_epi64(a) simde_mm_popcnt_epi64(a)
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde__m128i
simde_mm_mask_popcnt_epi64 (simde__m128i src, simde__mmask8 k, simde__m128i a) {
#if defined(SIMDE_X86_AVX512VPOPCNTDQ_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE)
return _mm_mask_popcnt_epi64(src, k, a);
#else
return simde_mm_mask_mov_epi64(src, k, simde_mm_popcnt_epi64(a));
#endif
}
#if defined(SIMDE_X86_AVX512VPOPCNTDQ_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES)
#undef _mm_mask_popcnt_epi64
#define _mm_mask_popcnt_epi64(src, k, a) simde_mm_mask_popcnt_epi64(src, k, a)
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde__m128i
simde_mm_maskz_popcnt_epi64 (simde__mmask8 k, simde__m128i a) {
#if defined(SIMDE_X86_AVX512VPOPCNTDQ_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE)
return _mm_maskz_popcnt_epi64(k, a);
#else
return simde_mm_maskz_mov_epi64(k, simde_mm_popcnt_epi64(a));
#endif
}
#if defined(SIMDE_X86_AVX512VPOPCNTDQ_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES)
#undef _mm_maskz_popcnt_epi64
#define _mm_maskz_popcnt_epi64(k, a) simde_mm_maskz_popcnt_epi64(k, a)
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde__m256i
simde_mm256_popcnt_epi8 (simde__m256i a) {
#if defined(SIMDE_X86_AVX512BITALG_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE)
return _mm256_popcnt_epi8(a);
#else
simde__m256i_private
r_,
a_ = simde__m256i_to_private(a);
#if SIMDE_NATURAL_VECTOR_SIZE_LE(128)
for (size_t i = 0 ; i < (sizeof(r_.m128i) / sizeof(r_.m128i[0])) ; i++) {
r_.m128i[i] = simde_mm_popcnt_epi8(a_.m128i[i]);
}
#elif defined(SIMDE_X86_AVX2_NATIVE)
const __m256i low_nibble_set = _mm256_set1_epi8(0x0f);
const __m256i high_nibble_of_input = _mm256_andnot_si256(low_nibble_set, a_.n);
const __m256i low_nibble_of_input = _mm256_and_si256(low_nibble_set, a_.n);
const __m256i lut =
_mm256_set_epi8(
4, 3, 3, 2, 3, 2, 2, 1, 3, 2, 2, 1, 2, 1, 1, 0,
4, 3, 3, 2, 3, 2, 2, 1, 3, 2, 2, 1, 2, 1, 1, 0
);
r_.n =
_mm256_add_epi8(
_mm256_shuffle_epi8(
lut,
low_nibble_of_input
),
_mm256_shuffle_epi8(
lut,
_mm256_srli_epi16(
high_nibble_of_input,
4
)
)
);
#elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR)
a_.u8 -= ((a_.u8 >> 1) & 0x55);
a_.u8 = ((a_.u8 & 0x33) + ((a_.u8 >> 2) & 0x33));
a_.u8 = (a_.u8 + (a_.u8 >> 4)) & 15;
r_.u8 = a_.u8 >> ((sizeof(uint8_t) - 1) * CHAR_BIT);
#else
SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof(r_.u8) / sizeof(r_.u8[0])) ; i++) {
uint8_t v = HEDLEY_STATIC_CAST(uint8_t, a_.u8[i]);
v -= ((v >> 1) & 0x55);
v = (v & 0x33) + ((v >> 2) & 0x33);
v = (v + (v >> 4)) & 0xf;
r_.u8[i] = v >> (sizeof(uint8_t) - 1) * CHAR_BIT;
}
#endif
return simde__m256i_from_private(r_);
#endif
}
#if defined(SIMDE_X86_AVX512BITALG_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES)
#undef _mm256_popcnt_epi8
#define _mm256_popcnt_epi8(a) simde_mm256_popcnt_epi8(a)
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde__m256i
simde_mm256_mask_popcnt_epi8 (simde__m256i src, simde__mmask32 k, simde__m256i a) {
#if defined(SIMDE_X86_AVX512BITALG_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE)
return _mm256_mask_popcnt_epi8(src, k, a);
#else
return simde_mm256_mask_mov_epi8(src, k, simde_mm256_popcnt_epi8(a));
#endif
}
#if defined(SIMDE_X86_AVX512BITALG_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES)
#undef _mm256_mask_popcnt_epi8
#define _mm256_mask_popcnt_epi8(src, k, a) simde_mm256_mask_popcnt_epi8(src, k, a)
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde__m256i
simde_mm256_maskz_popcnt_epi8 (simde__mmask32 k, simde__m256i a) {
#if defined(SIMDE_X86_AVX512BITALG_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE)
return _mm256_maskz_popcnt_epi8(k, a);
#else
return simde_mm256_maskz_mov_epi8(k, simde_mm256_popcnt_epi8(a));
#endif
}
#if defined(SIMDE_X86_AVX512BITALG_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES)
#undef _mm256_maskz_popcnt_epi8
#define _mm256_maskz_popcnt_epi8(k, a) simde_mm256_maskz_popcnt_epi8(k, a)
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde__m256i
simde_mm256_popcnt_epi16 (simde__m256i a) {
#if defined(SIMDE_X86_AVX512BITALG_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE)
return _mm256_popcnt_epi16(a);
#else
simde__m256i_private
r_,
a_ = simde__m256i_to_private(a);
#if SIMDE_NATURAL_VECTOR_SIZE_LE(128)
for (size_t i = 0 ; i < (sizeof(r_.m128i) / sizeof(r_.m128i[0])) ; i++) {
r_.m128i[i] = simde_mm_popcnt_epi16(a_.m128i[i]);
}
#elif defined(SIMDE_X86_AVX2_NATIVE)
r_.n =
_mm256_sub_epi16(
a_.n,
_mm256_and_si256(
_mm256_srli_epi16(a_.n, 1),
_mm256_set1_epi16(0x5555)
)
);
r_.n =
_mm256_add_epi16(
_mm256_and_si256(
r_.n,
_mm256_set1_epi16(0x3333)
),
_mm256_and_si256(
_mm256_srli_epi16(r_.n, 2),
_mm256_set1_epi16(0x3333)
)
);
r_.n =
_mm256_and_si256(
_mm256_add_epi16(
r_.n,
_mm256_srli_epi16(r_.n, 4)
),
_mm256_set1_epi16(0x0f0f)
);
r_.n =
_mm256_srli_epi16(
_mm256_mullo_epi16(
r_.n,
_mm256_set1_epi16(0x0101)
),
(sizeof(uint16_t) - 1) * CHAR_BIT
);
#elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR)
a_.u16 -= ((a_.u16 >> 1) & UINT16_C(0x5555));
a_.u16 = ((a_.u16 & UINT16_C(0x3333)) + ((a_.u16 >> 2) & UINT16_C(0x3333)));
a_.u16 = (a_.u16 + (a_.u16 >> 4)) & UINT16_C(0x0f0f);
r_.u16 = (a_.u16 * UINT16_C(0x0101)) >> ((sizeof(uint16_t) - 1) * CHAR_BIT);
#else
SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof(r_.u16) / sizeof(r_.u16[0])) ; i++) {
uint16_t v = HEDLEY_STATIC_CAST(uint16_t, a_.u16[i]);
v -= ((v >> 1) & UINT16_C(0x5555));
v = ((v & UINT16_C(0x3333)) + ((v >> 2) & UINT16_C(0x3333)));
v = (v + (v >> 4)) & UINT16_C(0x0f0f);
r_.u16[i] = HEDLEY_STATIC_CAST(uint16_t, (v * UINT16_C(0x0101))) >> ((sizeof(uint16_t) - 1) * CHAR_BIT);
}
#endif
return simde__m256i_from_private(r_);
#endif
}
#if defined(SIMDE_X86_AVX512BITALG_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES)
#undef _mm256_popcnt_epi16
#define _mm256_popcnt_epi16(a) simde_mm256_popcnt_epi16(a)
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde__m256i
simde_mm256_mask_popcnt_epi16 (simde__m256i src, simde__mmask16 k, simde__m256i a) {
#if defined(SIMDE_X86_AVX512BITALG_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE)
return _mm256_mask_popcnt_epi16(src, k, a);
#else
return simde_mm256_mask_mov_epi16(src, k, simde_mm256_popcnt_epi16(a));
#endif
}
#if defined(SIMDE_X86_AVX512BITALG_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES)
#undef _mm256_mask_popcnt_epi16
#define _mm256_mask_popcnt_epi16(src, k, a) simde_mm256_mask_popcnt_epi16(src, k, a)
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde__m256i
simde_mm256_maskz_popcnt_epi16 (simde__mmask16 k, simde__m256i a) {
#if defined(SIMDE_X86_AVX512BITALG_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE)
return _mm256_maskz_popcnt_epi16(k, a);
#else
return simde_mm256_maskz_mov_epi16(k, simde_mm256_popcnt_epi16(a));
#endif
}
#if defined(SIMDE_X86_AVX512BITALG_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES)
#undef _mm256_maskz_popcnt_epi16
#define _mm256_maskz_popcnt_epi16(k, a) simde_mm256_maskz_popcnt_epi16(k, a)
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde__m256i
simde_mm256_popcnt_epi32 (simde__m256i a) {
#if defined(SIMDE_X86_AVX512VPOPCNTDQ_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE)
return _mm256_popcnt_epi32(a);
#else
simde__m256i_private
r_,
a_ = simde__m256i_to_private(a);
#if SIMDE_NATURAL_VECTOR_SIZE_LE(128)
for (size_t i = 0 ; i < (sizeof(r_.m128i) / sizeof(r_.m128i[0])) ; i++) {
r_.m128i[i] = simde_mm_popcnt_epi32(a_.m128i[i]);
}
#elif defined(SIMDE_X86_AVX2_NATIVE)
r_.n =
_mm256_sub_epi32(
a_.n,
_mm256_and_si256(
_mm256_srli_epi32(a_.n, 1),
_mm256_set1_epi32(0x55555555)
)
);
r_.n =
_mm256_add_epi32(
_mm256_and_si256(
r_.n,
_mm256_set1_epi32(0x33333333)
),
_mm256_and_si256(
_mm256_srli_epi32(r_.n, 2),
_mm256_set1_epi32(0x33333333)
)
);
r_.n =
_mm256_and_si256(
_mm256_add_epi32(
r_.n,
_mm256_srli_epi32(r_.n, 4)
),
_mm256_set1_epi32(0x0f0f0f0f)
);
r_.n =
_mm256_srli_epi32(
_mm256_mullo_epi32(
r_.n,
_mm256_set1_epi32(0x01010101)
),
(sizeof(uint32_t) - 1) * CHAR_BIT
);
#elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR)
a_.u32 -= ((a_.u32 >> 1) & UINT32_C(0x55555555));
a_.u32 = ((a_.u32 & UINT32_C(0x33333333)) + ((a_.u32 >> 2) & UINT32_C(0x33333333)));
a_.u32 = (a_.u32 + (a_.u32 >> 4)) & UINT32_C(0x0f0f0f0f);
r_.u32 = (a_.u32 * UINT32_C(0x01010101)) >> ((sizeof(uint32_t) - 1) * CHAR_BIT);
#else
SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof(r_.u32) / sizeof(r_.u32[0])) ; i++) {
uint32_t v = HEDLEY_STATIC_CAST(uint32_t, a_.u32[i]);
v -= ((v >> 1) & UINT32_C(0x55555555));
v = ((v & UINT32_C(0x33333333)) + ((v >> 2) & UINT32_C(0x33333333)));
v = (v + (v >> 4)) & UINT32_C(0x0f0f0f0f);
r_.u32[i] = HEDLEY_STATIC_CAST(uint32_t, (v * UINT32_C(0x01010101))) >> ((sizeof(uint32_t) - 1) * CHAR_BIT);
}
#endif
return simde__m256i_from_private(r_);
#endif
}
#if defined(SIMDE_X86_AVX512VPOPCNTDQ_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES)
#undef _mm256_popcnt_epi32
#define _mm256_popcnt_epi32(a) simde_mm256_popcnt_epi32(a)
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde__m256i
simde_mm256_mask_popcnt_epi32 (simde__m256i src, simde__mmask8 k, simde__m256i a) {
#if defined(SIMDE_X86_AVX512VPOPCNTDQ_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE)
return _mm256_mask_popcnt_epi32(src, k, a);
#else
return simde_mm256_mask_mov_epi32(src, k, simde_mm256_popcnt_epi32(a));
#endif
}
#if defined(SIMDE_X86_AVX512VPOPCNTDQ_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES)
#undef _mm256_mask_popcnt_epi32
#define _mm256_mask_popcnt_epi32(src, k, a) simde_mm256_mask_popcnt_epi32(src, k, a)
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde__m256i
simde_mm256_maskz_popcnt_epi32 (simde__mmask8 k, simde__m256i a) {
#if defined(SIMDE_X86_AVX512VPOPCNTDQ_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE)
return _mm256_maskz_popcnt_epi32(k, a);
#else
return simde_mm256_maskz_mov_epi32(k, simde_mm256_popcnt_epi32(a));
#endif
}
#if defined(SIMDE_X86_AVX512VPOPCNTDQ_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES)
#undef _mm256_maskz_popcnt_epi32
#define _mm256_maskz_popcnt_epi32(k, a) simde_mm256_maskz_popcnt_epi32(k, a)
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde__m256i
simde_mm256_popcnt_epi64 (simde__m256i a) {
#if defined(SIMDE_X86_AVX512VPOPCNTDQ_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE)
return _mm256_popcnt_epi64(a);
#else
simde__m256i_private
r_,
a_ = simde__m256i_to_private(a);
#if SIMDE_NATURAL_VECTOR_SIZE_LE(128)
for (size_t i = 0 ; i < sizeof(r_.m128i) / sizeof(r_.m128i[0]) ; i++) {
r_.m128i[i] = simde_mm_popcnt_epi64(a_.m128i[i]);
}
#elif defined(SIMDE_X86_AVX2_NATIVE)
const __m256i low_nibble_set = _mm256_set1_epi8(0x0f);
const __m256i high_nibble_of_input = _mm256_andnot_si256(low_nibble_set, a_.n);
const __m256i low_nibble_of_input = _mm256_and_si256(low_nibble_set, a_.n);
const __m256i lut =
_mm256_set_epi8(
4, 3, 3, 2, 3, 2, 2, 1, 3, 2, 2, 1, 2, 1, 1, 0,
4, 3, 3, 2, 3, 2, 2, 1, 3, 2, 2, 1, 2, 1, 1, 0
);
r_.n =
_mm256_sad_epu8(
_mm256_add_epi8(
_mm256_shuffle_epi8(
lut,
low_nibble_of_input
),
_mm256_shuffle_epi8(
lut,
_mm256_srli_epi16(high_nibble_of_input, 4)
)
),
_mm256_setzero_si256()
);
#elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR)
a_.u64 -= ((a_.u64 >> 1) & UINT64_C(0x5555555555555555));
a_.u64 = ((a_.u64 & UINT64_C(0x3333333333333333)) + ((a_.u64 >> 2) & UINT64_C(0x3333333333333333)));
a_.u64 = (a_.u64 + (a_.u64 >> 4)) & UINT64_C(0x0f0f0f0f0f0f0f0f);
r_.u64 = (a_.u64 * UINT64_C(0x0101010101010101)) >> ((sizeof(uint64_t) - 1) * CHAR_BIT);
#else
SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof(r_.u64) / sizeof(r_.u64[0])) ; i++) {
uint64_t v = HEDLEY_STATIC_CAST(uint64_t, a_.u64[i]);
v -= ((v >> 1) & UINT64_C(0x5555555555555555));
v = ((v & UINT64_C(0x3333333333333333)) + ((v >> 2) & UINT64_C(0x3333333333333333)));
v = (v + (v >> 4)) & UINT64_C(0x0f0f0f0f0f0f0f0f);
r_.u64[i] = HEDLEY_STATIC_CAST(uint64_t, (v * UINT64_C(0x0101010101010101))) >> ((sizeof(uint64_t) - 1) * CHAR_BIT);
}
#endif
return simde__m256i_from_private(r_);
#endif
}
#if defined(SIMDE_X86_AVX512VPOPCNTDQ_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES)
#undef _mm256_popcnt_epi64
#define _mm256_popcnt_epi64(a) simde_mm256_popcnt_epi64(a)
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde__m256i
simde_mm256_mask_popcnt_epi64 (simde__m256i src, simde__mmask8 k, simde__m256i a) {
#if defined(SIMDE_X86_AVX512VPOPCNTDQ_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE)
return _mm256_mask_popcnt_epi64(src, k, a);
#else
return simde_mm256_mask_mov_epi64(src, k, simde_mm256_popcnt_epi64(a));
#endif
}
#if defined(SIMDE_X86_AVX512VPOPCNTDQ_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES)
#undef _mm256_mask_popcnt_epi64
#define _mm256_mask_popcnt_epi64(src, k, a) simde_mm256_mask_popcnt_epi64(src, k, a)
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde__m256i
simde_mm256_maskz_popcnt_epi64 (simde__mmask8 k, simde__m256i a) {
#if defined(SIMDE_X86_AVX512VPOPCNTDQ_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE)
return _mm256_maskz_popcnt_epi64(k, a);
#else
return simde_mm256_maskz_mov_epi64(k, simde_mm256_popcnt_epi64(a));
#endif
}
#if defined(SIMDE_X86_AVX512VPOPCNTDQ_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES)
#undef _mm256_maskz_popcnt_epi64
#define _mm256_maskz_popcnt_epi64(k, a) simde_mm256_maskz_popcnt_epi64(k, a)
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde__m512i
simde_mm512_popcnt_epi8 (simde__m512i a) {
#if defined(SIMDE_X86_AVX512BITALG_NATIVE)
return _mm512_popcnt_epi8(a);
#else
simde__m512i_private
r_,
a_ = simde__m512i_to_private(a);
#if SIMDE_NATURAL_VECTOR_SIZE_LE(128)
for (size_t i = 0 ; i < (sizeof(r_.m128i) / sizeof(r_.m128i[0])) ; i++) {
r_.m128i[i] = simde_mm_popcnt_epi8(a_.m128i[i]);
}
#elif SIMDE_NATURAL_VECTOR_SIZE_LE(256)
for (size_t i = 0 ; i < (sizeof(r_.m256i) / sizeof(r_.m256i[0])) ; i++) {
r_.m256i[i] = simde_mm256_popcnt_epi8(a_.m256i[i]);
}
#elif defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512F_NATIVE)
const __m512i low_nibble_set = _mm512_set1_epi8(0x0f);
const __m512i high_nibble_of_input = _mm512_andnot_si512(low_nibble_set, a_.n);
const __m512i low_nibble_of_input = _mm512_and_si512(low_nibble_set, a_.n);
const __m512i lut =
simde_mm512_set_epi8(
4, 3, 3, 2, 3, 2, 2, 1, 3, 2, 2, 1, 2, 1, 1, 0,
4, 3, 3, 2, 3, 2, 2, 1, 3, 2, 2, 1, 2, 1, 1, 0,
4, 3, 3, 2, 3, 2, 2, 1, 3, 2, 2, 1, 2, 1, 1, 0,
4, 3, 3, 2, 3, 2, 2, 1, 3, 2, 2, 1, 2, 1, 1, 0
);
r_.n =
_mm512_add_epi8(
_mm512_shuffle_epi8(
lut,
low_nibble_of_input
),
_mm512_shuffle_epi8(
lut,
_mm512_srli_epi16(
high_nibble_of_input,
4
)
)
);
#elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR)
a_.u8 -= ((a_.u8 >> 1) & 0x55);
a_.u8 = ((a_.u8 & 0x33) + ((a_.u8 >> 2) & 0x33));
a_.u8 = (a_.u8 + (a_.u8 >> 4)) & 15;
r_.u8 = a_.u8 >> ((sizeof(uint8_t) - 1) * CHAR_BIT);
#else
SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof(r_.u8) / sizeof(r_.u8[0])) ; i++) {
uint8_t v = HEDLEY_STATIC_CAST(uint8_t, a_.u8[i]);
v -= ((v >> 1) & 0x55);
v = (v & 0x33) + ((v >> 2) & 0x33);
v = (v + (v >> 4)) & 0xf;
r_.u8[i] = v >> (sizeof(uint8_t) - 1) * CHAR_BIT;
}
#endif
return simde__m512i_from_private(r_);
#endif
}
#if defined(SIMDE_X86_AVX512BITALG_ENABLE_NATIVE_ALIASES)
#undef _mm512_popcnt_epi8
#define _mm512_popcnt_epi8(a) simde_mm512_popcnt_epi8(a)
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde__m512i
simde_mm512_mask_popcnt_epi8 (simde__m512i src, simde__mmask64 k, simde__m512i a) {
#if defined(SIMDE_X86_AVX512BITALG_NATIVE)
return _mm512_mask_popcnt_epi8(src, k, a);
#else
return simde_mm512_mask_mov_epi8(src, k, simde_mm512_popcnt_epi8(a));
#endif
}
#if defined(SIMDE_X86_AVX512BITALG_ENABLE_NATIVE_ALIASES)
#undef _mm512_mask_popcnt_epi8
#define _mm512_mask_popcnt_epi8(src, k, a) simde_mm512_mask_popcnt_epi8(src, k, a)
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde__m512i
simde_mm512_maskz_popcnt_epi8 (simde__mmask64 k, simde__m512i a) {
#if defined(SIMDE_X86_AVX512BITALG_NATIVE)
return _mm512_maskz_popcnt_epi8(k, a);
#else
return simde_mm512_maskz_mov_epi8(k, simde_mm512_popcnt_epi8(a));
#endif
}
#if defined(SIMDE_X86_AVX512BITALG_ENABLE_NATIVE_ALIASES)
#undef _mm512_maskz_popcnt_epi8
#define _mm512_maskz_popcnt_epi8(k, a) simde_mm512_maskz_popcnt_epi8(k, a)
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde__m512i
simde_mm512_popcnt_epi16 (simde__m512i a) {
#if defined(SIMDE_X86_AVX512BITALG_NATIVE)
return _mm512_popcnt_epi16(a);
#else
simde__m512i_private
r_,
a_ = simde__m512i_to_private(a);
#if SIMDE_NATURAL_VECTOR_SIZE_LE(128)
for (size_t i = 0 ; i < (sizeof(r_.m128i) / sizeof(r_.m128i[0])) ; i++) {
r_.m128i[i] = simde_mm_popcnt_epi16(a_.m128i[i]);
}
#elif SIMDE_NATURAL_VECTOR_SIZE_LE(256)
for (size_t i = 0 ; i < (sizeof(r_.m256i) / sizeof(r_.m256i[0])) ; i++) {
r_.m256i[i] = simde_mm256_popcnt_epi16(a_.m256i[i]);
}
#elif defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512F_NATIVE)
r_.n =
_mm512_sub_epi16(
a_.n,
_mm512_and_si512(
_mm512_srli_epi16(a_.n, 1),
_mm512_set1_epi16(0x5555)
)
);
r_.n =
_mm512_add_epi16(
_mm512_and_si512(
r_.n,
_mm512_set1_epi16(0x3333)
),
_mm512_and_si512(
_mm512_srli_epi16(r_.n, 2),
_mm512_set1_epi16(0x3333)
)
);
r_.n =
_mm512_and_si512(
_mm512_add_epi16(
r_.n,
_mm512_srli_epi16(r_.n, 4)
),
_mm512_set1_epi16(0x0f0f)
);
r_.n =
_mm512_srli_epi16(
_mm512_mullo_epi16(
r_.n,
_mm512_set1_epi16(0x0101)
),
(sizeof(uint16_t) - 1) * CHAR_BIT
);
#elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR)
a_.u16 -= ((a_.u16 >> 1) & UINT16_C(0x5555));
a_.u16 = ((a_.u16 & UINT16_C(0x3333)) + ((a_.u16 >> 2) & UINT16_C(0x3333)));
a_.u16 = (a_.u16 + (a_.u16 >> 4)) & UINT16_C(0x0f0f);
r_.u16 = (a_.u16 * UINT16_C(0x0101)) >> ((sizeof(uint16_t) - 1) * CHAR_BIT);
#else
SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof(r_.u16) / sizeof(r_.u16[0])) ; i++) {
uint16_t v = HEDLEY_STATIC_CAST(uint16_t, a_.u16[i]);
v -= ((v >> 1) & UINT16_C(0x5555));
v = ((v & UINT16_C(0x3333)) + ((v >> 2) & UINT16_C(0x3333)));
v = (v + (v >> 4)) & UINT16_C(0x0f0f);
r_.u16[i] = HEDLEY_STATIC_CAST(uint16_t, (v * UINT16_C(0x0101))) >> ((sizeof(uint16_t) - 1) * CHAR_BIT);
}
#endif
return simde__m512i_from_private(r_);
#endif
}
#if defined(SIMDE_X86_AVX512BITALG_ENABLE_NATIVE_ALIASES)
#undef _mm512_popcnt_epi16
#define _mm512_popcnt_epi16(a) simde_mm512_popcnt_epi16(a)
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde__m512i
simde_mm512_mask_popcnt_epi16 (simde__m512i src, simde__mmask32 k, simde__m512i a) {
#if defined(SIMDE_X86_AVX512BITALG_NATIVE)
return _mm512_mask_popcnt_epi16(src, k, a);
#else
return simde_mm512_mask_mov_epi16(src, k, simde_mm512_popcnt_epi16(a));
#endif
}
#if defined(SIMDE_X86_AVX512BITALG_ENABLE_NATIVE_ALIASES)
#undef _mm512_mask_popcnt_epi16
#define _mm512_mask_popcnt_epi16(src, k, a) simde_mm512_mask_popcnt_epi16(src, k, a)
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde__m512i
simde_mm512_maskz_popcnt_epi16 (simde__mmask32 k, simde__m512i a) {
#if defined(SIMDE_X86_AVX512BITALG_NATIVE)
return _mm512_maskz_popcnt_epi16(k, a);
#else
return simde_mm512_maskz_mov_epi16(k, simde_mm512_popcnt_epi16(a));
#endif
}
#if defined(SIMDE_X86_AVX512BITALG_ENABLE_NATIVE_ALIASES)
#undef _mm512_maskz_popcnt_epi16
#define _mm512_maskz_popcnt_epi16(k, a) simde_mm512_maskz_popcnt_epi16(k, a)
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde__m512i
simde_mm512_popcnt_epi32 (simde__m512i a) {
#if defined(SIMDE_X86_AVX512VPOPCNTDQ_NATIVE)
return _mm512_popcnt_epi32(a);
#else
simde__m512i_private
r_,
a_ = simde__m512i_to_private(a);
#if SIMDE_NATURAL_VECTOR_SIZE_LE(128)
for (size_t i = 0 ; i < (sizeof(r_.m128i) / sizeof(r_.m128i[0])) ; i++) {
r_.m128i[i] = simde_mm_popcnt_epi32(a_.m128i[i]);
}
#elif SIMDE_NATURAL_VECTOR_SIZE_LE(256)
for (size_t i = 0 ; i < (sizeof(r_.m256i) / sizeof(r_.m256i[0])) ; i++) {
r_.m256i[i] = simde_mm256_popcnt_epi32(a_.m256i[i]);
}
#elif defined(SIMDE_X86_AVX512F_NATIVE)
r_.n =
_mm512_sub_epi32(
a_.n,
_mm512_and_si512(
_mm512_srli_epi32(a_.n, 1),
_mm512_set1_epi32(0x55555555)
)
);
r_.n =
_mm512_add_epi32(
_mm512_and_si512(
r_.n,
_mm512_set1_epi32(0x33333333)
),
_mm512_and_si512(
_mm512_srli_epi32(r_.n, 2),
_mm512_set1_epi32(0x33333333)
)
);
r_.n =
_mm512_and_si512(
_mm512_add_epi32(
r_.n,
_mm512_srli_epi32(r_.n, 4)
),
_mm512_set1_epi32(0x0f0f0f0f)
);
r_.n =
_mm512_srli_epi32(
_mm512_mullo_epi32(
r_.n,
_mm512_set1_epi32(0x01010101)
),
(sizeof(uint32_t) - 1) * CHAR_BIT
);
#elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR)
a_.u32 -= ((a_.u32 >> 1) & UINT32_C(0x55555555));
a_.u32 = ((a_.u32 & UINT32_C(0x33333333)) + ((a_.u32 >> 2) & UINT32_C(0x33333333)));
a_.u32 = (a_.u32 + (a_.u32 >> 4)) & UINT32_C(0x0f0f0f0f);
r_.u32 = (a_.u32 * UINT32_C(0x01010101)) >> ((sizeof(uint32_t) - 1) * CHAR_BIT);
#else
SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof(r_.u32) / sizeof(r_.u32[0])) ; i++) {
uint32_t v = HEDLEY_STATIC_CAST(uint32_t, a_.u32[i]);
v -= ((v >> 1) & UINT32_C(0x55555555));
v = ((v & UINT32_C(0x33333333)) + ((v >> 2) & UINT32_C(0x33333333)));
v = (v + (v >> 4)) & UINT32_C(0x0f0f0f0f);
r_.u32[i] = HEDLEY_STATIC_CAST(uint32_t, (v * UINT32_C(0x01010101))) >> ((sizeof(uint32_t) - 1) * CHAR_BIT);
}
#endif
return simde__m512i_from_private(r_);
#endif
}
#if defined(SIMDE_X86_AVX512VPOPCNTDQ_ENABLE_NATIVE_ALIASES)
#undef _mm512_popcnt_epi32
#define _mm512_popcnt_epi32(a) simde_mm512_popcnt_epi32(a)
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde__m512i
simde_mm512_mask_popcnt_epi32 (simde__m512i src, simde__mmask16 k, simde__m512i a) {
#if defined(SIMDE_X86_AVX512VPOPCNTDQ_NATIVE)
return _mm512_mask_popcnt_epi32(src, k, a);
#else
return simde_mm512_mask_mov_epi32(src, k, simde_mm512_popcnt_epi32(a));
#endif
}
#if defined(SIMDE_X86_AVX512VPOPCNTDQ_ENABLE_NATIVE_ALIASES)
#undef _mm512_mask_popcnt_epi32
#define _mm512_mask_popcnt_epi32(src, k, a) simde_mm512_mask_popcnt_epi32(src, k, a)
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde__m512i
simde_mm512_maskz_popcnt_epi32 (simde__mmask16 k, simde__m512i a) {
#if defined(SIMDE_X86_AVX512VPOPCNTDQ_NATIVE)
return _mm512_maskz_popcnt_epi32(k, a);
#else
return simde_mm512_maskz_mov_epi32(k, simde_mm512_popcnt_epi32(a));
#endif
}
#if defined(SIMDE_X86_AVX512VPOPCNTDQ_ENABLE_NATIVE_ALIASES)
#undef _mm512_maskz_popcnt_epi32
#define _mm512_maskz_popcnt_epi32(k, a) simde_mm512_maskz_popcnt_epi32(k, a)
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde__m512i
simde_mm512_popcnt_epi64 (simde__m512i a) {
#if defined(SIMDE_X86_AVX512VPOPCNTDQ_NATIVE)
return _mm512_popcnt_epi64(a);
#else
simde__m512i_private
r_,
a_ = simde__m512i_to_private(a);
#if SIMDE_NATURAL_VECTOR_SIZE_LE(128)
for (size_t i = 0 ; i < (sizeof(r_.m128i) / sizeof(r_.m128i[0])) ; i++) {
r_.m128i[i] = simde_mm_popcnt_epi64(a_.m128i[i]);
}
#elif SIMDE_NATURAL_VECTOR_SIZE_LE(256)
for (size_t i = 0 ; i < sizeof(r_.m256i) / sizeof(r_.m256i[0]) ; i++) {
r_.m256i[i] = simde_mm256_popcnt_epi64(a_.m256i[i]);
}
#elif defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512F_NATIVE)
const __m512i low_nibble_set = _mm512_set1_epi8(0x0f);
const __m512i high_nibble_of_input = _mm512_andnot_si512(low_nibble_set, a_.n);
const __m512i low_nibble_of_input = _mm512_and_si512(low_nibble_set, a_.n);
const __m512i lut =
simde_mm512_set_epi8(
4, 3, 3, 2, 3, 2, 2, 1, 3, 2, 2, 1, 2, 1, 1, 0,
4, 3, 3, 2, 3, 2, 2, 1, 3, 2, 2, 1, 2, 1, 1, 0,
4, 3, 3, 2, 3, 2, 2, 1, 3, 2, 2, 1, 2, 1, 1, 0,
4, 3, 3, 2, 3, 2, 2, 1, 3, 2, 2, 1, 2, 1, 1, 0
);
r_.n =
_mm512_sad_epu8(
_mm512_add_epi8(
_mm512_shuffle_epi8(
lut,
low_nibble_of_input
),
_mm512_shuffle_epi8(
lut,
_mm512_srli_epi16(high_nibble_of_input, 4)
)
),
_mm512_setzero_si512()
);
#elif defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512DQ_NATIVE)
r_.n =
_mm512_sub_epi64(
a_.n,
_mm512_and_si512(
_mm512_srli_epi64(a_.n, 1),
_mm512_set1_epi64(0x5555555555555555)
)
);
r_.n =
_mm512_add_epi64(
_mm512_and_si512(
r_.n,
_mm512_set1_epi64(0x3333333333333333)
),
_mm512_and_si512(
_mm512_srli_epi64(r_.n, 2),
_mm512_set1_epi64(0x3333333333333333)
)
);
r_.n =
_mm512_and_si512(
_mm512_add_epi64(
r_.n,
_mm512_srli_epi64(r_.n, 4)
),
_mm512_set1_epi64(0x0f0f0f0f0f0f0f0f)
);
r_.n =
_mm512_srli_epi64(
_mm512_mullo_epi64(
r_.n,
_mm512_set1_epi64(0x0101010101010101)
),
(sizeof(uint64_t) - 1) * CHAR_BIT
);
#elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR)
a_.u64 -= ((a_.u64 >> 1) & UINT64_C(0x5555555555555555));
a_.u64 = ((a_.u64 & UINT64_C(0x3333333333333333)) + ((a_.u64 >> 2) & UINT64_C(0x3333333333333333)));
a_.u64 = (a_.u64 + (a_.u64 >> 4)) & UINT64_C(0x0f0f0f0f0f0f0f0f);
r_.u64 = (a_.u64 * UINT64_C(0x0101010101010101)) >> ((sizeof(uint64_t) - 1) * CHAR_BIT);
#else
SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof(r_.u64) / sizeof(r_.u64[0])) ; i++) {
uint64_t v = HEDLEY_STATIC_CAST(uint64_t, a_.u64[i]);
v -= ((v >> 1) & UINT64_C(0x5555555555555555));
v = ((v & UINT64_C(0x3333333333333333)) + ((v >> 2) & UINT64_C(0x3333333333333333)));
v = (v + (v >> 4)) & UINT64_C(0x0f0f0f0f0f0f0f0f);
r_.u64[i] = HEDLEY_STATIC_CAST(uint64_t, (v * UINT64_C(0x0101010101010101))) >> ((sizeof(uint64_t) - 1) * CHAR_BIT);
}
#endif
return simde__m512i_from_private(r_);
#endif
}
#if defined(SIMDE_X86_AVX512VPOPCNTDQ_ENABLE_NATIVE_ALIASES)
#undef _mm512_popcnt_epi64
#define _mm512_popcnt_epi64(a) simde_mm512_popcnt_epi64(a)
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde__m512i
simde_mm512_mask_popcnt_epi64 (simde__m512i src, simde__mmask8 k, simde__m512i a) {
#if defined(SIMDE_X86_AVX512VPOPCNTDQ_NATIVE)
return _mm512_mask_popcnt_epi64(src, k, a);
#else
return simde_mm512_mask_mov_epi64(src, k, simde_mm512_popcnt_epi64(a));
#endif
}
#if defined(SIMDE_X86_AVX512VPOPCNTDQ_ENABLE_NATIVE_ALIASES)
#undef _mm512_mask_popcnt_epi64
#define _mm512_mask_popcnt_epi64(src, k, a) simde_mm512_mask_popcnt_epi64(src, k, a)
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde__m512i
simde_mm512_maskz_popcnt_epi64 (simde__mmask8 k, simde__m512i a) {
#if defined(SIMDE_X86_AVX512VPOPCNTDQ_NATIVE)
return _mm512_maskz_popcnt_epi64(k, a);
#else
return simde_mm512_maskz_mov_epi64(k, simde_mm512_popcnt_epi64(a));
#endif
}
#if defined(SIMDE_X86_AVX512VPOPCNTDQ_ENABLE_NATIVE_ALIASES)
#undef _mm512_maskz_popcnt_epi64
#define _mm512_maskz_popcnt_epi64(k, a) simde_mm512_maskz_popcnt_epi64(k, a)
#endif
SIMDE_END_DECLS_
HEDLEY_DIAGNOSTIC_POP
#endif /* !defined(SIMDE_X86_AVX512_POPCNT_H) */