1347 lines
45 KiB
C
1347 lines
45 KiB
C
#if !defined(SIMDE_X86_AVX512_POPCNT_H)
|
|
#define SIMDE_X86_AVX512_POPCNT_H
|
|
|
|
#include "types.h"
|
|
#include "mov.h"
|
|
|
|
HEDLEY_DIAGNOSTIC_PUSH
|
|
SIMDE_DISABLE_UNWANTED_DIAGNOSTICS
|
|
SIMDE_BEGIN_DECLS_
|
|
|
|
SIMDE_FUNCTION_ATTRIBUTES
|
|
simde__m128i
|
|
simde_mm_popcnt_epi8 (simde__m128i a) {
|
|
#if defined(SIMDE_X86_AVX512BITALG_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE)
|
|
return _mm_popcnt_epi8(a);
|
|
#else
|
|
simde__m128i_private
|
|
r_,
|
|
a_ = simde__m128i_to_private(a);
|
|
|
|
#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
|
|
r_.neon_i8 = vcntq_s8(a_.neon_i8);
|
|
#elif defined(SIMDE_WASM_SIMD128_NATIVE)
|
|
r_.wasm_v128 = wasm_i8x16_popcnt(a_.wasm_v128);
|
|
#elif defined(SIMDE_X86_SSSE3_NATIVE)
|
|
const __m128i low_nibble_set = _mm_set1_epi8(0x0f);
|
|
const __m128i high_nibble_of_input = _mm_andnot_si128(low_nibble_set, a_.n);
|
|
const __m128i low_nibble_of_input = _mm_and_si128(low_nibble_set, a_.n);
|
|
const __m128i lut = _mm_set_epi8(4, 3, 3, 2, 3, 2, 2, 1, 3, 2, 2, 1, 2, 1, 1, 0);
|
|
|
|
r_.n =
|
|
_mm_add_epi8(
|
|
_mm_shuffle_epi8(
|
|
lut,
|
|
low_nibble_of_input
|
|
),
|
|
_mm_shuffle_epi8(
|
|
lut,
|
|
_mm_srli_epi16(
|
|
high_nibble_of_input,
|
|
4
|
|
)
|
|
)
|
|
);
|
|
#elif defined(SIMDE_X86_SSE2_NATIVE)
|
|
/* v -= ((v >> 1) & UINT8_C(0x55)); */
|
|
r_.n =
|
|
_mm_sub_epi8(
|
|
a_.n,
|
|
_mm_and_si128(
|
|
_mm_srli_epi16(a_.n, 1),
|
|
_mm_set1_epi8(0x55)
|
|
)
|
|
);
|
|
|
|
/* v = (v & 0x33) + ((v >> 2) & 0x33); */
|
|
r_.n =
|
|
_mm_add_epi8(
|
|
_mm_and_si128(
|
|
r_.n,
|
|
_mm_set1_epi8(0x33)
|
|
),
|
|
_mm_and_si128(
|
|
_mm_srli_epi16(r_.n, 2),
|
|
_mm_set1_epi8(0x33)
|
|
)
|
|
);
|
|
|
|
/* v = (v + (v >> 4)) & 0xf */
|
|
r_.n =
|
|
_mm_and_si128(
|
|
_mm_add_epi8(
|
|
r_.n,
|
|
_mm_srli_epi16(r_.n, 4)
|
|
),
|
|
_mm_set1_epi8(0x0f)
|
|
);
|
|
#elif defined(SIMDE_POWER_ALTIVEC_P8_NATIVE)
|
|
r_.altivec_i8 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(signed char), vec_popcnt(HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(unsigned char), a_.altivec_i8)));
|
|
#elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR)
|
|
a_.u8 -= ((a_.u8 >> 1) & 0x55);
|
|
a_.u8 = ((a_.u8 & 0x33) + ((a_.u8 >> 2) & 0x33));
|
|
a_.u8 = (a_.u8 + (a_.u8 >> 4)) & 15;
|
|
r_.u8 = a_.u8 >> ((sizeof(uint8_t) - 1) * CHAR_BIT);
|
|
#else
|
|
SIMDE_VECTORIZE
|
|
for (size_t i = 0 ; i < (sizeof(r_.u8) / sizeof(r_.u8[0])) ; i++) {
|
|
uint8_t v = HEDLEY_STATIC_CAST(uint8_t, a_.u8[i]);
|
|
v -= ((v >> 1) & 0x55);
|
|
v = (v & 0x33) + ((v >> 2) & 0x33);
|
|
v = (v + (v >> 4)) & 0xf;
|
|
r_.u8[i] = v >> (sizeof(uint8_t) - 1) * CHAR_BIT;
|
|
}
|
|
#endif
|
|
|
|
return simde__m128i_from_private(r_);
|
|
#endif
|
|
}
|
|
#if defined(SIMDE_X86_AVX512BITALG_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES)
|
|
#undef _mm_popcnt_epi8
|
|
#define _mm_popcnt_epi8(a) simde_mm_popcnt_epi8(a)
|
|
#endif
|
|
|
|
SIMDE_FUNCTION_ATTRIBUTES
|
|
simde__m128i
|
|
simde_mm_mask_popcnt_epi8 (simde__m128i src, simde__mmask16 k, simde__m128i a) {
|
|
#if defined(SIMDE_X86_AVX512BITALG_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE)
|
|
return _mm_mask_popcnt_epi8(src, k, a);
|
|
#else
|
|
return simde_mm_mask_mov_epi8(src, k, simde_mm_popcnt_epi8(a));
|
|
#endif
|
|
}
|
|
#if defined(SIMDE_X86_AVX512BITALG_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES)
|
|
#undef _mm_mask_popcnt_epi8
|
|
#define _mm_mask_popcnt_epi8(src, k, a) simde_mm_mask_popcnt_epi8(src, k, a)
|
|
#endif
|
|
|
|
SIMDE_FUNCTION_ATTRIBUTES
|
|
simde__m128i
|
|
simde_mm_maskz_popcnt_epi8 (simde__mmask16 k, simde__m128i a) {
|
|
#if defined(SIMDE_X86_AVX512BITALG_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE)
|
|
return _mm_maskz_popcnt_epi8(k, a);
|
|
#else
|
|
return simde_mm_maskz_mov_epi8(k, simde_mm_popcnt_epi8(a));
|
|
#endif
|
|
}
|
|
#if defined(SIMDE_X86_AVX512BITALG_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES)
|
|
#undef _mm_maskz_popcnt_epi8
|
|
#define _mm_maskz_popcnt_epi8(k, a) simde_mm_maskz_popcnt_epi8(k, a)
|
|
#endif
|
|
|
|
SIMDE_FUNCTION_ATTRIBUTES
|
|
simde__m128i
|
|
simde_mm_popcnt_epi16 (simde__m128i a) {
|
|
#if defined(SIMDE_X86_AVX512BITALG_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE)
|
|
return _mm_popcnt_epi16(a);
|
|
#else
|
|
simde__m128i_private
|
|
r_,
|
|
a_ = simde__m128i_to_private(a);
|
|
|
|
#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
|
|
r_.neon_i16 = vpaddlq_s8(vcntq_s8(a_.neon_i8));
|
|
#elif defined(SIMDE_WASM_SIMD128_NATIVE)
|
|
r_.wasm_v128 = wasm_i16x8_extadd_pairwise_i8x16(wasm_i8x16_popcnt(a_.wasm_v128));
|
|
#elif defined(SIMDE_POWER_ALTIVEC_P8_NATIVE)
|
|
r_.altivec_u16 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(unsigned short), vec_popcnt(HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(unsigned short), a_.altivec_u16)));
|
|
#elif defined(SIMDE_X86_XOP_NATIVE)
|
|
const __m128i low_nibble_set = _mm_set1_epi8(0x0f);
|
|
const __m128i high_nibble_of_input = _mm_andnot_si128(low_nibble_set, a_.n);
|
|
const __m128i low_nibble_of_input = _mm_and_si128(low_nibble_set, a_.n);
|
|
const __m128i lut = _mm_set_epi8(4, 3, 3, 2, 3, 2, 2, 1, 3, 2, 2, 1, 2, 1, 1, 0);
|
|
|
|
r_.n =
|
|
_mm_haddw_epi8(
|
|
_mm_add_epi8(
|
|
_mm_shuffle_epi8(
|
|
lut,
|
|
low_nibble_of_input
|
|
),
|
|
_mm_shuffle_epi8(
|
|
lut,
|
|
_mm_srli_epi16(high_nibble_of_input, 4)
|
|
)
|
|
)
|
|
);
|
|
#elif defined(SIMDE_X86_SSE2_NATIVE)
|
|
r_.n =
|
|
_mm_sub_epi16(
|
|
a_.n,
|
|
_mm_and_si128(
|
|
_mm_srli_epi16(a_.n, 1),
|
|
_mm_set1_epi16(0x5555)
|
|
)
|
|
);
|
|
|
|
r_.n =
|
|
_mm_add_epi16(
|
|
_mm_and_si128(
|
|
r_.n,
|
|
_mm_set1_epi16(0x3333)
|
|
),
|
|
_mm_and_si128(
|
|
_mm_srli_epi16(r_.n, 2),
|
|
_mm_set1_epi16(0x3333)
|
|
)
|
|
);
|
|
|
|
r_.n =
|
|
_mm_and_si128(
|
|
_mm_add_epi16(
|
|
r_.n,
|
|
_mm_srli_epi16(r_.n, 4)
|
|
),
|
|
_mm_set1_epi16(0x0f0f)
|
|
);
|
|
|
|
r_.n =
|
|
_mm_srli_epi16(
|
|
_mm_mullo_epi16(
|
|
r_.n,
|
|
_mm_set1_epi16(0x0101)
|
|
),
|
|
(sizeof(uint16_t) - 1) * CHAR_BIT
|
|
);
|
|
#elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR)
|
|
a_.u16 -= ((a_.u16 >> 1) & UINT16_C(0x5555));
|
|
a_.u16 = ((a_.u16 & UINT16_C(0x3333)) + ((a_.u16 >> 2) & UINT16_C(0x3333)));
|
|
a_.u16 = (a_.u16 + (a_.u16 >> 4)) & UINT16_C(0x0f0f);
|
|
r_.u16 = (a_.u16 * UINT16_C(0x0101)) >> ((sizeof(uint16_t) - 1) * CHAR_BIT);
|
|
#else
|
|
SIMDE_VECTORIZE
|
|
for (size_t i = 0 ; i < (sizeof(r_.u16) / sizeof(r_.u16[0])) ; i++) {
|
|
uint16_t v = HEDLEY_STATIC_CAST(uint16_t, a_.u16[i]);
|
|
v -= ((v >> 1) & UINT16_C(0x5555));
|
|
v = ((v & UINT16_C(0x3333)) + ((v >> 2) & UINT16_C(0x3333)));
|
|
v = (v + (v >> 4)) & UINT16_C(0x0f0f);
|
|
r_.u16[i] = HEDLEY_STATIC_CAST(uint16_t, (v * UINT16_C(0x0101))) >> ((sizeof(uint16_t) - 1) * CHAR_BIT);
|
|
}
|
|
#endif
|
|
|
|
return simde__m128i_from_private(r_);
|
|
#endif
|
|
}
|
|
#if defined(SIMDE_X86_AVX512BITALG_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES)
|
|
#undef _mm_popcnt_epi16
|
|
#define _mm_popcnt_epi16(a) simde_mm_popcnt_epi16(a)
|
|
#endif
|
|
|
|
SIMDE_FUNCTION_ATTRIBUTES
|
|
simde__m128i
|
|
simde_mm_mask_popcnt_epi16 (simde__m128i src, simde__mmask8 k, simde__m128i a) {
|
|
#if defined(SIMDE_X86_AVX512BITALG_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE)
|
|
return _mm_mask_popcnt_epi16(src, k, a);
|
|
#else
|
|
return simde_mm_mask_mov_epi16(src, k, simde_mm_popcnt_epi16(a));
|
|
#endif
|
|
}
|
|
#if defined(SIMDE_X86_AVX512BITALG_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES)
|
|
#undef _mm_mask_popcnt_epi16
|
|
#define _mm_mask_popcnt_epi16(src, k, a) simde_mm_mask_popcnt_epi16(src, k, a)
|
|
#endif
|
|
|
|
SIMDE_FUNCTION_ATTRIBUTES
|
|
simde__m128i
|
|
simde_mm_maskz_popcnt_epi16 (simde__mmask8 k, simde__m128i a) {
|
|
#if defined(SIMDE_X86_AVX512BITALG_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE)
|
|
return _mm_maskz_popcnt_epi16(k, a);
|
|
#else
|
|
return simde_mm_maskz_mov_epi16(k, simde_mm_popcnt_epi16(a));
|
|
#endif
|
|
}
|
|
#if defined(SIMDE_X86_AVX512BITALG_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES)
|
|
#undef _mm_maskz_popcnt_epi16
|
|
#define _mm_maskz_popcnt_epi16(k, a) simde_mm_maskz_popcnt_epi16(k, a)
|
|
#endif
|
|
|
|
SIMDE_FUNCTION_ATTRIBUTES
|
|
simde__m128i
|
|
simde_mm_popcnt_epi32 (simde__m128i a) {
|
|
#if defined(SIMDE_X86_AVX512VPOPCNTDQ_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE)
|
|
return _mm_popcnt_epi32(a);
|
|
#else
|
|
simde__m128i_private
|
|
r_,
|
|
a_ = simde__m128i_to_private(a);
|
|
|
|
#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
|
|
r_.neon_i32 = vpaddlq_s16(vpaddlq_s8(vcntq_s8(a_.neon_i8)));
|
|
#elif defined(SIMDE_POWER_ALTIVEC_P8_NATIVE)
|
|
r_.altivec_u32 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(unsigned int), vec_popcnt(HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(unsigned int), a_.altivec_u32)));
|
|
#elif defined(SIMDE_X86_XOP_NATIVE)
|
|
const __m128i low_nibble_set = _mm_set1_epi8(0x0f);
|
|
const __m128i high_nibble_of_input = _mm_andnot_si128(low_nibble_set, a_.n);
|
|
const __m128i low_nibble_of_input = _mm_and_si128(low_nibble_set, a_.n);
|
|
const __m128i lut = _mm_set_epi8(4, 3, 3, 2, 3, 2, 2, 1, 3, 2, 2, 1, 2, 1, 1, 0);
|
|
|
|
r_.n =
|
|
_mm_haddd_epi8(
|
|
_mm_add_epi8(
|
|
_mm_shuffle_epi8(
|
|
lut,
|
|
low_nibble_of_input
|
|
),
|
|
_mm_shuffle_epi8(
|
|
lut,
|
|
_mm_srli_epi16(high_nibble_of_input, 4)
|
|
)
|
|
)
|
|
);
|
|
#elif defined(SIMDE_X86_SSE4_1_NATIVE)
|
|
r_.n =
|
|
_mm_sub_epi32(
|
|
a_.n,
|
|
_mm_and_si128(
|
|
_mm_srli_epi32(a_.n, 1),
|
|
_mm_set1_epi32(0x55555555)
|
|
)
|
|
);
|
|
|
|
r_.n =
|
|
_mm_add_epi32(
|
|
_mm_and_si128(
|
|
r_.n,
|
|
_mm_set1_epi32(0x33333333)
|
|
),
|
|
_mm_and_si128(
|
|
_mm_srli_epi32(r_.n, 2),
|
|
_mm_set1_epi32(0x33333333)
|
|
)
|
|
);
|
|
|
|
r_.n =
|
|
_mm_and_si128(
|
|
_mm_add_epi32(
|
|
r_.n,
|
|
_mm_srli_epi32(r_.n, 4)
|
|
),
|
|
_mm_set1_epi32(0x0f0f0f0f)
|
|
);
|
|
|
|
r_.n =
|
|
_mm_srli_epi32(
|
|
_mm_mullo_epi32(
|
|
r_.n,
|
|
_mm_set1_epi32(0x01010101)
|
|
),
|
|
(sizeof(uint32_t) - 1) * CHAR_BIT
|
|
);
|
|
#elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR)
|
|
a_.u32 -= ((a_.u32 >> 1) & UINT32_C(0x55555555));
|
|
a_.u32 = ((a_.u32 & UINT32_C(0x33333333)) + ((a_.u32 >> 2) & UINT32_C(0x33333333)));
|
|
a_.u32 = (a_.u32 + (a_.u32 >> 4)) & UINT32_C(0x0f0f0f0f);
|
|
r_.u32 = (a_.u32 * UINT32_C(0x01010101)) >> ((sizeof(uint32_t) - 1) * CHAR_BIT);
|
|
#else
|
|
SIMDE_VECTORIZE
|
|
for (size_t i = 0 ; i < (sizeof(r_.u32) / sizeof(r_.u32[0])) ; i++) {
|
|
uint32_t v = HEDLEY_STATIC_CAST(uint32_t, a_.u32[i]);
|
|
v -= ((v >> 1) & UINT32_C(0x55555555));
|
|
v = ((v & UINT32_C(0x33333333)) + ((v >> 2) & UINT32_C(0x33333333)));
|
|
v = (v + (v >> 4)) & UINT32_C(0x0f0f0f0f);
|
|
r_.u32[i] = HEDLEY_STATIC_CAST(uint32_t, (v * UINT32_C(0x01010101))) >> ((sizeof(uint32_t) - 1) * CHAR_BIT);
|
|
}
|
|
#endif
|
|
|
|
return simde__m128i_from_private(r_);
|
|
#endif
|
|
}
|
|
#if defined(SIMDE_X86_AVX512VPOPCNTDQ_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES)
|
|
#undef _mm_popcnt_epi32
|
|
#define _mm_popcnt_epi32(a) simde_mm_popcnt_epi32(a)
|
|
#endif
|
|
|
|
SIMDE_FUNCTION_ATTRIBUTES
|
|
simde__m128i
|
|
simde_mm_mask_popcnt_epi32 (simde__m128i src, simde__mmask8 k, simde__m128i a) {
|
|
#if defined(SIMDE_X86_AVX512VPOPCNTDQ_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE)
|
|
return _mm_mask_popcnt_epi32(src, k, a);
|
|
#else
|
|
return simde_mm_mask_mov_epi32(src, k, simde_mm_popcnt_epi32(a));
|
|
#endif
|
|
}
|
|
#if defined(SIMDE_X86_AVX512VPOPCNTDQ_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES)
|
|
#undef _mm_mask_popcnt_epi32
|
|
#define _mm_mask_popcnt_epi32(src, k, a) simde_mm_mask_popcnt_epi32(src, k, a)
|
|
#endif
|
|
|
|
SIMDE_FUNCTION_ATTRIBUTES
|
|
simde__m128i
|
|
simde_mm_maskz_popcnt_epi32 (simde__mmask8 k, simde__m128i a) {
|
|
#if defined(SIMDE_X86_AVX512VPOPCNTDQ_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE)
|
|
return _mm_maskz_popcnt_epi32(k, a);
|
|
#else
|
|
return simde_mm_maskz_mov_epi32(k, simde_mm_popcnt_epi32(a));
|
|
#endif
|
|
}
|
|
#if defined(SIMDE_X86_AVX512VPOPCNTDQ_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES)
|
|
#undef _mm_maskz_popcnt_epi32
|
|
#define _mm_maskz_popcnt_epi32(k, a) simde_mm_maskz_popcnt_epi32(k, a)
|
|
#endif
|
|
|
|
SIMDE_FUNCTION_ATTRIBUTES
|
|
simde__m128i
|
|
simde_mm_popcnt_epi64 (simde__m128i a) {
|
|
#if defined(SIMDE_X86_AVX512VPOPCNTDQ_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE)
|
|
return _mm_popcnt_epi64(a);
|
|
#else
|
|
simde__m128i_private
|
|
r_,
|
|
a_ = simde__m128i_to_private(a);
|
|
|
|
#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
|
|
r_.neon_i64 = vpaddlq_s32(vpaddlq_s16(vpaddlq_s8(vcntq_s8(a_.neon_i8))));
|
|
#elif defined(SIMDE_POWER_ALTIVEC_P8_NATIVE)
|
|
r_.altivec_u64 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(unsigned long long), vec_popcnt(HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(unsigned long long), a_.altivec_u64)));
|
|
#elif defined(SIMDE_X86_SSSE3_NATIVE)
|
|
const __m128i low_nibble_set = _mm_set1_epi8(0x0f);
|
|
const __m128i high_nibble_of_input = _mm_andnot_si128(low_nibble_set, a_.n);
|
|
const __m128i low_nibble_of_input = _mm_and_si128(low_nibble_set, a_.n);
|
|
const __m128i lut = _mm_set_epi8(4, 3, 3, 2, 3, 2, 2, 1, 3, 2, 2, 1, 2, 1, 1, 0);
|
|
|
|
r_.n =
|
|
_mm_sad_epu8(
|
|
_mm_add_epi8(
|
|
_mm_shuffle_epi8(
|
|
lut,
|
|
low_nibble_of_input
|
|
),
|
|
_mm_shuffle_epi8(
|
|
lut,
|
|
_mm_srli_epi16(high_nibble_of_input, 4)
|
|
)
|
|
),
|
|
_mm_setzero_si128()
|
|
);
|
|
#elif defined(SIMDE_X86_SSE2_NATIVE)
|
|
r_.n =
|
|
_mm_sub_epi8(
|
|
a_.n,
|
|
_mm_and_si128(
|
|
_mm_srli_epi16(a_.n, 1),
|
|
_mm_set1_epi8(0x55)
|
|
)
|
|
);
|
|
|
|
r_.n =
|
|
_mm_add_epi8(
|
|
_mm_and_si128(
|
|
r_.n,
|
|
_mm_set1_epi8(0x33)
|
|
),
|
|
_mm_and_si128(
|
|
_mm_srli_epi16(r_.n, 2),
|
|
_mm_set1_epi8(0x33)
|
|
)
|
|
);
|
|
|
|
r_.n =
|
|
_mm_and_si128(
|
|
_mm_add_epi8(
|
|
r_.n,
|
|
_mm_srli_epi16(r_.n, 4)
|
|
),
|
|
_mm_set1_epi8(0x0f)
|
|
);
|
|
|
|
r_.n =
|
|
_mm_sad_epu8(
|
|
r_.n,
|
|
_mm_setzero_si128()
|
|
);
|
|
#elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR)
|
|
a_.u64 -= ((a_.u64 >> 1) & UINT64_C(0x5555555555555555));
|
|
a_.u64 = ((a_.u64 & UINT64_C(0x3333333333333333)) + ((a_.u64 >> 2) & UINT64_C(0x3333333333333333)));
|
|
a_.u64 = (a_.u64 + (a_.u64 >> 4)) & UINT64_C(0x0f0f0f0f0f0f0f0f);
|
|
r_.u64 = (a_.u64 * UINT64_C(0x0101010101010101)) >> ((sizeof(uint64_t) - 1) * CHAR_BIT);
|
|
#else
|
|
SIMDE_VECTORIZE
|
|
for (size_t i = 0 ; i < (sizeof(r_.u64) / sizeof(r_.u64[0])) ; i++) {
|
|
uint64_t v = HEDLEY_STATIC_CAST(uint64_t, a_.u64[i]);
|
|
v -= ((v >> 1) & UINT64_C(0x5555555555555555));
|
|
v = ((v & UINT64_C(0x3333333333333333)) + ((v >> 2) & UINT64_C(0x3333333333333333)));
|
|
v = (v + (v >> 4)) & UINT64_C(0x0f0f0f0f0f0f0f0f);
|
|
r_.u64[i] = HEDLEY_STATIC_CAST(uint64_t, (v * UINT64_C(0x0101010101010101))) >> ((sizeof(uint64_t) - 1) * CHAR_BIT);
|
|
}
|
|
#endif
|
|
|
|
return simde__m128i_from_private(r_);
|
|
#endif
|
|
}
|
|
#if defined(SIMDE_X86_AVX512VPOPCNTDQ_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES)
|
|
#undef _mm_popcnt_epi64
|
|
#define _mm_popcnt_epi64(a) simde_mm_popcnt_epi64(a)
|
|
#endif
|
|
|
|
SIMDE_FUNCTION_ATTRIBUTES
|
|
simde__m128i
|
|
simde_mm_mask_popcnt_epi64 (simde__m128i src, simde__mmask8 k, simde__m128i a) {
|
|
#if defined(SIMDE_X86_AVX512VPOPCNTDQ_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE)
|
|
return _mm_mask_popcnt_epi64(src, k, a);
|
|
#else
|
|
return simde_mm_mask_mov_epi64(src, k, simde_mm_popcnt_epi64(a));
|
|
#endif
|
|
}
|
|
#if defined(SIMDE_X86_AVX512VPOPCNTDQ_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES)
|
|
#undef _mm_mask_popcnt_epi64
|
|
#define _mm_mask_popcnt_epi64(src, k, a) simde_mm_mask_popcnt_epi64(src, k, a)
|
|
#endif
|
|
|
|
SIMDE_FUNCTION_ATTRIBUTES
|
|
simde__m128i
|
|
simde_mm_maskz_popcnt_epi64 (simde__mmask8 k, simde__m128i a) {
|
|
#if defined(SIMDE_X86_AVX512VPOPCNTDQ_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE)
|
|
return _mm_maskz_popcnt_epi64(k, a);
|
|
#else
|
|
return simde_mm_maskz_mov_epi64(k, simde_mm_popcnt_epi64(a));
|
|
#endif
|
|
}
|
|
#if defined(SIMDE_X86_AVX512VPOPCNTDQ_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES)
|
|
#undef _mm_maskz_popcnt_epi64
|
|
#define _mm_maskz_popcnt_epi64(k, a) simde_mm_maskz_popcnt_epi64(k, a)
|
|
#endif
|
|
|
|
SIMDE_FUNCTION_ATTRIBUTES
|
|
simde__m256i
|
|
simde_mm256_popcnt_epi8 (simde__m256i a) {
|
|
#if defined(SIMDE_X86_AVX512BITALG_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE)
|
|
return _mm256_popcnt_epi8(a);
|
|
#else
|
|
simde__m256i_private
|
|
r_,
|
|
a_ = simde__m256i_to_private(a);
|
|
|
|
#if SIMDE_NATURAL_VECTOR_SIZE_LE(128)
|
|
for (size_t i = 0 ; i < (sizeof(r_.m128i) / sizeof(r_.m128i[0])) ; i++) {
|
|
r_.m128i[i] = simde_mm_popcnt_epi8(a_.m128i[i]);
|
|
}
|
|
#elif defined(SIMDE_X86_AVX2_NATIVE)
|
|
const __m256i low_nibble_set = _mm256_set1_epi8(0x0f);
|
|
const __m256i high_nibble_of_input = _mm256_andnot_si256(low_nibble_set, a_.n);
|
|
const __m256i low_nibble_of_input = _mm256_and_si256(low_nibble_set, a_.n);
|
|
const __m256i lut =
|
|
_mm256_set_epi8(
|
|
4, 3, 3, 2, 3, 2, 2, 1, 3, 2, 2, 1, 2, 1, 1, 0,
|
|
4, 3, 3, 2, 3, 2, 2, 1, 3, 2, 2, 1, 2, 1, 1, 0
|
|
);
|
|
|
|
r_.n =
|
|
_mm256_add_epi8(
|
|
_mm256_shuffle_epi8(
|
|
lut,
|
|
low_nibble_of_input
|
|
),
|
|
_mm256_shuffle_epi8(
|
|
lut,
|
|
_mm256_srli_epi16(
|
|
high_nibble_of_input,
|
|
4
|
|
)
|
|
)
|
|
);
|
|
#elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR)
|
|
a_.u8 -= ((a_.u8 >> 1) & 0x55);
|
|
a_.u8 = ((a_.u8 & 0x33) + ((a_.u8 >> 2) & 0x33));
|
|
a_.u8 = (a_.u8 + (a_.u8 >> 4)) & 15;
|
|
r_.u8 = a_.u8 >> ((sizeof(uint8_t) - 1) * CHAR_BIT);
|
|
#else
|
|
SIMDE_VECTORIZE
|
|
for (size_t i = 0 ; i < (sizeof(r_.u8) / sizeof(r_.u8[0])) ; i++) {
|
|
uint8_t v = HEDLEY_STATIC_CAST(uint8_t, a_.u8[i]);
|
|
v -= ((v >> 1) & 0x55);
|
|
v = (v & 0x33) + ((v >> 2) & 0x33);
|
|
v = (v + (v >> 4)) & 0xf;
|
|
r_.u8[i] = v >> (sizeof(uint8_t) - 1) * CHAR_BIT;
|
|
}
|
|
#endif
|
|
|
|
return simde__m256i_from_private(r_);
|
|
#endif
|
|
}
|
|
#if defined(SIMDE_X86_AVX512BITALG_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES)
|
|
#undef _mm256_popcnt_epi8
|
|
#define _mm256_popcnt_epi8(a) simde_mm256_popcnt_epi8(a)
|
|
#endif
|
|
|
|
SIMDE_FUNCTION_ATTRIBUTES
|
|
simde__m256i
|
|
simde_mm256_mask_popcnt_epi8 (simde__m256i src, simde__mmask32 k, simde__m256i a) {
|
|
#if defined(SIMDE_X86_AVX512BITALG_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE)
|
|
return _mm256_mask_popcnt_epi8(src, k, a);
|
|
#else
|
|
return simde_mm256_mask_mov_epi8(src, k, simde_mm256_popcnt_epi8(a));
|
|
#endif
|
|
}
|
|
#if defined(SIMDE_X86_AVX512BITALG_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES)
|
|
#undef _mm256_mask_popcnt_epi8
|
|
#define _mm256_mask_popcnt_epi8(src, k, a) simde_mm256_mask_popcnt_epi8(src, k, a)
|
|
#endif
|
|
|
|
SIMDE_FUNCTION_ATTRIBUTES
|
|
simde__m256i
|
|
simde_mm256_maskz_popcnt_epi8 (simde__mmask32 k, simde__m256i a) {
|
|
#if defined(SIMDE_X86_AVX512BITALG_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE)
|
|
return _mm256_maskz_popcnt_epi8(k, a);
|
|
#else
|
|
return simde_mm256_maskz_mov_epi8(k, simde_mm256_popcnt_epi8(a));
|
|
#endif
|
|
}
|
|
#if defined(SIMDE_X86_AVX512BITALG_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES)
|
|
#undef _mm256_maskz_popcnt_epi8
|
|
#define _mm256_maskz_popcnt_epi8(k, a) simde_mm256_maskz_popcnt_epi8(k, a)
|
|
#endif
|
|
|
|
SIMDE_FUNCTION_ATTRIBUTES
|
|
simde__m256i
|
|
simde_mm256_popcnt_epi16 (simde__m256i a) {
|
|
#if defined(SIMDE_X86_AVX512BITALG_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE)
|
|
return _mm256_popcnt_epi16(a);
|
|
#else
|
|
simde__m256i_private
|
|
r_,
|
|
a_ = simde__m256i_to_private(a);
|
|
|
|
#if SIMDE_NATURAL_VECTOR_SIZE_LE(128)
|
|
for (size_t i = 0 ; i < (sizeof(r_.m128i) / sizeof(r_.m128i[0])) ; i++) {
|
|
r_.m128i[i] = simde_mm_popcnt_epi16(a_.m128i[i]);
|
|
}
|
|
#elif defined(SIMDE_X86_AVX2_NATIVE)
|
|
r_.n =
|
|
_mm256_sub_epi16(
|
|
a_.n,
|
|
_mm256_and_si256(
|
|
_mm256_srli_epi16(a_.n, 1),
|
|
_mm256_set1_epi16(0x5555)
|
|
)
|
|
);
|
|
|
|
r_.n =
|
|
_mm256_add_epi16(
|
|
_mm256_and_si256(
|
|
r_.n,
|
|
_mm256_set1_epi16(0x3333)
|
|
),
|
|
_mm256_and_si256(
|
|
_mm256_srli_epi16(r_.n, 2),
|
|
_mm256_set1_epi16(0x3333)
|
|
)
|
|
);
|
|
|
|
r_.n =
|
|
_mm256_and_si256(
|
|
_mm256_add_epi16(
|
|
r_.n,
|
|
_mm256_srli_epi16(r_.n, 4)
|
|
),
|
|
_mm256_set1_epi16(0x0f0f)
|
|
);
|
|
|
|
r_.n =
|
|
_mm256_srli_epi16(
|
|
_mm256_mullo_epi16(
|
|
r_.n,
|
|
_mm256_set1_epi16(0x0101)
|
|
),
|
|
(sizeof(uint16_t) - 1) * CHAR_BIT
|
|
);
|
|
#elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR)
|
|
a_.u16 -= ((a_.u16 >> 1) & UINT16_C(0x5555));
|
|
a_.u16 = ((a_.u16 & UINT16_C(0x3333)) + ((a_.u16 >> 2) & UINT16_C(0x3333)));
|
|
a_.u16 = (a_.u16 + (a_.u16 >> 4)) & UINT16_C(0x0f0f);
|
|
r_.u16 = (a_.u16 * UINT16_C(0x0101)) >> ((sizeof(uint16_t) - 1) * CHAR_BIT);
|
|
#else
|
|
SIMDE_VECTORIZE
|
|
for (size_t i = 0 ; i < (sizeof(r_.u16) / sizeof(r_.u16[0])) ; i++) {
|
|
uint16_t v = HEDLEY_STATIC_CAST(uint16_t, a_.u16[i]);
|
|
v -= ((v >> 1) & UINT16_C(0x5555));
|
|
v = ((v & UINT16_C(0x3333)) + ((v >> 2) & UINT16_C(0x3333)));
|
|
v = (v + (v >> 4)) & UINT16_C(0x0f0f);
|
|
r_.u16[i] = HEDLEY_STATIC_CAST(uint16_t, (v * UINT16_C(0x0101))) >> ((sizeof(uint16_t) - 1) * CHAR_BIT);
|
|
}
|
|
#endif
|
|
|
|
return simde__m256i_from_private(r_);
|
|
#endif
|
|
}
|
|
#if defined(SIMDE_X86_AVX512BITALG_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES)
|
|
#undef _mm256_popcnt_epi16
|
|
#define _mm256_popcnt_epi16(a) simde_mm256_popcnt_epi16(a)
|
|
#endif
|
|
|
|
SIMDE_FUNCTION_ATTRIBUTES
|
|
simde__m256i
|
|
simde_mm256_mask_popcnt_epi16 (simde__m256i src, simde__mmask16 k, simde__m256i a) {
|
|
#if defined(SIMDE_X86_AVX512BITALG_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE)
|
|
return _mm256_mask_popcnt_epi16(src, k, a);
|
|
#else
|
|
return simde_mm256_mask_mov_epi16(src, k, simde_mm256_popcnt_epi16(a));
|
|
#endif
|
|
}
|
|
#if defined(SIMDE_X86_AVX512BITALG_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES)
|
|
#undef _mm256_mask_popcnt_epi16
|
|
#define _mm256_mask_popcnt_epi16(src, k, a) simde_mm256_mask_popcnt_epi16(src, k, a)
|
|
#endif
|
|
|
|
SIMDE_FUNCTION_ATTRIBUTES
|
|
simde__m256i
|
|
simde_mm256_maskz_popcnt_epi16 (simde__mmask16 k, simde__m256i a) {
|
|
#if defined(SIMDE_X86_AVX512BITALG_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE)
|
|
return _mm256_maskz_popcnt_epi16(k, a);
|
|
#else
|
|
return simde_mm256_maskz_mov_epi16(k, simde_mm256_popcnt_epi16(a));
|
|
#endif
|
|
}
|
|
#if defined(SIMDE_X86_AVX512BITALG_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES)
|
|
#undef _mm256_maskz_popcnt_epi16
|
|
#define _mm256_maskz_popcnt_epi16(k, a) simde_mm256_maskz_popcnt_epi16(k, a)
|
|
#endif
|
|
|
|
SIMDE_FUNCTION_ATTRIBUTES
|
|
simde__m256i
|
|
simde_mm256_popcnt_epi32 (simde__m256i a) {
|
|
#if defined(SIMDE_X86_AVX512VPOPCNTDQ_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE)
|
|
return _mm256_popcnt_epi32(a);
|
|
#else
|
|
simde__m256i_private
|
|
r_,
|
|
a_ = simde__m256i_to_private(a);
|
|
|
|
#if SIMDE_NATURAL_VECTOR_SIZE_LE(128)
|
|
for (size_t i = 0 ; i < (sizeof(r_.m128i) / sizeof(r_.m128i[0])) ; i++) {
|
|
r_.m128i[i] = simde_mm_popcnt_epi32(a_.m128i[i]);
|
|
}
|
|
#elif defined(SIMDE_X86_AVX2_NATIVE)
|
|
r_.n =
|
|
_mm256_sub_epi32(
|
|
a_.n,
|
|
_mm256_and_si256(
|
|
_mm256_srli_epi32(a_.n, 1),
|
|
_mm256_set1_epi32(0x55555555)
|
|
)
|
|
);
|
|
|
|
r_.n =
|
|
_mm256_add_epi32(
|
|
_mm256_and_si256(
|
|
r_.n,
|
|
_mm256_set1_epi32(0x33333333)
|
|
),
|
|
_mm256_and_si256(
|
|
_mm256_srli_epi32(r_.n, 2),
|
|
_mm256_set1_epi32(0x33333333)
|
|
)
|
|
);
|
|
|
|
r_.n =
|
|
_mm256_and_si256(
|
|
_mm256_add_epi32(
|
|
r_.n,
|
|
_mm256_srli_epi32(r_.n, 4)
|
|
),
|
|
_mm256_set1_epi32(0x0f0f0f0f)
|
|
);
|
|
|
|
r_.n =
|
|
_mm256_srli_epi32(
|
|
_mm256_mullo_epi32(
|
|
r_.n,
|
|
_mm256_set1_epi32(0x01010101)
|
|
),
|
|
(sizeof(uint32_t) - 1) * CHAR_BIT
|
|
);
|
|
#elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR)
|
|
a_.u32 -= ((a_.u32 >> 1) & UINT32_C(0x55555555));
|
|
a_.u32 = ((a_.u32 & UINT32_C(0x33333333)) + ((a_.u32 >> 2) & UINT32_C(0x33333333)));
|
|
a_.u32 = (a_.u32 + (a_.u32 >> 4)) & UINT32_C(0x0f0f0f0f);
|
|
r_.u32 = (a_.u32 * UINT32_C(0x01010101)) >> ((sizeof(uint32_t) - 1) * CHAR_BIT);
|
|
#else
|
|
SIMDE_VECTORIZE
|
|
for (size_t i = 0 ; i < (sizeof(r_.u32) / sizeof(r_.u32[0])) ; i++) {
|
|
uint32_t v = HEDLEY_STATIC_CAST(uint32_t, a_.u32[i]);
|
|
v -= ((v >> 1) & UINT32_C(0x55555555));
|
|
v = ((v & UINT32_C(0x33333333)) + ((v >> 2) & UINT32_C(0x33333333)));
|
|
v = (v + (v >> 4)) & UINT32_C(0x0f0f0f0f);
|
|
r_.u32[i] = HEDLEY_STATIC_CAST(uint32_t, (v * UINT32_C(0x01010101))) >> ((sizeof(uint32_t) - 1) * CHAR_BIT);
|
|
}
|
|
#endif
|
|
|
|
return simde__m256i_from_private(r_);
|
|
#endif
|
|
}
|
|
#if defined(SIMDE_X86_AVX512VPOPCNTDQ_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES)
|
|
#undef _mm256_popcnt_epi32
|
|
#define _mm256_popcnt_epi32(a) simde_mm256_popcnt_epi32(a)
|
|
#endif
|
|
|
|
SIMDE_FUNCTION_ATTRIBUTES
|
|
simde__m256i
|
|
simde_mm256_mask_popcnt_epi32 (simde__m256i src, simde__mmask8 k, simde__m256i a) {
|
|
#if defined(SIMDE_X86_AVX512VPOPCNTDQ_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE)
|
|
return _mm256_mask_popcnt_epi32(src, k, a);
|
|
#else
|
|
return simde_mm256_mask_mov_epi32(src, k, simde_mm256_popcnt_epi32(a));
|
|
#endif
|
|
}
|
|
#if defined(SIMDE_X86_AVX512VPOPCNTDQ_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES)
|
|
#undef _mm256_mask_popcnt_epi32
|
|
#define _mm256_mask_popcnt_epi32(src, k, a) simde_mm256_mask_popcnt_epi32(src, k, a)
|
|
#endif
|
|
|
|
SIMDE_FUNCTION_ATTRIBUTES
|
|
simde__m256i
|
|
simde_mm256_maskz_popcnt_epi32 (simde__mmask8 k, simde__m256i a) {
|
|
#if defined(SIMDE_X86_AVX512VPOPCNTDQ_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE)
|
|
return _mm256_maskz_popcnt_epi32(k, a);
|
|
#else
|
|
return simde_mm256_maskz_mov_epi32(k, simde_mm256_popcnt_epi32(a));
|
|
#endif
|
|
}
|
|
#if defined(SIMDE_X86_AVX512VPOPCNTDQ_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES)
|
|
#undef _mm256_maskz_popcnt_epi32
|
|
#define _mm256_maskz_popcnt_epi32(k, a) simde_mm256_maskz_popcnt_epi32(k, a)
|
|
#endif
|
|
|
|
SIMDE_FUNCTION_ATTRIBUTES
|
|
simde__m256i
|
|
simde_mm256_popcnt_epi64 (simde__m256i a) {
|
|
#if defined(SIMDE_X86_AVX512VPOPCNTDQ_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE)
|
|
return _mm256_popcnt_epi64(a);
|
|
#else
|
|
simde__m256i_private
|
|
r_,
|
|
a_ = simde__m256i_to_private(a);
|
|
|
|
#if SIMDE_NATURAL_VECTOR_SIZE_LE(128)
|
|
for (size_t i = 0 ; i < sizeof(r_.m128i) / sizeof(r_.m128i[0]) ; i++) {
|
|
r_.m128i[i] = simde_mm_popcnt_epi64(a_.m128i[i]);
|
|
}
|
|
#elif defined(SIMDE_X86_AVX2_NATIVE)
|
|
const __m256i low_nibble_set = _mm256_set1_epi8(0x0f);
|
|
const __m256i high_nibble_of_input = _mm256_andnot_si256(low_nibble_set, a_.n);
|
|
const __m256i low_nibble_of_input = _mm256_and_si256(low_nibble_set, a_.n);
|
|
const __m256i lut =
|
|
_mm256_set_epi8(
|
|
4, 3, 3, 2, 3, 2, 2, 1, 3, 2, 2, 1, 2, 1, 1, 0,
|
|
4, 3, 3, 2, 3, 2, 2, 1, 3, 2, 2, 1, 2, 1, 1, 0
|
|
);
|
|
|
|
r_.n =
|
|
_mm256_sad_epu8(
|
|
_mm256_add_epi8(
|
|
_mm256_shuffle_epi8(
|
|
lut,
|
|
low_nibble_of_input
|
|
),
|
|
_mm256_shuffle_epi8(
|
|
lut,
|
|
_mm256_srli_epi16(high_nibble_of_input, 4)
|
|
)
|
|
),
|
|
_mm256_setzero_si256()
|
|
);
|
|
#elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR)
|
|
a_.u64 -= ((a_.u64 >> 1) & UINT64_C(0x5555555555555555));
|
|
a_.u64 = ((a_.u64 & UINT64_C(0x3333333333333333)) + ((a_.u64 >> 2) & UINT64_C(0x3333333333333333)));
|
|
a_.u64 = (a_.u64 + (a_.u64 >> 4)) & UINT64_C(0x0f0f0f0f0f0f0f0f);
|
|
r_.u64 = (a_.u64 * UINT64_C(0x0101010101010101)) >> ((sizeof(uint64_t) - 1) * CHAR_BIT);
|
|
#else
|
|
SIMDE_VECTORIZE
|
|
for (size_t i = 0 ; i < (sizeof(r_.u64) / sizeof(r_.u64[0])) ; i++) {
|
|
uint64_t v = HEDLEY_STATIC_CAST(uint64_t, a_.u64[i]);
|
|
v -= ((v >> 1) & UINT64_C(0x5555555555555555));
|
|
v = ((v & UINT64_C(0x3333333333333333)) + ((v >> 2) & UINT64_C(0x3333333333333333)));
|
|
v = (v + (v >> 4)) & UINT64_C(0x0f0f0f0f0f0f0f0f);
|
|
r_.u64[i] = HEDLEY_STATIC_CAST(uint64_t, (v * UINT64_C(0x0101010101010101))) >> ((sizeof(uint64_t) - 1) * CHAR_BIT);
|
|
}
|
|
#endif
|
|
|
|
return simde__m256i_from_private(r_);
|
|
#endif
|
|
}
|
|
#if defined(SIMDE_X86_AVX512VPOPCNTDQ_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES)
|
|
#undef _mm256_popcnt_epi64
|
|
#define _mm256_popcnt_epi64(a) simde_mm256_popcnt_epi64(a)
|
|
#endif
|
|
|
|
SIMDE_FUNCTION_ATTRIBUTES
|
|
simde__m256i
|
|
simde_mm256_mask_popcnt_epi64 (simde__m256i src, simde__mmask8 k, simde__m256i a) {
|
|
#if defined(SIMDE_X86_AVX512VPOPCNTDQ_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE)
|
|
return _mm256_mask_popcnt_epi64(src, k, a);
|
|
#else
|
|
return simde_mm256_mask_mov_epi64(src, k, simde_mm256_popcnt_epi64(a));
|
|
#endif
|
|
}
|
|
#if defined(SIMDE_X86_AVX512VPOPCNTDQ_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES)
|
|
#undef _mm256_mask_popcnt_epi64
|
|
#define _mm256_mask_popcnt_epi64(src, k, a) simde_mm256_mask_popcnt_epi64(src, k, a)
|
|
#endif
|
|
|
|
SIMDE_FUNCTION_ATTRIBUTES
|
|
simde__m256i
|
|
simde_mm256_maskz_popcnt_epi64 (simde__mmask8 k, simde__m256i a) {
|
|
#if defined(SIMDE_X86_AVX512VPOPCNTDQ_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE)
|
|
return _mm256_maskz_popcnt_epi64(k, a);
|
|
#else
|
|
return simde_mm256_maskz_mov_epi64(k, simde_mm256_popcnt_epi64(a));
|
|
#endif
|
|
}
|
|
#if defined(SIMDE_X86_AVX512VPOPCNTDQ_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES)
|
|
#undef _mm256_maskz_popcnt_epi64
|
|
#define _mm256_maskz_popcnt_epi64(k, a) simde_mm256_maskz_popcnt_epi64(k, a)
|
|
#endif
|
|
|
|
SIMDE_FUNCTION_ATTRIBUTES
|
|
simde__m512i
|
|
simde_mm512_popcnt_epi8 (simde__m512i a) {
|
|
#if defined(SIMDE_X86_AVX512BITALG_NATIVE)
|
|
return _mm512_popcnt_epi8(a);
|
|
#else
|
|
simde__m512i_private
|
|
r_,
|
|
a_ = simde__m512i_to_private(a);
|
|
|
|
#if SIMDE_NATURAL_VECTOR_SIZE_LE(128)
|
|
for (size_t i = 0 ; i < (sizeof(r_.m128i) / sizeof(r_.m128i[0])) ; i++) {
|
|
r_.m128i[i] = simde_mm_popcnt_epi8(a_.m128i[i]);
|
|
}
|
|
#elif SIMDE_NATURAL_VECTOR_SIZE_LE(256)
|
|
for (size_t i = 0 ; i < (sizeof(r_.m256i) / sizeof(r_.m256i[0])) ; i++) {
|
|
r_.m256i[i] = simde_mm256_popcnt_epi8(a_.m256i[i]);
|
|
}
|
|
#elif defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512F_NATIVE)
|
|
const __m512i low_nibble_set = _mm512_set1_epi8(0x0f);
|
|
const __m512i high_nibble_of_input = _mm512_andnot_si512(low_nibble_set, a_.n);
|
|
const __m512i low_nibble_of_input = _mm512_and_si512(low_nibble_set, a_.n);
|
|
const __m512i lut =
|
|
simde_mm512_set_epi8(
|
|
4, 3, 3, 2, 3, 2, 2, 1, 3, 2, 2, 1, 2, 1, 1, 0,
|
|
4, 3, 3, 2, 3, 2, 2, 1, 3, 2, 2, 1, 2, 1, 1, 0,
|
|
4, 3, 3, 2, 3, 2, 2, 1, 3, 2, 2, 1, 2, 1, 1, 0,
|
|
4, 3, 3, 2, 3, 2, 2, 1, 3, 2, 2, 1, 2, 1, 1, 0
|
|
);
|
|
|
|
r_.n =
|
|
_mm512_add_epi8(
|
|
_mm512_shuffle_epi8(
|
|
lut,
|
|
low_nibble_of_input
|
|
),
|
|
_mm512_shuffle_epi8(
|
|
lut,
|
|
_mm512_srli_epi16(
|
|
high_nibble_of_input,
|
|
4
|
|
)
|
|
)
|
|
);
|
|
#elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR)
|
|
a_.u8 -= ((a_.u8 >> 1) & 0x55);
|
|
a_.u8 = ((a_.u8 & 0x33) + ((a_.u8 >> 2) & 0x33));
|
|
a_.u8 = (a_.u8 + (a_.u8 >> 4)) & 15;
|
|
r_.u8 = a_.u8 >> ((sizeof(uint8_t) - 1) * CHAR_BIT);
|
|
#else
|
|
SIMDE_VECTORIZE
|
|
for (size_t i = 0 ; i < (sizeof(r_.u8) / sizeof(r_.u8[0])) ; i++) {
|
|
uint8_t v = HEDLEY_STATIC_CAST(uint8_t, a_.u8[i]);
|
|
v -= ((v >> 1) & 0x55);
|
|
v = (v & 0x33) + ((v >> 2) & 0x33);
|
|
v = (v + (v >> 4)) & 0xf;
|
|
r_.u8[i] = v >> (sizeof(uint8_t) - 1) * CHAR_BIT;
|
|
}
|
|
#endif
|
|
|
|
return simde__m512i_from_private(r_);
|
|
#endif
|
|
}
|
|
#if defined(SIMDE_X86_AVX512BITALG_ENABLE_NATIVE_ALIASES)
|
|
#undef _mm512_popcnt_epi8
|
|
#define _mm512_popcnt_epi8(a) simde_mm512_popcnt_epi8(a)
|
|
#endif
|
|
|
|
SIMDE_FUNCTION_ATTRIBUTES
|
|
simde__m512i
|
|
simde_mm512_mask_popcnt_epi8 (simde__m512i src, simde__mmask64 k, simde__m512i a) {
|
|
#if defined(SIMDE_X86_AVX512BITALG_NATIVE)
|
|
return _mm512_mask_popcnt_epi8(src, k, a);
|
|
#else
|
|
return simde_mm512_mask_mov_epi8(src, k, simde_mm512_popcnt_epi8(a));
|
|
#endif
|
|
}
|
|
#if defined(SIMDE_X86_AVX512BITALG_ENABLE_NATIVE_ALIASES)
|
|
#undef _mm512_mask_popcnt_epi8
|
|
#define _mm512_mask_popcnt_epi8(src, k, a) simde_mm512_mask_popcnt_epi8(src, k, a)
|
|
#endif
|
|
|
|
SIMDE_FUNCTION_ATTRIBUTES
|
|
simde__m512i
|
|
simde_mm512_maskz_popcnt_epi8 (simde__mmask64 k, simde__m512i a) {
|
|
#if defined(SIMDE_X86_AVX512BITALG_NATIVE)
|
|
return _mm512_maskz_popcnt_epi8(k, a);
|
|
#else
|
|
return simde_mm512_maskz_mov_epi8(k, simde_mm512_popcnt_epi8(a));
|
|
#endif
|
|
}
|
|
#if defined(SIMDE_X86_AVX512BITALG_ENABLE_NATIVE_ALIASES)
|
|
#undef _mm512_maskz_popcnt_epi8
|
|
#define _mm512_maskz_popcnt_epi8(k, a) simde_mm512_maskz_popcnt_epi8(k, a)
|
|
#endif
|
|
|
|
SIMDE_FUNCTION_ATTRIBUTES
|
|
simde__m512i
|
|
simde_mm512_popcnt_epi16 (simde__m512i a) {
|
|
#if defined(SIMDE_X86_AVX512BITALG_NATIVE)
|
|
return _mm512_popcnt_epi16(a);
|
|
#else
|
|
simde__m512i_private
|
|
r_,
|
|
a_ = simde__m512i_to_private(a);
|
|
|
|
#if SIMDE_NATURAL_VECTOR_SIZE_LE(128)
|
|
for (size_t i = 0 ; i < (sizeof(r_.m128i) / sizeof(r_.m128i[0])) ; i++) {
|
|
r_.m128i[i] = simde_mm_popcnt_epi16(a_.m128i[i]);
|
|
}
|
|
#elif SIMDE_NATURAL_VECTOR_SIZE_LE(256)
|
|
for (size_t i = 0 ; i < (sizeof(r_.m256i) / sizeof(r_.m256i[0])) ; i++) {
|
|
r_.m256i[i] = simde_mm256_popcnt_epi16(a_.m256i[i]);
|
|
}
|
|
#elif defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512F_NATIVE)
|
|
r_.n =
|
|
_mm512_sub_epi16(
|
|
a_.n,
|
|
_mm512_and_si512(
|
|
_mm512_srli_epi16(a_.n, 1),
|
|
_mm512_set1_epi16(0x5555)
|
|
)
|
|
);
|
|
|
|
r_.n =
|
|
_mm512_add_epi16(
|
|
_mm512_and_si512(
|
|
r_.n,
|
|
_mm512_set1_epi16(0x3333)
|
|
),
|
|
_mm512_and_si512(
|
|
_mm512_srli_epi16(r_.n, 2),
|
|
_mm512_set1_epi16(0x3333)
|
|
)
|
|
);
|
|
|
|
r_.n =
|
|
_mm512_and_si512(
|
|
_mm512_add_epi16(
|
|
r_.n,
|
|
_mm512_srli_epi16(r_.n, 4)
|
|
),
|
|
_mm512_set1_epi16(0x0f0f)
|
|
);
|
|
|
|
r_.n =
|
|
_mm512_srli_epi16(
|
|
_mm512_mullo_epi16(
|
|
r_.n,
|
|
_mm512_set1_epi16(0x0101)
|
|
),
|
|
(sizeof(uint16_t) - 1) * CHAR_BIT
|
|
);
|
|
#elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR)
|
|
a_.u16 -= ((a_.u16 >> 1) & UINT16_C(0x5555));
|
|
a_.u16 = ((a_.u16 & UINT16_C(0x3333)) + ((a_.u16 >> 2) & UINT16_C(0x3333)));
|
|
a_.u16 = (a_.u16 + (a_.u16 >> 4)) & UINT16_C(0x0f0f);
|
|
r_.u16 = (a_.u16 * UINT16_C(0x0101)) >> ((sizeof(uint16_t) - 1) * CHAR_BIT);
|
|
#else
|
|
SIMDE_VECTORIZE
|
|
for (size_t i = 0 ; i < (sizeof(r_.u16) / sizeof(r_.u16[0])) ; i++) {
|
|
uint16_t v = HEDLEY_STATIC_CAST(uint16_t, a_.u16[i]);
|
|
v -= ((v >> 1) & UINT16_C(0x5555));
|
|
v = ((v & UINT16_C(0x3333)) + ((v >> 2) & UINT16_C(0x3333)));
|
|
v = (v + (v >> 4)) & UINT16_C(0x0f0f);
|
|
r_.u16[i] = HEDLEY_STATIC_CAST(uint16_t, (v * UINT16_C(0x0101))) >> ((sizeof(uint16_t) - 1) * CHAR_BIT);
|
|
}
|
|
#endif
|
|
|
|
return simde__m512i_from_private(r_);
|
|
#endif
|
|
}
|
|
#if defined(SIMDE_X86_AVX512BITALG_ENABLE_NATIVE_ALIASES)
|
|
#undef _mm512_popcnt_epi16
|
|
#define _mm512_popcnt_epi16(a) simde_mm512_popcnt_epi16(a)
|
|
#endif
|
|
|
|
SIMDE_FUNCTION_ATTRIBUTES
|
|
simde__m512i
|
|
simde_mm512_mask_popcnt_epi16 (simde__m512i src, simde__mmask32 k, simde__m512i a) {
|
|
#if defined(SIMDE_X86_AVX512BITALG_NATIVE)
|
|
return _mm512_mask_popcnt_epi16(src, k, a);
|
|
#else
|
|
return simde_mm512_mask_mov_epi16(src, k, simde_mm512_popcnt_epi16(a));
|
|
#endif
|
|
}
|
|
#if defined(SIMDE_X86_AVX512BITALG_ENABLE_NATIVE_ALIASES)
|
|
#undef _mm512_mask_popcnt_epi16
|
|
#define _mm512_mask_popcnt_epi16(src, k, a) simde_mm512_mask_popcnt_epi16(src, k, a)
|
|
#endif
|
|
|
|
SIMDE_FUNCTION_ATTRIBUTES
|
|
simde__m512i
|
|
simde_mm512_maskz_popcnt_epi16 (simde__mmask32 k, simde__m512i a) {
|
|
#if defined(SIMDE_X86_AVX512BITALG_NATIVE)
|
|
return _mm512_maskz_popcnt_epi16(k, a);
|
|
#else
|
|
return simde_mm512_maskz_mov_epi16(k, simde_mm512_popcnt_epi16(a));
|
|
#endif
|
|
}
|
|
#if defined(SIMDE_X86_AVX512BITALG_ENABLE_NATIVE_ALIASES)
|
|
#undef _mm512_maskz_popcnt_epi16
|
|
#define _mm512_maskz_popcnt_epi16(k, a) simde_mm512_maskz_popcnt_epi16(k, a)
|
|
#endif
|
|
|
|
SIMDE_FUNCTION_ATTRIBUTES
|
|
simde__m512i
|
|
simde_mm512_popcnt_epi32 (simde__m512i a) {
|
|
#if defined(SIMDE_X86_AVX512VPOPCNTDQ_NATIVE)
|
|
return _mm512_popcnt_epi32(a);
|
|
#else
|
|
simde__m512i_private
|
|
r_,
|
|
a_ = simde__m512i_to_private(a);
|
|
|
|
#if SIMDE_NATURAL_VECTOR_SIZE_LE(128)
|
|
for (size_t i = 0 ; i < (sizeof(r_.m128i) / sizeof(r_.m128i[0])) ; i++) {
|
|
r_.m128i[i] = simde_mm_popcnt_epi32(a_.m128i[i]);
|
|
}
|
|
#elif SIMDE_NATURAL_VECTOR_SIZE_LE(256)
|
|
for (size_t i = 0 ; i < (sizeof(r_.m256i) / sizeof(r_.m256i[0])) ; i++) {
|
|
r_.m256i[i] = simde_mm256_popcnt_epi32(a_.m256i[i]);
|
|
}
|
|
#elif defined(SIMDE_X86_AVX512F_NATIVE)
|
|
r_.n =
|
|
_mm512_sub_epi32(
|
|
a_.n,
|
|
_mm512_and_si512(
|
|
_mm512_srli_epi32(a_.n, 1),
|
|
_mm512_set1_epi32(0x55555555)
|
|
)
|
|
);
|
|
|
|
r_.n =
|
|
_mm512_add_epi32(
|
|
_mm512_and_si512(
|
|
r_.n,
|
|
_mm512_set1_epi32(0x33333333)
|
|
),
|
|
_mm512_and_si512(
|
|
_mm512_srli_epi32(r_.n, 2),
|
|
_mm512_set1_epi32(0x33333333)
|
|
)
|
|
);
|
|
|
|
r_.n =
|
|
_mm512_and_si512(
|
|
_mm512_add_epi32(
|
|
r_.n,
|
|
_mm512_srli_epi32(r_.n, 4)
|
|
),
|
|
_mm512_set1_epi32(0x0f0f0f0f)
|
|
);
|
|
|
|
r_.n =
|
|
_mm512_srli_epi32(
|
|
_mm512_mullo_epi32(
|
|
r_.n,
|
|
_mm512_set1_epi32(0x01010101)
|
|
),
|
|
(sizeof(uint32_t) - 1) * CHAR_BIT
|
|
);
|
|
#elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR)
|
|
a_.u32 -= ((a_.u32 >> 1) & UINT32_C(0x55555555));
|
|
a_.u32 = ((a_.u32 & UINT32_C(0x33333333)) + ((a_.u32 >> 2) & UINT32_C(0x33333333)));
|
|
a_.u32 = (a_.u32 + (a_.u32 >> 4)) & UINT32_C(0x0f0f0f0f);
|
|
r_.u32 = (a_.u32 * UINT32_C(0x01010101)) >> ((sizeof(uint32_t) - 1) * CHAR_BIT);
|
|
#else
|
|
SIMDE_VECTORIZE
|
|
for (size_t i = 0 ; i < (sizeof(r_.u32) / sizeof(r_.u32[0])) ; i++) {
|
|
uint32_t v = HEDLEY_STATIC_CAST(uint32_t, a_.u32[i]);
|
|
v -= ((v >> 1) & UINT32_C(0x55555555));
|
|
v = ((v & UINT32_C(0x33333333)) + ((v >> 2) & UINT32_C(0x33333333)));
|
|
v = (v + (v >> 4)) & UINT32_C(0x0f0f0f0f);
|
|
r_.u32[i] = HEDLEY_STATIC_CAST(uint32_t, (v * UINT32_C(0x01010101))) >> ((sizeof(uint32_t) - 1) * CHAR_BIT);
|
|
}
|
|
#endif
|
|
|
|
return simde__m512i_from_private(r_);
|
|
#endif
|
|
}
|
|
#if defined(SIMDE_X86_AVX512VPOPCNTDQ_ENABLE_NATIVE_ALIASES)
|
|
#undef _mm512_popcnt_epi32
|
|
#define _mm512_popcnt_epi32(a) simde_mm512_popcnt_epi32(a)
|
|
#endif
|
|
|
|
SIMDE_FUNCTION_ATTRIBUTES
|
|
simde__m512i
|
|
simde_mm512_mask_popcnt_epi32 (simde__m512i src, simde__mmask16 k, simde__m512i a) {
|
|
#if defined(SIMDE_X86_AVX512VPOPCNTDQ_NATIVE)
|
|
return _mm512_mask_popcnt_epi32(src, k, a);
|
|
#else
|
|
return simde_mm512_mask_mov_epi32(src, k, simde_mm512_popcnt_epi32(a));
|
|
#endif
|
|
}
|
|
#if defined(SIMDE_X86_AVX512VPOPCNTDQ_ENABLE_NATIVE_ALIASES)
|
|
#undef _mm512_mask_popcnt_epi32
|
|
#define _mm512_mask_popcnt_epi32(src, k, a) simde_mm512_mask_popcnt_epi32(src, k, a)
|
|
#endif
|
|
|
|
SIMDE_FUNCTION_ATTRIBUTES
|
|
simde__m512i
|
|
simde_mm512_maskz_popcnt_epi32 (simde__mmask16 k, simde__m512i a) {
|
|
#if defined(SIMDE_X86_AVX512VPOPCNTDQ_NATIVE)
|
|
return _mm512_maskz_popcnt_epi32(k, a);
|
|
#else
|
|
return simde_mm512_maskz_mov_epi32(k, simde_mm512_popcnt_epi32(a));
|
|
#endif
|
|
}
|
|
#if defined(SIMDE_X86_AVX512VPOPCNTDQ_ENABLE_NATIVE_ALIASES)
|
|
#undef _mm512_maskz_popcnt_epi32
|
|
#define _mm512_maskz_popcnt_epi32(k, a) simde_mm512_maskz_popcnt_epi32(k, a)
|
|
#endif
|
|
|
|
SIMDE_FUNCTION_ATTRIBUTES
|
|
simde__m512i
|
|
simde_mm512_popcnt_epi64 (simde__m512i a) {
|
|
#if defined(SIMDE_X86_AVX512VPOPCNTDQ_NATIVE)
|
|
return _mm512_popcnt_epi64(a);
|
|
#else
|
|
simde__m512i_private
|
|
r_,
|
|
a_ = simde__m512i_to_private(a);
|
|
|
|
#if SIMDE_NATURAL_VECTOR_SIZE_LE(128)
|
|
for (size_t i = 0 ; i < (sizeof(r_.m128i) / sizeof(r_.m128i[0])) ; i++) {
|
|
r_.m128i[i] = simde_mm_popcnt_epi64(a_.m128i[i]);
|
|
}
|
|
#elif SIMDE_NATURAL_VECTOR_SIZE_LE(256)
|
|
for (size_t i = 0 ; i < sizeof(r_.m256i) / sizeof(r_.m256i[0]) ; i++) {
|
|
r_.m256i[i] = simde_mm256_popcnt_epi64(a_.m256i[i]);
|
|
}
|
|
#elif defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512F_NATIVE)
|
|
const __m512i low_nibble_set = _mm512_set1_epi8(0x0f);
|
|
const __m512i high_nibble_of_input = _mm512_andnot_si512(low_nibble_set, a_.n);
|
|
const __m512i low_nibble_of_input = _mm512_and_si512(low_nibble_set, a_.n);
|
|
const __m512i lut =
|
|
simde_mm512_set_epi8(
|
|
4, 3, 3, 2, 3, 2, 2, 1, 3, 2, 2, 1, 2, 1, 1, 0,
|
|
4, 3, 3, 2, 3, 2, 2, 1, 3, 2, 2, 1, 2, 1, 1, 0,
|
|
4, 3, 3, 2, 3, 2, 2, 1, 3, 2, 2, 1, 2, 1, 1, 0,
|
|
4, 3, 3, 2, 3, 2, 2, 1, 3, 2, 2, 1, 2, 1, 1, 0
|
|
);
|
|
|
|
r_.n =
|
|
_mm512_sad_epu8(
|
|
_mm512_add_epi8(
|
|
_mm512_shuffle_epi8(
|
|
lut,
|
|
low_nibble_of_input
|
|
),
|
|
_mm512_shuffle_epi8(
|
|
lut,
|
|
_mm512_srli_epi16(high_nibble_of_input, 4)
|
|
)
|
|
),
|
|
_mm512_setzero_si512()
|
|
);
|
|
#elif defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512DQ_NATIVE)
|
|
r_.n =
|
|
_mm512_sub_epi64(
|
|
a_.n,
|
|
_mm512_and_si512(
|
|
_mm512_srli_epi64(a_.n, 1),
|
|
_mm512_set1_epi64(0x5555555555555555)
|
|
)
|
|
);
|
|
|
|
r_.n =
|
|
_mm512_add_epi64(
|
|
_mm512_and_si512(
|
|
r_.n,
|
|
_mm512_set1_epi64(0x3333333333333333)
|
|
),
|
|
_mm512_and_si512(
|
|
_mm512_srli_epi64(r_.n, 2),
|
|
_mm512_set1_epi64(0x3333333333333333)
|
|
)
|
|
);
|
|
|
|
r_.n =
|
|
_mm512_and_si512(
|
|
_mm512_add_epi64(
|
|
r_.n,
|
|
_mm512_srli_epi64(r_.n, 4)
|
|
),
|
|
_mm512_set1_epi64(0x0f0f0f0f0f0f0f0f)
|
|
);
|
|
|
|
r_.n =
|
|
_mm512_srli_epi64(
|
|
_mm512_mullo_epi64(
|
|
r_.n,
|
|
_mm512_set1_epi64(0x0101010101010101)
|
|
),
|
|
(sizeof(uint64_t) - 1) * CHAR_BIT
|
|
);
|
|
#elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR)
|
|
a_.u64 -= ((a_.u64 >> 1) & UINT64_C(0x5555555555555555));
|
|
a_.u64 = ((a_.u64 & UINT64_C(0x3333333333333333)) + ((a_.u64 >> 2) & UINT64_C(0x3333333333333333)));
|
|
a_.u64 = (a_.u64 + (a_.u64 >> 4)) & UINT64_C(0x0f0f0f0f0f0f0f0f);
|
|
r_.u64 = (a_.u64 * UINT64_C(0x0101010101010101)) >> ((sizeof(uint64_t) - 1) * CHAR_BIT);
|
|
#else
|
|
SIMDE_VECTORIZE
|
|
for (size_t i = 0 ; i < (sizeof(r_.u64) / sizeof(r_.u64[0])) ; i++) {
|
|
uint64_t v = HEDLEY_STATIC_CAST(uint64_t, a_.u64[i]);
|
|
v -= ((v >> 1) & UINT64_C(0x5555555555555555));
|
|
v = ((v & UINT64_C(0x3333333333333333)) + ((v >> 2) & UINT64_C(0x3333333333333333)));
|
|
v = (v + (v >> 4)) & UINT64_C(0x0f0f0f0f0f0f0f0f);
|
|
r_.u64[i] = HEDLEY_STATIC_CAST(uint64_t, (v * UINT64_C(0x0101010101010101))) >> ((sizeof(uint64_t) - 1) * CHAR_BIT);
|
|
}
|
|
#endif
|
|
|
|
return simde__m512i_from_private(r_);
|
|
#endif
|
|
}
|
|
#if defined(SIMDE_X86_AVX512VPOPCNTDQ_ENABLE_NATIVE_ALIASES)
|
|
#undef _mm512_popcnt_epi64
|
|
#define _mm512_popcnt_epi64(a) simde_mm512_popcnt_epi64(a)
|
|
#endif
|
|
|
|
SIMDE_FUNCTION_ATTRIBUTES
|
|
simde__m512i
|
|
simde_mm512_mask_popcnt_epi64 (simde__m512i src, simde__mmask8 k, simde__m512i a) {
|
|
#if defined(SIMDE_X86_AVX512VPOPCNTDQ_NATIVE)
|
|
return _mm512_mask_popcnt_epi64(src, k, a);
|
|
#else
|
|
return simde_mm512_mask_mov_epi64(src, k, simde_mm512_popcnt_epi64(a));
|
|
#endif
|
|
}
|
|
#if defined(SIMDE_X86_AVX512VPOPCNTDQ_ENABLE_NATIVE_ALIASES)
|
|
#undef _mm512_mask_popcnt_epi64
|
|
#define _mm512_mask_popcnt_epi64(src, k, a) simde_mm512_mask_popcnt_epi64(src, k, a)
|
|
#endif
|
|
|
|
SIMDE_FUNCTION_ATTRIBUTES
|
|
simde__m512i
|
|
simde_mm512_maskz_popcnt_epi64 (simde__mmask8 k, simde__m512i a) {
|
|
#if defined(SIMDE_X86_AVX512VPOPCNTDQ_NATIVE)
|
|
return _mm512_maskz_popcnt_epi64(k, a);
|
|
#else
|
|
return simde_mm512_maskz_mov_epi64(k, simde_mm512_popcnt_epi64(a));
|
|
#endif
|
|
}
|
|
#if defined(SIMDE_X86_AVX512VPOPCNTDQ_ENABLE_NATIVE_ALIASES)
|
|
#undef _mm512_maskz_popcnt_epi64
|
|
#define _mm512_maskz_popcnt_epi64(k, a) simde_mm512_maskz_popcnt_epi64(k, a)
|
|
#endif
|
|
|
|
SIMDE_END_DECLS_
|
|
HEDLEY_DIAGNOSTIC_POP
|
|
|
|
#endif /* !defined(SIMDE_X86_AVX512_POPCNT_H) */
|