5730 lines
193 KiB
C
5730 lines
193 KiB
C
/* SPDX-License-Identifier: MIT
|
|
*
|
|
* Permission is hereby granted, free of charge, to any person
|
|
* obtaining a copy of this software and associated documentation
|
|
* files (the "Software"), to deal in the Software without
|
|
* restriction, including without limitation the rights to use, copy,
|
|
* modify, merge, publish, distribute, sublicense, and/or sell copies
|
|
* of the Software, and to permit persons to whom the Software is
|
|
* furnished to do so, subject to the following conditions:
|
|
*
|
|
* The above copyright notice and this permission notice shall be
|
|
* included in all copies or substantial portions of the Software.
|
|
*
|
|
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
|
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
|
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
|
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
|
|
* BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
|
|
* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
|
|
* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
* SOFTWARE.
|
|
*
|
|
* Copyright:
|
|
* 2018-2020 Evan Nemerson <evan@nemerson.com>
|
|
* 2019-2020 Michael R. Crusoe <crusoe@debian.org>
|
|
* 2020 Himanshi Mathur <himanshi18037@iiitd.ac.in>
|
|
* 2020 Hidayat Khan <huk2209@gmail.com>
|
|
*/
|
|
|
|
#if !defined(SIMDE_X86_AVX2_H)
|
|
#define SIMDE_X86_AVX2_H
|
|
|
|
#include "avx.h"
|
|
|
|
HEDLEY_DIAGNOSTIC_PUSH
|
|
SIMDE_DISABLE_UNWANTED_DIAGNOSTICS
|
|
SIMDE_BEGIN_DECLS_
|
|
|
|
SIMDE_FUNCTION_ATTRIBUTES
|
|
simde__m256i
|
|
simde_mm256_abs_epi8 (simde__m256i a) {
|
|
#if defined(SIMDE_X86_AVX2_NATIVE)
|
|
return _mm256_abs_epi8(a);
|
|
#else
|
|
simde__m256i_private
|
|
r_,
|
|
a_ = simde__m256i_to_private(a);
|
|
|
|
#if SIMDE_NATURAL_VECTOR_SIZE_LE(128)
|
|
r_.m128i[0] = simde_mm_abs_epi8(a_.m128i[0]);
|
|
r_.m128i[1] = simde_mm_abs_epi8(a_.m128i[1]);
|
|
#else
|
|
SIMDE_VECTORIZE
|
|
for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) {
|
|
r_.i8[i] = (a_.i8[i] < INT32_C(0)) ? -a_.i8[i] : a_.i8[i];
|
|
}
|
|
#endif
|
|
|
|
return simde__m256i_from_private(r_);
|
|
#endif
|
|
}
|
|
#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
|
|
#undef _mm256_abs_epi8
|
|
#define _mm256_abs_epi8(a) simde_mm256_abs_epi8(a)
|
|
#endif
|
|
|
|
SIMDE_FUNCTION_ATTRIBUTES
|
|
simde__m256i
|
|
simde_mm256_abs_epi16 (simde__m256i a) {
|
|
#if defined(SIMDE_X86_AVX2_NATIVE)
|
|
return _mm256_abs_epi16(a);
|
|
#else
|
|
simde__m256i_private
|
|
r_,
|
|
a_ = simde__m256i_to_private(a);
|
|
|
|
#if SIMDE_NATURAL_VECTOR_SIZE_LE(128)
|
|
r_.m128i[0] = simde_mm_abs_epi16(a_.m128i[0]);
|
|
r_.m128i[1] = simde_mm_abs_epi16(a_.m128i[1]);
|
|
#else
|
|
SIMDE_VECTORIZE
|
|
for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) {
|
|
r_.i16[i] = (a_.i16[i] < INT32_C(0)) ? -a_.i16[i] : a_.i16[i];
|
|
}
|
|
#endif
|
|
|
|
return simde__m256i_from_private(r_);
|
|
#endif
|
|
}
|
|
#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
|
|
#undef _mm256_abs_epi16
|
|
#define _mm256_abs_epi16(a) simde_mm256_abs_epi16(a)
|
|
#endif
|
|
|
|
SIMDE_FUNCTION_ATTRIBUTES
|
|
simde__m256i
|
|
simde_mm256_abs_epi32(simde__m256i a) {
|
|
#if defined(SIMDE_X86_AVX2_NATIVE)
|
|
return _mm256_abs_epi32(a);
|
|
#else
|
|
simde__m256i_private
|
|
r_,
|
|
a_ = simde__m256i_to_private(a);
|
|
|
|
#if SIMDE_NATURAL_VECTOR_SIZE_LE(128)
|
|
r_.m128i[0] = simde_mm_abs_epi32(a_.m128i[0]);
|
|
r_.m128i[1] = simde_mm_abs_epi32(a_.m128i[1]);
|
|
#else
|
|
SIMDE_VECTORIZE
|
|
for (size_t i = 0; i < (sizeof(r_.i32) / sizeof(r_.i32[0])); i++) {
|
|
r_.i32[i] = (a_.i32[i] < INT32_C(0)) ? -a_.i32[i] : a_.i32[i];
|
|
}
|
|
#endif
|
|
|
|
return simde__m256i_from_private(r_);
|
|
#endif
|
|
}
|
|
#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
|
|
#undef _mm256_abs_epi32
|
|
#define _mm256_abs_epi32(a) simde_mm256_abs_epi32(a)
|
|
#endif
|
|
|
|
SIMDE_FUNCTION_ATTRIBUTES
|
|
simde__m256i
|
|
simde_mm256_add_epi8 (simde__m256i a, simde__m256i b) {
|
|
#if defined(SIMDE_X86_AVX2_NATIVE)
|
|
return _mm256_add_epi8(a, b);
|
|
#else
|
|
simde__m256i_private
|
|
r_,
|
|
a_ = simde__m256i_to_private(a),
|
|
b_ = simde__m256i_to_private(b);
|
|
|
|
#if SIMDE_NATURAL_VECTOR_SIZE_LE(128)
|
|
r_.m128i[0] = simde_mm_add_epi8(a_.m128i[0], b_.m128i[0]);
|
|
r_.m128i[1] = simde_mm_add_epi8(a_.m128i[1], b_.m128i[1]);
|
|
#elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
|
|
r_.i8 = a_.i8 + b_.i8;
|
|
#else
|
|
SIMDE_VECTORIZE
|
|
for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) {
|
|
r_.i8[i] = a_.i8[i] + b_.i8[i];
|
|
}
|
|
#endif
|
|
|
|
return simde__m256i_from_private(r_);
|
|
#endif
|
|
}
|
|
#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
|
|
#undef _mm256_add_epi8
|
|
#define _mm256_add_epi8(a, b) simde_mm256_add_epi8(a, b)
|
|
#endif
|
|
|
|
SIMDE_FUNCTION_ATTRIBUTES
|
|
simde__m256i
|
|
simde_mm256_add_epi16 (simde__m256i a, simde__m256i b) {
|
|
#if defined(SIMDE_X86_AVX2_NATIVE)
|
|
return _mm256_add_epi16(a, b);
|
|
#else
|
|
simde__m256i_private
|
|
r_,
|
|
a_ = simde__m256i_to_private(a),
|
|
b_ = simde__m256i_to_private(b);
|
|
|
|
#if SIMDE_NATURAL_VECTOR_SIZE_LE(128)
|
|
r_.m128i[0] = simde_mm_add_epi16(a_.m128i[0], b_.m128i[0]);
|
|
r_.m128i[1] = simde_mm_add_epi16(a_.m128i[1], b_.m128i[1]);
|
|
#elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
|
|
r_.i16 = a_.i16 + b_.i16;
|
|
#else
|
|
SIMDE_VECTORIZE
|
|
for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) {
|
|
r_.i16[i] = a_.i16[i] + b_.i16[i];
|
|
}
|
|
#endif
|
|
|
|
return simde__m256i_from_private(r_);
|
|
#endif
|
|
}
|
|
#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
|
|
#undef _mm256_add_epi16
|
|
#define _mm256_add_epi16(a, b) simde_mm256_add_epi16(a, b)
|
|
#endif
|
|
|
|
SIMDE_FUNCTION_ATTRIBUTES
|
|
simde__m256i
|
|
simde_mm256_hadd_epi16 (simde__m256i a, simde__m256i b) {
|
|
#if defined(SIMDE_X86_AVX2_NATIVE)
|
|
return _mm256_hadd_epi16(a, b);
|
|
#else
|
|
return simde_mm256_add_epi16(simde_x_mm256_deinterleaveeven_epi16(a, b), simde_x_mm256_deinterleaveodd_epi16(a, b));
|
|
#endif
|
|
}
|
|
#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
|
|
#undef _mm256_hadd_epi16
|
|
#define _mm256_hadd_epi16(a, b) simde_mm256_hadd_epi16(a, b)
|
|
#endif
|
|
|
|
SIMDE_FUNCTION_ATTRIBUTES
|
|
simde__m256i
|
|
simde_mm256_add_epi32 (simde__m256i a, simde__m256i b) {
|
|
#if defined(SIMDE_X86_AVX2_NATIVE)
|
|
return _mm256_add_epi32(a, b);
|
|
#else
|
|
simde__m256i_private
|
|
r_,
|
|
a_ = simde__m256i_to_private(a),
|
|
b_ = simde__m256i_to_private(b);
|
|
|
|
#if SIMDE_NATURAL_VECTOR_SIZE_LE(128)
|
|
r_.m128i[0] = simde_mm_add_epi32(a_.m128i[0], b_.m128i[0]);
|
|
r_.m128i[1] = simde_mm_add_epi32(a_.m128i[1], b_.m128i[1]);
|
|
#elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
|
|
r_.i32 = a_.i32 + b_.i32;
|
|
#else
|
|
SIMDE_VECTORIZE
|
|
for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
|
|
r_.i32[i] = a_.i32[i] + b_.i32[i];
|
|
}
|
|
#endif
|
|
|
|
return simde__m256i_from_private(r_);
|
|
#endif
|
|
}
|
|
#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
|
|
#undef _mm256_add_epi32
|
|
#define _mm256_add_epi32(a, b) simde_mm256_add_epi32(a, b)
|
|
#endif
|
|
|
|
SIMDE_FUNCTION_ATTRIBUTES
|
|
simde__m256i
|
|
simde_mm256_hadd_epi32 (simde__m256i a, simde__m256i b) {
|
|
#if defined(SIMDE_X86_AVX2_NATIVE)
|
|
return _mm256_hadd_epi32(a, b);
|
|
#else
|
|
return simde_mm256_add_epi32(simde_x_mm256_deinterleaveeven_epi32(a, b), simde_x_mm256_deinterleaveodd_epi32(a, b));
|
|
#endif
|
|
}
|
|
#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
|
|
#undef _mm256_hadd_epi32
|
|
#define _mm256_hadd_epi32(a, b) simde_mm256_hadd_epi32(a, b)
|
|
#endif
|
|
|
|
SIMDE_FUNCTION_ATTRIBUTES
|
|
simde__m256i
|
|
simde_mm256_add_epi64 (simde__m256i a, simde__m256i b) {
|
|
#if defined(SIMDE_X86_AVX2_NATIVE)
|
|
return _mm256_add_epi64(a, b);
|
|
#else
|
|
simde__m256i_private
|
|
r_,
|
|
a_ = simde__m256i_to_private(a),
|
|
b_ = simde__m256i_to_private(b);
|
|
|
|
#if SIMDE_NATURAL_VECTOR_SIZE_LE(128)
|
|
r_.m128i[0] = simde_mm_add_epi64(a_.m128i[0], b_.m128i[0]);
|
|
r_.m128i[1] = simde_mm_add_epi64(a_.m128i[1], b_.m128i[1]);
|
|
#elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) && !defined(SIMDE_BUG_CLANG_BAD_VI64_OPS)
|
|
r_.i64 = a_.i64 + b_.i64;
|
|
#else
|
|
SIMDE_VECTORIZE
|
|
for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) {
|
|
r_.i64[i] = a_.i64[i] + b_.i64[i];
|
|
}
|
|
#endif
|
|
|
|
return simde__m256i_from_private(r_);
|
|
#endif
|
|
}
|
|
#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
|
|
#undef _mm256_add_epi64
|
|
#define _mm256_add_epi64(a, b) simde_mm256_add_epi64(a, b)
|
|
#endif
|
|
|
|
SIMDE_FUNCTION_ATTRIBUTES
|
|
simde__m256i
|
|
simde_mm256_alignr_epi8 (simde__m256i a, simde__m256i b, int count)
|
|
SIMDE_REQUIRE_CONSTANT_RANGE(count, 0, 255) {
|
|
simde__m256i_private
|
|
r_,
|
|
a_ = simde__m256i_to_private(a),
|
|
b_ = simde__m256i_to_private(b);
|
|
|
|
if (HEDLEY_UNLIKELY(count > 31))
|
|
return simde_mm256_setzero_si256();
|
|
|
|
for (size_t h = 0 ; h < (sizeof(r_.m128i) / sizeof(r_.m128i[0])) ; h++) {
|
|
SIMDE_VECTORIZE
|
|
for (size_t i = 0 ; i < (sizeof(r_.m128i_private[h].i8) / sizeof(r_.m128i_private[h].i8[0])) ; i++) {
|
|
const int srcpos = count + HEDLEY_STATIC_CAST(int, i);
|
|
if (srcpos > 31) {
|
|
r_.m128i_private[h].i8[i] = 0;
|
|
} else if (srcpos > 15) {
|
|
r_.m128i_private[h].i8[i] = a_.m128i_private[h].i8[(srcpos) & 15];
|
|
} else {
|
|
r_.m128i_private[h].i8[i] = b_.m128i_private[h].i8[srcpos];
|
|
}
|
|
}
|
|
}
|
|
|
|
return simde__m256i_from_private(r_);
|
|
}
|
|
#if defined(SIMDE_X86_AVX2_NATIVE) && !defined(SIMDE_BUG_PGI_30106)
|
|
# define simde_mm256_alignr_epi8(a, b, count) _mm256_alignr_epi8(a, b, count)
|
|
#elif SIMDE_NATURAL_VECTOR_SIZE_LE(128)
|
|
# define simde_mm256_alignr_epi8(a, b, count) \
|
|
simde_mm256_set_m128i( \
|
|
simde_mm_alignr_epi8(simde_mm256_extracti128_si256(a, 1), simde_mm256_extracti128_si256(b, 1), (count)), \
|
|
simde_mm_alignr_epi8(simde_mm256_extracti128_si256(a, 0), simde_mm256_extracti128_si256(b, 0), (count)))
|
|
#endif
|
|
#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
|
|
#undef _mm256_alignr_epi8
|
|
#define _mm256_alignr_epi8(a, b, count) simde_mm256_alignr_epi8(a, b, (count))
|
|
#endif
|
|
|
|
SIMDE_FUNCTION_ATTRIBUTES
|
|
simde__m256i
|
|
simde_mm256_and_si256 (simde__m256i a, simde__m256i b) {
|
|
#if defined(SIMDE_X86_AVX2_NATIVE)
|
|
return _mm256_and_si256(a, b);
|
|
#else
|
|
simde__m256i_private
|
|
r_,
|
|
a_ = simde__m256i_to_private(a),
|
|
b_ = simde__m256i_to_private(b);
|
|
|
|
#if SIMDE_NATURAL_VECTOR_SIZE_LE(128)
|
|
r_.m128i[0] = simde_mm_and_si128(a_.m128i[0], b_.m128i[0]);
|
|
r_.m128i[1] = simde_mm_and_si128(a_.m128i[1], b_.m128i[1]);
|
|
#elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
|
|
r_.i32f = a_.i32f & b_.i32f;
|
|
#else
|
|
SIMDE_VECTORIZE
|
|
for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) {
|
|
r_.i64[i] = a_.i64[i] & b_.i64[i];
|
|
}
|
|
#endif
|
|
|
|
return simde__m256i_from_private(r_);
|
|
#endif
|
|
}
|
|
#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
|
|
#undef _mm256_and_si256
|
|
#define _mm256_and_si256(a, b) simde_mm256_and_si256(a, b)
|
|
#endif
|
|
|
|
SIMDE_FUNCTION_ATTRIBUTES
|
|
simde__m256i
|
|
simde_mm256_andnot_si256 (simde__m256i a, simde__m256i b) {
|
|
#if defined(SIMDE_X86_AVX2_NATIVE)
|
|
return _mm256_andnot_si256(a, b);
|
|
#else
|
|
simde__m256i_private
|
|
r_,
|
|
a_ = simde__m256i_to_private(a),
|
|
b_ = simde__m256i_to_private(b);
|
|
|
|
#if SIMDE_NATURAL_VECTOR_SIZE_LE(128)
|
|
r_.m128i[0] = simde_mm_andnot_si128(a_.m128i[0], b_.m128i[0]);
|
|
r_.m128i[1] = simde_mm_andnot_si128(a_.m128i[1], b_.m128i[1]);
|
|
#else
|
|
SIMDE_VECTORIZE
|
|
for (size_t i = 0 ; i < (sizeof(r_.i32f) / sizeof(r_.i32f[0])) ; i++) {
|
|
r_.i32f[i] = ~(a_.i32f[i]) & b_.i32f[i];
|
|
}
|
|
#endif
|
|
|
|
return simde__m256i_from_private(r_);
|
|
#endif
|
|
}
|
|
#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
|
|
#undef _mm256_andnot_si256
|
|
#define _mm256_andnot_si256(a, b) simde_mm256_andnot_si256(a, b)
|
|
#endif
|
|
|
|
SIMDE_FUNCTION_ATTRIBUTES
|
|
simde__m256i
|
|
simde_mm256_adds_epi8 (simde__m256i a, simde__m256i b) {
|
|
#if defined(SIMDE_X86_AVX2_NATIVE)
|
|
return _mm256_adds_epi8(a, b);
|
|
#else
|
|
simde__m256i_private
|
|
r_,
|
|
a_ = simde__m256i_to_private(a),
|
|
b_ = simde__m256i_to_private(b);
|
|
|
|
#if SIMDE_NATURAL_VECTOR_SIZE_LE(128)
|
|
r_.m128i[0] = simde_mm_adds_epi8(a_.m128i[0], b_.m128i[0]);
|
|
r_.m128i[1] = simde_mm_adds_epi8(a_.m128i[1], b_.m128i[1]);
|
|
#else
|
|
SIMDE_VECTORIZE
|
|
for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) {
|
|
r_.i8[i] = simde_math_adds_i8(a_.i8[i], b_.i8[i]);
|
|
}
|
|
#endif
|
|
|
|
return simde__m256i_from_private(r_);
|
|
#endif
|
|
}
|
|
#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
|
|
#undef _mm256_adds_epi8
|
|
#define _mm256_adds_epi8(a, b) simde_mm256_adds_epi8(a, b)
|
|
#endif
|
|
|
|
SIMDE_FUNCTION_ATTRIBUTES
|
|
simde__m256i
|
|
simde_mm256_adds_epi16(simde__m256i a, simde__m256i b) {
|
|
#if defined(SIMDE_X86_AVX2_NATIVE)
|
|
return _mm256_adds_epi16(a, b);
|
|
#else
|
|
simde__m256i_private
|
|
r_,
|
|
a_ = simde__m256i_to_private(a),
|
|
b_ = simde__m256i_to_private(b);
|
|
|
|
#if SIMDE_NATURAL_VECTOR_SIZE_LE(128)
|
|
r_.m128i[0] = simde_mm_adds_epi16(a_.m128i[0], b_.m128i[0]);
|
|
r_.m128i[1] = simde_mm_adds_epi16(a_.m128i[1], b_.m128i[1]);
|
|
#else
|
|
SIMDE_VECTORIZE
|
|
for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) {
|
|
r_.i16[i] = simde_math_adds_i16(a_.i16[i], b_.i16[i]);
|
|
}
|
|
#endif
|
|
|
|
return simde__m256i_from_private(r_);
|
|
#endif
|
|
}
|
|
#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
|
|
#undef _mm256_adds_epi16
|
|
#define _mm256_adds_epi16(a, b) simde_mm256_adds_epi16(a, b)
|
|
#endif
|
|
|
|
SIMDE_FUNCTION_ATTRIBUTES
|
|
simde__m256i
|
|
simde_mm256_hadds_epi16 (simde__m256i a, simde__m256i b) {
|
|
#if defined(SIMDE_X86_AVX2_NATIVE)
|
|
return _mm256_hadds_epi16(a, b);
|
|
#else
|
|
return simde_mm256_adds_epi16(simde_x_mm256_deinterleaveeven_epi16(a, b), simde_x_mm256_deinterleaveodd_epi16(a, b));
|
|
#endif
|
|
}
|
|
#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
|
|
#undef _mm256_hadds_epi16
|
|
#define _mm256_hadds_epi16(a, b) simde_mm256_hadds_epi16(a, b)
|
|
#endif
|
|
|
|
SIMDE_FUNCTION_ATTRIBUTES
|
|
simde__m256i
|
|
simde_mm256_adds_epu8 (simde__m256i a, simde__m256i b) {
|
|
#if defined(SIMDE_X86_AVX2_NATIVE)
|
|
return _mm256_adds_epu8(a, b);
|
|
#else
|
|
simde__m256i_private
|
|
r_,
|
|
a_ = simde__m256i_to_private(a),
|
|
b_ = simde__m256i_to_private(b);
|
|
|
|
#if SIMDE_NATURAL_VECTOR_SIZE_LE(128)
|
|
r_.m128i[0] = simde_mm_adds_epu8(a_.m128i[0], b_.m128i[0]);
|
|
r_.m128i[1] = simde_mm_adds_epu8(a_.m128i[1], b_.m128i[1]);
|
|
#else
|
|
SIMDE_VECTORIZE
|
|
for (size_t i = 0 ; i < (sizeof(r_.u8) / sizeof(r_.u8[0])) ; i++) {
|
|
r_.u8[i] = simde_math_adds_u8(a_.u8[i], b_.u8[i]);
|
|
}
|
|
#endif
|
|
|
|
return simde__m256i_from_private(r_);
|
|
#endif
|
|
}
|
|
#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
|
|
#undef _mm256_adds_epu8
|
|
#define _mm256_adds_epu8(a, b) simde_mm256_adds_epu8(a, b)
|
|
#endif
|
|
|
|
SIMDE_FUNCTION_ATTRIBUTES
|
|
simde__m256i
|
|
simde_mm256_adds_epu16(simde__m256i a, simde__m256i b) {
|
|
#if defined(SIMDE_X86_AVX2_NATIVE)
|
|
return _mm256_adds_epu16(a, b);
|
|
#else
|
|
simde__m256i_private
|
|
r_,
|
|
a_ = simde__m256i_to_private(a),
|
|
b_ = simde__m256i_to_private(b);
|
|
|
|
#if SIMDE_NATURAL_VECTOR_SIZE_LE(128)
|
|
r_.m128i[0] = simde_mm_adds_epu16(a_.m128i[0], b_.m128i[0]);
|
|
r_.m128i[1] = simde_mm_adds_epu16(a_.m128i[1], b_.m128i[1]);
|
|
#else
|
|
SIMDE_VECTORIZE
|
|
for (size_t i = 0 ; i < (sizeof(r_.u16) / sizeof(r_.u16[0])) ; i++) {
|
|
r_.u16[i] = simde_math_adds_u16(a_.u16[i], b_.u16[i]);
|
|
}
|
|
#endif
|
|
|
|
return simde__m256i_from_private(r_);
|
|
#endif
|
|
}
|
|
#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
|
|
#undef _mm256_adds_epu16
|
|
#define _mm256_adds_epu16(a, b) simde_mm256_adds_epu16(a, b)
|
|
#endif
|
|
|
|
SIMDE_FUNCTION_ATTRIBUTES
|
|
simde__m256i
|
|
simde_mm256_avg_epu8 (simde__m256i a, simde__m256i b) {
|
|
#if defined(SIMDE_X86_AVX2_NATIVE)
|
|
return _mm256_avg_epu8(a, b);
|
|
#else
|
|
simde__m256i_private
|
|
r_,
|
|
a_ = simde__m256i_to_private(a),
|
|
b_ = simde__m256i_to_private(b);
|
|
|
|
SIMDE_VECTORIZE
|
|
for (size_t i = 0 ; i < (sizeof(r_.u8) / sizeof(r_.u8[0])) ; i++) {
|
|
r_.u8[i] = (a_.u8[i] + b_.u8[i] + 1) >> 1;
|
|
}
|
|
|
|
return simde__m256i_from_private(r_);
|
|
#endif
|
|
}
|
|
#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
|
|
#undef _mm256_avg_epu8
|
|
#define _mm256_avg_epu8(a, b) simde_mm256_avg_epu8(a, b)
|
|
#endif
|
|
|
|
SIMDE_FUNCTION_ATTRIBUTES
|
|
simde__m256i
|
|
simde_mm256_avg_epu16 (simde__m256i a, simde__m256i b) {
|
|
#if defined(SIMDE_X86_AVX2_NATIVE)
|
|
return _mm256_avg_epu16(a, b);
|
|
#else
|
|
simde__m256i_private
|
|
r_,
|
|
a_ = simde__m256i_to_private(a),
|
|
b_ = simde__m256i_to_private(b);
|
|
|
|
SIMDE_VECTORIZE
|
|
for (size_t i = 0 ; i < (sizeof(r_.u16) / sizeof(r_.u16[0])) ; i++) {
|
|
r_.u16[i] = (a_.u16[i] + b_.u16[i] + 1) >> 1;
|
|
}
|
|
|
|
return simde__m256i_from_private(r_);
|
|
#endif
|
|
}
|
|
#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
|
|
#undef _mm256_avg_epu16
|
|
#define _mm256_avg_epu16(a, b) simde_mm256_avg_epu16(a, b)
|
|
#endif
|
|
|
|
SIMDE_FUNCTION_ATTRIBUTES
|
|
simde__m128i
|
|
simde_mm_blend_epi32(simde__m128i a, simde__m128i b, const int imm8)
|
|
SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 15) {
|
|
simde__m128i_private
|
|
r_,
|
|
a_ = simde__m128i_to_private(a),
|
|
b_ = simde__m128i_to_private(b);
|
|
|
|
SIMDE_VECTORIZE
|
|
for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
|
|
r_.i32[i] = ((imm8 >> i) & 1) ? b_.i32[i] : a_.i32[i];
|
|
}
|
|
|
|
return simde__m128i_from_private(r_);
|
|
}
|
|
#if defined(SIMDE_X86_AVX2_NATIVE)
|
|
# define simde_mm_blend_epi32(a, b, imm8) _mm_blend_epi32(a, b, imm8)
|
|
#elif SIMDE_NATURAL_VECTOR_SIZE_LE(128)
|
|
# define simde_mm_blend_epi32(a, b, imm8) \
|
|
simde_mm_castps_si128(simde_mm_blend_ps(simde_mm_castsi128_ps(a), simde_mm_castsi128_ps(b), (imm8)))
|
|
#endif
|
|
#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
|
|
#undef _mm_blend_epi32
|
|
#define _mm_blend_epi32(a, b, imm8) simde_mm_blend_epi32(a, b, imm8)
|
|
#endif
|
|
|
|
SIMDE_FUNCTION_ATTRIBUTES
|
|
simde__m256i
|
|
simde_mm256_blend_epi16(simde__m256i a, simde__m256i b, const int imm8)
|
|
SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 255) {
|
|
simde__m256i_private
|
|
r_,
|
|
a_ = simde__m256i_to_private(a),
|
|
b_ = simde__m256i_to_private(b);
|
|
|
|
SIMDE_VECTORIZE
|
|
for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) {
|
|
r_.i16[i] = ((imm8 >> i%8) & 1) ? b_.i16[i] : a_.i16[i];
|
|
}
|
|
|
|
return simde__m256i_from_private(r_);
|
|
}
|
|
#if defined(SIMDE_X86_AVX2_NATIVE) && defined(SIMDE_BUG_CLANG_REV_234560)
|
|
# define simde_mm256_blend_epi16(a, b, imm8) _mm256_castpd_si256(_mm256_blend_epi16(a, b, imm8))
|
|
#elif defined(SIMDE_X86_AVX2_NATIVE)
|
|
# define simde_mm256_blend_epi16(a, b, imm8) _mm256_blend_epi16(a, b, imm8)
|
|
#elif SIMDE_NATURAL_VECTOR_SIZE_LE(128)
|
|
# define simde_mm256_blend_epi16(a, b, imm8) \
|
|
simde_mm256_set_m128i( \
|
|
simde_mm_blend_epi16(simde_mm256_extracti128_si256(a, 1), simde_mm256_extracti128_si256(b, 1), (imm8)), \
|
|
simde_mm_blend_epi16(simde_mm256_extracti128_si256(a, 0), simde_mm256_extracti128_si256(b, 0), (imm8)))
|
|
#endif
|
|
#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
|
|
#undef _mm256_blend_epi16
|
|
#define _mm256_blend_epi16(a, b, imm8) simde_mm256_blend_epi16(a, b, imm8)
|
|
#endif
|
|
|
|
|
|
SIMDE_FUNCTION_ATTRIBUTES
|
|
simde__m256i
|
|
simde_mm256_blend_epi32(simde__m256i a, simde__m256i b, const int imm8)
|
|
SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 255) {
|
|
simde__m256i_private
|
|
r_,
|
|
a_ = simde__m256i_to_private(a),
|
|
b_ = simde__m256i_to_private(b);
|
|
|
|
SIMDE_VECTORIZE
|
|
for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
|
|
r_.i32[i] = ((imm8 >> i) & 1) ? b_.i32[i] : a_.i32[i];
|
|
}
|
|
|
|
return simde__m256i_from_private(r_);
|
|
}
|
|
#if defined(SIMDE_X86_AVX2_NATIVE)
|
|
# define simde_mm256_blend_epi32(a, b, imm8) _mm256_blend_epi32(a, b, imm8)
|
|
#elif SIMDE_NATURAL_VECTOR_SIZE_LE(128)
|
|
# define simde_mm256_blend_epi32(a, b, imm8) \
|
|
simde_mm256_set_m128i( \
|
|
simde_mm_blend_epi32(simde_mm256_extracti128_si256(a, 1), simde_mm256_extracti128_si256(b, 1), (imm8) >> 4), \
|
|
simde_mm_blend_epi32(simde_mm256_extracti128_si256(a, 0), simde_mm256_extracti128_si256(b, 0), (imm8) & 0x0F))
|
|
#endif
|
|
#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
|
|
#undef _mm256_blend_epi32
|
|
#define _mm256_blend_epi32(a, b, imm8) simde_mm256_blend_epi32(a, b, imm8)
|
|
#endif
|
|
|
|
|
|
SIMDE_FUNCTION_ATTRIBUTES
|
|
simde__m256i
|
|
simde_mm256_blendv_epi8(simde__m256i a, simde__m256i b, simde__m256i mask) {
|
|
#if defined(SIMDE_X86_AVX2_NATIVE)
|
|
return _mm256_blendv_epi8(a, b, mask);
|
|
#else
|
|
simde__m256i_private
|
|
r_,
|
|
a_ = simde__m256i_to_private(a),
|
|
b_ = simde__m256i_to_private(b),
|
|
mask_ = simde__m256i_to_private(mask);
|
|
|
|
#if SIMDE_NATURAL_VECTOR_SIZE_LE(128)
|
|
r_.m128i[0] = simde_mm_blendv_epi8(a_.m128i[0], b_.m128i[0], mask_.m128i[0]);
|
|
r_.m128i[1] = simde_mm_blendv_epi8(a_.m128i[1], b_.m128i[1], mask_.m128i[1]);
|
|
#else
|
|
SIMDE_VECTORIZE
|
|
for (size_t i = 0 ; i < (sizeof(r_.u8) / sizeof(r_.u8[0])) ; i++) {
|
|
if (mask_.u8[i] & 0x80) {
|
|
r_.u8[i] = b_.u8[i];
|
|
} else {
|
|
r_.u8[i] = a_.u8[i];
|
|
}
|
|
}
|
|
#endif
|
|
|
|
return simde__m256i_from_private(r_);
|
|
#endif
|
|
}
|
|
#if defined(SIMDE_X86_AVX2_NATIVE)
|
|
# define simde_mm256_blendv_epi8(a, b, imm8) _mm256_blendv_epi8(a, b, imm8)
|
|
#endif
|
|
#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
|
|
#undef _mm256_blendv_epi8
|
|
#define _mm256_blendv_epi8(a, b, mask) simde_mm256_blendv_epi8(a, b, mask)
|
|
#endif
|
|
|
|
SIMDE_FUNCTION_ATTRIBUTES
|
|
simde__m128i
|
|
simde_mm_broadcastb_epi8 (simde__m128i a) {
|
|
#if defined(SIMDE_X86_AVX2_NATIVE)
|
|
return _mm_broadcastb_epi8(a);
|
|
#else
|
|
simde__m128i_private r_;
|
|
simde__m128i_private a_= simde__m128i_to_private(a);
|
|
|
|
SIMDE_VECTORIZE
|
|
for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) {
|
|
r_.i8[i] = a_.i8[0];
|
|
}
|
|
|
|
return simde__m128i_from_private(r_);
|
|
#endif
|
|
}
|
|
#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
|
|
#undef _mm_broadcastb_epi8
|
|
#define _mm_broadcastb_epi8(a) simde_mm_broadcastb_epi8(a)
|
|
#endif
|
|
|
|
SIMDE_FUNCTION_ATTRIBUTES
|
|
simde__m256i
|
|
simde_mm256_broadcastb_epi8 (simde__m128i a) {
|
|
#if defined(SIMDE_X86_AVX2_NATIVE)
|
|
return _mm256_broadcastb_epi8(a);
|
|
#else
|
|
simde__m256i_private r_;
|
|
simde__m128i_private a_= simde__m128i_to_private(a);
|
|
|
|
SIMDE_VECTORIZE
|
|
for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) {
|
|
r_.i8[i] = a_.i8[0];
|
|
}
|
|
|
|
return simde__m256i_from_private(r_);
|
|
#endif
|
|
}
|
|
#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
|
|
#undef _mm256_broadcastb_epi8
|
|
#define _mm256_broadcastb_epi8(a) simde_mm256_broadcastb_epi8(a)
|
|
#endif
|
|
|
|
SIMDE_FUNCTION_ATTRIBUTES
|
|
simde__m128i
|
|
simde_mm_broadcastw_epi16 (simde__m128i a) {
|
|
#if defined(SIMDE_X86_AVX2_NATIVE)
|
|
return _mm_broadcastw_epi16(a);
|
|
#else
|
|
simde__m128i_private r_;
|
|
simde__m128i_private a_= simde__m128i_to_private(a);
|
|
|
|
SIMDE_VECTORIZE
|
|
for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) {
|
|
r_.i16[i] = a_.i16[0];
|
|
}
|
|
|
|
return simde__m128i_from_private(r_);
|
|
#endif
|
|
}
|
|
#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
|
|
#undef _mm_broadcastw_epi16
|
|
#define _mm_broadcastw_epi16(a) simde_mm_broadcastw_epi16(a)
|
|
#endif
|
|
|
|
SIMDE_FUNCTION_ATTRIBUTES
|
|
simde__m256i
|
|
simde_mm256_broadcastw_epi16 (simde__m128i a) {
|
|
#if defined(SIMDE_X86_AVX2_NATIVE)
|
|
return _mm256_broadcastw_epi16(a);
|
|
#else
|
|
simde__m256i_private r_;
|
|
simde__m128i_private a_= simde__m128i_to_private(a);
|
|
|
|
SIMDE_VECTORIZE
|
|
for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) {
|
|
r_.i16[i] = a_.i16[0];
|
|
}
|
|
|
|
return simde__m256i_from_private(r_);
|
|
#endif
|
|
}
|
|
#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
|
|
#undef _mm256_broadcastw_epi16
|
|
#define _mm256_broadcastw_epi16(a) simde_mm256_broadcastw_epi16(a)
|
|
#endif
|
|
|
|
SIMDE_FUNCTION_ATTRIBUTES
|
|
simde__m128i
|
|
simde_mm_broadcastd_epi32 (simde__m128i a) {
|
|
#if defined(SIMDE_X86_AVX2_NATIVE)
|
|
return _mm_broadcastd_epi32(a);
|
|
#else
|
|
simde__m128i_private r_;
|
|
simde__m128i_private a_= simde__m128i_to_private(a);
|
|
|
|
SIMDE_VECTORIZE
|
|
for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
|
|
r_.i32[i] = a_.i32[0];
|
|
}
|
|
|
|
return simde__m128i_from_private(r_);
|
|
#endif
|
|
}
|
|
#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
|
|
#undef _mm_broadcastd_epi32
|
|
#define _mm_broadcastd_epi32(a) simde_mm_broadcastd_epi32(a)
|
|
#endif
|
|
|
|
SIMDE_FUNCTION_ATTRIBUTES
|
|
simde__m256i
|
|
simde_mm256_broadcastd_epi32 (simde__m128i a) {
|
|
#if defined(SIMDE_X86_AVX2_NATIVE)
|
|
return _mm256_broadcastd_epi32(a);
|
|
#else
|
|
simde__m256i_private r_;
|
|
simde__m128i_private a_= simde__m128i_to_private(a);
|
|
|
|
SIMDE_VECTORIZE
|
|
for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
|
|
r_.i32[i] = a_.i32[0];
|
|
}
|
|
|
|
return simde__m256i_from_private(r_);
|
|
#endif
|
|
}
|
|
#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
|
|
#undef _mm256_broadcastd_epi32
|
|
#define _mm256_broadcastd_epi32(a) simde_mm256_broadcastd_epi32(a)
|
|
#endif
|
|
|
|
SIMDE_FUNCTION_ATTRIBUTES
|
|
simde__m128i
|
|
simde_mm_broadcastq_epi64 (simde__m128i a) {
|
|
#if defined(SIMDE_X86_AVX2_NATIVE)
|
|
return _mm_broadcastq_epi64(a);
|
|
#else
|
|
simde__m128i_private r_;
|
|
simde__m128i_private a_= simde__m128i_to_private(a);
|
|
|
|
SIMDE_VECTORIZE
|
|
for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) {
|
|
r_.i64[i] = a_.i64[0];
|
|
}
|
|
|
|
return simde__m128i_from_private(r_);
|
|
#endif
|
|
}
|
|
#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
|
|
#undef _mm_broadcastq_epi64
|
|
#define _mm_broadcastq_epi64(a) simde_mm_broadcastq_epi64(a)
|
|
#endif
|
|
|
|
SIMDE_FUNCTION_ATTRIBUTES
|
|
simde__m256i
|
|
simde_mm256_broadcastq_epi64 (simde__m128i a) {
|
|
#if defined(SIMDE_X86_AVX2_NATIVE)
|
|
return _mm256_broadcastq_epi64(a);
|
|
#else
|
|
simde__m256i_private r_;
|
|
simde__m128i_private a_= simde__m128i_to_private(a);
|
|
|
|
SIMDE_VECTORIZE
|
|
for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) {
|
|
r_.i64[i] = a_.i64[0];
|
|
}
|
|
|
|
return simde__m256i_from_private(r_);
|
|
#endif
|
|
}
|
|
#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
|
|
#undef _mm256_broadcastq_epi64
|
|
#define _mm256_broadcastq_epi64(a) simde_mm256_broadcastq_epi64(a)
|
|
#endif
|
|
|
|
SIMDE_FUNCTION_ATTRIBUTES
|
|
simde__m128
|
|
simde_mm_broadcastss_ps (simde__m128 a) {
|
|
#if defined(SIMDE_X86_AVX2_NATIVE)
|
|
return _mm_broadcastss_ps(a);
|
|
#else
|
|
simde__m128_private r_;
|
|
simde__m128_private a_= simde__m128_to_private(a);
|
|
|
|
SIMDE_VECTORIZE
|
|
for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) {
|
|
r_.f32[i] = a_.f32[0];
|
|
}
|
|
|
|
return simde__m128_from_private(r_);
|
|
#endif
|
|
}
|
|
#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
|
|
#undef _mm_broadcastss_ps
|
|
#define _mm_broadcastss_ps(a) simde_mm_broadcastss_ps(a)
|
|
#endif
|
|
|
|
SIMDE_FUNCTION_ATTRIBUTES
|
|
simde__m256
|
|
simde_mm256_broadcastss_ps (simde__m128 a) {
|
|
#if defined(SIMDE_X86_AVX2_NATIVE)
|
|
return _mm256_broadcastss_ps(a);
|
|
#else
|
|
simde__m256_private r_;
|
|
simde__m128_private a_= simde__m128_to_private(a);
|
|
|
|
SIMDE_VECTORIZE
|
|
for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) {
|
|
r_.f32[i] = a_.f32[0];
|
|
}
|
|
|
|
return simde__m256_from_private(r_);
|
|
#endif
|
|
}
|
|
#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
|
|
#undef _mm256_broadcastss_ps
|
|
#define _mm256_broadcastss_ps(a) simde_mm256_broadcastss_ps(a)
|
|
#endif
|
|
|
|
SIMDE_FUNCTION_ATTRIBUTES
|
|
simde__m128d
|
|
simde_mm_broadcastsd_pd (simde__m128d a) {
|
|
return simde_mm_movedup_pd(a);
|
|
}
|
|
#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
|
|
#undef _mm_broadcastsd_pd
|
|
#define _mm_broadcastsd_pd(a) simde_mm_broadcastsd_pd(a)
|
|
#endif
|
|
|
|
SIMDE_FUNCTION_ATTRIBUTES
|
|
simde__m256d
|
|
simde_mm256_broadcastsd_pd (simde__m128d a) {
|
|
#if defined(SIMDE_X86_AVX2_NATIVE)
|
|
return _mm256_broadcastsd_pd(a);
|
|
#else
|
|
simde__m256d_private r_;
|
|
simde__m128d_private a_= simde__m128d_to_private(a);
|
|
|
|
SIMDE_VECTORIZE
|
|
for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) {
|
|
r_.f64[i] = a_.f64[0];
|
|
}
|
|
|
|
return simde__m256d_from_private(r_);
|
|
#endif
|
|
}
|
|
#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
|
|
#undef _mm256_broadcastsd_pd
|
|
#define _mm256_broadcastsd_pd(a) simde_mm256_broadcastsd_pd(a)
|
|
#endif
|
|
|
|
SIMDE_FUNCTION_ATTRIBUTES
|
|
simde__m256i
|
|
simde_mm256_broadcastsi128_si256 (simde__m128i a) {
|
|
#if defined(SIMDE_X86_AVX2_NATIVE) && \
|
|
(!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(4,8,0))
|
|
return _mm256_broadcastsi128_si256(a);
|
|
#else
|
|
simde__m256i_private r_;
|
|
simde__m128i_private a_ = simde__m128i_to_private(a);
|
|
|
|
#if SIMDE_NATURAL_VECTOR_SIZE_LE(128)
|
|
r_.m128i_private[0] = a_;
|
|
r_.m128i_private[1] = a_;
|
|
#else
|
|
r_.i64[0] = a_.i64[0];
|
|
r_.i64[1] = a_.i64[1];
|
|
r_.i64[2] = a_.i64[0];
|
|
r_.i64[3] = a_.i64[1];
|
|
#endif
|
|
|
|
return simde__m256i_from_private(r_);
|
|
#endif
|
|
}
|
|
#define simde_mm_broadcastsi128_si256(a) simde_mm256_broadcastsi128_si256(a)
|
|
#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
|
|
#undef _mm256_broadcastsi128_si256
|
|
#define _mm256_broadcastsi128_si256(a) simde_mm256_broadcastsi128_si256(a)
|
|
#undef _mm_broadcastsi128_si256
|
|
#define _mm_broadcastsi128_si256(a) simde_mm256_broadcastsi128_si256(a)
|
|
#endif
|
|
|
|
SIMDE_FUNCTION_ATTRIBUTES
|
|
simde__m256i
|
|
simde_mm256_bslli_epi128 (simde__m256i a, const int imm8)
|
|
SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 255) {
|
|
simde__m256i_private
|
|
r_,
|
|
a_ = simde__m256i_to_private(a);
|
|
const int ssize = HEDLEY_STATIC_CAST(int, (sizeof(r_.i8) / sizeof(r_.i8[0])));
|
|
|
|
SIMDE_VECTORIZE
|
|
for (int i = 0 ; i < ssize ; i++) {
|
|
const int e = i - imm8;
|
|
if(i >= (ssize/2)) {
|
|
if(e >= (ssize/2) && e < ssize)
|
|
r_.i8[i] = a_.i8[e];
|
|
else
|
|
r_.i8[i] = 0;
|
|
}
|
|
else{
|
|
if(e >= 0 && e < (ssize/2))
|
|
r_.i8[i] = a_.i8[e];
|
|
else
|
|
r_.i8[i] = 0;
|
|
}
|
|
}
|
|
|
|
return simde__m256i_from_private(r_);
|
|
}
|
|
#if defined(SIMDE_X86_AVX2_NATIVE) && \
|
|
(!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(4,8,0)) && \
|
|
SIMDE_DETECT_CLANG_VERSION_CHECK(3,7,0)
|
|
#define simde_mm256_bslli_epi128(a, imm8) _mm256_bslli_epi128(a, imm8)
|
|
#endif
|
|
#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
|
|
#undef _mm256_bslli_epi128
|
|
#define _mm256_bslli_epi128(a, imm8) simde_mm256_bslli_epi128(a, imm8)
|
|
#endif
|
|
|
|
SIMDE_FUNCTION_ATTRIBUTES
|
|
simde__m256i
|
|
simde_mm256_bsrli_epi128 (simde__m256i a, const int imm8)
|
|
SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 255) {
|
|
simde__m256i_private
|
|
r_,
|
|
a_ = simde__m256i_to_private(a);
|
|
const int ssize = HEDLEY_STATIC_CAST(int, (sizeof(r_.i8) / sizeof(r_.i8[0])));
|
|
|
|
SIMDE_VECTORIZE
|
|
for (int i = 0 ; i < ssize ; i++) {
|
|
const int e = i + imm8;
|
|
if(i < (ssize/2)) {
|
|
if(e >= 0 && e < (ssize/2))
|
|
r_.i8[i] = a_.i8[e];
|
|
else
|
|
r_.i8[i] = 0;
|
|
}
|
|
else{
|
|
if(e >= (ssize/2) && e < ssize)
|
|
r_.i8[i] = a_.i8[e];
|
|
else
|
|
r_.i8[i] = 0;
|
|
}
|
|
}
|
|
|
|
return simde__m256i_from_private(r_);
|
|
}
|
|
#if defined(SIMDE_X86_AVX2_NATIVE) && \
|
|
(!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(4,8,0)) && \
|
|
SIMDE_DETECT_CLANG_VERSION_CHECK(3,7,0)
|
|
#define simde_mm256_bsrli_epi128(a, imm8) _mm256_bsrli_epi128(a, imm8)
|
|
#endif
|
|
#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
|
|
#undef _mm256_bsrli_epi128
|
|
#define _mm256_bsrli_epi128(a, imm8) simde_mm256_bsrli_epi128(a, imm8)
|
|
#endif
|
|
|
|
SIMDE_FUNCTION_ATTRIBUTES
|
|
simde__m256i
|
|
simde_mm256_cmpeq_epi8 (simde__m256i a, simde__m256i b) {
|
|
#if defined(SIMDE_X86_AVX2_NATIVE)
|
|
return _mm256_cmpeq_epi8(a, b);
|
|
#else
|
|
simde__m256i_private
|
|
r_,
|
|
a_ = simde__m256i_to_private(a),
|
|
b_ = simde__m256i_to_private(b);
|
|
|
|
#if SIMDE_NATURAL_VECTOR_SIZE_LE(128)
|
|
r_.m128i[0] = simde_mm_cmpeq_epi8(a_.m128i[0], b_.m128i[0]);
|
|
r_.m128i[1] = simde_mm_cmpeq_epi8(a_.m128i[1], b_.m128i[1]);
|
|
#else
|
|
SIMDE_VECTORIZE
|
|
for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) {
|
|
r_.i8[i] = (a_.i8[i] == b_.i8[i]) ? ~INT8_C(0) : INT8_C(0);
|
|
}
|
|
#endif
|
|
|
|
return simde__m256i_from_private(r_);
|
|
#endif
|
|
}
|
|
#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
|
|
#undef _mm256_cmpeq_epi8
|
|
#define _mm256_cmpeq_epi8(a, b) simde_mm256_cmpeq_epi8(a, b)
|
|
#endif
|
|
|
|
SIMDE_FUNCTION_ATTRIBUTES
|
|
simde__m256i
|
|
simde_mm256_cmpeq_epi16 (simde__m256i a, simde__m256i b) {
|
|
#if defined(SIMDE_X86_AVX2_NATIVE)
|
|
return _mm256_cmpeq_epi16(a, b);
|
|
#else
|
|
simde__m256i_private
|
|
r_,
|
|
a_ = simde__m256i_to_private(a),
|
|
b_ = simde__m256i_to_private(b);
|
|
|
|
#if SIMDE_NATURAL_VECTOR_SIZE_LE(128)
|
|
r_.m128i[0] = simde_mm_cmpeq_epi16(a_.m128i[0], b_.m128i[0]);
|
|
r_.m128i[1] = simde_mm_cmpeq_epi16(a_.m128i[1], b_.m128i[1]);
|
|
#else
|
|
SIMDE_VECTORIZE
|
|
for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) {
|
|
r_.i16[i] = (a_.i16[i] == b_.i16[i]) ? ~INT16_C(0) : INT16_C(0);
|
|
}
|
|
#endif
|
|
|
|
return simde__m256i_from_private(r_);
|
|
#endif
|
|
}
|
|
#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
|
|
#undef _mm256_cmpeq_epi16
|
|
#define _mm256_cmpeq_epi16(a, b) simde_mm256_cmpeq_epi16(a, b)
|
|
#endif
|
|
|
|
SIMDE_FUNCTION_ATTRIBUTES
|
|
simde__m256i
|
|
simde_mm256_cmpeq_epi32 (simde__m256i a, simde__m256i b) {
|
|
#if defined(SIMDE_X86_AVX2_NATIVE)
|
|
return _mm256_cmpeq_epi32(a, b);
|
|
#else
|
|
simde__m256i_private
|
|
r_,
|
|
a_ = simde__m256i_to_private(a),
|
|
b_ = simde__m256i_to_private(b);
|
|
|
|
#if SIMDE_NATURAL_VECTOR_SIZE_LE(128)
|
|
r_.m128i[0] = simde_mm_cmpeq_epi32(a_.m128i[0], b_.m128i[0]);
|
|
r_.m128i[1] = simde_mm_cmpeq_epi32(a_.m128i[1], b_.m128i[1]);
|
|
#else
|
|
SIMDE_VECTORIZE
|
|
for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
|
|
r_.i32[i] = (a_.i32[i] == b_.i32[i]) ? ~INT32_C(0) : INT32_C(0);
|
|
}
|
|
#endif
|
|
|
|
return simde__m256i_from_private(r_);
|
|
#endif
|
|
}
|
|
#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
|
|
#undef _mm256_cmpeq_epi32
|
|
#define _mm256_cmpeq_epi32(a, b) simde_mm256_cmpeq_epi32(a, b)
|
|
#endif
|
|
|
|
SIMDE_FUNCTION_ATTRIBUTES
|
|
simde__m256i
|
|
simde_mm256_cmpeq_epi64 (simde__m256i a, simde__m256i b) {
|
|
#if defined(SIMDE_X86_AVX2_NATIVE)
|
|
return _mm256_cmpeq_epi64(a, b);
|
|
#else
|
|
simde__m256i_private
|
|
r_,
|
|
a_ = simde__m256i_to_private(a),
|
|
b_ = simde__m256i_to_private(b);
|
|
|
|
#if SIMDE_NATURAL_VECTOR_SIZE_LE(128)
|
|
r_.m128i[0] = simde_mm_cmpeq_epi64(a_.m128i[0], b_.m128i[0]);
|
|
r_.m128i[1] = simde_mm_cmpeq_epi64(a_.m128i[1], b_.m128i[1]);
|
|
#else
|
|
SIMDE_VECTORIZE
|
|
for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) {
|
|
r_.i64[i] = (a_.i64[i] == b_.i64[i]) ? ~INT64_C(0) : INT64_C(0);
|
|
}
|
|
#endif
|
|
|
|
return simde__m256i_from_private(r_);
|
|
#endif
|
|
}
|
|
#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
|
|
#undef _mm256_cmpeq_epi64
|
|
#define _mm256_cmpeq_epi64(a, b) simde_mm256_cmpeq_epi64(a, b)
|
|
#endif
|
|
|
|
SIMDE_FUNCTION_ATTRIBUTES
|
|
simde__m256i
|
|
simde_mm256_cmpgt_epi8 (simde__m256i a, simde__m256i b) {
|
|
#if defined(SIMDE_X86_AVX2_NATIVE)
|
|
return _mm256_cmpgt_epi8(a, b);
|
|
#else
|
|
simde__m256i_private
|
|
r_,
|
|
a_ = simde__m256i_to_private(a),
|
|
b_ = simde__m256i_to_private(b);
|
|
|
|
#if SIMDE_NATURAL_VECTOR_SIZE_LE(128)
|
|
r_.m128i[0] = simde_mm_cmpgt_epi8(a_.m128i[0], b_.m128i[0]);
|
|
r_.m128i[1] = simde_mm_cmpgt_epi8(a_.m128i[1], b_.m128i[1]);
|
|
#elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
|
|
r_.i8 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i8), a_.i8 > b_.i8);
|
|
#else
|
|
SIMDE_VECTORIZE
|
|
for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) {
|
|
r_.i8[i] = (a_.i8[i] > b_.i8[i]) ? ~INT8_C(0) : INT8_C(0);
|
|
}
|
|
#endif
|
|
|
|
return simde__m256i_from_private(r_);
|
|
#endif
|
|
}
|
|
#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
|
|
#undef _mm256_cmpgt_epi8
|
|
#define _mm256_cmpgt_epi8(a, b) simde_mm256_cmpgt_epi8(a, b)
|
|
#endif
|
|
|
|
SIMDE_FUNCTION_ATTRIBUTES
|
|
simde__m256i
|
|
simde_mm256_cmpgt_epi16 (simde__m256i a, simde__m256i b) {
|
|
#if defined(SIMDE_X86_AVX2_NATIVE)
|
|
return _mm256_cmpgt_epi16(a, b);
|
|
#else
|
|
simde__m256i_private
|
|
r_,
|
|
a_ = simde__m256i_to_private(a),
|
|
b_ = simde__m256i_to_private(b);
|
|
|
|
#if SIMDE_NATURAL_VECTOR_SIZE_LE(128)
|
|
r_.m128i[0] = simde_mm_cmpgt_epi16(a_.m128i[0], b_.m128i[0]);
|
|
r_.m128i[1] = simde_mm_cmpgt_epi16(a_.m128i[1], b_.m128i[1]);
|
|
#elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
|
|
r_.i16 = a_.i16 > b_.i16;
|
|
#else
|
|
SIMDE_VECTORIZE
|
|
for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) {
|
|
r_.i16[i] = (a_.i16[i] > b_.i16[i]) ? ~INT16_C(0) : INT16_C(0);
|
|
}
|
|
#endif
|
|
|
|
return simde__m256i_from_private(r_);
|
|
#endif
|
|
}
|
|
#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
|
|
#undef _mm256_cmpgt_epi16
|
|
#define _mm256_cmpgt_epi16(a, b) simde_mm256_cmpgt_epi16(a, b)
|
|
#endif
|
|
|
|
SIMDE_FUNCTION_ATTRIBUTES
|
|
simde__m256i
|
|
simde_mm256_cmpgt_epi32 (simde__m256i a, simde__m256i b) {
|
|
#if defined(SIMDE_X86_AVX2_NATIVE)
|
|
return _mm256_cmpgt_epi32(a, b);
|
|
#else
|
|
simde__m256i_private
|
|
r_,
|
|
a_ = simde__m256i_to_private(a),
|
|
b_ = simde__m256i_to_private(b);
|
|
|
|
#if SIMDE_NATURAL_VECTOR_SIZE_LE(128)
|
|
r_.m128i[0] = simde_mm_cmpgt_epi32(a_.m128i[0], b_.m128i[0]);
|
|
r_.m128i[1] = simde_mm_cmpgt_epi32(a_.m128i[1], b_.m128i[1]);
|
|
#elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
|
|
r_.i32 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i32), a_.i32 > b_.i32);
|
|
#else
|
|
SIMDE_VECTORIZE
|
|
for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
|
|
r_.i32[i] = (a_.i32[i] > b_.i32[i]) ? ~INT32_C(0) : INT32_C(0);
|
|
}
|
|
#endif
|
|
|
|
return simde__m256i_from_private(r_);
|
|
#endif
|
|
}
|
|
#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
|
|
#undef _mm256_cmpgt_epi32
|
|
#define _mm256_cmpgt_epi32(a, b) simde_mm256_cmpgt_epi32(a, b)
|
|
#endif
|
|
|
|
SIMDE_FUNCTION_ATTRIBUTES
|
|
simde__m256i
|
|
simde_mm256_cmpgt_epi64 (simde__m256i a, simde__m256i b) {
|
|
#if defined(SIMDE_X86_AVX2_NATIVE)
|
|
return _mm256_cmpgt_epi64(a, b);
|
|
#else
|
|
simde__m256i_private
|
|
r_,
|
|
a_ = simde__m256i_to_private(a),
|
|
b_ = simde__m256i_to_private(b);
|
|
|
|
#if SIMDE_NATURAL_VECTOR_SIZE_LE(128)
|
|
r_.m128i[0] = simde_mm_cmpgt_epi64(a_.m128i[0], b_.m128i[0]);
|
|
r_.m128i[1] = simde_mm_cmpgt_epi64(a_.m128i[1], b_.m128i[1]);
|
|
#elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
|
|
r_.i64 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i64), a_.i64 > b_.i64);
|
|
#else
|
|
SIMDE_VECTORIZE
|
|
for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) {
|
|
r_.i64[i] = (a_.i64[i] > b_.i64[i]) ? ~INT64_C(0) : INT64_C(0);
|
|
}
|
|
#endif
|
|
|
|
return simde__m256i_from_private(r_);
|
|
#endif
|
|
}
|
|
#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
|
|
#undef _mm256_cmpgt_epi64
|
|
#define _mm256_cmpgt_epi64(a, b) simde_mm256_cmpgt_epi64(a, b)
|
|
#endif
|
|
|
|
SIMDE_FUNCTION_ATTRIBUTES
|
|
simde__m256i
|
|
simde_mm256_cvtepi8_epi16 (simde__m128i a) {
|
|
#if defined(SIMDE_X86_AVX2_NATIVE)
|
|
return _mm256_cvtepi8_epi16(a);
|
|
#else
|
|
simde__m256i_private r_;
|
|
simde__m128i_private a_ = simde__m128i_to_private(a);
|
|
|
|
#if defined(SIMDE_CONVERT_VECTOR_)
|
|
SIMDE_CONVERT_VECTOR_(r_.i16, a_.i8);
|
|
#else
|
|
SIMDE_VECTORIZE
|
|
for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) {
|
|
r_.i16[i] = a_.i8[i];
|
|
}
|
|
#endif
|
|
|
|
return simde__m256i_from_private(r_);
|
|
#endif
|
|
}
|
|
#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
|
|
#undef _mm256_cvtepi8_epi16
|
|
#define _mm256_cvtepi8_epi16(a) simde_mm256_cvtepi8_epi16(a)
|
|
#endif
|
|
|
|
SIMDE_FUNCTION_ATTRIBUTES
|
|
simde__m256i
|
|
simde_mm256_cvtepi8_epi32 (simde__m128i a) {
|
|
#if defined(SIMDE_X86_AVX2_NATIVE)
|
|
return _mm256_cvtepi8_epi32(a);
|
|
#else
|
|
simde__m256i_private r_;
|
|
simde__m128i_private a_ = simde__m128i_to_private(a);
|
|
|
|
#if defined(SIMDE_CONVERT_VECTOR_)
|
|
SIMDE_CONVERT_VECTOR_(r_.i32, a_.m64_private[0].i8);
|
|
#else
|
|
SIMDE_VECTORIZE
|
|
for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
|
|
r_.i32[i] = a_.i8[i];
|
|
}
|
|
#endif
|
|
|
|
return simde__m256i_from_private(r_);
|
|
#endif
|
|
}
|
|
#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
|
|
#undef _mm256_cvtepi8_epi32
|
|
#define _mm256_cvtepi8_epi32(a) simde_mm256_cvtepi8_epi32(a)
|
|
#endif
|
|
|
|
SIMDE_FUNCTION_ATTRIBUTES
|
|
simde__m256i
|
|
simde_mm256_cvtepi8_epi64 (simde__m128i a) {
|
|
#if defined(SIMDE_X86_AVX2_NATIVE)
|
|
return _mm256_cvtepi8_epi64(a);
|
|
#else
|
|
simde__m256i_private r_;
|
|
simde__m128i_private a_ = simde__m128i_to_private(a);
|
|
|
|
SIMDE_VECTORIZE
|
|
for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) {
|
|
r_.i64[i] = a_.i8[i];
|
|
}
|
|
|
|
return simde__m256i_from_private(r_);
|
|
#endif
|
|
}
|
|
#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
|
|
#undef _mm256_cvtepi8_epi64
|
|
#define _mm256_cvtepi8_epi64(a) simde_mm256_cvtepi8_epi64(a)
|
|
#endif
|
|
|
|
SIMDE_FUNCTION_ATTRIBUTES
|
|
simde__m256i
|
|
simde_mm256_cvtepi16_epi32 (simde__m128i a) {
|
|
#if defined(SIMDE_X86_AVX2_NATIVE)
|
|
return _mm256_cvtepi16_epi32(a);
|
|
#else
|
|
simde__m256i_private r_;
|
|
simde__m128i_private a_ = simde__m128i_to_private(a);
|
|
|
|
#if defined(SIMDE_CONVERT_VECTOR_)
|
|
SIMDE_CONVERT_VECTOR_(r_.i32, a_.i16);
|
|
#else
|
|
SIMDE_VECTORIZE
|
|
for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
|
|
r_.i32[i] = a_.i16[i];
|
|
}
|
|
#endif
|
|
|
|
return simde__m256i_from_private(r_);
|
|
#endif
|
|
}
|
|
#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
|
|
#undef _mm256_cvtepi16_epi32
|
|
#define _mm256_cvtepi16_epi32(a) simde_mm256_cvtepi16_epi32(a)
|
|
#endif
|
|
|
|
SIMDE_FUNCTION_ATTRIBUTES
|
|
simde__m256i
|
|
simde_mm256_cvtepi16_epi64 (simde__m128i a) {
|
|
#if defined(SIMDE_X86_AVX2_NATIVE)
|
|
return _mm256_cvtepi16_epi64(a);
|
|
#else
|
|
simde__m256i_private r_;
|
|
simde__m128i_private a_ = simde__m128i_to_private(a);
|
|
|
|
#if defined(SIMDE_CONVERT_VECTOR_)
|
|
SIMDE_CONVERT_VECTOR_(r_.i64, a_.m64_private[0].i16);
|
|
#else
|
|
SIMDE_VECTORIZE
|
|
for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) {
|
|
r_.i64[i] = a_.i16[i];
|
|
}
|
|
#endif
|
|
|
|
return simde__m256i_from_private(r_);
|
|
#endif
|
|
}
|
|
#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
|
|
#undef _mm256_cvtepi16_epi64
|
|
#define _mm256_cvtepi16_epi64(a) simde_mm256_cvtepi16_epi64(a)
|
|
#endif
|
|
|
|
SIMDE_FUNCTION_ATTRIBUTES
|
|
simde__m256i
|
|
simde_mm256_cvtepi32_epi64 (simde__m128i a) {
|
|
#if defined(SIMDE_X86_AVX2_NATIVE)
|
|
return _mm256_cvtepi32_epi64(a);
|
|
#else
|
|
simde__m256i_private r_;
|
|
simde__m128i_private a_ = simde__m128i_to_private(a);
|
|
|
|
#if defined(SIMDE_CONVERT_VECTOR_)
|
|
SIMDE_CONVERT_VECTOR_(r_.i64, a_.i32);
|
|
#else
|
|
SIMDE_VECTORIZE
|
|
for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) {
|
|
r_.i64[i] = a_.i32[i];
|
|
}
|
|
#endif
|
|
|
|
return simde__m256i_from_private(r_);
|
|
#endif
|
|
}
|
|
#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
|
|
#undef _mm256_cvtepi32_epi64
|
|
#define _mm256_cvtepi32_epi64(a) simde_mm256_cvtepi32_epi64(a)
|
|
#endif
|
|
|
|
SIMDE_FUNCTION_ATTRIBUTES
|
|
simde__m256i
|
|
simde_mm256_cvtepu8_epi16 (simde__m128i a) {
|
|
#if defined(SIMDE_X86_AVX2_NATIVE)
|
|
return _mm256_cvtepu8_epi16(a);
|
|
#else
|
|
simde__m256i_private r_;
|
|
simde__m128i_private a_ = simde__m128i_to_private(a);
|
|
|
|
#if defined(SIMDE_CONVERT_VECTOR_)
|
|
SIMDE_CONVERT_VECTOR_(r_.i16, a_.u8);
|
|
#else
|
|
SIMDE_VECTORIZE
|
|
for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) {
|
|
r_.i16[i] = a_.u8[i];
|
|
}
|
|
#endif
|
|
|
|
return simde__m256i_from_private(r_);
|
|
#endif
|
|
}
|
|
#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
|
|
#undef _mm256_cvtepu8_epi16
|
|
#define _mm256_cvtepu8_epi16(a) simde_mm256_cvtepu8_epi16(a)
|
|
#endif
|
|
|
|
SIMDE_FUNCTION_ATTRIBUTES
|
|
simde__m256i
|
|
simde_mm256_cvtepu8_epi32 (simde__m128i a) {
|
|
#if defined(SIMDE_X86_AVX2_NATIVE)
|
|
return _mm256_cvtepu8_epi32(a);
|
|
#else
|
|
simde__m256i_private r_;
|
|
simde__m128i_private a_ = simde__m128i_to_private(a);
|
|
|
|
#if defined(SIMDE_CONVERT_VECTOR_)
|
|
SIMDE_CONVERT_VECTOR_(r_.i32, a_.m64_private[0].u8);
|
|
#else
|
|
SIMDE_VECTORIZE
|
|
for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
|
|
r_.i32[i] = a_.u8[i];
|
|
}
|
|
#endif
|
|
|
|
return simde__m256i_from_private(r_);
|
|
#endif
|
|
}
|
|
#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
|
|
#undef _mm256_cvtepu8_epi32
|
|
#define _mm256_cvtepu8_epi32(a) simde_mm256_cvtepu8_epi32(a)
|
|
#endif
|
|
|
|
SIMDE_FUNCTION_ATTRIBUTES
|
|
simde__m256i
|
|
simde_mm256_cvtepu8_epi64 (simde__m128i a) {
|
|
#if defined(SIMDE_X86_AVX2_NATIVE)
|
|
return _mm256_cvtepu8_epi64(a);
|
|
#else
|
|
simde__m256i_private r_;
|
|
simde__m128i_private a_ = simde__m128i_to_private(a);
|
|
|
|
SIMDE_VECTORIZE
|
|
for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) {
|
|
r_.i64[i] = a_.u8[i];
|
|
}
|
|
|
|
return simde__m256i_from_private(r_);
|
|
#endif
|
|
}
|
|
#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
|
|
#undef _mm256_cvtepu8_epi64
|
|
#define _mm256_cvtepu8_epi64(a) simde_mm256_cvtepu8_epi64(a)
|
|
#endif
|
|
|
|
SIMDE_FUNCTION_ATTRIBUTES
|
|
simde__m256i
|
|
simde_mm256_cvtepu16_epi32 (simde__m128i a) {
|
|
#if defined(SIMDE_X86_AVX2_NATIVE)
|
|
return _mm256_cvtepu16_epi32(a);
|
|
#else
|
|
simde__m256i_private r_;
|
|
simde__m128i_private a_ = simde__m128i_to_private(a);
|
|
|
|
#if defined(SIMDE_CONVERT_VECTOR_)
|
|
SIMDE_CONVERT_VECTOR_(r_.i32, a_.u16);
|
|
#else
|
|
SIMDE_VECTORIZE
|
|
for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
|
|
r_.i32[i] = a_.u16[i];
|
|
}
|
|
#endif
|
|
|
|
return simde__m256i_from_private(r_);
|
|
#endif
|
|
}
|
|
#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
|
|
#undef _mm256_cvtepu16_epi32
|
|
#define _mm256_cvtepu16_epi32(a) simde_mm256_cvtepu16_epi32(a)
|
|
#endif
|
|
|
|
SIMDE_FUNCTION_ATTRIBUTES
|
|
simde__m256i
|
|
simde_mm256_cvtepu16_epi64 (simde__m128i a) {
|
|
#if defined(SIMDE_X86_AVX2_NATIVE)
|
|
return _mm256_cvtepu16_epi64(a);
|
|
#else
|
|
simde__m256i_private r_;
|
|
simde__m128i_private a_ = simde__m128i_to_private(a);
|
|
|
|
#if defined(SIMDE_CONVERT_VECTOR_)
|
|
SIMDE_CONVERT_VECTOR_(r_.i64, a_.m64_private[0].u16);
|
|
#else
|
|
SIMDE_VECTORIZE
|
|
for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) {
|
|
r_.i64[i] = a_.u16[i];
|
|
}
|
|
#endif
|
|
|
|
return simde__m256i_from_private(r_);
|
|
#endif
|
|
}
|
|
#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
|
|
#undef _mm256_cvtepu16_epi64
|
|
#define _mm256_cvtepu16_epi64(a) simde_mm256_cvtepu16_epi64(a)
|
|
#endif
|
|
|
|
SIMDE_FUNCTION_ATTRIBUTES
|
|
simde__m256i
|
|
simde_mm256_cvtepu32_epi64 (simde__m128i a) {
|
|
#if defined(SIMDE_X86_AVX2_NATIVE)
|
|
return _mm256_cvtepu32_epi64(a);
|
|
#else
|
|
simde__m256i_private r_;
|
|
simde__m128i_private a_ = simde__m128i_to_private(a);
|
|
|
|
#if defined(SIMDE_CONVERT_VECTOR_)
|
|
SIMDE_CONVERT_VECTOR_(r_.i64, a_.u32);
|
|
#else
|
|
SIMDE_VECTORIZE
|
|
for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) {
|
|
r_.i64[i] = a_.u32[i];
|
|
}
|
|
#endif
|
|
|
|
return simde__m256i_from_private(r_);
|
|
#endif
|
|
}
|
|
#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
|
|
#undef _mm256_cvtepu32_epi64
|
|
#define _mm256_cvtepu32_epi64(a) simde_mm256_cvtepu32_epi64(a)
|
|
#endif
|
|
|
|
SIMDE_FUNCTION_ATTRIBUTES
|
|
int
|
|
simde_mm256_extract_epi8 (simde__m256i a, const int index)
|
|
SIMDE_REQUIRE_RANGE(index, 0, 31){
|
|
simde__m256i_private a_ = simde__m256i_to_private(a);
|
|
return a_.i8[index];
|
|
}
|
|
#if defined(SIMDE_X86_AVX2_NATIVE)
|
|
#define simde_mm256_extract_epi8(a, index) _mm256_extract_epi8(a, index)
|
|
#endif
|
|
#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
|
|
#undef _mm256_extract_epi8
|
|
#define _mm256_extract_epi8(a, index) simde_mm256_extract_epi8(a, index)
|
|
#endif
|
|
|
|
SIMDE_FUNCTION_ATTRIBUTES
|
|
int
|
|
simde_mm256_extract_epi16 (simde__m256i a, const int index)
|
|
SIMDE_REQUIRE_RANGE(index, 0, 15) {
|
|
simde__m256i_private a_ = simde__m256i_to_private(a);
|
|
return a_.i16[index];
|
|
}
|
|
#if defined(SIMDE_X86_AVX2_NATIVE)
|
|
#define simde_mm256_extract_epi16(a, index) _mm256_extract_epi16(a, index)
|
|
#endif
|
|
#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
|
|
#undef _mm256_extract_epi16
|
|
#define _mm256_extract_epi16(a, index) simde_mm256_extract_epi16(a, index)
|
|
#endif
|
|
|
|
SIMDE_FUNCTION_ATTRIBUTES
|
|
simde__m128i
|
|
simde_mm256_extracti128_si256 (simde__m256i a, const int imm8)
|
|
SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 1) {
|
|
simde__m256i_private a_ = simde__m256i_to_private(a);
|
|
return a_.m128i[imm8];
|
|
}
|
|
#if defined(SIMDE_X86_AVX2_NATIVE)
|
|
# define simde_mm256_extracti128_si256(a, imm8) _mm256_extracti128_si256(a, imm8)
|
|
#endif
|
|
#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
|
|
#undef _mm256_extracti128_si256
|
|
#define _mm256_extracti128_si256(a, imm8) simde_mm256_extracti128_si256(a, imm8)
|
|
#endif
|
|
|
|
SIMDE_FUNCTION_ATTRIBUTES
|
|
simde__m128i
|
|
simde_mm_i32gather_epi32(const int32_t* base_addr, simde__m128i vindex, const int32_t scale)
|
|
SIMDE_REQUIRE_CONSTANT(scale)
|
|
HEDLEY_REQUIRE_MSG((scale && scale <= 8 && !(scale & (scale - 1))), "`scale' must be a power of two less than or equal to 8") {
|
|
simde__m128i_private
|
|
vindex_ = simde__m128i_to_private(vindex),
|
|
r_;
|
|
const uint8_t* addr = HEDLEY_REINTERPRET_CAST(const uint8_t*, base_addr);
|
|
|
|
SIMDE_VECTORIZE
|
|
for (size_t i = 0 ; i < (sizeof(vindex_.i32) / sizeof(vindex_.i32[0])) ; i++) {
|
|
const uint8_t* src = addr + (HEDLEY_STATIC_CAST(size_t , vindex_.i32[i]) * HEDLEY_STATIC_CAST(size_t , scale));
|
|
int32_t dst;
|
|
simde_memcpy(&dst, src, sizeof(dst));
|
|
r_.i32[i] = dst;
|
|
}
|
|
|
|
return simde__m128i_from_private(r_);
|
|
}
|
|
#if defined(SIMDE_X86_AVX2_NATIVE)
|
|
#define simde_mm_i32gather_epi32(base_addr, vindex, scale) _mm_i32gather_epi32(SIMDE_CHECKED_REINTERPRET_CAST(int const*, int32_t const*, base_addr), vindex, scale)
|
|
#endif
|
|
#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
|
|
#undef _mm_i32gather_epi32
|
|
#define _mm_i32gather_epi32(base_addr, vindex, scale) simde_mm_i32gather_epi32(SIMDE_CHECKED_REINTERPRET_CAST(int32_t const*, int const*, base_addr), vindex, scale)
|
|
#endif
|
|
|
|
SIMDE_FUNCTION_ATTRIBUTES
|
|
simde__m128i
|
|
simde_mm_mask_i32gather_epi32(simde__m128i src, const int32_t* base_addr, simde__m128i vindex, simde__m128i mask, const int32_t scale)
|
|
SIMDE_REQUIRE_CONSTANT(scale)
|
|
HEDLEY_REQUIRE_MSG((scale && scale <= 8 && !(scale & (scale - 1))), "`scale' must be a power of two less than or equal to 8") {
|
|
simde__m128i_private
|
|
vindex_ = simde__m128i_to_private(vindex),
|
|
src_ = simde__m128i_to_private(src),
|
|
mask_ = simde__m128i_to_private(mask),
|
|
r_;
|
|
const uint8_t* addr = HEDLEY_REINTERPRET_CAST(const uint8_t*, base_addr);
|
|
|
|
SIMDE_VECTORIZE
|
|
for (size_t i = 0 ; i < (sizeof(vindex_.i32) / sizeof(vindex_.i32[0])) ; i++) {
|
|
if ((mask_.i32[i] >> 31) & 1) {
|
|
const uint8_t* src1 = addr + (HEDLEY_STATIC_CAST(size_t , vindex_.i32[i]) * HEDLEY_STATIC_CAST(size_t , scale));
|
|
int32_t dst;
|
|
simde_memcpy(&dst, src1, sizeof(dst));
|
|
r_.i32[i] = dst;
|
|
}
|
|
else {
|
|
r_.i32[i] = src_.i32[i];
|
|
}
|
|
}
|
|
|
|
return simde__m128i_from_private(r_);
|
|
}
|
|
#if defined(SIMDE_X86_AVX2_NATIVE)
|
|
#define simde_mm_mask_i32gather_epi32(src, base_addr, vindex, mask, scale) _mm_mask_i32gather_epi32(src, SIMDE_CHECKED_REINTERPRET_CAST(int const*, int32_t const*, base_addr), vindex, mask, scale)
|
|
#endif
|
|
#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
|
|
#undef _mm_mask_i32gather_epi32
|
|
#define _mm_mask_i32gather_epi32(src, base_addr, vindex, mask, scale) simde_mm_mask_i32gather_epi32(src, SIMDE_CHECKED_REINTERPRET_CAST(int32_t const*, int const*, base_addr), vindex, mask, scale)
|
|
#endif
|
|
|
|
SIMDE_FUNCTION_ATTRIBUTES
|
|
simde__m256i
|
|
simde_mm256_i32gather_epi32(const int32_t* base_addr, simde__m256i vindex, const int32_t scale)
|
|
SIMDE_REQUIRE_CONSTANT(scale)
|
|
HEDLEY_REQUIRE_MSG((scale && scale <= 8 && !(scale & (scale - 1))), "`scale' must be a power of two less than or equal to 8") {
|
|
simde__m256i_private
|
|
vindex_ = simde__m256i_to_private(vindex),
|
|
r_;
|
|
const uint8_t* addr = HEDLEY_REINTERPRET_CAST(const uint8_t*, base_addr);
|
|
|
|
SIMDE_VECTORIZE
|
|
for (size_t i = 0 ; i < (sizeof(vindex_.i32) / sizeof(vindex_.i32[0])) ; i++) {
|
|
const uint8_t* src = addr + (HEDLEY_STATIC_CAST(size_t , vindex_.i32[i]) * HEDLEY_STATIC_CAST(size_t , scale));
|
|
int32_t dst;
|
|
simde_memcpy(&dst, src, sizeof(dst));
|
|
r_.i32[i] = dst;
|
|
}
|
|
|
|
return simde__m256i_from_private(r_);
|
|
}
|
|
#if defined(SIMDE_X86_AVX2_NATIVE)
|
|
#define simde_mm256_i32gather_epi32(base_addr, vindex, scale) _mm256_i32gather_epi32(SIMDE_CHECKED_REINTERPRET_CAST(int const*, int32_t const*, base_addr), vindex, scale)
|
|
#endif
|
|
#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
|
|
#undef _mm256_i32gather_epi32
|
|
#define _mm256_i32gather_epi32(base_addr, vindex, scale) simde_mm256_i32gather_epi32(SIMDE_CHECKED_REINTERPRET_CAST(int32_t const*, int const*, base_addr), vindex, scale)
|
|
#endif
|
|
|
|
SIMDE_FUNCTION_ATTRIBUTES
|
|
simde__m256i
|
|
simde_mm256_mask_i32gather_epi32(simde__m256i src, const int32_t* base_addr, simde__m256i vindex, simde__m256i mask, const int32_t scale)
|
|
SIMDE_REQUIRE_CONSTANT(scale)
|
|
HEDLEY_REQUIRE_MSG((scale && scale <= 8 && !(scale & (scale - 1))), "`scale' must be a power of two less than or equal to 8") {
|
|
simde__m256i_private
|
|
vindex_ = simde__m256i_to_private(vindex),
|
|
src_ = simde__m256i_to_private(src),
|
|
mask_ = simde__m256i_to_private(mask),
|
|
r_;
|
|
const uint8_t* addr = HEDLEY_REINTERPRET_CAST(const uint8_t*, base_addr);
|
|
|
|
SIMDE_VECTORIZE
|
|
for (size_t i = 0 ; i < (sizeof(vindex_.i32) / sizeof(vindex_.i32[0])) ; i++) {
|
|
if ((mask_.i32[i] >> 31) & 1) {
|
|
const uint8_t* src1 = addr + (HEDLEY_STATIC_CAST(size_t , vindex_.i32[i]) * HEDLEY_STATIC_CAST(size_t , scale));
|
|
int32_t dst;
|
|
simde_memcpy(&dst, src1, sizeof(dst));
|
|
r_.i32[i] = dst;
|
|
}
|
|
else {
|
|
r_.i32[i] = src_.i32[i];
|
|
}
|
|
}
|
|
|
|
return simde__m256i_from_private(r_);
|
|
}
|
|
#if defined(SIMDE_X86_AVX2_NATIVE)
|
|
#define simde_mm256_mask_i32gather_epi32(src, base_addr, vindex, mask, scale) _mm256_mask_i32gather_epi32(src, SIMDE_CHECKED_REINTERPRET_CAST(int const*, int32_t const*, base_addr), vindex, mask, scale)
|
|
#endif
|
|
#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
|
|
#undef _mm256_mask_i32gather_epi32
|
|
#define _mm256_mask_i32gather_epi32(src, base_addr, vindex, mask, scale) simde_mm256_mask_i32gather_epi32(src, SIMDE_CHECKED_REINTERPRET_CAST(int32_t const*, int const*, base_addr), vindex, mask, scale)
|
|
#endif
|
|
|
|
SIMDE_FUNCTION_ATTRIBUTES
|
|
simde__m128i
|
|
simde_mm_i64gather_epi32(const int32_t* base_addr, simde__m128i vindex, const int32_t scale)
|
|
SIMDE_REQUIRE_CONSTANT(scale)
|
|
HEDLEY_REQUIRE_MSG((scale && scale <= 8 && !(scale & (scale - 1))), "`scale' must be a power of two less than or equal to 8") {
|
|
simde__m128i_private
|
|
vindex_ = simde__m128i_to_private(vindex),
|
|
r_ = simde__m128i_to_private(simde_mm_setzero_si128());
|
|
const uint8_t* addr = HEDLEY_REINTERPRET_CAST(const uint8_t*, base_addr);
|
|
|
|
SIMDE_VECTORIZE
|
|
for (size_t i = 0 ; i < (sizeof(vindex_.i64) / sizeof(vindex_.i64[0])) ; i++) {
|
|
const uint8_t* src = addr + (HEDLEY_STATIC_CAST(size_t , vindex_.i64[i]) * HEDLEY_STATIC_CAST(size_t , scale));
|
|
int32_t dst;
|
|
simde_memcpy(&dst, src, sizeof(dst));
|
|
r_.i32[i] = dst;
|
|
}
|
|
|
|
return simde__m128i_from_private(r_);
|
|
}
|
|
#if defined(SIMDE_X86_AVX2_NATIVE)
|
|
#define simde_mm_i64gather_epi32(base_addr, vindex, scale) _mm_i64gather_epi32(SIMDE_CHECKED_REINTERPRET_CAST(int const*, int32_t const*, base_addr), vindex, scale)
|
|
#endif
|
|
#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
|
|
#undef _mm_i64gather_epi32
|
|
#define _mm_i64gather_epi32(base_addr, vindex, scale) simde_mm_i64gather_epi32(SIMDE_CHECKED_REINTERPRET_CAST(int32_t const*, int const*, base_addr), vindex, scale)
|
|
#endif
|
|
|
|
SIMDE_FUNCTION_ATTRIBUTES
|
|
simde__m128i
|
|
simde_mm_mask_i64gather_epi32(simde__m128i src, const int32_t* base_addr, simde__m128i vindex, simde__m128i mask, const int32_t scale)
|
|
SIMDE_REQUIRE_CONSTANT(scale)
|
|
HEDLEY_REQUIRE_MSG((scale && scale <= 8 && !(scale & (scale - 1))), "`scale' must be a power of two less than or equal to 8") {
|
|
simde__m128i_private
|
|
vindex_ = simde__m128i_to_private(vindex),
|
|
src_ = simde__m128i_to_private(src),
|
|
mask_ = simde__m128i_to_private(mask),
|
|
r_ = simde__m128i_to_private(simde_mm_setzero_si128());
|
|
const uint8_t* addr = HEDLEY_REINTERPRET_CAST(const uint8_t*, base_addr);
|
|
|
|
SIMDE_VECTORIZE
|
|
for (size_t i = 0 ; i < (sizeof(vindex_.i64) / sizeof(vindex_.i64[0])) ; i++) {
|
|
if ((mask_.i32[i] >> 31) & 1) {
|
|
const uint8_t* src1 = addr + (HEDLEY_STATIC_CAST(size_t , vindex_.i64[i]) * HEDLEY_STATIC_CAST(size_t , scale));
|
|
int32_t dst;
|
|
simde_memcpy(&dst, src1, sizeof(dst));
|
|
r_.i32[i] = dst;
|
|
}
|
|
else {
|
|
r_.i32[i] = src_.i32[i];
|
|
}
|
|
}
|
|
|
|
return simde__m128i_from_private(r_);
|
|
}
|
|
#if defined(SIMDE_X86_AVX2_NATIVE)
|
|
#define simde_mm_mask_i64gather_epi32(src, base_addr, vindex, mask, scale) _mm_mask_i64gather_epi32(src, SIMDE_CHECKED_REINTERPRET_CAST(int const*, int32_t const*, base_addr), vindex, mask, scale)
|
|
#endif
|
|
#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
|
|
#undef _mm_mask_i64gather_epi32
|
|
#define _mm_mask_i64gather_epi32(src, base_addr, vindex, mask, scale) simde_mm_mask_i64gather_epi32(src, SIMDE_CHECKED_REINTERPRET_CAST(int32_t const*, int const*, base_addr), vindex, mask, scale)
|
|
#endif
|
|
|
|
SIMDE_FUNCTION_ATTRIBUTES
|
|
simde__m128i
|
|
simde_mm256_i64gather_epi32(const int32_t* base_addr, simde__m256i vindex, const int32_t scale)
|
|
SIMDE_REQUIRE_CONSTANT(scale)
|
|
HEDLEY_REQUIRE_MSG((scale && scale <= 8 && !(scale & (scale - 1))), "`scale' must be a power of two less than or equal to 8") {
|
|
simde__m256i_private
|
|
vindex_ = simde__m256i_to_private(vindex);
|
|
simde__m128i_private
|
|
r_ = simde__m128i_to_private(simde_mm_setzero_si128());
|
|
const uint8_t* addr = HEDLEY_REINTERPRET_CAST(const uint8_t*, base_addr);
|
|
|
|
SIMDE_VECTORIZE
|
|
for (size_t i = 0 ; i < (sizeof(vindex_.i64) / sizeof(vindex_.i64[0])) ; i++) {
|
|
const uint8_t* src = addr + (HEDLEY_STATIC_CAST(size_t , vindex_.i64[i]) * HEDLEY_STATIC_CAST(size_t , scale));
|
|
int32_t dst;
|
|
simde_memcpy(&dst, src, sizeof(dst));
|
|
r_.i32[i] = dst;
|
|
}
|
|
|
|
return simde__m128i_from_private(r_);
|
|
}
|
|
#if defined(SIMDE_X86_AVX2_NATIVE)
|
|
#define simde_mm256_i64gather_epi32(base_addr, vindex, scale) _mm256_i64gather_epi32(SIMDE_CHECKED_REINTERPRET_CAST(int const*, int32_t const*, base_addr), vindex, scale)
|
|
#endif
|
|
#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
|
|
#undef _mm256_i64gather_epi32
|
|
#define _mm256_i64gather_epi32(base_addr, vindex, scale) simde_mm256_i64gather_epi32(SIMDE_CHECKED_REINTERPRET_CAST(int32_t const*, int const*, base_addr), vindex, scale)
|
|
#endif
|
|
|
|
SIMDE_FUNCTION_ATTRIBUTES
|
|
simde__m128i
|
|
simde_mm256_mask_i64gather_epi32(simde__m128i src, const int32_t* base_addr, simde__m256i vindex, simde__m128i mask, const int32_t scale)
|
|
SIMDE_REQUIRE_CONSTANT(scale)
|
|
HEDLEY_REQUIRE_MSG((scale && scale <= 8 && !(scale & (scale - 1))), "`scale' must be a power of two less than or equal to 8") {
|
|
simde__m256i_private
|
|
vindex_ = simde__m256i_to_private(vindex);
|
|
simde__m128i_private
|
|
src_ = simde__m128i_to_private(src),
|
|
mask_ = simde__m128i_to_private(mask),
|
|
r_ = simde__m128i_to_private(simde_mm_setzero_si128());
|
|
const uint8_t* addr = HEDLEY_REINTERPRET_CAST(const uint8_t*, base_addr);
|
|
|
|
SIMDE_VECTORIZE
|
|
for (size_t i = 0 ; i < (sizeof(vindex_.i64) / sizeof(vindex_.i64[0])) ; i++) {
|
|
if ((mask_.i32[i] >> 31) & 1) {
|
|
const uint8_t* src1 = addr + (HEDLEY_STATIC_CAST(size_t , vindex_.i64[i]) * HEDLEY_STATIC_CAST(size_t , scale));
|
|
int32_t dst;
|
|
simde_memcpy(&dst, src1, sizeof(dst));
|
|
r_.i32[i] = dst;
|
|
}
|
|
else {
|
|
r_.i32[i] = src_.i32[i];
|
|
}
|
|
}
|
|
|
|
return simde__m128i_from_private(r_);
|
|
}
|
|
#if defined(SIMDE_X86_AVX2_NATIVE)
|
|
#define simde_mm256_mask_i64gather_epi32(src, base_addr, vindex, mask, scale) _mm256_mask_i64gather_epi32(src, SIMDE_CHECKED_REINTERPRET_CAST(int const*, int32_t const*, base_addr), vindex, mask, scale)
|
|
#endif
|
|
#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
|
|
#undef _mm256_mask_i64gather_epi32
|
|
#define _mm256_mask_i64gather_epi32(src, base_addr, vindex, mask, scale) simde_mm256_mask_i64gather_epi32(src, SIMDE_CHECKED_REINTERPRET_CAST(int32_t const*, int const*, base_addr), vindex, mask, scale)
|
|
#endif
|
|
|
|
SIMDE_FUNCTION_ATTRIBUTES
|
|
simde__m128i
|
|
simde_mm_i32gather_epi64(const int64_t* base_addr, simde__m128i vindex, const int32_t scale)
|
|
SIMDE_REQUIRE_CONSTANT(scale)
|
|
HEDLEY_REQUIRE_MSG((scale && scale <= 8 && !(scale & (scale - 1))), "`scale' must be a power of two less than or equal to 8") {
|
|
simde__m128i_private
|
|
vindex_ = simde__m128i_to_private(vindex),
|
|
r_;
|
|
const uint8_t* addr = HEDLEY_REINTERPRET_CAST(const uint8_t*, base_addr);
|
|
|
|
SIMDE_VECTORIZE
|
|
for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) {
|
|
const uint8_t* src = addr + (HEDLEY_STATIC_CAST(size_t , vindex_.i32[i]) * HEDLEY_STATIC_CAST(size_t , scale));
|
|
int64_t dst;
|
|
simde_memcpy(&dst, src, sizeof(dst));
|
|
r_.i64[i] = dst;
|
|
}
|
|
|
|
return simde__m128i_from_private(r_);
|
|
}
|
|
#if defined(SIMDE_X86_AVX2_NATIVE)
|
|
#if SIMDE_DETECT_CLANG_VERSION_CHECK(3,8,0)
|
|
#define simde_mm_i32gather_epi64(base_addr, vindex, scale) _mm_i32gather_epi64(HEDLEY_REINTERPRET_CAST(int64_t const*, base_addr), vindex, scale)
|
|
#else
|
|
#define simde_mm_i32gather_epi64(base_addr, vindex, scale) _mm_i32gather_epi64(HEDLEY_REINTERPRET_CAST(long long const*, base_addr), vindex, scale)
|
|
#endif
|
|
#endif
|
|
#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
|
|
#undef _mm_i32gather_epi64
|
|
#define _mm_i32gather_epi64(base_addr, vindex, scale) simde_mm_i32gather_epi64(HEDLEY_REINTERPRET_CAST(int64_t const*, base_addr), vindex, scale)
|
|
#endif
|
|
|
|
SIMDE_FUNCTION_ATTRIBUTES
|
|
simde__m128i
|
|
simde_mm_mask_i32gather_epi64(simde__m128i src, const int64_t* base_addr, simde__m128i vindex, simde__m128i mask, const int32_t scale)
|
|
SIMDE_REQUIRE_CONSTANT(scale)
|
|
HEDLEY_REQUIRE_MSG((scale && scale <= 8 && !(scale & (scale - 1))), "`scale' must be a power of two less than or equal to 8") {
|
|
simde__m128i_private
|
|
vindex_ = simde__m128i_to_private(vindex),
|
|
src_ = simde__m128i_to_private(src),
|
|
mask_ = simde__m128i_to_private(mask),
|
|
r_;
|
|
const uint8_t* addr = HEDLEY_REINTERPRET_CAST(const uint8_t*, base_addr);
|
|
|
|
SIMDE_VECTORIZE
|
|
for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) {
|
|
if ((mask_.i64[i] >> 63) & 1) {
|
|
const uint8_t* src1 = addr + (HEDLEY_STATIC_CAST(size_t , vindex_.i32[i]) * HEDLEY_STATIC_CAST(size_t , scale));
|
|
int64_t dst;
|
|
simde_memcpy(&dst, src1, sizeof(dst));
|
|
r_.i64[i] = dst;
|
|
}
|
|
else {
|
|
r_.i64[i] = src_.i64[i];
|
|
}
|
|
}
|
|
|
|
return simde__m128i_from_private(r_);
|
|
}
|
|
#if defined(SIMDE_X86_AVX2_NATIVE)
|
|
#if SIMDE_DETECT_CLANG_VERSION_CHECK(3,8,0)
|
|
#define simde_mm_mask_i32gather_epi64(src, base_addr, vindex, mask, scale) _mm_mask_i32gather_epi64(src, HEDLEY_REINTERPRET_CAST(int64_t const*, base_addr), vindex, mask, scale)
|
|
#else
|
|
#define simde_mm_mask_i32gather_epi64(src, base_addr, vindex, mask, scale) _mm_mask_i32gather_epi64(src, HEDLEY_REINTERPRET_CAST(long long const*, base_addr), vindex, mask, scale)
|
|
#endif
|
|
#endif
|
|
#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
|
|
#undef _mm_mask_i32gather_epi64
|
|
#define _mm_mask_i32gather_epi64(src, base_addr, vindex, mask, scale) simde_mm_mask_i32gather_epi64(src, HEDLEY_REINTERPRET_CAST(int64_t const*, base_addr), vindex, mask, scale)
|
|
#endif
|
|
|
|
SIMDE_FUNCTION_ATTRIBUTES
|
|
simde__m256i
|
|
simde_mm256_i32gather_epi64(const int64_t* base_addr, simde__m128i vindex, const int32_t scale)
|
|
SIMDE_REQUIRE_CONSTANT(scale)
|
|
HEDLEY_REQUIRE_MSG((scale && scale <= 8 && !(scale & (scale - 1))), "`scale' must be a power of two less than or equal to 8") {
|
|
simde__m128i_private
|
|
vindex_ = simde__m128i_to_private(vindex);
|
|
simde__m256i_private
|
|
r_;
|
|
const uint8_t* addr = HEDLEY_REINTERPRET_CAST(const uint8_t*, base_addr);
|
|
|
|
SIMDE_VECTORIZE
|
|
for (size_t i = 0 ; i < (sizeof(vindex_.i32) / sizeof(vindex_.i32[0])) ; i++) {
|
|
const uint8_t* src = addr + (HEDLEY_STATIC_CAST(size_t , vindex_.i32[i]) * HEDLEY_STATIC_CAST(size_t , scale));
|
|
int64_t dst;
|
|
simde_memcpy(&dst, src, sizeof(dst));
|
|
r_.i64[i] = dst;
|
|
}
|
|
|
|
return simde__m256i_from_private(r_);
|
|
}
|
|
#if defined(SIMDE_X86_AVX2_NATIVE)
|
|
#if SIMDE_DETECT_CLANG_VERSION_CHECK(3,8,0)
|
|
#define simde_mm256_i32gather_epi64(base_addr, vindex, scale) _mm256_i32gather_epi64(HEDLEY_REINTERPRET_CAST(int64_t const*, base_addr), vindex, scale)
|
|
#else
|
|
#define simde_mm256_i32gather_epi64(base_addr, vindex, scale) _mm256_i32gather_epi64(HEDLEY_REINTERPRET_CAST(long long const*, base_addr), vindex, scale)
|
|
#endif
|
|
#endif
|
|
#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
|
|
#undef _mm256_i32gather_epi64
|
|
#define _mm256_i32gather_epi64(base_addr, vindex, scale) simde_mm256_i32gather_epi64(HEDLEY_REINTERPRET_CAST(int64_t const*, base_addr), vindex, scale)
|
|
#endif
|
|
|
|
SIMDE_FUNCTION_ATTRIBUTES
|
|
simde__m256i
|
|
simde_mm256_mask_i32gather_epi64(simde__m256i src, const int64_t* base_addr, simde__m128i vindex, simde__m256i mask, const int32_t scale)
|
|
SIMDE_REQUIRE_CONSTANT(scale)
|
|
HEDLEY_REQUIRE_MSG((scale && scale <= 8 && !(scale & (scale - 1))), "`scale' must be a power of two less than or equal to 8") {
|
|
simde__m256i_private
|
|
src_ = simde__m256i_to_private(src),
|
|
mask_ = simde__m256i_to_private(mask),
|
|
r_;
|
|
simde__m128i_private
|
|
vindex_ = simde__m128i_to_private(vindex);
|
|
const uint8_t* addr = HEDLEY_REINTERPRET_CAST(const uint8_t*, base_addr);
|
|
|
|
SIMDE_VECTORIZE
|
|
for (size_t i = 0 ; i < (sizeof(vindex_.i32) / sizeof(vindex_.i32[0])) ; i++) {
|
|
if ((mask_.i64[i] >> 63) & 1) {
|
|
const uint8_t* src1 = addr + (HEDLEY_STATIC_CAST(size_t , vindex_.i32[i]) * HEDLEY_STATIC_CAST(size_t , scale));
|
|
int64_t dst;
|
|
simde_memcpy(&dst, src1, sizeof(dst));
|
|
r_.i64[i] = dst;
|
|
}
|
|
else {
|
|
r_.i64[i] = src_.i64[i];
|
|
}
|
|
}
|
|
|
|
return simde__m256i_from_private(r_);
|
|
}
|
|
#if defined(SIMDE_X86_AVX2_NATIVE)
|
|
#if SIMDE_DETECT_CLANG_VERSION_CHECK(3,8,0)
|
|
#define simde_mm256_mask_i32gather_epi64(src, base_addr, vindex, mask, scale) _mm256_mask_i32gather_epi64(src, HEDLEY_REINTERPRET_CAST(int64_t const*, base_addr), vindex, mask, scale)
|
|
#else
|
|
#define simde_mm256_mask_i32gather_epi64(src, base_addr, vindex, mask, scale) _mm256_mask_i32gather_epi64(src, HEDLEY_REINTERPRET_CAST(long long const*, base_addr), vindex, mask, scale)
|
|
#endif
|
|
#endif
|
|
#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
|
|
#undef _mm256_mask_i32gather_epi64
|
|
#define _mm256_mask_i32gather_epi64(src, base_addr, vindex, mask, scale) simde_mm256_mask_i32gather_epi64(src, HEDLEY_REINTERPRET_CAST(int64_t const*, base_addr), vindex, mask, scale)
|
|
#endif
|
|
|
|
SIMDE_FUNCTION_ATTRIBUTES
|
|
simde__m128i
|
|
simde_mm_i64gather_epi64(const int64_t* base_addr, simde__m128i vindex, const int32_t scale)
|
|
SIMDE_REQUIRE_CONSTANT(scale)
|
|
HEDLEY_REQUIRE_MSG((scale && scale <= 8 && !(scale & (scale - 1))), "`scale' must be a power of two less than or equal to 8") {
|
|
simde__m128i_private
|
|
vindex_ = simde__m128i_to_private(vindex),
|
|
r_ = simde__m128i_to_private(simde_mm_setzero_si128());
|
|
const uint8_t* addr = HEDLEY_REINTERPRET_CAST(const uint8_t*, base_addr);
|
|
|
|
SIMDE_VECTORIZE
|
|
for (size_t i = 0 ; i < (sizeof(vindex_.i64) / sizeof(vindex_.i64[0])) ; i++) {
|
|
const uint8_t* src = addr + (HEDLEY_STATIC_CAST(size_t , vindex_.i64[i]) * HEDLEY_STATIC_CAST(size_t , scale));
|
|
int64_t dst;
|
|
simde_memcpy(&dst, src, sizeof(dst));
|
|
r_.i64[i] = dst;
|
|
}
|
|
|
|
return simde__m128i_from_private(r_);
|
|
}
|
|
#if defined(SIMDE_X86_AVX2_NATIVE)
|
|
#if SIMDE_DETECT_CLANG_VERSION_CHECK(3,8,0)
|
|
#define simde_mm_i64gather_epi64(base_addr, vindex, scale) _mm_i64gather_epi64(HEDLEY_REINTERPRET_CAST(int64_t const*, base_addr), vindex, scale)
|
|
#else
|
|
#define simde_mm_i64gather_epi64(base_addr, vindex, scale) _mm_i64gather_epi64(HEDLEY_REINTERPRET_CAST(long long const*, base_addr), vindex, scale)
|
|
#endif
|
|
#endif
|
|
#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
|
|
#undef _mm_i64gather_epi64
|
|
#define _mm_i64gather_epi64(base_addr, vindex, scale) simde_mm_i64gather_epi64(HEDLEY_REINTERPRET_CAST(int64_t const*, base_addr), vindex, scale)
|
|
#endif
|
|
|
|
SIMDE_FUNCTION_ATTRIBUTES
|
|
simde__m128i
|
|
simde_mm_mask_i64gather_epi64(simde__m128i src, const int64_t* base_addr, simde__m128i vindex, simde__m128i mask, const int32_t scale)
|
|
SIMDE_REQUIRE_CONSTANT(scale)
|
|
HEDLEY_REQUIRE_MSG((scale && scale <= 8 && !(scale & (scale - 1))), "`scale' must be a power of two less than or equal to 8") {
|
|
simde__m128i_private
|
|
vindex_ = simde__m128i_to_private(vindex),
|
|
src_ = simde__m128i_to_private(src),
|
|
mask_ = simde__m128i_to_private(mask),
|
|
r_ = simde__m128i_to_private(simde_mm_setzero_si128());
|
|
const uint8_t* addr = HEDLEY_REINTERPRET_CAST(const uint8_t*, base_addr);
|
|
|
|
SIMDE_VECTORIZE
|
|
for (size_t i = 0 ; i < (sizeof(vindex_.i64) / sizeof(vindex_.i64[0])) ; i++) {
|
|
if ((mask_.i64[i] >> 63) & 1) {
|
|
const uint8_t* src1 = addr + (HEDLEY_STATIC_CAST(size_t , vindex_.i64[i]) * HEDLEY_STATIC_CAST(size_t , scale));
|
|
int64_t dst;
|
|
simde_memcpy(&dst, src1, sizeof(dst));
|
|
r_.i64[i] = dst;
|
|
}
|
|
else {
|
|
r_.i64[i] = src_.i64[i];
|
|
}
|
|
}
|
|
|
|
return simde__m128i_from_private(r_);
|
|
}
|
|
#if defined(SIMDE_X86_AVX2_NATIVE)
|
|
#if SIMDE_DETECT_CLANG_VERSION_CHECK(3,8,0)
|
|
#define simde_mm_mask_i64gather_epi64(src, base_addr, vindex, mask, scale) _mm_mask_i64gather_epi64(src, HEDLEY_REINTERPRET_CAST(int64_t const*, base_addr), vindex, mask, scale)
|
|
#else
|
|
#define simde_mm_mask_i64gather_epi64(src, base_addr, vindex, mask, scale) _mm_mask_i64gather_epi64(src, HEDLEY_REINTERPRET_CAST(long long const*, base_addr), vindex, mask, scale)
|
|
#endif
|
|
#endif
|
|
#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
|
|
#undef _mm_mask_i64gather_epi64
|
|
#define _mm_mask_i64gather_epi64(src, base_addr, vindex, mask, scale) simde_mm_mask_i64gather_epi64(src, HEDLEY_REINTERPRET_CAST(int64_t const*, base_addr), vindex, mask, scale)
|
|
#endif
|
|
|
|
SIMDE_FUNCTION_ATTRIBUTES
|
|
simde__m256i
|
|
simde_mm256_i64gather_epi64(const int64_t* base_addr, simde__m256i vindex, const int32_t scale)
|
|
SIMDE_REQUIRE_CONSTANT(scale)
|
|
HEDLEY_REQUIRE_MSG((scale && scale <= 8 && !(scale & (scale - 1))), "`scale' must be a power of two less than or equal to 8") {
|
|
simde__m256i_private
|
|
vindex_ = simde__m256i_to_private(vindex),
|
|
r_ = simde__m256i_to_private(simde_mm256_setzero_si256());
|
|
const uint8_t* addr = HEDLEY_REINTERPRET_CAST(const uint8_t*, base_addr);
|
|
|
|
SIMDE_VECTORIZE
|
|
for (size_t i = 0 ; i < (sizeof(vindex_.i64) / sizeof(vindex_.i64[0])) ; i++) {
|
|
const uint8_t* src = addr + (HEDLEY_STATIC_CAST(size_t , vindex_.i64[i]) * HEDLEY_STATIC_CAST(size_t , scale));
|
|
int64_t dst;
|
|
simde_memcpy(&dst, src, sizeof(dst));
|
|
r_.i64[i] = dst;
|
|
}
|
|
|
|
return simde__m256i_from_private(r_);
|
|
}
|
|
#if defined(SIMDE_X86_AVX2_NATIVE)
|
|
#if SIMDE_DETECT_CLANG_VERSION_CHECK(3,8,0)
|
|
#define simde_mm256_i64gather_epi64(base_addr, vindex, scale) _mm256_i64gather_epi64(HEDLEY_REINTERPRET_CAST(int64_t const*, base_addr), vindex, scale)
|
|
#else
|
|
#define simde_mm256_i64gather_epi64(base_addr, vindex, scale) _mm256_i64gather_epi64(HEDLEY_REINTERPRET_CAST(long long const*, base_addr), vindex, scale)
|
|
#endif
|
|
#endif
|
|
#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
|
|
#undef _mm256_i64gather_epi64
|
|
#define _mm256_i64gather_epi64(base_addr, vindex, scale) simde_mm256_i64gather_epi64(HEDLEY_REINTERPRET_CAST(int64_t const*, base_addr), vindex, scale)
|
|
#endif
|
|
|
|
SIMDE_FUNCTION_ATTRIBUTES
|
|
simde__m256i
|
|
simde_mm256_mask_i64gather_epi64(simde__m256i src, const int64_t* base_addr, simde__m256i vindex, simde__m256i mask, const int32_t scale)
|
|
SIMDE_REQUIRE_CONSTANT(scale)
|
|
HEDLEY_REQUIRE_MSG((scale && scale <= 8 && !(scale & (scale - 1))), "`scale' must be a power of two less than or equal to 8") {
|
|
simde__m256i_private
|
|
vindex_ = simde__m256i_to_private(vindex),
|
|
src_ = simde__m256i_to_private(src),
|
|
mask_ = simde__m256i_to_private(mask),
|
|
r_ = simde__m256i_to_private(simde_mm256_setzero_si256());
|
|
const uint8_t* addr = HEDLEY_REINTERPRET_CAST(const uint8_t*, base_addr);
|
|
|
|
SIMDE_VECTORIZE
|
|
for (size_t i = 0 ; i < (sizeof(vindex_.i64) / sizeof(vindex_.i64[0])) ; i++) {
|
|
if ((mask_.i64[i] >> 63) & 1) {
|
|
const uint8_t* src1 = addr + (HEDLEY_STATIC_CAST(size_t , vindex_.i64[i]) * HEDLEY_STATIC_CAST(size_t , scale));
|
|
int64_t dst;
|
|
simde_memcpy(&dst, src1, sizeof(dst));
|
|
r_.i64[i] = dst;
|
|
}
|
|
else {
|
|
r_.i64[i] = src_.i64[i];
|
|
}
|
|
}
|
|
|
|
return simde__m256i_from_private(r_);
|
|
}
|
|
#if defined(SIMDE_X86_AVX2_NATIVE)
|
|
#if SIMDE_DETECT_CLANG_VERSION_CHECK(3,8,0)
|
|
#define simde_mm256_mask_i64gather_epi64(src, base_addr, vindex, mask, scale) _mm256_mask_i64gather_epi64(src, HEDLEY_REINTERPRET_CAST(int64_t const*, base_addr), vindex, mask, scale)
|
|
#else
|
|
#define simde_mm256_mask_i64gather_epi64(src, base_addr, vindex, mask, scale) _mm256_mask_i64gather_epi64(src, HEDLEY_REINTERPRET_CAST(long long const*, base_addr), vindex, mask, scale)
|
|
#endif
|
|
#endif
|
|
#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
|
|
#undef _mm256_mask_i64gather_epi64
|
|
#define _mm256_mask_i64gather_epi64(src, base_addr, vindex, mask, scale) simde_mm256_mask_i64gather_epi64(src, HEDLEY_REINTERPRET_CAST(int64_t const*, base_addr), vindex, mask, scale)
|
|
#endif
|
|
|
|
SIMDE_FUNCTION_ATTRIBUTES
|
|
simde__m128
|
|
simde_mm_i32gather_ps(const simde_float32* base_addr, simde__m128i vindex, const int32_t scale)
|
|
SIMDE_REQUIRE_CONSTANT(scale)
|
|
HEDLEY_REQUIRE_MSG((scale && scale <= 8 && !(scale & (scale - 1))), "`scale' must be a power of two less than or equal to 8") {
|
|
simde__m128i_private
|
|
vindex_ = simde__m128i_to_private(vindex);
|
|
simde__m128_private
|
|
r_;
|
|
const uint8_t* addr = HEDLEY_REINTERPRET_CAST(const uint8_t*, base_addr);
|
|
|
|
SIMDE_VECTORIZE
|
|
for (size_t i = 0 ; i < (sizeof(vindex_.i32) / sizeof(vindex_.i32[0])) ; i++) {
|
|
const uint8_t* src = addr + (HEDLEY_STATIC_CAST(size_t , vindex_.i32[i]) * HEDLEY_STATIC_CAST(size_t , scale));
|
|
simde_float32 dst;
|
|
simde_memcpy(&dst, src, sizeof(dst));
|
|
r_.f32[i] = dst;
|
|
}
|
|
|
|
return simde__m128_from_private(r_);
|
|
}
|
|
#if defined(SIMDE_X86_AVX2_NATIVE)
|
|
#define simde_mm_i32gather_ps(base_addr, vindex, scale) _mm_i32gather_ps(SIMDE_CHECKED_REINTERPRET_CAST(float const*, simde_float32 const*, base_addr), vindex, scale)
|
|
#endif
|
|
#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
|
|
#undef _mm_i32gather_ps
|
|
#define _mm_i32gather_ps(base_addr, vindex, scale) simde_mm_i32gather_ps(SIMDE_CHECKED_REINTERPRET_CAST(simde_float32 const*, float const*, base_addr), vindex, scale)
|
|
#endif
|
|
|
|
SIMDE_FUNCTION_ATTRIBUTES
|
|
simde__m128
|
|
simde_mm_mask_i32gather_ps(simde__m128 src, const simde_float32* base_addr, simde__m128i vindex, simde__m128 mask, const int32_t scale)
|
|
SIMDE_REQUIRE_CONSTANT(scale)
|
|
HEDLEY_REQUIRE_MSG((scale && scale <= 8 && !(scale & (scale - 1))), "`scale' must be a power of two less than or equal to 8") {
|
|
simde__m128i_private
|
|
vindex_ = simde__m128i_to_private(vindex);
|
|
simde__m128_private
|
|
src_ = simde__m128_to_private(src),
|
|
mask_ = simde__m128_to_private(mask),
|
|
r_;
|
|
const uint8_t* addr = HEDLEY_REINTERPRET_CAST(const uint8_t*, base_addr);
|
|
|
|
SIMDE_VECTORIZE
|
|
for (size_t i = 0 ; i < (sizeof(vindex_.i32) / sizeof(vindex_.i32[0])) ; i++) {
|
|
if ((mask_.i32[i] >> 31) & 1) {
|
|
const uint8_t* src1 = addr + (HEDLEY_STATIC_CAST(size_t , vindex_.i32[i]) * HEDLEY_STATIC_CAST(size_t , scale));
|
|
simde_float32 dst;
|
|
simde_memcpy(&dst, src1, sizeof(dst));
|
|
r_.f32[i] = dst;
|
|
}
|
|
else {
|
|
r_.f32[i] = src_.f32[i];
|
|
}
|
|
}
|
|
|
|
return simde__m128_from_private(r_);
|
|
}
|
|
#if defined(SIMDE_X86_AVX2_NATIVE)
|
|
#define simde_mm_mask_i32gather_ps(src, base_addr, vindex, mask, scale) _mm_mask_i32gather_ps(src, SIMDE_CHECKED_REINTERPRET_CAST(float const*, simde_float32 const*, base_addr), vindex, mask, scale)
|
|
#endif
|
|
#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
|
|
#undef _mm_mask_i32gather_ps
|
|
#define _mm_mask_i32gather_ps(src, base_addr, vindex, mask, scale) simde_mm_mask_i32gather_ps(src, SIMDE_CHECKED_REINTERPRET_CAST(simde_float32 const*, float const*, base_addr), vindex, mask, scale)
|
|
#endif
|
|
|
|
SIMDE_FUNCTION_ATTRIBUTES
|
|
simde__m256
|
|
simde_mm256_i32gather_ps(const simde_float32* base_addr, simde__m256i vindex, const int32_t scale)
|
|
SIMDE_REQUIRE_CONSTANT(scale)
|
|
HEDLEY_REQUIRE_MSG((scale && scale <= 8 && !(scale & (scale - 1))), "`scale' must be a power of two less than or equal to 8") {
|
|
simde__m256i_private
|
|
vindex_ = simde__m256i_to_private(vindex);
|
|
simde__m256_private
|
|
r_;
|
|
const uint8_t* addr = HEDLEY_REINTERPRET_CAST(const uint8_t*, base_addr);
|
|
|
|
SIMDE_VECTORIZE
|
|
for (size_t i = 0 ; i < (sizeof(vindex_.i32) / sizeof(vindex_.i32[0])) ; i++) {
|
|
const uint8_t* src = addr + (HEDLEY_STATIC_CAST(size_t , vindex_.i32[i]) * HEDLEY_STATIC_CAST(size_t , scale));
|
|
simde_float32 dst;
|
|
simde_memcpy(&dst, src, sizeof(dst));
|
|
r_.f32[i] = dst;
|
|
}
|
|
|
|
return simde__m256_from_private(r_);
|
|
}
|
|
#if defined(SIMDE_X86_AVX2_NATIVE)
|
|
#define simde_mm256_i32gather_ps(base_addr, vindex, scale) _mm256_i32gather_ps(SIMDE_CHECKED_REINTERPRET_CAST(float const*, simde_float32 const*, base_addr), vindex, scale)
|
|
#endif
|
|
#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
|
|
#undef _mm256_i32gather_ps
|
|
#define _mm256_i32gather_ps(base_addr, vindex, scale) simde_mm256_i32gather_ps(SIMDE_CHECKED_REINTERPRET_CAST(simde_float32 const*, float const*, base_addr), vindex, scale)
|
|
#endif
|
|
|
|
SIMDE_FUNCTION_ATTRIBUTES
|
|
simde__m256
|
|
simde_mm256_mask_i32gather_ps(simde__m256 src, const simde_float32* base_addr, simde__m256i vindex, simde__m256 mask, const int32_t scale)
|
|
SIMDE_REQUIRE_CONSTANT(scale)
|
|
HEDLEY_REQUIRE_MSG((scale && scale <= 8 && !(scale & (scale - 1))), "`scale' must be a power of two less than or equal to 8") {
|
|
simde__m256i_private
|
|
vindex_ = simde__m256i_to_private(vindex);
|
|
simde__m256_private
|
|
src_ = simde__m256_to_private(src),
|
|
mask_ = simde__m256_to_private(mask),
|
|
r_;
|
|
const uint8_t* addr = HEDLEY_REINTERPRET_CAST(const uint8_t*, base_addr);
|
|
|
|
SIMDE_VECTORIZE
|
|
for (size_t i = 0 ; i < (sizeof(vindex_.i32) / sizeof(vindex_.i32[0])) ; i++) {
|
|
if ((mask_.i32[i] >> 31) & 1) {
|
|
const uint8_t* src1 = addr + (HEDLEY_STATIC_CAST(size_t , vindex_.i32[i]) * HEDLEY_STATIC_CAST(size_t , scale));
|
|
simde_float32 dst;
|
|
simde_memcpy(&dst, src1, sizeof(dst));
|
|
r_.f32[i] = dst;
|
|
}
|
|
else {
|
|
r_.f32[i] = src_.f32[i];
|
|
}
|
|
}
|
|
|
|
return simde__m256_from_private(r_);
|
|
}
|
|
#if defined(SIMDE_X86_AVX2_NATIVE)
|
|
#define simde_mm256_mask_i32gather_ps(src, base_addr, vindex, mask, scale) _mm256_mask_i32gather_ps(src, SIMDE_CHECKED_REINTERPRET_CAST(float const*, simde_float32 const*, base_addr), vindex, mask, scale)
|
|
#endif
|
|
#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
|
|
#undef _mm256_mask_i32gather_ps
|
|
#define _mm256_mask_i32gather_ps(src, base_addr, vindex, mask, scale) simde_mm256_mask_i32gather_ps(src, SIMDE_CHECKED_REINTERPRET_CAST(simde_float32 const*, float const*, base_addr), vindex, mask, scale)
|
|
#endif
|
|
|
|
SIMDE_FUNCTION_ATTRIBUTES
|
|
simde__m128
|
|
simde_mm_i64gather_ps(const simde_float32* base_addr, simde__m128i vindex, const int32_t scale)
|
|
SIMDE_REQUIRE_CONSTANT(scale)
|
|
HEDLEY_REQUIRE_MSG((scale && scale <= 8 && !(scale & (scale - 1))), "`scale' must be a power of two less than or equal to 8") {
|
|
simde__m128i_private
|
|
vindex_ = simde__m128i_to_private(vindex);
|
|
simde__m128_private
|
|
r_ = simde__m128_to_private(simde_mm_setzero_ps());
|
|
const uint8_t* addr = HEDLEY_REINTERPRET_CAST(const uint8_t*, base_addr);
|
|
|
|
SIMDE_VECTORIZE
|
|
for (size_t i = 0 ; i < (sizeof(vindex_.i64) / sizeof(vindex_.i64[0])) ; i++) {
|
|
const uint8_t* src = addr + (HEDLEY_STATIC_CAST(size_t , vindex_.i64[i]) * HEDLEY_STATIC_CAST(size_t , scale));
|
|
simde_float32 dst;
|
|
simde_memcpy(&dst, src, sizeof(dst));
|
|
r_.f32[i] = dst;
|
|
}
|
|
|
|
return simde__m128_from_private(r_);
|
|
}
|
|
#if defined(SIMDE_X86_AVX2_NATIVE)
|
|
#define simde_mm_i64gather_ps(base_addr, vindex, scale) _mm_i64gather_ps(SIMDE_CHECKED_REINTERPRET_CAST(float const*, simde_float32 const*, base_addr), vindex, scale)
|
|
#endif
|
|
#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
|
|
#undef _mm_i64gather_ps
|
|
#define _mm_i64gather_ps(base_addr, vindex, scale) simde_mm_i64gather_ps(SIMDE_CHECKED_REINTERPRET_CAST(simde_float32 const*, float const*, base_addr), vindex, scale)
|
|
#endif
|
|
|
|
SIMDE_FUNCTION_ATTRIBUTES
|
|
simde__m128
|
|
simde_mm_mask_i64gather_ps(simde__m128 src, const simde_float32* base_addr, simde__m128i vindex, simde__m128 mask, const int32_t scale)
|
|
SIMDE_REQUIRE_CONSTANT(scale)
|
|
HEDLEY_REQUIRE_MSG((scale && scale <= 8 && !(scale & (scale - 1))), "`scale' must be a power of two less than or equal to 8") {
|
|
simde__m128i_private
|
|
vindex_ = simde__m128i_to_private(vindex);
|
|
simde__m128_private
|
|
src_ = simde__m128_to_private(src),
|
|
mask_ = simde__m128_to_private(mask),
|
|
r_ = simde__m128_to_private(simde_mm_setzero_ps());
|
|
const uint8_t* addr = HEDLEY_REINTERPRET_CAST(const uint8_t*, base_addr);
|
|
|
|
SIMDE_VECTORIZE
|
|
for (size_t i = 0 ; i < (sizeof(vindex_.i64) / sizeof(vindex_.i64[0])) ; i++) {
|
|
if ((mask_.i32[i] >> 31) & 1) {
|
|
const uint8_t* src1 = addr + (HEDLEY_STATIC_CAST(size_t , vindex_.i64[i]) * HEDLEY_STATIC_CAST(size_t , scale));
|
|
simde_float32 dst;
|
|
simde_memcpy(&dst, src1, sizeof(dst));
|
|
r_.f32[i] = dst;
|
|
}
|
|
else {
|
|
r_.f32[i] = src_.f32[i];
|
|
}
|
|
}
|
|
|
|
return simde__m128_from_private(r_);
|
|
}
|
|
#if defined(SIMDE_X86_AVX2_NATIVE)
|
|
#define simde_mm_mask_i64gather_ps(src, base_addr, vindex, mask, scale) _mm_mask_i64gather_ps(src, SIMDE_CHECKED_REINTERPRET_CAST(float const*, float32_t const*, base_addr), vindex, mask, scale)
|
|
#endif
|
|
#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
|
|
#undef _mm_mask_i64gather_ps
|
|
#define _mm_mask_i64gather_ps(src, base_addr, vindex, mask, scale) simde_mm_mask_i64gather_ps(src, SIMDE_CHECKED_REINTERPRET_CAST(simde_float32 const*, float const*, base_addr), vindex, mask, scale)
|
|
#endif
|
|
|
|
SIMDE_FUNCTION_ATTRIBUTES
|
|
simde__m128
|
|
simde_mm256_i64gather_ps(const simde_float32* base_addr, simde__m256i vindex, const int32_t scale)
|
|
SIMDE_REQUIRE_CONSTANT(scale)
|
|
HEDLEY_REQUIRE_MSG((scale && scale <= 8 && !(scale & (scale - 1))), "`scale' must be a power of two less than or equal to 8") {
|
|
simde__m256i_private
|
|
vindex_ = simde__m256i_to_private(vindex);
|
|
simde__m128_private
|
|
r_ = simde__m128_to_private(simde_mm_setzero_ps());
|
|
const uint8_t* addr = HEDLEY_REINTERPRET_CAST(const uint8_t*, base_addr);
|
|
|
|
SIMDE_VECTORIZE
|
|
for (size_t i = 0 ; i < (sizeof(vindex_.i64) / sizeof(vindex_.i64[0])) ; i++) {
|
|
const uint8_t* src = addr + (HEDLEY_STATIC_CAST(size_t , vindex_.i64[i]) * HEDLEY_STATIC_CAST(size_t , scale));
|
|
simde_float32 dst;
|
|
simde_memcpy(&dst, src, sizeof(dst));
|
|
r_.f32[i] = dst;
|
|
}
|
|
|
|
return simde__m128_from_private(r_);
|
|
}
|
|
#if defined(SIMDE_X86_AVX2_NATIVE)
|
|
#define simde_mm256_i64gather_ps(base_addr, vindex, scale) _mm256_i64gather_ps(SIMDE_CHECKED_REINTERPRET_CAST(float const*, simde_float32 const*, base_addr), vindex, scale)
|
|
#endif
|
|
#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
|
|
#undef _mm256_i64gather_ps
|
|
#define _mm256_i64gather_ps(base_addr, vindex, scale) simde_mm256_i64gather_ps(SIMDE_CHECKED_REINTERPRET_CAST(simde_float32 const*, float const*, base_addr), vindex, scale)
|
|
#endif
|
|
|
|
SIMDE_FUNCTION_ATTRIBUTES
|
|
simde__m128
|
|
simde_mm256_mask_i64gather_ps(simde__m128 src, const simde_float32* base_addr, simde__m256i vindex, simde__m128 mask, const int32_t scale)
|
|
SIMDE_REQUIRE_CONSTANT(scale)
|
|
HEDLEY_REQUIRE_MSG((scale && scale <= 8 && !(scale & (scale - 1))), "`scale' must be a power of two less than or equal to 8") {
|
|
simde__m256i_private
|
|
vindex_ = simde__m256i_to_private(vindex);
|
|
simde__m128_private
|
|
src_ = simde__m128_to_private(src),
|
|
mask_ = simde__m128_to_private(mask),
|
|
r_ = simde__m128_to_private(simde_mm_setzero_ps());
|
|
const uint8_t* addr = HEDLEY_REINTERPRET_CAST(const uint8_t*, base_addr);
|
|
|
|
SIMDE_VECTORIZE
|
|
for (size_t i = 0 ; i < (sizeof(vindex_.i64) / sizeof(vindex_.i64[0])) ; i++) {
|
|
if ((mask_.i32[i] >> 31) & 1) {
|
|
const uint8_t* src1 = addr + (HEDLEY_STATIC_CAST(size_t , vindex_.i64[i]) * HEDLEY_STATIC_CAST(size_t , scale));
|
|
simde_float32 dst;
|
|
simde_memcpy(&dst, src1, sizeof(dst));
|
|
r_.f32[i] = dst;
|
|
}
|
|
else {
|
|
r_.f32[i] = src_.f32[i];
|
|
}
|
|
}
|
|
|
|
return simde__m128_from_private(r_);
|
|
}
|
|
#if defined(SIMDE_X86_AVX2_NATIVE)
|
|
#define simde_mm256_mask_i64gather_ps(src, base_addr, vindex, mask, scale) _mm256_mask_i64gather_ps(src, SIMDE_CHECKED_REINTERPRET_CAST(float const*, simde_float32 const*, base_addr), vindex, mask, scale)
|
|
#endif
|
|
#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
|
|
#undef _mm256_mask_i64gather_ps
|
|
#define _mm256_mask_i64gather_ps(src, base_addr, vindex, mask, scale) simde_mm256_mask_i64gather_ps(src, SIMDE_CHECKED_REINTERPRET_CAST(simde_float32 const*, float const*, base_addr), vindex, mask, scale)
|
|
#endif
|
|
|
|
SIMDE_FUNCTION_ATTRIBUTES
|
|
simde__m128d
|
|
simde_mm_i32gather_pd(const simde_float64* base_addr, simde__m128i vindex, const int32_t scale)
|
|
SIMDE_REQUIRE_CONSTANT(scale)
|
|
HEDLEY_REQUIRE_MSG((scale && scale <= 8 && !(scale & (scale - 1))), "`scale' must be a power of two less than or equal to 8") {
|
|
simde__m128i_private
|
|
vindex_ = simde__m128i_to_private(vindex);
|
|
simde__m128d_private
|
|
r_;
|
|
const uint8_t* addr = HEDLEY_REINTERPRET_CAST(const uint8_t*, base_addr);
|
|
|
|
SIMDE_VECTORIZE
|
|
for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) {
|
|
const uint8_t* src = addr + (HEDLEY_STATIC_CAST(size_t , vindex_.i32[i]) * HEDLEY_STATIC_CAST(size_t , scale));
|
|
simde_float64 dst;
|
|
simde_memcpy(&dst, src, sizeof(dst));
|
|
r_.f64[i] = dst;
|
|
}
|
|
|
|
return simde__m128d_from_private(r_);
|
|
}
|
|
#if defined(SIMDE_X86_AVX2_NATIVE)
|
|
#define simde_mm_i32gather_pd(base_addr, vindex, scale) _mm_i32gather_pd(HEDLEY_REINTERPRET_CAST(simde_float64 const*, base_addr), vindex, scale)
|
|
#endif
|
|
#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
|
|
#undef _mm_i32gather_pd
|
|
#define _mm_i32gather_pd(base_addr, vindex, scale) simde_mm_i32gather_pd(HEDLEY_REINTERPRET_CAST(simde_float64 const*, base_addr), vindex, scale)
|
|
#endif
|
|
|
|
SIMDE_FUNCTION_ATTRIBUTES
|
|
simde__m128d
|
|
simde_mm_mask_i32gather_pd(simde__m128d src, const simde_float64* base_addr, simde__m128i vindex, simde__m128d mask, const int32_t scale)
|
|
SIMDE_REQUIRE_CONSTANT(scale)
|
|
HEDLEY_REQUIRE_MSG((scale && scale <= 8 && !(scale & (scale - 1))), "`scale' must be a power of two less than or equal to 8") {
|
|
simde__m128i_private
|
|
vindex_ = simde__m128i_to_private(vindex);
|
|
simde__m128d_private
|
|
src_ = simde__m128d_to_private(src),
|
|
mask_ = simde__m128d_to_private(mask),
|
|
r_;
|
|
const uint8_t* addr = HEDLEY_REINTERPRET_CAST(const uint8_t*, base_addr);
|
|
|
|
SIMDE_VECTORIZE
|
|
for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) {
|
|
if ((mask_.i64[i] >> 63) & 1) {
|
|
const uint8_t* src1 = addr + (HEDLEY_STATIC_CAST(size_t , vindex_.i32[i]) * HEDLEY_STATIC_CAST(size_t , scale));
|
|
simde_float64 dst;
|
|
simde_memcpy(&dst, src1, sizeof(dst));
|
|
r_.f64[i] = dst;
|
|
}
|
|
else {
|
|
r_.f64[i] = src_.f64[i];
|
|
}
|
|
}
|
|
|
|
return simde__m128d_from_private(r_);
|
|
}
|
|
#if defined(SIMDE_X86_AVX2_NATIVE)
|
|
#define simde_mm_mask_i32gather_pd(src, base_addr, vindex, mask, scale) _mm_mask_i32gather_pd(src, HEDLEY_REINTERPRET_CAST(simde_float64 const*, base_addr), vindex, mask, scale)
|
|
#endif
|
|
#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
|
|
#undef _mm_mask_i32gather_pd
|
|
#define _mm_mask_i32gather_pd(src, base_addr, vindex, mask, scale) simde_mm_mask_i32gather_pd(src, HEDLEY_REINTERPRET_CAST(simde_float64 const*, base_addr), vindex, mask, scale)
|
|
#endif
|
|
|
|
SIMDE_FUNCTION_ATTRIBUTES
|
|
simde__m256d
|
|
simde_mm256_i32gather_pd(const simde_float64* base_addr, simde__m128i vindex, const int32_t scale)
|
|
SIMDE_REQUIRE_CONSTANT(scale)
|
|
HEDLEY_REQUIRE_MSG((scale && scale <= 8 && !(scale & (scale - 1))), "`scale' must be a power of two less than or equal to 8") {
|
|
simde__m128i_private
|
|
vindex_ = simde__m128i_to_private(vindex);
|
|
simde__m256d_private
|
|
r_;
|
|
const uint8_t* addr = HEDLEY_REINTERPRET_CAST(const uint8_t*, base_addr);
|
|
|
|
SIMDE_VECTORIZE
|
|
for (size_t i = 0 ; i < (sizeof(vindex_.i32) / sizeof(vindex_.i32[0])) ; i++) {
|
|
const uint8_t* src = addr + (HEDLEY_STATIC_CAST(size_t , vindex_.i32[i]) * HEDLEY_STATIC_CAST(size_t , scale));
|
|
simde_float64 dst;
|
|
simde_memcpy(&dst, src, sizeof(dst));
|
|
r_.f64[i] = dst;
|
|
}
|
|
|
|
return simde__m256d_from_private(r_);
|
|
}
|
|
#if defined(SIMDE_X86_AVX2_NATIVE)
|
|
#define simde_mm256_i32gather_pd(base_addr, vindex, scale) _mm256_i32gather_pd(HEDLEY_REINTERPRET_CAST(simde_float64 const*, base_addr), vindex, scale)
|
|
#endif
|
|
#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
|
|
#undef _mm256_i32gather_pd
|
|
#define _mm256_i32gather_pd(base_addr, vindex, scale) simde_mm256_i32gather_pd(HEDLEY_REINTERPRET_CAST(simde_float64 const*, base_addr), vindex, scale)
|
|
#endif
|
|
|
|
SIMDE_FUNCTION_ATTRIBUTES
|
|
simde__m256d
|
|
simde_mm256_mask_i32gather_pd(simde__m256d src, const simde_float64* base_addr, simde__m128i vindex, simde__m256d mask, const int32_t scale)
|
|
SIMDE_REQUIRE_CONSTANT(scale)
|
|
HEDLEY_REQUIRE_MSG((scale && scale <= 8 && !(scale & (scale - 1))), "`scale' must be a power of two less than or equal to 8") {
|
|
simde__m256d_private
|
|
src_ = simde__m256d_to_private(src),
|
|
mask_ = simde__m256d_to_private(mask),
|
|
r_;
|
|
simde__m128i_private
|
|
vindex_ = simde__m128i_to_private(vindex);
|
|
const uint8_t* addr = HEDLEY_REINTERPRET_CAST(const uint8_t*, base_addr);
|
|
|
|
SIMDE_VECTORIZE
|
|
for (size_t i = 0 ; i < (sizeof(vindex_.i32) / sizeof(vindex_.i32[0])) ; i++) {
|
|
if ((mask_.i64[i] >> 63) & 1) {
|
|
const uint8_t* src1 = addr + (HEDLEY_STATIC_CAST(size_t , vindex_.i32[i]) * HEDLEY_STATIC_CAST(size_t , scale));
|
|
simde_float64 dst;
|
|
simde_memcpy(&dst, src1, sizeof(dst));
|
|
r_.f64[i] = dst;
|
|
}
|
|
else {
|
|
r_.f64[i] = src_.f64[i];
|
|
}
|
|
}
|
|
|
|
return simde__m256d_from_private(r_);
|
|
}
|
|
#if defined(SIMDE_X86_AVX2_NATIVE)
|
|
#define simde_mm256_mask_i32gather_pd(src, base_addr, vindex, mask, scale) _mm256_mask_i32gather_pd(src, HEDLEY_REINTERPRET_CAST(simde_float64 const*, base_addr), vindex, mask, scale)
|
|
#endif
|
|
#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
|
|
#undef _mm256_mask_i32gather_pd
|
|
#define _mm256_mask_i32gather_pd(src, base_addr, vindex, mask, scale) simde_mm256_mask_i32gather_pd(src, HEDLEY_REINTERPRET_CAST(simde_float64 const*, base_addr), vindex, mask, scale)
|
|
#endif
|
|
|
|
SIMDE_FUNCTION_ATTRIBUTES
|
|
simde__m128d
|
|
simde_mm_i64gather_pd(const simde_float64* base_addr, simde__m128i vindex, const int32_t scale)
|
|
SIMDE_REQUIRE_CONSTANT(scale)
|
|
HEDLEY_REQUIRE_MSG((scale && scale <= 8 && !(scale & (scale - 1))), "`scale' must be a power of two less than or equal to 8") {
|
|
simde__m128i_private
|
|
vindex_ = simde__m128i_to_private(vindex);
|
|
simde__m128d_private
|
|
r_ = simde__m128d_to_private(simde_mm_setzero_pd());
|
|
const uint8_t* addr = HEDLEY_REINTERPRET_CAST(const uint8_t*, base_addr);
|
|
|
|
SIMDE_VECTORIZE
|
|
for (size_t i = 0 ; i < (sizeof(vindex_.i64) / sizeof(vindex_.i64[0])) ; i++) {
|
|
const uint8_t* src = addr + (HEDLEY_STATIC_CAST(size_t , vindex_.i64[i]) * HEDLEY_STATIC_CAST(size_t , scale));
|
|
simde_float64 dst;
|
|
simde_memcpy(&dst, src, sizeof(dst));
|
|
r_.f64[i] = dst;
|
|
}
|
|
|
|
return simde__m128d_from_private(r_);
|
|
}
|
|
#if defined(SIMDE_X86_AVX2_NATIVE)
|
|
#define simde_mm_i64gather_pd(base_addr, vindex, scale) _mm_i64gather_pd(HEDLEY_REINTERPRET_CAST(simde_float64 const*, base_addr), vindex, scale)
|
|
#endif
|
|
#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
|
|
#undef _mm_i64gather_pd
|
|
#define _mm_i64gather_pd(base_addr, vindex, scale) simde_mm_i64gather_pd(HEDLEY_REINTERPRET_CAST(simde_float64 const*, base_addr), vindex, scale)
|
|
#endif
|
|
|
|
SIMDE_FUNCTION_ATTRIBUTES
|
|
simde__m128d
|
|
simde_mm_mask_i64gather_pd(simde__m128d src, const simde_float64* base_addr, simde__m128i vindex, simde__m128d mask, const int32_t scale)
|
|
SIMDE_REQUIRE_CONSTANT(scale)
|
|
HEDLEY_REQUIRE_MSG((scale && scale <= 8 && !(scale & (scale - 1))), "`scale' must be a power of two less than or equal to 8") {
|
|
simde__m128i_private
|
|
vindex_ = simde__m128i_to_private(vindex);
|
|
simde__m128d_private
|
|
src_ = simde__m128d_to_private(src),
|
|
mask_ = simde__m128d_to_private(mask),
|
|
r_ = simde__m128d_to_private(simde_mm_setzero_pd());
|
|
const uint8_t* addr = HEDLEY_REINTERPRET_CAST(const uint8_t*, base_addr);
|
|
|
|
SIMDE_VECTORIZE
|
|
for (size_t i = 0 ; i < (sizeof(vindex_.i64) / sizeof(vindex_.i64[0])) ; i++) {
|
|
if ((mask_.i64[i] >> 63) & 1) {
|
|
const uint8_t* src1 = addr + (HEDLEY_STATIC_CAST(size_t , vindex_.i64[i]) * HEDLEY_STATIC_CAST(size_t , scale));
|
|
simde_float64 dst;
|
|
simde_memcpy(&dst, src1, sizeof(dst));
|
|
r_.f64[i] = dst;
|
|
}
|
|
else {
|
|
r_.f64[i] = src_.f64[i];
|
|
}
|
|
}
|
|
|
|
return simde__m128d_from_private(r_);
|
|
}
|
|
#if defined(SIMDE_X86_AVX2_NATIVE)
|
|
#define simde_mm_mask_i64gather_pd(src, base_addr, vindex, mask, scale) _mm_mask_i64gather_pd(src, HEDLEY_REINTERPRET_CAST(simde_float64 const*, base_addr), vindex, mask, scale)
|
|
#endif
|
|
#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
|
|
#undef _mm_mask_i64gather_pd
|
|
#define _mm_mask_i64gather_pd(src, base_addr, vindex, mask, scale) simde_mm_mask_i64gather_pd(src, HEDLEY_REINTERPRET_CAST(simde_float64 const*, base_addr), vindex, mask, scale)
|
|
#endif
|
|
|
|
SIMDE_FUNCTION_ATTRIBUTES
|
|
simde__m256d
|
|
simde_mm256_i64gather_pd(const simde_float64* base_addr, simde__m256i vindex, const int32_t scale)
|
|
SIMDE_REQUIRE_CONSTANT(scale)
|
|
HEDLEY_REQUIRE_MSG((scale && scale <= 8 && !(scale & (scale - 1))), "`scale' must be a power of two less than or equal to 8") {
|
|
simde__m256i_private
|
|
vindex_ = simde__m256i_to_private(vindex);
|
|
simde__m256d_private
|
|
r_ = simde__m256d_to_private(simde_mm256_setzero_pd());
|
|
const uint8_t* addr = HEDLEY_REINTERPRET_CAST(const uint8_t*, base_addr);
|
|
|
|
SIMDE_VECTORIZE
|
|
for (size_t i = 0 ; i < (sizeof(vindex_.i64) / sizeof(vindex_.i64[0])) ; i++) {
|
|
const uint8_t* src = addr + (HEDLEY_STATIC_CAST(size_t , vindex_.i64[i]) * HEDLEY_STATIC_CAST(size_t , scale));
|
|
simde_float64 dst;
|
|
simde_memcpy(&dst, src, sizeof(dst));
|
|
r_.f64[i] = dst;
|
|
}
|
|
|
|
return simde__m256d_from_private(r_);
|
|
}
|
|
#if defined(SIMDE_X86_AVX2_NATIVE)
|
|
#define simde_mm256_i64gather_pd(base_addr, vindex, scale) _mm256_i64gather_pd(HEDLEY_REINTERPRET_CAST(simde_float64 const*, base_addr), vindex, scale)
|
|
#endif
|
|
#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
|
|
#undef _mm256_i64gather_pd
|
|
#define _mm256_i64gather_pd(base_addr, vindex, scale) simde_mm256_i64gather_pd(HEDLEY_REINTERPRET_CAST(simde_float64 const*, base_addr), vindex, scale)
|
|
#endif
|
|
|
|
SIMDE_FUNCTION_ATTRIBUTES
|
|
simde__m256d
|
|
simde_mm256_mask_i64gather_pd(simde__m256d src, const simde_float64* base_addr, simde__m256i vindex, simde__m256d mask, const int32_t scale)
|
|
SIMDE_REQUIRE_CONSTANT(scale)
|
|
HEDLEY_REQUIRE_MSG((scale && scale <= 8 && !(scale & (scale - 1))), "`scale' must be a power of two less than or equal to 8") {
|
|
simde__m256i_private
|
|
vindex_ = simde__m256i_to_private(vindex);
|
|
simde__m256d_private
|
|
src_ = simde__m256d_to_private(src),
|
|
mask_ = simde__m256d_to_private(mask),
|
|
r_ = simde__m256d_to_private(simde_mm256_setzero_pd());
|
|
const uint8_t* addr = HEDLEY_REINTERPRET_CAST(const uint8_t*, base_addr);
|
|
|
|
SIMDE_VECTORIZE
|
|
for (size_t i = 0 ; i < (sizeof(vindex_.i64) / sizeof(vindex_.i64[0])) ; i++) {
|
|
if ((mask_.i64[i] >> 63) & 1) {
|
|
const uint8_t* src1 = addr + (HEDLEY_STATIC_CAST(size_t , vindex_.i64[i]) * HEDLEY_STATIC_CAST(size_t , scale));
|
|
simde_float64 dst;
|
|
simde_memcpy(&dst, src1, sizeof(dst));
|
|
r_.f64[i] = dst;
|
|
}
|
|
else {
|
|
r_.f64[i] = src_.f64[i];
|
|
}
|
|
}
|
|
|
|
return simde__m256d_from_private(r_);
|
|
}
|
|
#if defined(SIMDE_X86_AVX2_NATIVE)
|
|
#define simde_mm256_mask_i64gather_pd(src, base_addr, vindex, mask, scale) _mm256_mask_i64gather_pd(src, HEDLEY_REINTERPRET_CAST(simde_float64 const*, base_addr), vindex, mask, scale)
|
|
#endif
|
|
#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
|
|
#undef _mm256_mask_i64gather_pd
|
|
#define _mm256_mask_i64gather_pd(src, base_addr, vindex, mask, scale) simde_mm256_mask_i64gather_pd(src, HEDLEY_REINTERPRET_CAST(simde_float64 const*, base_addr), vindex, mask, scale)
|
|
#endif
|
|
|
|
SIMDE_FUNCTION_ATTRIBUTES
|
|
simde__m256i
|
|
simde_mm256_inserti128_si256(simde__m256i a, simde__m128i b, const int imm8)
|
|
SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 1) {
|
|
simde__m256i_private a_ = simde__m256i_to_private(a);
|
|
simde__m128i_private b_ = simde__m128i_to_private(b);
|
|
|
|
a_.m128i_private[ imm8 & 1 ] = b_;
|
|
|
|
return simde__m256i_from_private(a_);
|
|
}
|
|
#if defined(SIMDE_X86_AVX2_NATIVE)
|
|
#define simde_mm256_inserti128_si256(a, b, imm8) _mm256_inserti128_si256(a, b, imm8)
|
|
#endif
|
|
#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
|
|
#undef _mm256_inserti128_si256
|
|
#define _mm256_inserti128_si256(a, b, imm8) simde_mm256_inserti128_si256(a, b, imm8)
|
|
#endif
|
|
|
|
SIMDE_FUNCTION_ATTRIBUTES
|
|
simde__m256i
|
|
simde_mm256_madd_epi16 (simde__m256i a, simde__m256i b) {
|
|
#if defined(SIMDE_X86_AVX2_NATIVE)
|
|
return _mm256_madd_epi16(a, b);
|
|
#else
|
|
simde__m256i_private
|
|
r_,
|
|
a_ = simde__m256i_to_private(a),
|
|
b_ = simde__m256i_to_private(b);
|
|
|
|
#if SIMDE_NATURAL_VECTOR_SIZE_LE(128)
|
|
r_.m128i[0] = simde_mm_madd_epi16(a_.m128i[0], b_.m128i[0]);
|
|
r_.m128i[1] = simde_mm_madd_epi16(a_.m128i[1], b_.m128i[1]);
|
|
#elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) && defined(SIMDE_CONVERT_VECTOR_) && HEDLEY_HAS_BUILTIN(__builtin_shufflevector)
|
|
SIMDE_ALIGN_TO_32 int32_t product SIMDE_VECTOR(64);
|
|
SIMDE_ALIGN_TO_32 int32_t a32x16 SIMDE_VECTOR(64);
|
|
SIMDE_ALIGN_TO_32 int32_t b32x16 SIMDE_VECTOR(64);
|
|
SIMDE_ALIGN_TO_32 int32_t even SIMDE_VECTOR(32);
|
|
SIMDE_ALIGN_TO_32 int32_t odd SIMDE_VECTOR(32);
|
|
|
|
SIMDE_CONVERT_VECTOR_(a32x16, a_.i16);
|
|
SIMDE_CONVERT_VECTOR_(b32x16, b_.i16);
|
|
product = a32x16 * b32x16;
|
|
|
|
even = __builtin_shufflevector(product, product, 0, 2, 4, 6, 8, 10, 12, 14);
|
|
odd = __builtin_shufflevector(product, product, 1, 3, 5, 7, 9, 11, 13, 15);
|
|
|
|
r_.i32 = even + odd;
|
|
#else
|
|
SIMDE_VECTORIZE
|
|
for (size_t i = 0 ; i < (sizeof(r_) / sizeof(r_.i16[0])) ; i += 2) {
|
|
r_.i32[i / 2] = (a_.i16[i] * b_.i16[i]) + (a_.i16[i + 1] * b_.i16[i + 1]);
|
|
}
|
|
#endif
|
|
|
|
return simde__m256i_from_private(r_);
|
|
#endif
|
|
}
|
|
#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
|
|
#undef _mm256_madd_epi16
|
|
#define _mm256_madd_epi16(a, b) simde_mm256_madd_epi16(a, b)
|
|
#endif
|
|
|
|
SIMDE_FUNCTION_ATTRIBUTES
|
|
simde__m256i
|
|
simde_mm256_maddubs_epi16 (simde__m256i a, simde__m256i b) {
|
|
#if defined(SIMDE_X86_AVX2_NATIVE)
|
|
return _mm256_maddubs_epi16(a, b);
|
|
#else
|
|
simde__m256i_private
|
|
r_,
|
|
a_ = simde__m256i_to_private(a),
|
|
b_ = simde__m256i_to_private(b);
|
|
|
|
#if SIMDE_NATURAL_VECTOR_SIZE_LE(128)
|
|
r_.m128i[0] = simde_mm_maddubs_epi16(a_.m128i[0], b_.m128i[0]);
|
|
r_.m128i[1] = simde_mm_maddubs_epi16(a_.m128i[1], b_.m128i[1]);
|
|
#else
|
|
SIMDE_VECTORIZE
|
|
for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) {
|
|
const int idx = HEDLEY_STATIC_CAST(int, i) << 1;
|
|
int32_t ts =
|
|
(HEDLEY_STATIC_CAST(int16_t, a_.u8[ idx ]) * HEDLEY_STATIC_CAST(int16_t, b_.i8[ idx ])) +
|
|
(HEDLEY_STATIC_CAST(int16_t, a_.u8[idx + 1]) * HEDLEY_STATIC_CAST(int16_t, b_.i8[idx + 1]));
|
|
r_.i16[i] = (ts > INT16_MIN) ? ((ts < INT16_MAX) ? HEDLEY_STATIC_CAST(int16_t, ts) : INT16_MAX) : INT16_MIN;
|
|
}
|
|
#endif
|
|
|
|
return simde__m256i_from_private(r_);
|
|
#endif
|
|
}
|
|
#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
|
|
#undef _mm256_maddubs_epi16
|
|
#define _mm256_maddubs_epi16(a, b) simde_mm256_maddubs_epi16(a, b)
|
|
#endif
|
|
|
|
SIMDE_FUNCTION_ATTRIBUTES
|
|
simde__m128i
|
|
simde_mm_maskload_epi32 (const int32_t mem_addr[HEDLEY_ARRAY_PARAM(4)], simde__m128i mask) {
|
|
#if defined(SIMDE_X86_AVX2_NATIVE)
|
|
return _mm_maskload_epi32(mem_addr, mask);
|
|
#else
|
|
simde__m128i_private
|
|
mem_ = simde__m128i_to_private(simde_x_mm_loadu_epi32(mem_addr)),
|
|
r_,
|
|
mask_ = simde__m128i_to_private(mask);
|
|
|
|
#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
|
|
r_.neon_i32 = vandq_s32(mem_.neon_i32, vshrq_n_s32(mask_.neon_i32, 31));
|
|
#else
|
|
SIMDE_VECTORIZE
|
|
for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
|
|
r_.i32[i] = mem_.i32[i] & (mask_.i32[i] >> 31);
|
|
}
|
|
#endif
|
|
|
|
return simde__m128i_from_private(r_);
|
|
#endif
|
|
}
|
|
#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
|
|
#undef _mm_maskload_epi32
|
|
#define _mm_maskload_epi32(mem_addr, mask) simde_mm_maskload_epi32(HEDLEY_REINTERPRET_CAST(int32_t const*, mem_addr), mask)
|
|
#endif
|
|
|
|
SIMDE_FUNCTION_ATTRIBUTES
|
|
simde__m256i
|
|
simde_mm256_maskload_epi32 (const int32_t mem_addr[HEDLEY_ARRAY_PARAM(4)], simde__m256i mask) {
|
|
#if defined(SIMDE_X86_AVX2_NATIVE)
|
|
return _mm256_maskload_epi32(mem_addr, mask);
|
|
#else
|
|
simde__m256i_private
|
|
mask_ = simde__m256i_to_private(mask),
|
|
r_ = simde__m256i_to_private(simde_x_mm256_loadu_epi32(mem_addr));
|
|
|
|
SIMDE_VECTORIZE
|
|
for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
|
|
r_.i32[i] &= mask_.i32[i] >> 31;
|
|
}
|
|
|
|
return simde__m256i_from_private(r_);
|
|
#endif
|
|
}
|
|
#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
|
|
#undef _mm256_maskload_epi32
|
|
#define _mm256_maskload_epi32(mem_addr, mask) simde_mm256_maskload_epi32(HEDLEY_REINTERPRET_CAST(int32_t const*, mem_addr), mask)
|
|
#endif
|
|
|
|
SIMDE_FUNCTION_ATTRIBUTES
|
|
simde__m128i
|
|
simde_mm_maskload_epi64 (const int64_t mem_addr[HEDLEY_ARRAY_PARAM(4)], simde__m128i mask) {
|
|
#if defined(SIMDE_X86_AVX2_NATIVE)
|
|
return _mm_maskload_epi64(HEDLEY_REINTERPRET_CAST(const long long *, mem_addr), mask);
|
|
#else
|
|
simde__m128i_private
|
|
mem_ = simde__m128i_to_private(simde_x_mm_loadu_epi64((mem_addr))),
|
|
r_,
|
|
mask_ = simde__m128i_to_private(mask);
|
|
|
|
#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
|
|
r_.neon_i64 = vandq_s64(mem_.neon_i64, vshrq_n_s64(mask_.neon_i64, 63));
|
|
#else
|
|
SIMDE_VECTORIZE
|
|
for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) {
|
|
r_.i64[i] = mem_.i64[i] & (mask_.i64[i] >> 63);
|
|
}
|
|
#endif
|
|
|
|
return simde__m128i_from_private(r_);
|
|
#endif
|
|
}
|
|
#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
|
|
#undef _mm_maskload_epi64
|
|
#define _mm_maskload_epi64(mem_addr, mask) simde_mm_maskload_epi64(HEDLEY_REINTERPRET_CAST(int64_t const*, mem_addr), mask)
|
|
#endif
|
|
|
|
SIMDE_FUNCTION_ATTRIBUTES
|
|
simde__m256i
|
|
simde_mm256_maskload_epi64 (const int64_t mem_addr[HEDLEY_ARRAY_PARAM(4)], simde__m256i mask) {
|
|
#if defined(SIMDE_X86_AVX2_NATIVE)
|
|
return _mm256_maskload_epi64(HEDLEY_REINTERPRET_CAST(const long long *, mem_addr), mask);
|
|
#else
|
|
simde__m256i_private
|
|
mask_ = simde__m256i_to_private(mask),
|
|
r_ = simde__m256i_to_private(simde_x_mm256_loadu_epi64((mem_addr)));
|
|
|
|
SIMDE_VECTORIZE
|
|
for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) {
|
|
r_.i64[i] &= mask_.i64[i] >> 63;
|
|
}
|
|
|
|
return simde__m256i_from_private(r_);
|
|
#endif
|
|
}
|
|
#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
|
|
#undef _mm256_maskload_epi64
|
|
#define _mm256_maskload_epi64(mem_addr, mask) simde_mm256_maskload_epi64(HEDLEY_REINTERPRET_CAST(int64_t const*, mem_addr), mask)
|
|
#endif
|
|
|
|
SIMDE_FUNCTION_ATTRIBUTES
|
|
void
|
|
simde_mm_maskstore_epi32 (int32_t mem_addr[HEDLEY_ARRAY_PARAM(4)], simde__m128i mask, simde__m128i a) {
|
|
#if defined(SIMDE_X86_AVX2_NATIVE)
|
|
_mm_maskstore_epi32(mem_addr, mask, a);
|
|
#else
|
|
simde__m128i_private mask_ = simde__m128i_to_private(mask);
|
|
simde__m128i_private a_ = simde__m128i_to_private(a);
|
|
|
|
SIMDE_VECTORIZE
|
|
for (size_t i = 0 ; i < (sizeof(a_.i32) / sizeof(a_.i32[0])) ; i++) {
|
|
if (mask_.u32[i] & (UINT32_C(1) << 31))
|
|
mem_addr[i] = a_.i32[i];
|
|
}
|
|
#endif
|
|
}
|
|
#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
|
|
#undef _mm_maskstore_epi32
|
|
#define _mm_maskstore_epi32(mem_addr, mask, a) simde_mm_maskstore_epi32(HEDLEY_REINTERPRET_CAST(int32_t *, mem_addr), mask, a)
|
|
#endif
|
|
|
|
SIMDE_FUNCTION_ATTRIBUTES
|
|
void
|
|
simde_mm256_maskstore_epi32 (int32_t mem_addr[HEDLEY_ARRAY_PARAM(8)], simde__m256i mask, simde__m256i a) {
|
|
#if defined(SIMDE_X86_AVX2_NATIVE)
|
|
_mm256_maskstore_epi32(mem_addr, mask, a);
|
|
#else
|
|
simde__m256i_private mask_ = simde__m256i_to_private(mask);
|
|
simde__m256i_private a_ = simde__m256i_to_private(a);
|
|
|
|
SIMDE_VECTORIZE
|
|
for (size_t i = 0 ; i < (sizeof(a_.i32) / sizeof(a_.i32[0])) ; i++) {
|
|
if (mask_.u32[i] & (UINT32_C(1) << 31))
|
|
mem_addr[i] = a_.i32[i];
|
|
}
|
|
#endif
|
|
}
|
|
#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
|
|
#undef _mm256_maskstore_epi32
|
|
#define _mm256_maskstore_epi32(mem_addr, mask, a) simde_mm256_maskstore_epi32(HEDLEY_REINTERPRET_CAST(int32_t *, mem_addr), mask, a)
|
|
#endif
|
|
|
|
SIMDE_FUNCTION_ATTRIBUTES
|
|
void
|
|
simde_mm_maskstore_epi64 (int64_t mem_addr[HEDLEY_ARRAY_PARAM(2)], simde__m128i mask, simde__m128i a) {
|
|
#if defined(SIMDE_X86_AVX2_NATIVE)
|
|
_mm_maskstore_epi64(HEDLEY_REINTERPRET_CAST(long long *, mem_addr), mask, a);
|
|
#else
|
|
simde__m128i_private mask_ = simde__m128i_to_private(mask);
|
|
simde__m128i_private a_ = simde__m128i_to_private(a);
|
|
|
|
SIMDE_VECTORIZE
|
|
for (size_t i = 0 ; i < (sizeof(a_.i64) / sizeof(a_.i64[0])) ; i++) {
|
|
if (mask_.u64[i] >> 63)
|
|
mem_addr[i] = a_.i64[i];
|
|
}
|
|
#endif
|
|
}
|
|
#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
|
|
#undef _mm_maskstore_epi64
|
|
#define _mm_maskstore_epi64(mem_addr, mask, a) simde_mm_maskstore_epi64(HEDLEY_REINTERPRET_CAST(int64_t *, mem_addr), mask, a)
|
|
#endif
|
|
|
|
SIMDE_FUNCTION_ATTRIBUTES
|
|
void
|
|
simde_mm256_maskstore_epi64 (int64_t mem_addr[HEDLEY_ARRAY_PARAM(4)], simde__m256i mask, simde__m256i a) {
|
|
#if defined(SIMDE_X86_AVX2_NATIVE)
|
|
_mm256_maskstore_epi64(HEDLEY_REINTERPRET_CAST(long long *, mem_addr), mask, a);
|
|
#else
|
|
simde__m256i_private mask_ = simde__m256i_to_private(mask);
|
|
simde__m256i_private a_ = simde__m256i_to_private(a);
|
|
|
|
SIMDE_VECTORIZE
|
|
for (size_t i = 0 ; i < (sizeof(a_.i64) / sizeof(a_.i64[0])) ; i++) {
|
|
if (mask_.u64[i] & (UINT64_C(1) << 63))
|
|
mem_addr[i] = a_.i64[i];
|
|
}
|
|
#endif
|
|
}
|
|
#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
|
|
#undef _mm256_maskstore_epi64
|
|
#define _mm256_maskstore_epi64(mem_addr, mask, a) simde_mm256_maskstore_epi64(HEDLEY_REINTERPRET_CAST(int64_t *, mem_addr), mask, a)
|
|
#endif
|
|
|
|
SIMDE_FUNCTION_ATTRIBUTES
|
|
simde__m256i
|
|
simde_mm256_max_epi8 (simde__m256i a, simde__m256i b) {
|
|
#if defined(SIMDE_X86_AVX2_NATIVE) && !defined(__PGI)
|
|
return _mm256_max_epi8(a, b);
|
|
#else
|
|
simde__m256i_private
|
|
r_,
|
|
a_ = simde__m256i_to_private(a),
|
|
b_ = simde__m256i_to_private(b);
|
|
|
|
#if SIMDE_NATURAL_VECTOR_SIZE_LE(128)
|
|
r_.m128i[0] = simde_mm_max_epi8(a_.m128i[0], b_.m128i[0]);
|
|
r_.m128i[1] = simde_mm_max_epi8(a_.m128i[1], b_.m128i[1]);
|
|
#else
|
|
SIMDE_VECTORIZE
|
|
for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) {
|
|
r_.i8[i] = a_.i8[i] > b_.i8[i] ? a_.i8[i] : b_.i8[i];
|
|
}
|
|
#endif
|
|
|
|
return simde__m256i_from_private(r_);
|
|
#endif
|
|
}
|
|
#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
|
|
#undef _mm256_max_epi8
|
|
#define _mm256_max_epi8(a, b) simde_mm256_max_epi8(a, b)
|
|
#endif
|
|
|
|
SIMDE_FUNCTION_ATTRIBUTES
|
|
simde__m256i
|
|
simde_mm256_max_epu8 (simde__m256i a, simde__m256i b) {
|
|
#if defined(SIMDE_X86_AVX2_NATIVE)
|
|
return _mm256_max_epu8(a, b);
|
|
#else
|
|
simde__m256i_private
|
|
r_,
|
|
a_ = simde__m256i_to_private(a),
|
|
b_ = simde__m256i_to_private(b);
|
|
|
|
#if SIMDE_NATURAL_VECTOR_SIZE_LE(128)
|
|
r_.m128i[0] = simde_mm_max_epu8(a_.m128i[0], b_.m128i[0]);
|
|
r_.m128i[1] = simde_mm_max_epu8(a_.m128i[1], b_.m128i[1]);
|
|
#else
|
|
SIMDE_VECTORIZE
|
|
for (size_t i = 0 ; i < (sizeof(r_.u8) / sizeof(r_.u8[0])) ; i++) {
|
|
r_.u8[i] = (a_.u8[i] > b_.u8[i]) ? a_.u8[i] : b_.u8[i];
|
|
}
|
|
#endif
|
|
|
|
return simde__m256i_from_private(r_);
|
|
#endif
|
|
}
|
|
#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
|
|
#undef _mm256_max_epu8
|
|
#define _mm256_max_epu8(a, b) simde_mm256_max_epu8(a, b)
|
|
#endif
|
|
|
|
SIMDE_FUNCTION_ATTRIBUTES
|
|
simde__m256i
|
|
simde_mm256_max_epu16 (simde__m256i a, simde__m256i b) {
|
|
#if defined(SIMDE_X86_AVX2_NATIVE)
|
|
return _mm256_max_epu16(a, b);
|
|
#else
|
|
simde__m256i_private
|
|
r_,
|
|
a_ = simde__m256i_to_private(a),
|
|
b_ = simde__m256i_to_private(b);
|
|
|
|
#if SIMDE_NATURAL_VECTOR_SIZE_LE(128)
|
|
r_.m128i[0] = simde_mm_max_epu16(a_.m128i[0], b_.m128i[0]);
|
|
r_.m128i[1] = simde_mm_max_epu16(a_.m128i[1], b_.m128i[1]);
|
|
#else
|
|
SIMDE_VECTORIZE
|
|
for (size_t i = 0 ; i < (sizeof(r_.u16) / sizeof(r_.u16[0])) ; i++) {
|
|
r_.u16[i] = (a_.u16[i] > b_.u16[i]) ? a_.u16[i] : b_.u16[i];
|
|
}
|
|
#endif
|
|
|
|
return simde__m256i_from_private(r_);
|
|
#endif
|
|
}
|
|
#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
|
|
#undef _mm256_max_epu16
|
|
#define _mm256_max_epu16(a, b) simde_mm256_max_epu16(a, b)
|
|
#endif
|
|
|
|
SIMDE_FUNCTION_ATTRIBUTES
|
|
simde__m256i
|
|
simde_mm256_max_epu32 (simde__m256i a, simde__m256i b) {
|
|
#if defined(SIMDE_X86_AVX2_NATIVE)
|
|
return _mm256_max_epu32(a, b);
|
|
#else
|
|
simde__m256i_private
|
|
r_,
|
|
a_ = simde__m256i_to_private(a),
|
|
b_ = simde__m256i_to_private(b);
|
|
|
|
#if SIMDE_NATURAL_VECTOR_SIZE_LE(128)
|
|
r_.m128i[0] = simde_mm_max_epu32(a_.m128i[0], b_.m128i[0]);
|
|
r_.m128i[1] = simde_mm_max_epu32(a_.m128i[1], b_.m128i[1]);
|
|
#else
|
|
SIMDE_VECTORIZE
|
|
for (size_t i = 0 ; i < (sizeof(r_.u32) / sizeof(r_.u32[0])) ; i++) {
|
|
r_.u32[i] = (a_.u32[i] > b_.u32[i]) ? a_.u32[i] : b_.u32[i];
|
|
}
|
|
#endif
|
|
|
|
return simde__m256i_from_private(r_);
|
|
#endif
|
|
}
|
|
#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
|
|
#undef _mm256_max_epu32
|
|
#define _mm256_max_epu32(a, b) simde_mm256_max_epu32(a, b)
|
|
#endif
|
|
|
|
SIMDE_FUNCTION_ATTRIBUTES
|
|
simde__m256i
|
|
simde_mm256_max_epi16 (simde__m256i a, simde__m256i b) {
|
|
#if defined(SIMDE_X86_AVX2_NATIVE)
|
|
return _mm256_max_epi16(a, b);
|
|
#else
|
|
simde__m256i_private
|
|
r_,
|
|
a_ = simde__m256i_to_private(a),
|
|
b_ = simde__m256i_to_private(b);
|
|
|
|
#if SIMDE_NATURAL_VECTOR_SIZE_LE(128)
|
|
r_.m128i[0] = simde_mm_max_epi16(a_.m128i[0], b_.m128i[0]);
|
|
r_.m128i[1] = simde_mm_max_epi16(a_.m128i[1], b_.m128i[1]);
|
|
#else
|
|
SIMDE_VECTORIZE
|
|
for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) {
|
|
r_.i16[i] = (a_.i16[i] > b_.i16[i]) ? a_.i16[i] : b_.i16[i];
|
|
}
|
|
#endif
|
|
|
|
return simde__m256i_from_private(r_);
|
|
#endif
|
|
}
|
|
#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
|
|
#undef _mm256_max_epi16
|
|
#define _mm256_max_epi16(a, b) simde_mm256_max_epi16(a, b)
|
|
#endif
|
|
|
|
SIMDE_FUNCTION_ATTRIBUTES
|
|
simde__m256i
|
|
simde_mm256_max_epi32 (simde__m256i a, simde__m256i b) {
|
|
#if defined(SIMDE_X86_AVX2_NATIVE)
|
|
return _mm256_max_epi32(a, b);
|
|
#else
|
|
simde__m256i_private
|
|
r_,
|
|
a_ = simde__m256i_to_private(a),
|
|
b_ = simde__m256i_to_private(b);
|
|
|
|
#if SIMDE_NATURAL_VECTOR_SIZE_LE(128)
|
|
r_.m128i[0] = simde_mm_max_epi32(a_.m128i[0], b_.m128i[0]);
|
|
r_.m128i[1] = simde_mm_max_epi32(a_.m128i[1], b_.m128i[1]);
|
|
#else
|
|
SIMDE_VECTORIZE
|
|
for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
|
|
r_.i32[i] = a_.i32[i] > b_.i32[i] ? a_.i32[i] : b_.i32[i];
|
|
}
|
|
#endif
|
|
|
|
return simde__m256i_from_private(r_);
|
|
#endif
|
|
}
|
|
#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
|
|
#undef _mm256_max_epi32
|
|
#define _mm256_max_epi32(a, b) simde_mm256_max_epi32(a, b)
|
|
#endif
|
|
|
|
SIMDE_FUNCTION_ATTRIBUTES
|
|
simde__m256i
|
|
simde_mm256_min_epi8 (simde__m256i a, simde__m256i b) {
|
|
#if defined(SIMDE_X86_AVX2_NATIVE) && !defined(__PGI)
|
|
return _mm256_min_epi8(a, b);
|
|
#else
|
|
simde__m256i_private
|
|
r_,
|
|
a_ = simde__m256i_to_private(a),
|
|
b_ = simde__m256i_to_private(b);
|
|
|
|
#if SIMDE_NATURAL_VECTOR_SIZE_LE(128)
|
|
r_.m128i[0] = simde_mm_min_epi8(a_.m128i[0], b_.m128i[0]);
|
|
r_.m128i[1] = simde_mm_min_epi8(a_.m128i[1], b_.m128i[1]);
|
|
#else
|
|
SIMDE_VECTORIZE
|
|
for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) {
|
|
r_.i8[i] = a_.i8[i] < b_.i8[i] ? a_.i8[i] : b_.i8[i];
|
|
}
|
|
#endif
|
|
|
|
return simde__m256i_from_private(r_);
|
|
#endif
|
|
}
|
|
#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
|
|
#undef _mm256_min_epi8
|
|
#define _mm256_min_epi8(a, b) simde_mm256_min_epi8(a, b)
|
|
#endif
|
|
|
|
SIMDE_FUNCTION_ATTRIBUTES
|
|
simde__m256i
|
|
simde_mm256_min_epi16 (simde__m256i a, simde__m256i b) {
|
|
#if defined(SIMDE_X86_AVX2_NATIVE)
|
|
return _mm256_min_epi16(a, b);
|
|
#else
|
|
simde__m256i_private
|
|
r_,
|
|
a_ = simde__m256i_to_private(a),
|
|
b_ = simde__m256i_to_private(b);
|
|
|
|
#if SIMDE_NATURAL_VECTOR_SIZE_LE(128)
|
|
r_.m128i[0] = simde_mm_min_epi16(a_.m128i[0], b_.m128i[0]);
|
|
r_.m128i[1] = simde_mm_min_epi16(a_.m128i[1], b_.m128i[1]);
|
|
#else
|
|
SIMDE_VECTORIZE
|
|
for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) {
|
|
r_.i16[i] = (a_.i16[i] < b_.i16[i]) ? a_.i16[i] : b_.i16[i];
|
|
}
|
|
#endif
|
|
|
|
return simde__m256i_from_private(r_);
|
|
#endif
|
|
}
|
|
#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
|
|
#undef _mm256_min_epi16
|
|
#define _mm256_min_epi16(a, b) simde_mm256_min_epi16(a, b)
|
|
#endif
|
|
|
|
SIMDE_FUNCTION_ATTRIBUTES
|
|
simde__m256i
|
|
simde_mm256_min_epi32 (simde__m256i a, simde__m256i b) {
|
|
#if defined(SIMDE_X86_AVX2_NATIVE)
|
|
return _mm256_min_epi32(a, b);
|
|
#else
|
|
simde__m256i_private
|
|
r_,
|
|
a_ = simde__m256i_to_private(a),
|
|
b_ = simde__m256i_to_private(b);
|
|
|
|
#if SIMDE_NATURAL_VECTOR_SIZE_LE(128)
|
|
r_.m128i[0] = simde_mm_min_epi32(a_.m128i[0], b_.m128i[0]);
|
|
r_.m128i[1] = simde_mm_min_epi32(a_.m128i[1], b_.m128i[1]);
|
|
#else
|
|
SIMDE_VECTORIZE
|
|
for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
|
|
r_.i32[i] = a_.i32[i] < b_.i32[i] ? a_.i32[i] : b_.i32[i];
|
|
}
|
|
#endif
|
|
|
|
return simde__m256i_from_private(r_);
|
|
#endif
|
|
}
|
|
#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
|
|
#undef _mm256_min_epi32
|
|
#define _mm256_min_epi32(a, b) simde_mm256_min_epi32(a, b)
|
|
#endif
|
|
|
|
SIMDE_FUNCTION_ATTRIBUTES
|
|
simde__m256i
|
|
simde_mm256_min_epu8 (simde__m256i a, simde__m256i b) {
|
|
#if defined(SIMDE_X86_AVX2_NATIVE)
|
|
return _mm256_min_epu8(a, b);
|
|
#else
|
|
simde__m256i_private
|
|
r_,
|
|
a_ = simde__m256i_to_private(a),
|
|
b_ = simde__m256i_to_private(b);
|
|
|
|
#if SIMDE_NATURAL_VECTOR_SIZE_LE(128)
|
|
r_.m128i[0] = simde_mm_min_epu8(a_.m128i[0], b_.m128i[0]);
|
|
r_.m128i[1] = simde_mm_min_epu8(a_.m128i[1], b_.m128i[1]);
|
|
#else
|
|
SIMDE_VECTORIZE
|
|
for (size_t i = 0 ; i < (sizeof(r_.u8) / sizeof(r_.u8[0])) ; i++) {
|
|
r_.u8[i] = (a_.u8[i] < b_.u8[i]) ? a_.u8[i] : b_.u8[i];
|
|
}
|
|
#endif
|
|
|
|
return simde__m256i_from_private(r_);
|
|
#endif
|
|
}
|
|
#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
|
|
#undef _mm256_min_epu8
|
|
#define _mm256_min_epu8(a, b) simde_mm256_min_epu8(a, b)
|
|
#endif
|
|
|
|
SIMDE_FUNCTION_ATTRIBUTES
|
|
simde__m256i
|
|
simde_mm256_min_epu16 (simde__m256i a, simde__m256i b) {
|
|
#if defined(SIMDE_X86_AVX2_NATIVE)
|
|
return _mm256_min_epu16(a, b);
|
|
#else
|
|
simde__m256i_private
|
|
r_,
|
|
a_ = simde__m256i_to_private(a),
|
|
b_ = simde__m256i_to_private(b);
|
|
|
|
#if SIMDE_NATURAL_VECTOR_SIZE_LE(128)
|
|
r_.m128i[0] = simde_mm_min_epu16(a_.m128i[0], b_.m128i[0]);
|
|
r_.m128i[1] = simde_mm_min_epu16(a_.m128i[1], b_.m128i[1]);
|
|
#else
|
|
SIMDE_VECTORIZE
|
|
for (size_t i = 0 ; i < (sizeof(r_.u16) / sizeof(r_.u16[0])) ; i++) {
|
|
r_.u16[i] = (a_.u16[i] < b_.u16[i]) ? a_.u16[i] : b_.u16[i];
|
|
}
|
|
#endif
|
|
|
|
return simde__m256i_from_private(r_);
|
|
#endif
|
|
}
|
|
#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
|
|
#undef _mm256_min_epu16
|
|
#define _mm256_min_epu16(a, b) simde_mm256_min_epu16(a, b)
|
|
#endif
|
|
|
|
SIMDE_FUNCTION_ATTRIBUTES
|
|
simde__m256i
|
|
simde_mm256_min_epu32 (simde__m256i a, simde__m256i b) {
|
|
#if defined(SIMDE_X86_AVX2_NATIVE)
|
|
return _mm256_min_epu32(a, b);
|
|
#else
|
|
simde__m256i_private
|
|
r_,
|
|
a_ = simde__m256i_to_private(a),
|
|
b_ = simde__m256i_to_private(b);
|
|
|
|
#if SIMDE_NATURAL_VECTOR_SIZE_LE(128)
|
|
r_.m128i[0] = simde_mm_min_epu32(a_.m128i[0], b_.m128i[0]);
|
|
r_.m128i[1] = simde_mm_min_epu32(a_.m128i[1], b_.m128i[1]);
|
|
#else
|
|
SIMDE_VECTORIZE
|
|
for (size_t i = 0 ; i < (sizeof(r_.u32) / sizeof(r_.u32[0])) ; i++) {
|
|
r_.u32[i] = (a_.u32[i] < b_.u32[i]) ? a_.u32[i] : b_.u32[i];
|
|
}
|
|
#endif
|
|
|
|
return simde__m256i_from_private(r_);
|
|
#endif
|
|
}
|
|
#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
|
|
#undef _mm256_min_epu32
|
|
#define _mm256_min_epu32(a, b) simde_mm256_min_epu32(a, b)
|
|
#endif
|
|
|
|
SIMDE_FUNCTION_ATTRIBUTES
|
|
int32_t
|
|
simde_mm256_movemask_epi8 (simde__m256i a) {
|
|
#if defined(SIMDE_X86_AVX2_NATIVE)
|
|
return _mm256_movemask_epi8(a);
|
|
#else
|
|
simde__m256i_private a_ = simde__m256i_to_private(a);
|
|
uint32_t r = 0;
|
|
|
|
#if SIMDE_NATURAL_VECTOR_SIZE_LE(128)
|
|
for (size_t i = 0 ; i < (sizeof(a_.m128i) / sizeof(a_.m128i[0])) ; i++) {
|
|
r |= HEDLEY_STATIC_CAST(uint32_t,simde_mm_movemask_epi8(a_.m128i[i])) << (16 * i);
|
|
}
|
|
#else
|
|
r = 0;
|
|
SIMDE_VECTORIZE_REDUCTION(|:r)
|
|
for (size_t i = 0 ; i < (sizeof(a_.u8) / sizeof(a_.u8[0])) ; i++) {
|
|
r |= HEDLEY_STATIC_CAST(uint32_t, (a_.u8[31 - i] >> 7)) << (31 - i);
|
|
}
|
|
#endif
|
|
|
|
return HEDLEY_STATIC_CAST(int32_t, r);
|
|
#endif
|
|
}
|
|
#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
|
|
#undef _mm256_movemask_epi8
|
|
#define _mm256_movemask_epi8(a) simde_mm256_movemask_epi8(a)
|
|
#endif
|
|
|
|
SIMDE_FUNCTION_ATTRIBUTES
|
|
simde__m256i
|
|
simde_mm256_mpsadbw_epu8 (simde__m256i a, simde__m256i b, const int imm8)
|
|
SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 255) {
|
|
simde__m256i_private
|
|
r_,
|
|
a_ = simde__m256i_to_private(a),
|
|
b_ = simde__m256i_to_private(b);
|
|
|
|
const int a_offset1 = imm8 & 4;
|
|
const int b_offset1 = (imm8 & 3) << 2;
|
|
const int a_offset2 = (imm8 >> 3) & 4;
|
|
const int b_offset2 = ((imm8 >> 3) & 3) << 2;
|
|
|
|
#if defined(simde_math_abs)
|
|
const int halfway_point = HEDLEY_STATIC_CAST(int, (sizeof(r_.u16) / sizeof(r_.u16[0])) ) / 2;
|
|
for (int i = 0 ; i < halfway_point ; i++) {
|
|
r_.u16[i] =
|
|
HEDLEY_STATIC_CAST(uint16_t, simde_math_abs(HEDLEY_STATIC_CAST(int, a_.u8[a_offset1 + i + 0] - b_.u8[b_offset1 + 0]))) +
|
|
HEDLEY_STATIC_CAST(uint16_t, simde_math_abs(HEDLEY_STATIC_CAST(int, a_.u8[a_offset1 + i + 1] - b_.u8[b_offset1 + 1]))) +
|
|
HEDLEY_STATIC_CAST(uint16_t, simde_math_abs(HEDLEY_STATIC_CAST(int, a_.u8[a_offset1 + i + 2] - b_.u8[b_offset1 + 2]))) +
|
|
HEDLEY_STATIC_CAST(uint16_t, simde_math_abs(HEDLEY_STATIC_CAST(int, a_.u8[a_offset1 + i + 3] - b_.u8[b_offset1 + 3])));
|
|
r_.u16[halfway_point + i] =
|
|
HEDLEY_STATIC_CAST(uint16_t, simde_math_abs(HEDLEY_STATIC_CAST(int, a_.u8[2 * halfway_point + a_offset2 + i + 0] - b_.u8[2 * halfway_point + b_offset2 + 0]))) +
|
|
HEDLEY_STATIC_CAST(uint16_t, simde_math_abs(HEDLEY_STATIC_CAST(int, a_.u8[2 * halfway_point + a_offset2 + i + 1] - b_.u8[2 * halfway_point + b_offset2 + 1]))) +
|
|
HEDLEY_STATIC_CAST(uint16_t, simde_math_abs(HEDLEY_STATIC_CAST(int, a_.u8[2 * halfway_point + a_offset2 + i + 2] - b_.u8[2 * halfway_point + b_offset2 + 2]))) +
|
|
HEDLEY_STATIC_CAST(uint16_t, simde_math_abs(HEDLEY_STATIC_CAST(int, a_.u8[2 * halfway_point + a_offset2 + i + 3] - b_.u8[2 * halfway_point + b_offset2 + 3])));
|
|
}
|
|
#else
|
|
HEDLEY_UNREACHABLE();
|
|
#endif
|
|
|
|
return simde__m256i_from_private(r_);
|
|
}
|
|
#if defined(SIMDE_X86_AVX2_NATIVE) && SIMDE_DETECT_CLANG_VERSION_CHECK(3,9,0)
|
|
#define simde_mm256_mpsadbw_epu8(a, b, imm8) _mm256_mpsadbw_epu8(a, b, imm8)
|
|
#elif SIMDE_NATURAL_VECTOR_SIZE_LE(128)
|
|
#define simde_mm256_mpsadbw_epu8(a, b, imm8) \
|
|
simde_mm256_set_m128i( \
|
|
simde_mm_mpsadbw_epu8(simde_mm256_extracti128_si256(a, 1), simde_mm256_extracti128_si256(b, 1), (imm8 >> 3)), \
|
|
simde_mm_mpsadbw_epu8(simde_mm256_extracti128_si256(a, 0), simde_mm256_extracti128_si256(b, 0), (imm8)))
|
|
#endif
|
|
#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
|
|
#undef _mm256_mpsadbw_epu8
|
|
#define _mm256_mpsadbw_epu8(a, b, imm8) simde_mm256_mpsadbw_epu8(a, b, imm8)
|
|
#endif
|
|
|
|
SIMDE_FUNCTION_ATTRIBUTES
|
|
simde__m256i
|
|
simde_mm256_mul_epi32 (simde__m256i a, simde__m256i b) {
|
|
#if defined(SIMDE_X86_AVX2_NATIVE)
|
|
return _mm256_mul_epi32(a, b);
|
|
#else
|
|
simde__m256i_private
|
|
r_,
|
|
a_ = simde__m256i_to_private(a),
|
|
b_ = simde__m256i_to_private(b);
|
|
|
|
#if SIMDE_NATURAL_VECTOR_SIZE_LE(128)
|
|
r_.m128i[0] = simde_mm_mul_epi32(a_.m128i[0], b_.m128i[0]);
|
|
r_.m128i[1] = simde_mm_mul_epi32(a_.m128i[1], b_.m128i[1]);
|
|
#else
|
|
SIMDE_VECTORIZE
|
|
for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) {
|
|
r_.i64[i] =
|
|
HEDLEY_STATIC_CAST(int64_t, a_.i32[i * 2]) *
|
|
HEDLEY_STATIC_CAST(int64_t, b_.i32[i * 2]);
|
|
}
|
|
#endif
|
|
|
|
return simde__m256i_from_private(r_);
|
|
#endif
|
|
}
|
|
#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
|
|
# define _mm256_mul_epi32(a, b) simde_mm256_mul_epi32(a, b)
|
|
#endif
|
|
|
|
SIMDE_FUNCTION_ATTRIBUTES
|
|
simde__m256i
|
|
simde_mm256_mul_epu32 (simde__m256i a, simde__m256i b) {
|
|
#if defined(SIMDE_X86_AVX2_NATIVE)
|
|
return _mm256_mul_epu32(a, b);
|
|
#else
|
|
simde__m256i_private
|
|
r_,
|
|
a_ = simde__m256i_to_private(a),
|
|
b_ = simde__m256i_to_private(b);
|
|
|
|
#if SIMDE_NATURAL_VECTOR_SIZE_LE(128)
|
|
r_.m128i[0] = simde_mm_mul_epu32(a_.m128i[0], b_.m128i[0]);
|
|
r_.m128i[1] = simde_mm_mul_epu32(a_.m128i[1], b_.m128i[1]);
|
|
#else
|
|
SIMDE_VECTORIZE
|
|
for (size_t i = 0 ; i < (sizeof(r_.u64) / sizeof(r_.u64[0])) ; i++) {
|
|
r_.u64[i] = HEDLEY_STATIC_CAST(uint64_t, a_.u32[i * 2]) * HEDLEY_STATIC_CAST(uint64_t, b_.u32[i * 2]);
|
|
}
|
|
#endif
|
|
|
|
return simde__m256i_from_private(r_);
|
|
#endif
|
|
}
|
|
#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
|
|
# define _mm256_mul_epu32(a, b) simde_mm256_mul_epu32(a, b)
|
|
#endif
|
|
|
|
SIMDE_FUNCTION_ATTRIBUTES
|
|
simde__m256i
|
|
simde_mm256_mulhi_epi16 (simde__m256i a, simde__m256i b) {
|
|
#if defined(SIMDE_X86_AVX2_NATIVE)
|
|
return _mm256_mulhi_epi16(a, b);
|
|
#else
|
|
simde__m256i_private
|
|
r_,
|
|
a_ = simde__m256i_to_private(a),
|
|
b_ = simde__m256i_to_private(b);
|
|
|
|
SIMDE_VECTORIZE
|
|
for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) {
|
|
r_.u16[i] = HEDLEY_STATIC_CAST(uint16_t, (HEDLEY_STATIC_CAST(uint32_t, HEDLEY_STATIC_CAST(int32_t, a_.i16[i]) * HEDLEY_STATIC_CAST(int32_t, b_.i16[i])) >> 16));
|
|
}
|
|
|
|
return simde__m256i_from_private(r_);
|
|
#endif
|
|
}
|
|
#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
|
|
# define _mm256_mulhi_epi16(a, b) simde_mm256_mulhi_epi16(a, b)
|
|
#endif
|
|
|
|
SIMDE_FUNCTION_ATTRIBUTES
|
|
simde__m256i
|
|
simde_mm256_mulhi_epu16 (simde__m256i a, simde__m256i b) {
|
|
#if defined(SIMDE_X86_AVX2_NATIVE)
|
|
return _mm256_mulhi_epu16(a, b);
|
|
#else
|
|
simde__m256i_private
|
|
r_,
|
|
a_ = simde__m256i_to_private(a),
|
|
b_ = simde__m256i_to_private(b);
|
|
|
|
SIMDE_VECTORIZE
|
|
for (size_t i = 0 ; i < (sizeof(r_.u16) / sizeof(r_.u16[0])) ; i++) {
|
|
r_.u16[i] = HEDLEY_STATIC_CAST(uint16_t, HEDLEY_STATIC_CAST(uint32_t, a_.u16[i]) * HEDLEY_STATIC_CAST(uint32_t, b_.u16[i]) >> 16);
|
|
}
|
|
|
|
return simde__m256i_from_private(r_);
|
|
#endif
|
|
}
|
|
#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
|
|
# define _mm256_mulhi_epu16(a, b) simde_mm256_mulhi_epu16(a, b)
|
|
#endif
|
|
|
|
SIMDE_FUNCTION_ATTRIBUTES
|
|
simde__m256i
|
|
simde_mm256_mulhrs_epi16 (simde__m256i a, simde__m256i b) {
|
|
#if defined(SIMDE_X86_AVX2_NATIVE)
|
|
return _mm256_mulhrs_epi16(a, b);
|
|
#else
|
|
simde__m256i_private
|
|
r_,
|
|
a_ = simde__m256i_to_private(a),
|
|
b_ = simde__m256i_to_private(b);
|
|
|
|
SIMDE_VECTORIZE
|
|
for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) {
|
|
r_.i16[i] = HEDLEY_STATIC_CAST(int16_t, (((HEDLEY_STATIC_CAST(int32_t, a_.i16[i]) * HEDLEY_STATIC_CAST(int32_t, b_.i16[i])) + 0x4000) >> 15));
|
|
}
|
|
|
|
return simde__m256i_from_private(r_);
|
|
#endif
|
|
}
|
|
#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
|
|
# define _mm256_mulhrs_epi16(a, b) simde_mm256_mulhrs_epi16(a, b)
|
|
#endif
|
|
|
|
SIMDE_FUNCTION_ATTRIBUTES
|
|
simde__m256i
|
|
simde_mm256_mullo_epi16 (simde__m256i a, simde__m256i b) {
|
|
#if defined(SIMDE_X86_AVX2_NATIVE)
|
|
return _mm256_mullo_epi16(a, b);
|
|
#else
|
|
simde__m256i_private
|
|
a_ = simde__m256i_to_private(a),
|
|
b_ = simde__m256i_to_private(b),
|
|
r_;
|
|
|
|
SIMDE_VECTORIZE
|
|
for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) {
|
|
r_.i16[i] = HEDLEY_STATIC_CAST(int16_t, a_.i16[i] * b_.i16[i]);
|
|
}
|
|
|
|
return simde__m256i_from_private(r_);
|
|
#endif
|
|
}
|
|
#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
|
|
#undef _mm256_mullo_epi16
|
|
#define _mm256_mullo_epi16(a, b) simde_mm256_mullo_epi16(a, b)
|
|
#endif
|
|
|
|
SIMDE_FUNCTION_ATTRIBUTES
|
|
simde__m256i
|
|
simde_mm256_mullo_epi32 (simde__m256i a, simde__m256i b) {
|
|
#if defined(SIMDE_X86_AVX2_NATIVE)
|
|
return _mm256_mullo_epi32(a, b);
|
|
#else
|
|
simde__m256i_private
|
|
a_ = simde__m256i_to_private(a),
|
|
b_ = simde__m256i_to_private(b),
|
|
r_;
|
|
|
|
SIMDE_VECTORIZE
|
|
for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
|
|
r_.i32[i] = HEDLEY_STATIC_CAST(int32_t, a_.i32[i] * b_.i32[i]);
|
|
}
|
|
|
|
return simde__m256i_from_private(r_);
|
|
#endif
|
|
}
|
|
#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
|
|
#undef _mm256_mullo_epi32
|
|
#define _mm256_mullo_epi32(a, b) simde_mm256_mullo_epi32(a, b)
|
|
#endif
|
|
|
|
SIMDE_FUNCTION_ATTRIBUTES
|
|
simde__m256i
|
|
simde_x_mm256_mullo_epu32 (simde__m256i a, simde__m256i b) {
|
|
simde__m256i_private
|
|
r_,
|
|
a_ = simde__m256i_to_private(a),
|
|
b_ = simde__m256i_to_private(b);
|
|
|
|
#if defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
|
|
r_.u32 = a_.u32 * b_.u32;
|
|
#else
|
|
SIMDE_VECTORIZE
|
|
for (size_t i = 0 ; i < (sizeof(r_.u32) / sizeof(r_.u32[0])) ; i++) {
|
|
r_.u32[i] = a_.u32[i] * b_.u32[i];
|
|
}
|
|
#endif
|
|
|
|
return simde__m256i_from_private(r_);
|
|
}
|
|
|
|
SIMDE_FUNCTION_ATTRIBUTES
|
|
simde__m256i
|
|
simde_mm256_or_si256 (simde__m256i a, simde__m256i b) {
|
|
#if defined(SIMDE_X86_AVX2_NATIVE)
|
|
return _mm256_or_si256(a, b);
|
|
#else
|
|
simde__m256i_private
|
|
r_,
|
|
a_ = simde__m256i_to_private(a),
|
|
b_ = simde__m256i_to_private(b);
|
|
|
|
#if SIMDE_NATURAL_VECTOR_SIZE_LE(128)
|
|
r_.m128i[0] = simde_mm_or_si128(a_.m128i[0], b_.m128i[0]);
|
|
r_.m128i[1] = simde_mm_or_si128(a_.m128i[1], b_.m128i[1]);
|
|
#elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
|
|
r_.i32f = a_.i32f | b_.i32f;
|
|
#else
|
|
SIMDE_VECTORIZE
|
|
for (size_t i = 0 ; i < (sizeof(r_.i32f) / sizeof(r_.i32f[0])) ; i++) {
|
|
r_.i32f[i] = a_.i32f[i] | b_.i32f[i];
|
|
}
|
|
#endif
|
|
|
|
return simde__m256i_from_private(r_);
|
|
#endif
|
|
}
|
|
#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
|
|
#undef _mm256_or_si256
|
|
#define _mm256_or_si256(a, b) simde_mm256_or_si256(a, b)
|
|
#endif
|
|
|
|
SIMDE_FUNCTION_ATTRIBUTES
|
|
simde__m256i
|
|
simde_mm256_packs_epi16 (simde__m256i a, simde__m256i b) {
|
|
#if defined(SIMDE_X86_AVX2_NATIVE)
|
|
return _mm256_packs_epi16(a, b);
|
|
#else
|
|
simde__m256i_private
|
|
r_,
|
|
a_ = simde__m256i_to_private(a),
|
|
b_ = simde__m256i_to_private(b);
|
|
|
|
#if SIMDE_NATURAL_VECTOR_SIZE_LE(128)
|
|
r_.m128i[0] = simde_mm_packs_epi16(a_.m128i[0], b_.m128i[0]);
|
|
r_.m128i[1] = simde_mm_packs_epi16(a_.m128i[1], b_.m128i[1]);
|
|
#else
|
|
const size_t halfway_point = (sizeof(r_.i8) / sizeof(r_.i8[0]))/2;
|
|
const size_t quarter_point = (sizeof(r_.i8) / sizeof(r_.i8[0]))/4;
|
|
SIMDE_VECTORIZE
|
|
for (size_t i = 0 ; i < quarter_point ; i++) {
|
|
r_.i8[i] = (a_.i16[i] > INT8_MAX) ? INT8_MAX : ((a_.i16[i] < INT8_MIN) ? INT8_MIN : HEDLEY_STATIC_CAST(int8_t, a_.i16[i]));
|
|
r_.i8[i + quarter_point] = (b_.i16[i] > INT8_MAX) ? INT8_MAX : ((b_.i16[i] < INT8_MIN) ? INT8_MIN : HEDLEY_STATIC_CAST(int8_t, b_.i16[i]));
|
|
r_.i8[halfway_point + i] = (a_.i16[quarter_point + i] > INT8_MAX) ? INT8_MAX : ((a_.i16[quarter_point + i] < INT8_MIN) ? INT8_MIN : HEDLEY_STATIC_CAST(int8_t, a_.i16[quarter_point + i]));
|
|
r_.i8[halfway_point + i + quarter_point] = (b_.i16[quarter_point + i] > INT8_MAX) ? INT8_MAX : ((b_.i16[quarter_point + i] < INT8_MIN) ? INT8_MIN : HEDLEY_STATIC_CAST(int8_t, b_.i16[quarter_point + i]));
|
|
}
|
|
#endif
|
|
|
|
return simde__m256i_from_private(r_);
|
|
#endif
|
|
}
|
|
#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
|
|
#undef _mm256_packs_epi16
|
|
#define _mm256_packs_epi16(a, b) simde_mm256_packs_epi16(a, b)
|
|
#endif
|
|
|
|
SIMDE_FUNCTION_ATTRIBUTES
|
|
simde__m256i
|
|
simde_mm256_packs_epi32 (simde__m256i a, simde__m256i b) {
|
|
#if defined(SIMDE_X86_AVX2_NATIVE)
|
|
return _mm256_packs_epi32(a, b);
|
|
#else
|
|
simde__m256i_private
|
|
r_,
|
|
v_[] = {
|
|
simde__m256i_to_private(a),
|
|
simde__m256i_to_private(b)
|
|
};
|
|
|
|
#if SIMDE_NATURAL_VECTOR_SIZE_LE(128)
|
|
r_.m128i[0] = simde_mm_packs_epi32(v_[0].m128i[0], v_[1].m128i[0]);
|
|
r_.m128i[1] = simde_mm_packs_epi32(v_[0].m128i[1], v_[1].m128i[1]);
|
|
#else
|
|
SIMDE_VECTORIZE
|
|
for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) {
|
|
const int32_t v = v_[(i >> 2) & 1].i32[(i & 11) - ((i & 8) >> 1)];
|
|
r_.i16[i] = HEDLEY_STATIC_CAST(int16_t, (v > INT16_MAX) ? INT16_MAX : ((v < INT16_MIN) ? INT16_MIN : v));
|
|
}
|
|
#endif
|
|
|
|
return simde__m256i_from_private(r_);
|
|
#endif
|
|
}
|
|
#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
|
|
#undef _mm256_packs_epi32
|
|
#define _mm256_packs_epi32(a, b) simde_mm256_packs_epi32(a, b)
|
|
#endif
|
|
|
|
SIMDE_FUNCTION_ATTRIBUTES
|
|
simde__m256i
|
|
simde_mm256_packus_epi16 (simde__m256i a, simde__m256i b) {
|
|
#if defined(SIMDE_X86_AVX2_NATIVE)
|
|
return _mm256_packus_epi16(a, b);
|
|
#else
|
|
simde__m256i_private
|
|
r_,
|
|
a_ = simde__m256i_to_private(a),
|
|
b_ = simde__m256i_to_private(b);
|
|
|
|
#if SIMDE_NATURAL_VECTOR_SIZE_LE(128)
|
|
r_.m128i[0] = simde_mm_packus_epi16(a_.m128i[0], b_.m128i[0]);
|
|
r_.m128i[1] = simde_mm_packus_epi16(a_.m128i[1], b_.m128i[1]);
|
|
#else
|
|
const size_t halfway_point = (sizeof(r_.i8) / sizeof(r_.i8[0])) / 2;
|
|
const size_t quarter_point = (sizeof(r_.i8) / sizeof(r_.i8[0])) / 4;
|
|
SIMDE_VECTORIZE
|
|
for (size_t i = 0 ; i < quarter_point ; i++) {
|
|
r_.u8[i] = (a_.i16[i] > UINT8_MAX) ? UINT8_MAX : ((a_.i16[i] < 0) ? UINT8_C(0) : HEDLEY_STATIC_CAST(uint8_t, a_.i16[i]));
|
|
r_.u8[i + quarter_point] = (b_.i16[i] > UINT8_MAX) ? UINT8_MAX : ((b_.i16[i] < 0) ? UINT8_C(0) : HEDLEY_STATIC_CAST(uint8_t, b_.i16[i]));
|
|
r_.u8[halfway_point + i] = (a_.i16[quarter_point + i] > UINT8_MAX) ? UINT8_MAX : ((a_.i16[quarter_point + i] < 0) ? UINT8_C(0) : HEDLEY_STATIC_CAST(uint8_t, a_.i16[quarter_point + i]));
|
|
r_.u8[halfway_point + i + quarter_point] = (b_.i16[quarter_point + i] > UINT8_MAX) ? UINT8_MAX : ((b_.i16[quarter_point + i] < 0) ? UINT8_C(0) : HEDLEY_STATIC_CAST(uint8_t, b_.i16[quarter_point + i]));
|
|
}
|
|
#endif
|
|
|
|
return simde__m256i_from_private(r_);
|
|
#endif
|
|
}
|
|
#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
|
|
#undef _mm256_packus_epi16
|
|
#define _mm256_packus_epi16(a, b) simde_mm256_packus_epi16(a, b)
|
|
#endif
|
|
|
|
SIMDE_FUNCTION_ATTRIBUTES
|
|
simde__m256i
|
|
simde_mm256_packus_epi32 (simde__m256i a, simde__m256i b) {
|
|
#if defined(SIMDE_X86_AVX2_NATIVE)
|
|
return _mm256_packus_epi32(a, b);
|
|
#else
|
|
simde__m256i_private
|
|
r_,
|
|
a_ = simde__m256i_to_private(a),
|
|
b_ = simde__m256i_to_private(b);
|
|
|
|
#if SIMDE_NATURAL_VECTOR_SIZE_LE(128)
|
|
r_.m128i[0] = simde_mm_packus_epi32(a_.m128i[0], b_.m128i[0]);
|
|
r_.m128i[1] = simde_mm_packus_epi32(a_.m128i[1], b_.m128i[1]);
|
|
#else
|
|
const size_t halfway_point = (sizeof(r_.i16) / sizeof(r_.i16[0])) / 2;
|
|
const size_t quarter_point = (sizeof(r_.i16) / sizeof(r_.i16[0])) / 4;
|
|
SIMDE_VECTORIZE
|
|
for (size_t i = 0 ; i < quarter_point ; i++) {
|
|
r_.u16[i] = (a_.i32[i] > UINT16_MAX) ? UINT16_MAX : ((a_.i32[i] < 0) ? UINT16_C(0) : HEDLEY_STATIC_CAST(uint16_t, a_.i32[i]));
|
|
r_.u16[i + quarter_point] = (b_.i32[i] > UINT16_MAX) ? UINT16_MAX : ((b_.i32[i] < 0) ? UINT16_C(0) : HEDLEY_STATIC_CAST(uint16_t, b_.i32[i]));
|
|
r_.u16[halfway_point + i] = (a_.i32[quarter_point + i] > UINT16_MAX) ? UINT16_MAX : ((a_.i32[quarter_point + i] < 0) ? UINT16_C(0) : HEDLEY_STATIC_CAST(uint16_t, a_.i32[quarter_point + i]));
|
|
r_.u16[halfway_point + i + quarter_point] = (b_.i32[quarter_point + i] > UINT16_MAX) ? UINT16_MAX : ((b_.i32[quarter_point + i] < 0) ? UINT16_C(0) : HEDLEY_STATIC_CAST(uint16_t, b_.i32[quarter_point + i]));
|
|
}
|
|
#endif
|
|
|
|
return simde__m256i_from_private(r_);
|
|
#endif
|
|
}
|
|
#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
|
|
#undef _mm256_packus_epi32
|
|
#define _mm256_packus_epi32(a, b) simde_mm256_packus_epi32(a, b)
|
|
#endif
|
|
|
|
SIMDE_FUNCTION_ATTRIBUTES
|
|
simde__m256i
|
|
simde_mm256_permute2x128_si256 (simde__m256i a, simde__m256i b, const int imm8)
|
|
SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 255) {
|
|
simde__m256i_private
|
|
r_,
|
|
a_ = simde__m256i_to_private(a),
|
|
b_ = simde__m256i_to_private(b);
|
|
|
|
r_.m128i_private[0] = (imm8 & 0x08) ? simde__m128i_to_private(simde_mm_setzero_si128()) : ((imm8 & 0x02) ? b_.m128i_private[(imm8 ) & 1] : a_.m128i_private[(imm8 ) & 1]);
|
|
r_.m128i_private[1] = (imm8 & 0x80) ? simde__m128i_to_private(simde_mm_setzero_si128()) : ((imm8 & 0x20) ? b_.m128i_private[(imm8 >> 4) & 1] : a_.m128i_private[(imm8 >> 4) & 1]);
|
|
|
|
return simde__m256i_from_private(r_);
|
|
}
|
|
#if defined(SIMDE_X86_AVX2_NATIVE)
|
|
# define simde_mm256_permute2x128_si256(a, b, imm8) _mm256_permute2x128_si256(a, b, imm8)
|
|
#endif
|
|
#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
|
|
#undef _mm256_permute2x128_si256
|
|
#define _mm256_permute2x128_si256(a, b, imm8) simde_mm256_permute2x128_si256(a, b, imm8)
|
|
#endif
|
|
|
|
SIMDE_FUNCTION_ATTRIBUTES
|
|
simde__m256i
|
|
simde_mm256_permute4x64_epi64 (simde__m256i a, const int imm8)
|
|
SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 255) {
|
|
simde__m256i_private
|
|
r_,
|
|
a_ = simde__m256i_to_private(a);
|
|
|
|
r_.i64[0] = (imm8 & 0x02) ? a_.i64[((imm8 ) & 1)+2] : a_.i64[(imm8 ) & 1];
|
|
r_.i64[1] = (imm8 & 0x08) ? a_.i64[((imm8 >> 2 ) & 1)+2] : a_.i64[(imm8 >> 2 ) & 1];
|
|
r_.i64[2] = (imm8 & 0x20) ? a_.i64[((imm8 >> 4 ) & 1)+2] : a_.i64[(imm8 >> 4 ) & 1];
|
|
r_.i64[3] = (imm8 & 0x80) ? a_.i64[((imm8 >> 6 ) & 1)+2] : a_.i64[(imm8 >> 6 ) & 1];
|
|
|
|
return simde__m256i_from_private(r_);
|
|
}
|
|
#if defined(SIMDE_X86_AVX2_NATIVE)
|
|
# define simde_mm256_permute4x64_epi64(a, imm8) _mm256_permute4x64_epi64(a, imm8)
|
|
#endif
|
|
#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
|
|
#undef _mm256_permute4x64_epi64
|
|
#define _mm256_permute4x64_epi64(a, imm8) simde_mm256_permute4x64_epi64(a, imm8)
|
|
#endif
|
|
|
|
SIMDE_FUNCTION_ATTRIBUTES
|
|
simde__m256d
|
|
simde_mm256_permute4x64_pd (simde__m256d a, const int imm8)
|
|
SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 255) {
|
|
simde__m256d_private
|
|
r_,
|
|
a_ = simde__m256d_to_private(a);
|
|
|
|
r_.f64[0] = (imm8 & 0x02) ? a_.f64[((imm8 ) & 1)+2] : a_.f64[(imm8 ) & 1];
|
|
r_.f64[1] = (imm8 & 0x08) ? a_.f64[((imm8 >> 2 ) & 1)+2] : a_.f64[(imm8 >> 2 ) & 1];
|
|
r_.f64[2] = (imm8 & 0x20) ? a_.f64[((imm8 >> 4 ) & 1)+2] : a_.f64[(imm8 >> 4 ) & 1];
|
|
r_.f64[3] = (imm8 & 0x80) ? a_.f64[((imm8 >> 6 ) & 1)+2] : a_.f64[(imm8 >> 6 ) & 1];
|
|
|
|
return simde__m256d_from_private(r_);
|
|
}
|
|
#if defined(SIMDE_X86_AVX2_NATIVE)
|
|
# define simde_mm256_permute4x64_pd(a, imm8) _mm256_permute4x64_pd(a, imm8)
|
|
#endif
|
|
#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
|
|
#undef _mm256_permute4x64_pd
|
|
#define _mm256_permute4x64_pd(a, imm8) simde_mm256_permute4x64_pd(a, imm8)
|
|
#endif
|
|
|
|
SIMDE_FUNCTION_ATTRIBUTES
|
|
simde__m256i
|
|
simde_mm256_permutevar8x32_epi32 (simde__m256i a, simde__m256i idx) {
|
|
#if defined(SIMDE_X86_AVX2_NATIVE)
|
|
return _mm256_permutevar8x32_epi32(a, idx);
|
|
#else
|
|
simde__m256i_private
|
|
r_,
|
|
a_ = simde__m256i_to_private(a),
|
|
idx_ = simde__m256i_to_private(idx);
|
|
|
|
SIMDE_VECTORIZE
|
|
for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
|
|
r_.i32[i] = a_.i32[idx_.i32[i] & 7];
|
|
}
|
|
|
|
return simde__m256i_from_private(r_);
|
|
#endif
|
|
}
|
|
#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
|
|
#undef _mm256_permutevar8x32_epi32
|
|
#define _mm256_permutevar8x32_epi32(a, idx) simde_mm256_permutevar8x32_epi32(a, idx)
|
|
#endif
|
|
|
|
SIMDE_FUNCTION_ATTRIBUTES
|
|
simde__m256
|
|
simde_mm256_permutevar8x32_ps (simde__m256 a, simde__m256i idx) {
|
|
#if defined(SIMDE_X86_AVX2_NATIVE)
|
|
#if defined(__clang__) && !SIMDE_DETECT_CLANG_VERSION_CHECK(3,8,0)
|
|
return _mm256_permutevar8x32_ps(a, HEDLEY_REINTERPRET_CAST(simde__m256, idx));
|
|
#else
|
|
return _mm256_permutevar8x32_ps(a, idx);
|
|
#endif
|
|
#else
|
|
simde__m256_private
|
|
r_,
|
|
a_ = simde__m256_to_private(a);
|
|
simde__m256i_private
|
|
idx_ = simde__m256i_to_private(idx);
|
|
|
|
SIMDE_VECTORIZE
|
|
for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) {
|
|
r_.f32[i] = a_.f32[idx_.i32[i] & 7];
|
|
}
|
|
|
|
return simde__m256_from_private(r_);
|
|
#endif
|
|
}
|
|
#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
|
|
#undef _mm256_permutevar8x32_ps
|
|
#define _mm256_permutevar8x32_ps(a, idx) simde_mm256_permutevar8x32_ps(a, idx)
|
|
#endif
|
|
|
|
SIMDE_FUNCTION_ATTRIBUTES
|
|
simde__m256i
|
|
simde_mm256_sad_epu8 (simde__m256i a, simde__m256i b) {
|
|
#if defined(SIMDE_X86_AVX2_NATIVE)
|
|
return _mm256_sad_epu8(a, b);
|
|
#else
|
|
simde__m256i_private
|
|
r_,
|
|
a_ = simde__m256i_to_private(a),
|
|
b_ = simde__m256i_to_private(b);
|
|
|
|
#if SIMDE_NATURAL_VECTOR_SIZE_LE(128)
|
|
r_.m128i[0] = simde_mm_sad_epu8(a_.m128i[0], b_.m128i[0]);
|
|
r_.m128i[1] = simde_mm_sad_epu8(a_.m128i[1], b_.m128i[1]);
|
|
#else
|
|
for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) {
|
|
uint16_t tmp = 0;
|
|
SIMDE_VECTORIZE_REDUCTION(+:tmp)
|
|
for (size_t j = 0 ; j < ((sizeof(r_.u8) / sizeof(r_.u8[0])) / 4) ; j++) {
|
|
const size_t e = j + (i * 8);
|
|
tmp += (a_.u8[e] > b_.u8[e]) ? (a_.u8[e] - b_.u8[e]) : (b_.u8[e] - a_.u8[e]);
|
|
}
|
|
r_.i64[i] = tmp;
|
|
}
|
|
#endif
|
|
|
|
return simde__m256i_from_private(r_);
|
|
#endif
|
|
}
|
|
#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
|
|
#undef _mm256_sad_epu8
|
|
#define _mm256_sad_epu8(a, b) simde_mm256_sad_epu8(a, b)
|
|
#endif
|
|
|
|
SIMDE_FUNCTION_ATTRIBUTES
|
|
simde__m256i
|
|
simde_mm256_shuffle_epi8 (simde__m256i a, simde__m256i b) {
|
|
#if defined(SIMDE_X86_AVX2_NATIVE)
|
|
return _mm256_shuffle_epi8(a, b);
|
|
#else
|
|
simde__m256i_private
|
|
r_,
|
|
a_ = simde__m256i_to_private(a),
|
|
b_ = simde__m256i_to_private(b);
|
|
|
|
#if SIMDE_NATURAL_VECTOR_SIZE_LE(128)
|
|
r_.m128i[0] = simde_mm_shuffle_epi8(a_.m128i[0], b_.m128i[0]);
|
|
r_.m128i[1] = simde_mm_shuffle_epi8(a_.m128i[1], b_.m128i[1]);
|
|
#else
|
|
SIMDE_VECTORIZE
|
|
for (size_t i = 0 ; i < ((sizeof(r_.u8) / sizeof(r_.u8[0])) / 2) ; i++) {
|
|
r_.u8[ i ] = (b_.u8[ i ] & 0x80) ? 0 : a_.u8[(b_.u8[ i ] & 0x0f) ];
|
|
r_.u8[i + 16] = (b_.u8[i + 16] & 0x80) ? 0 : a_.u8[(b_.u8[i + 16] & 0x0f) + 16];
|
|
}
|
|
#endif
|
|
|
|
return simde__m256i_from_private(r_);
|
|
#endif
|
|
}
|
|
#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
|
|
#undef _mm256_shuffle_epi8
|
|
#define _mm256_shuffle_epi8(a, b) simde_mm256_shuffle_epi8(a, b)
|
|
#endif
|
|
|
|
SIMDE_FUNCTION_ATTRIBUTES
|
|
simde__m256i
|
|
simde_mm256_shuffle_epi32 (simde__m256i a, const int imm8)
|
|
SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 255) {
|
|
simde__m256i_private
|
|
r_,
|
|
a_ = simde__m256i_to_private(a);
|
|
|
|
for (size_t i = 0 ; i < ((sizeof(r_.i32) / sizeof(r_.i32[0])) / 2) ; i++) {
|
|
r_.i32[i] = a_.i32[(imm8 >> (i * 2)) & 3];
|
|
}
|
|
for (size_t i = 0 ; i < ((sizeof(r_.i32) / sizeof(r_.i32[0])) / 2) ; i++) {
|
|
r_.i32[i + 4] = a_.i32[((imm8 >> (i * 2)) & 3) + 4];
|
|
}
|
|
|
|
return simde__m256i_from_private(r_);
|
|
}
|
|
#if defined(SIMDE_X86_AVX2_NATIVE)
|
|
# define simde_mm256_shuffle_epi32(a, imm8) _mm256_shuffle_epi32(a, imm8)
|
|
#elif SIMDE_NATURAL_VECTOR_SIZE_LE(128) && !defined(__PGI)
|
|
# define simde_mm256_shuffle_epi32(a, imm8) \
|
|
simde_mm256_set_m128i( \
|
|
simde_mm_shuffle_epi32(simde_mm256_extracti128_si256(a, 1), (imm8)), \
|
|
simde_mm_shuffle_epi32(simde_mm256_extracti128_si256(a, 0), (imm8)))
|
|
#elif defined(SIMDE_SHUFFLE_VECTOR_)
|
|
# define simde_mm256_shuffle_epi32(a, imm8) (__extension__ ({ \
|
|
const simde__m256i_private simde__tmp_a_ = simde__m256i_to_private(a); \
|
|
simde__m256i_from_private((simde__m256i_private) { .i32 = \
|
|
SIMDE_SHUFFLE_VECTOR_(32, 32, \
|
|
(simde__tmp_a_).i32, \
|
|
(simde__tmp_a_).i32, \
|
|
((imm8) ) & 3, \
|
|
((imm8) >> 2) & 3, \
|
|
((imm8) >> 4) & 3, \
|
|
((imm8) >> 6) & 3, \
|
|
(((imm8) ) & 3) + 4, \
|
|
(((imm8) >> 2) & 3) + 4, \
|
|
(((imm8) >> 4) & 3) + 4, \
|
|
(((imm8) >> 6) & 3) + 4) }); }))
|
|
#endif
|
|
#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
|
|
#undef _mm256_shuffle_epi32
|
|
#define _mm256_shuffle_epi32(a, imm8) simde_mm256_shuffle_epi32(a, imm8)
|
|
#endif
|
|
|
|
#if defined(SIMDE_X86_AVX2_NATIVE)
|
|
# define simde_mm256_shufflehi_epi16(a, imm8) _mm256_shufflehi_epi16(a, imm8)
|
|
#elif SIMDE_NATURAL_VECTOR_SIZE_LE(128)
|
|
# define simde_mm256_shufflehi_epi16(a, imm8) \
|
|
simde_mm256_set_m128i( \
|
|
simde_mm_shufflehi_epi16(simde_mm256_extracti128_si256(a, 1), (imm8)), \
|
|
simde_mm_shufflehi_epi16(simde_mm256_extracti128_si256(a, 0), (imm8)))
|
|
#elif defined(SIMDE_SHUFFLE_VECTOR_)
|
|
# define simde_mm256_shufflehi_epi16(a, imm8) (__extension__ ({ \
|
|
const simde__m256i_private simde__tmp_a_ = simde__m256i_to_private(a); \
|
|
simde__m256i_from_private((simde__m256i_private) { .i16 = \
|
|
SIMDE_SHUFFLE_VECTOR_(16, 32, \
|
|
(simde__tmp_a_).i16, \
|
|
(simde__tmp_a_).i16, \
|
|
0, 1, 2, 3, \
|
|
(((imm8) ) & 3) + 4, \
|
|
(((imm8) >> 2) & 3) + 4, \
|
|
(((imm8) >> 4) & 3) + 4, \
|
|
(((imm8) >> 6) & 3) + 4, \
|
|
8, 9, 10, 11, \
|
|
((((imm8) ) & 3) + 8 + 4), \
|
|
((((imm8) >> 2) & 3) + 8 + 4), \
|
|
((((imm8) >> 4) & 3) + 8 + 4), \
|
|
((((imm8) >> 6) & 3) + 8 + 4) \
|
|
) }); }))
|
|
#else
|
|
# define simde_mm256_shufflehi_epi16(a, imm8) \
|
|
simde_mm256_set_m128i( \
|
|
simde_mm_shufflehi_epi16(simde_mm256_extracti128_si256(a, 1), imm8), \
|
|
simde_mm_shufflehi_epi16(simde_mm256_extracti128_si256(a, 0), imm8))
|
|
#endif
|
|
#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
|
|
#undef _mm256_shufflehi_epi16
|
|
#define _mm256_shufflehi_epi16(a, imm8) simde_mm256_shufflehi_epi16(a, imm8)
|
|
#endif
|
|
|
|
#if defined(SIMDE_X86_AVX2_NATIVE)
|
|
# define simde_mm256_shufflelo_epi16(a, imm8) _mm256_shufflelo_epi16(a, imm8)
|
|
#elif SIMDE_NATURAL_VECTOR_SIZE_LE(128)
|
|
# define simde_mm256_shufflelo_epi16(a, imm8) \
|
|
simde_mm256_set_m128i( \
|
|
simde_mm_shufflelo_epi16(simde_mm256_extracti128_si256(a, 1), (imm8)), \
|
|
simde_mm_shufflelo_epi16(simde_mm256_extracti128_si256(a, 0), (imm8)))
|
|
#elif defined(SIMDE_SHUFFLE_VECTOR_)
|
|
# define simde_mm256_shufflelo_epi16(a, imm8) (__extension__ ({ \
|
|
const simde__m256i_private simde__tmp_a_ = simde__m256i_to_private(a); \
|
|
simde__m256i_from_private((simde__m256i_private) { .i16 = \
|
|
SIMDE_SHUFFLE_VECTOR_(16, 32, \
|
|
(simde__tmp_a_).i16, \
|
|
(simde__tmp_a_).i16, \
|
|
(((imm8) ) & 3), \
|
|
(((imm8) >> 2) & 3), \
|
|
(((imm8) >> 4) & 3), \
|
|
(((imm8) >> 6) & 3), \
|
|
4, 5, 6, 7, \
|
|
((((imm8) ) & 3) + 8), \
|
|
((((imm8) >> 2) & 3) + 8), \
|
|
((((imm8) >> 4) & 3) + 8), \
|
|
((((imm8) >> 6) & 3) + 8), \
|
|
12, 13, 14, 15) }); }))
|
|
#else
|
|
# define simde_mm256_shufflelo_epi16(a, imm8) \
|
|
simde_mm256_set_m128i( \
|
|
simde_mm_shufflelo_epi16(simde_mm256_extracti128_si256(a, 1), imm8), \
|
|
simde_mm_shufflelo_epi16(simde_mm256_extracti128_si256(a, 0), imm8))
|
|
#endif
|
|
#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
|
|
#undef _mm256_shufflelo_epi16
|
|
#define _mm256_shufflelo_epi16(a, imm8) simde_mm256_shufflelo_epi16(a, imm8)
|
|
#endif
|
|
|
|
SIMDE_FUNCTION_ATTRIBUTES
|
|
simde__m256i
|
|
simde_mm256_sign_epi8 (simde__m256i a, simde__m256i b) {
|
|
#if defined(SIMDE_X86_AVX2_NATIVE)
|
|
return _mm256_sign_epi8(a, b);
|
|
#else
|
|
simde__m256i_private
|
|
r_,
|
|
a_ = simde__m256i_to_private(a),
|
|
b_ = simde__m256i_to_private(b);
|
|
|
|
SIMDE_VECTORIZE
|
|
for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) {
|
|
r_.i8[i] = (b_.i8[i] < INT32_C(0)) ? -a_.i8[i] : a_.i8[i];
|
|
}
|
|
|
|
return simde__m256i_from_private(r_);
|
|
#endif
|
|
}
|
|
#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
|
|
#undef _mm256_sign_epi8
|
|
#define _mm256_sign_epi8(a, b) simde_mm256_sign_epi8(a, b)
|
|
#endif
|
|
|
|
SIMDE_FUNCTION_ATTRIBUTES
|
|
simde__m256i
|
|
simde_mm256_sign_epi16 (simde__m256i a, simde__m256i b) {
|
|
#if defined(SIMDE_X86_AVX2_NATIVE)
|
|
return _mm256_sign_epi16(a, b);
|
|
#else
|
|
simde__m256i_private
|
|
r_,
|
|
a_ = simde__m256i_to_private(a),
|
|
b_ = simde__m256i_to_private(b);
|
|
|
|
SIMDE_VECTORIZE
|
|
for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) {
|
|
r_.i16[i] = (b_.i16[i] < INT32_C(0)) ? -a_.i16[i] : a_.i16[i];
|
|
}
|
|
|
|
return simde__m256i_from_private(r_);
|
|
#endif
|
|
}
|
|
#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
|
|
#undef _mm256_sign_epi16
|
|
#define _mm256_sign_epi16(a, b) simde_mm256_sign_epi16(a, b)
|
|
#endif
|
|
|
|
SIMDE_FUNCTION_ATTRIBUTES
|
|
simde__m256i
|
|
simde_mm256_sign_epi32(simde__m256i a, simde__m256i b) {
|
|
#if defined(SIMDE_X86_AVX2_NATIVE)
|
|
return _mm256_sign_epi32(a, b);
|
|
#else
|
|
simde__m256i_private
|
|
r_,
|
|
a_ = simde__m256i_to_private(a),
|
|
b_ = simde__m256i_to_private(b);
|
|
|
|
SIMDE_VECTORIZE
|
|
for (size_t i = 0; i < (sizeof(r_.i32) / sizeof(r_.i32[0])); i++) {
|
|
r_.i32[i] = (b_.i32[i] < INT32_C(0)) ? -a_.i32[i] : a_.i32[i];
|
|
}
|
|
|
|
return simde__m256i_from_private(r_);
|
|
#endif
|
|
}
|
|
#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
|
|
#undef _mm256_sign_epi32
|
|
#define _mm256_sign_epi32(a, b) simde_mm256_sign_epi32(a, b)
|
|
#endif
|
|
|
|
SIMDE_FUNCTION_ATTRIBUTES
|
|
simde__m256i
|
|
simde_mm256_sll_epi16 (simde__m256i a, simde__m128i count) {
|
|
#if defined(SIMDE_X86_AVX2_NATIVE)
|
|
return _mm256_sll_epi16(a, count);
|
|
#else
|
|
simde__m256i_private
|
|
r_,
|
|
a_ = simde__m256i_to_private(a);
|
|
|
|
#if SIMDE_NATURAL_VECTOR_SIZE_LE(128)
|
|
r_.m128i[0] = simde_mm_sll_epi16(a_.m128i[0], count);
|
|
r_.m128i[1] = simde_mm_sll_epi16(a_.m128i[1], count);
|
|
#else
|
|
simde__m128i_private
|
|
count_ = simde__m128i_to_private(count);
|
|
|
|
uint64_t shift = HEDLEY_STATIC_CAST(uint64_t, count_.i64[0]);
|
|
if (shift > 15)
|
|
return simde_mm256_setzero_si256();
|
|
|
|
#if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR)
|
|
r_.i16 = a_.i16 << HEDLEY_STATIC_CAST(int16_t, shift);
|
|
#else
|
|
SIMDE_VECTORIZE
|
|
for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) {
|
|
r_.i16[i] = HEDLEY_STATIC_CAST(int16_t, a_.i16[i] << (shift));
|
|
}
|
|
#endif
|
|
#endif
|
|
|
|
return simde__m256i_from_private(r_);
|
|
#endif
|
|
}
|
|
#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
|
|
#undef _mm256_sll_epi16
|
|
#define _mm256_sll_epi16(a, count) simde_mm256_sll_epi16(a, count)
|
|
#endif
|
|
|
|
SIMDE_FUNCTION_ATTRIBUTES
|
|
simde__m256i
|
|
simde_mm256_sll_epi32 (simde__m256i a, simde__m128i count) {
|
|
#if defined(SIMDE_X86_AVX2_NATIVE)
|
|
return _mm256_sll_epi32(a, count);
|
|
#else
|
|
simde__m256i_private
|
|
r_,
|
|
a_ = simde__m256i_to_private(a);
|
|
|
|
#if SIMDE_NATURAL_VECTOR_SIZE_LE(128)
|
|
r_.m128i[0] = simde_mm_sll_epi32(a_.m128i[0], count);
|
|
r_.m128i[1] = simde_mm_sll_epi32(a_.m128i[1], count);
|
|
#else
|
|
simde__m128i_private
|
|
count_ = simde__m128i_to_private(count);
|
|
|
|
uint64_t shift = HEDLEY_STATIC_CAST(uint64_t, count_.i64[0]);
|
|
if (shift > 31)
|
|
return simde_mm256_setzero_si256();
|
|
|
|
#if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR)
|
|
r_.i32 = a_.i32 << HEDLEY_STATIC_CAST(int32_t, shift);
|
|
#else
|
|
SIMDE_VECTORIZE
|
|
for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
|
|
r_.i32[i] = HEDLEY_STATIC_CAST(int32_t, a_.i32[i] << (shift));
|
|
}
|
|
#endif
|
|
#endif
|
|
|
|
return simde__m256i_from_private(r_);
|
|
#endif
|
|
}
|
|
#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
|
|
#undef _mm256_sll_epi32
|
|
#define _mm256_sll_epi32(a, count) simde_mm256_sll_epi32(a, count)
|
|
#endif
|
|
|
|
SIMDE_FUNCTION_ATTRIBUTES
|
|
simde__m256i
|
|
simde_mm256_sll_epi64 (simde__m256i a, simde__m128i count) {
|
|
#if defined(SIMDE_X86_AVX2_NATIVE)
|
|
return _mm256_sll_epi64(a, count);
|
|
#else
|
|
simde__m256i_private
|
|
r_,
|
|
a_ = simde__m256i_to_private(a);
|
|
|
|
#if SIMDE_NATURAL_VECTOR_SIZE_LE(128)
|
|
r_.m128i[0] = simde_mm_sll_epi64(a_.m128i[0], count);
|
|
r_.m128i[1] = simde_mm_sll_epi64(a_.m128i[1], count);
|
|
#else
|
|
simde__m128i_private
|
|
count_ = simde__m128i_to_private(count);
|
|
|
|
uint64_t shift = HEDLEY_STATIC_CAST(uint64_t, count_.i64[0]);
|
|
if (shift > 63)
|
|
return simde_mm256_setzero_si256();
|
|
|
|
#if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR)
|
|
r_.i64 = a_.i64 << HEDLEY_STATIC_CAST(int64_t, shift);
|
|
#else
|
|
SIMDE_VECTORIZE
|
|
for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) {
|
|
r_.i64[i] = HEDLEY_STATIC_CAST(int64_t, a_.i64[i] << (shift));
|
|
}
|
|
#endif
|
|
#endif
|
|
|
|
return simde__m256i_from_private(r_);
|
|
#endif
|
|
}
|
|
#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
|
|
#undef _mm256_sll_epi64
|
|
#define _mm256_sll_epi64(a, count) simde_mm256_sll_epi64(a, count)
|
|
#endif
|
|
|
|
SIMDE_FUNCTION_ATTRIBUTES
|
|
simde__m256i
|
|
simde_mm256_slli_epi16 (simde__m256i a, const int imm8)
|
|
SIMDE_REQUIRE_RANGE(imm8, 0, 255) {
|
|
/* Note: There is no consistency in how compilers handle values outside of
|
|
the expected range, hence the discrepancy between what we allow and what
|
|
Intel specifies. Some compilers will return 0, others seem to just mask
|
|
off everything outside of the range. */
|
|
simde__m256i_private
|
|
r_,
|
|
a_ = simde__m256i_to_private(a);
|
|
|
|
#if defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
|
|
SIMDE_POWER_ALTIVEC_VECTOR(unsigned short) sv = vec_splats(HEDLEY_STATIC_CAST(unsigned short, imm8));
|
|
for (size_t i = 0 ; i < (sizeof(a_.altivec_i16) / sizeof(a_.altivec_i16[0])) ; i++) {
|
|
r_.altivec_i16[i] = vec_sl(a_.altivec_i16[i], sv);
|
|
}
|
|
#elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR)
|
|
r_.i16 = a_.i16 << HEDLEY_STATIC_CAST(int16_t, imm8);
|
|
#else
|
|
SIMDE_VECTORIZE
|
|
for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) {
|
|
r_.i16[i] = HEDLEY_STATIC_CAST(int16_t, a_.i16[i] << (imm8 & 0xff));
|
|
}
|
|
#endif
|
|
|
|
return simde__m256i_from_private(r_);
|
|
}
|
|
#if defined(SIMDE_X86_AVX2_NATIVE)
|
|
# define simde_mm256_slli_epi16(a, imm8) _mm256_slli_epi16(a, imm8)
|
|
#elif SIMDE_NATURAL_VECTOR_SIZE_LE(128)
|
|
# define simde_mm256_slli_epi16(a, imm8) \
|
|
simde_mm256_set_m128i( \
|
|
simde_mm_slli_epi16(simde_mm256_extracti128_si256(a, 1), (imm8)), \
|
|
simde_mm_slli_epi16(simde_mm256_extracti128_si256(a, 0), (imm8)))
|
|
#endif
|
|
#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
|
|
#undef _mm256_slli_epi16
|
|
#define _mm256_slli_epi16(a, imm8) simde_mm256_slli_epi16(a, imm8)
|
|
#endif
|
|
|
|
SIMDE_FUNCTION_ATTRIBUTES
|
|
simde__m256i
|
|
simde_mm256_slli_epi32 (simde__m256i a, const int imm8)
|
|
SIMDE_REQUIRE_RANGE(imm8, 0, 255) {
|
|
simde__m256i_private
|
|
r_,
|
|
a_ = simde__m256i_to_private(a);
|
|
|
|
#if defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
|
|
SIMDE_POWER_ALTIVEC_VECTOR(unsigned int) sv = vec_splats(HEDLEY_STATIC_CAST(unsigned int, imm8));
|
|
for (size_t i = 0 ; i < (sizeof(a_.altivec_i32) / sizeof(a_.altivec_i32[0])) ; i++) {
|
|
r_.altivec_i32[i] = vec_sl(a_.altivec_i32[i], sv);
|
|
}
|
|
#elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR)
|
|
r_.i32 = a_.i32 << HEDLEY_STATIC_CAST(int32_t, imm8);
|
|
#else
|
|
SIMDE_VECTORIZE
|
|
for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
|
|
r_.i32[i] = a_.i32[i] << (imm8 & 0xff);
|
|
}
|
|
#endif
|
|
|
|
return simde__m256i_from_private(r_);
|
|
}
|
|
#if defined(SIMDE_X86_AVX2_NATIVE)
|
|
# define simde_mm256_slli_epi32(a, imm8) _mm256_slli_epi32(a, imm8)
|
|
#elif SIMDE_NATURAL_VECTOR_SIZE_LE(128)
|
|
# define simde_mm256_slli_epi32(a, imm8) \
|
|
simde_mm256_set_m128i( \
|
|
simde_mm_slli_epi32(simde_mm256_extracti128_si256(a, 1), (imm8)), \
|
|
simde_mm_slli_epi32(simde_mm256_extracti128_si256(a, 0), (imm8)))
|
|
#endif
|
|
#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
|
|
#undef _mm256_slli_epi32
|
|
#define _mm256_slli_epi32(a, imm8) simde_mm256_slli_epi32(a, imm8)
|
|
#endif
|
|
|
|
SIMDE_FUNCTION_ATTRIBUTES
|
|
simde__m256i
|
|
simde_mm256_slli_epi64 (simde__m256i a, const int imm8)
|
|
SIMDE_REQUIRE_RANGE(imm8, 0, 255) {
|
|
simde__m256i_private
|
|
r_,
|
|
a_ = simde__m256i_to_private(a);
|
|
|
|
#if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR)
|
|
r_.i64 = a_.i64 << HEDLEY_STATIC_CAST(int64_t, imm8);
|
|
#else
|
|
SIMDE_VECTORIZE
|
|
for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) {
|
|
r_.i64[i] = a_.i64[i] << (imm8 & 0xff);
|
|
}
|
|
#endif
|
|
|
|
return simde__m256i_from_private(r_);
|
|
}
|
|
#if defined(SIMDE_X86_AVX2_NATIVE)
|
|
# define simde_mm256_slli_epi64(a, imm8) _mm256_slli_epi64(a, imm8)
|
|
#elif SIMDE_NATURAL_VECTOR_SIZE_LE(128)
|
|
# define simde_mm256_slli_epi64(a, imm8) \
|
|
simde_mm256_set_m128i( \
|
|
simde_mm_slli_epi64(simde_mm256_extracti128_si256(a, 1), (imm8)), \
|
|
simde_mm_slli_epi64(simde_mm256_extracti128_si256(a, 0), (imm8)))
|
|
#endif
|
|
#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
|
|
#undef _mm256_slli_epi64
|
|
#define _mm256_slli_epi64(a, imm8) simde_mm256_slli_epi64(a, imm8)
|
|
#endif
|
|
|
|
SIMDE_FUNCTION_ATTRIBUTES
|
|
simde__m256i
|
|
simde_mm256_slli_si256 (simde__m256i a, const int imm8)
|
|
SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 255) {
|
|
simde__m256i_private
|
|
r_,
|
|
a_ = simde__m256i_to_private(a);
|
|
|
|
for (size_t h = 0 ; h < (sizeof(r_.m128i_private) / sizeof(r_.m128i_private[0])) ; h++) {
|
|
SIMDE_VECTORIZE
|
|
for (size_t i = 0 ; i < (sizeof(r_.m128i_private[h].i8) / sizeof(r_.m128i_private[h].i8[0])) ; i++) {
|
|
const int e = HEDLEY_STATIC_CAST(int, i) - imm8;
|
|
r_.m128i_private[h].i8[i] = (e >= 0) ? a_.m128i_private[h].i8[e] : 0;
|
|
}
|
|
}
|
|
|
|
return simde__m256i_from_private(r_);
|
|
}
|
|
#if defined(SIMDE_X86_AVX2_NATIVE)
|
|
# define simde_mm256_slli_si256(a, imm8) _mm256_slli_si256(a, imm8)
|
|
#elif SIMDE_NATURAL_VECTOR_SIZE_LE(128) && !defined(__PGI)
|
|
# define simde_mm256_slli_si256(a, imm8) \
|
|
simde_mm256_set_m128i( \
|
|
simde_mm_slli_si128(simde_mm256_extracti128_si256(a, 1), (imm8)), \
|
|
simde_mm_slli_si128(simde_mm256_extracti128_si256(a, 0), (imm8)))
|
|
#elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
|
|
# define simde_mm256_slli_si256(a, imm8) \
|
|
simde_mm256_set_m128i( \
|
|
simde_mm_bslli_si128(simde_mm256_extracti128_si256(a, 1), (imm8)), \
|
|
simde_mm_bslli_si128(simde_mm256_extracti128_si256(a, 0), (imm8)))
|
|
#endif
|
|
#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
|
|
#undef _mm256_slli_si256
|
|
#define _mm256_slli_si256(a, imm8) simde_mm256_slli_si256(a, imm8)
|
|
#endif
|
|
|
|
SIMDE_FUNCTION_ATTRIBUTES
|
|
simde__m128i
|
|
simde_mm_sllv_epi32 (simde__m128i a, simde__m128i b) {
|
|
simde__m128i_private
|
|
a_ = simde__m128i_to_private(a),
|
|
b_ = simde__m128i_to_private(b),
|
|
r_;
|
|
|
|
#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
|
|
r_.neon_u32 = vshlq_u32(a_.neon_u32, vreinterpretq_s32_u32(b_.neon_u32));
|
|
r_.neon_u32 = vandq_u32(r_.neon_u32, vcltq_u32(b_.neon_u32, vdupq_n_u32(32)));
|
|
#elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR)
|
|
r_.u32 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.u32), (b_.u32 < UINT32_C(32))) & (a_.u32 << b_.u32);
|
|
#else
|
|
SIMDE_VECTORIZE
|
|
for (size_t i = 0 ; i < (sizeof(r_.u32) / sizeof(r_.u32[0])) ; i++) {
|
|
r_.u32[i] = (b_.u32[i] < 32) ? (a_.u32[i] << b_.u32[i]) : 0;
|
|
}
|
|
#endif
|
|
|
|
return simde__m128i_from_private(r_);
|
|
}
|
|
#if defined(SIMDE_X86_AVX2_NATIVE)
|
|
#define simde_mm_sllv_epi32(a, b) _mm_sllv_epi32(a, b)
|
|
#endif
|
|
#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
|
|
#undef _mm_sllv_epi32
|
|
#define _mm_sllv_epi32(a, b) simde_mm_sllv_epi32(a, b)
|
|
#endif
|
|
|
|
SIMDE_FUNCTION_ATTRIBUTES
|
|
simde__m256i
|
|
simde_mm256_sllv_epi32 (simde__m256i a, simde__m256i b) {
|
|
simde__m256i_private
|
|
a_ = simde__m256i_to_private(a),
|
|
b_ = simde__m256i_to_private(b),
|
|
r_;
|
|
|
|
#if SIMDE_NATURAL_VECTOR_SIZE_LE(128)
|
|
r_.m128i[0] = simde_mm_sllv_epi32(a_.m128i[0], b_.m128i[0]);
|
|
r_.m128i[1] = simde_mm_sllv_epi32(a_.m128i[1], b_.m128i[1]);
|
|
#elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR)
|
|
r_.u32 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.u32), (b_.u32 < 32)) & (a_.u32 << b_.u32);
|
|
#else
|
|
SIMDE_VECTORIZE
|
|
for (size_t i = 0 ; i < (sizeof(r_.u32) / sizeof(r_.u32[0])) ; i++) {
|
|
r_.u32[i] = (b_.u32[i] < 32) ? (a_.u32[i] << b_.u32[i]) : 0;
|
|
}
|
|
#endif
|
|
|
|
return simde__m256i_from_private(r_);
|
|
}
|
|
#if defined(SIMDE_X86_AVX2_NATIVE)
|
|
#define simde_mm256_sllv_epi32(a, b) _mm256_sllv_epi32(a, b)
|
|
#endif
|
|
#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
|
|
#undef _mm256_sllv_epi32
|
|
#define _mm256_sllv_epi32(a, b) simde_mm256_sllv_epi32(a, b)
|
|
#endif
|
|
|
|
SIMDE_FUNCTION_ATTRIBUTES
|
|
simde__m128i
|
|
simde_mm_sllv_epi64 (simde__m128i a, simde__m128i b) {
|
|
simde__m128i_private
|
|
a_ = simde__m128i_to_private(a),
|
|
b_ = simde__m128i_to_private(b),
|
|
r_;
|
|
|
|
#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
|
|
r_.neon_u64 = vshlq_u64(a_.neon_u64, vreinterpretq_s64_u64(b_.neon_u64));
|
|
r_.neon_u64 = vandq_u64(r_.neon_u64, vcltq_u64(b_.neon_u64, vdupq_n_u64(64)));
|
|
#elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR)
|
|
r_.u64 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.u64), (b_.u64 < 64)) & (a_.u64 << b_.u64);
|
|
#else
|
|
SIMDE_VECTORIZE
|
|
for (size_t i = 0 ; i < (sizeof(r_.u64) / sizeof(r_.u64[0])) ; i++) {
|
|
r_.u64[i] = (b_.u64[i] < 64) ? (a_.u64[i] << b_.u64[i]) : 0;
|
|
}
|
|
#endif
|
|
|
|
return simde__m128i_from_private(r_);
|
|
}
|
|
#if defined(SIMDE_X86_AVX2_NATIVE)
|
|
#define simde_mm_sllv_epi64(a, b) _mm_sllv_epi64(a, b)
|
|
#endif
|
|
#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
|
|
#undef _mm_sllv_epi64
|
|
#define _mm_sllv_epi64(a, b) simde_mm_sllv_epi64(a, b)
|
|
#endif
|
|
|
|
SIMDE_FUNCTION_ATTRIBUTES
|
|
simde__m256i
|
|
simde_mm256_sllv_epi64 (simde__m256i a, simde__m256i b) {
|
|
simde__m256i_private
|
|
a_ = simde__m256i_to_private(a),
|
|
b_ = simde__m256i_to_private(b),
|
|
r_;
|
|
|
|
#if SIMDE_NATURAL_VECTOR_SIZE_LE(128)
|
|
r_.m128i[0] = simde_mm_sllv_epi64(a_.m128i[0], b_.m128i[0]);
|
|
r_.m128i[1] = simde_mm_sllv_epi64(a_.m128i[1], b_.m128i[1]);
|
|
#elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR)
|
|
r_.u64 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.u64), (b_.u64 < 64)) & (a_.u64 << b_.u64);
|
|
#else
|
|
SIMDE_VECTORIZE
|
|
for (size_t i = 0 ; i < (sizeof(r_.u64) / sizeof(r_.u64[0])) ; i++) {
|
|
r_.u64[i] = (b_.u64[i] < 64) ? (a_.u64[i] << b_.u64[i]) : 0;
|
|
}
|
|
#endif
|
|
|
|
return simde__m256i_from_private(r_);
|
|
}
|
|
#if defined(SIMDE_X86_AVX2_NATIVE)
|
|
#define simde_mm256_sllv_epi64(a, b) _mm256_sllv_epi64(a, b)
|
|
#endif
|
|
#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
|
|
#undef _mm256_sllv_epi64
|
|
#define _mm256_sllv_epi64(a, b) simde_mm256_sllv_epi64(a, b)
|
|
#endif
|
|
|
|
SIMDE_FUNCTION_ATTRIBUTES
|
|
simde__m256i
|
|
simde_mm256_sra_epi16 (simde__m256i a, simde__m128i count) {
|
|
#if defined(SIMDE_X86_AVX2_NATIVE)
|
|
return _mm256_sra_epi16(a, count);
|
|
#else
|
|
simde__m256i_private
|
|
r_,
|
|
a_ = simde__m256i_to_private(a);
|
|
|
|
#if SIMDE_NATURAL_VECTOR_SIZE_LE(128)
|
|
r_.m128i[0] = simde_mm_sra_epi16(a_.m128i[0], count);
|
|
r_.m128i[1] = simde_mm_sra_epi16(a_.m128i[1], count);
|
|
#else
|
|
simde__m128i_private
|
|
count_ = simde__m128i_to_private(count);
|
|
|
|
uint64_t shift = HEDLEY_STATIC_CAST(uint64_t, count_.i64[0]);
|
|
|
|
if (shift > 15) shift = 15;
|
|
|
|
#if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR)
|
|
r_.i16 = a_.i16 >> HEDLEY_STATIC_CAST(int16_t, shift);
|
|
#else
|
|
SIMDE_VECTORIZE
|
|
for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) {
|
|
r_.i16[i] = a_.i16[i] >> shift;
|
|
}
|
|
#endif
|
|
#endif
|
|
|
|
return simde__m256i_from_private(r_);
|
|
#endif
|
|
}
|
|
#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
|
|
#undef _mm256_sra_epi16
|
|
#define _mm256_sra_epi16(a, count) simde_mm256_sra_epi16(a, count)
|
|
#endif
|
|
|
|
SIMDE_FUNCTION_ATTRIBUTES
|
|
simde__m256i
|
|
simde_mm256_sra_epi32 (simde__m256i a, simde__m128i count) {
|
|
#if defined(SIMDE_X86_AVX2_NATIVE)
|
|
return _mm256_sra_epi32(a, count);
|
|
#else
|
|
simde__m256i_private
|
|
r_,
|
|
a_ = simde__m256i_to_private(a);
|
|
|
|
#if SIMDE_NATURAL_VECTOR_SIZE_LE(128)
|
|
r_.m128i[0] = simde_mm_sra_epi32(a_.m128i[0], count);
|
|
r_.m128i[1] = simde_mm_sra_epi32(a_.m128i[1], count);
|
|
#else
|
|
simde__m128i_private
|
|
count_ = simde__m128i_to_private(count);
|
|
uint64_t shift = HEDLEY_STATIC_CAST(uint64_t, count_.i64[0]);
|
|
|
|
if (shift > 31) shift = 31;
|
|
|
|
#if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR)
|
|
r_.i32 = a_.i32 >> HEDLEY_STATIC_CAST(int16_t, shift);
|
|
#else
|
|
SIMDE_VECTORIZE
|
|
for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
|
|
r_.i32[i] = a_.i32[i] >> shift;
|
|
}
|
|
#endif
|
|
#endif
|
|
|
|
return simde__m256i_from_private(r_);
|
|
#endif
|
|
}
|
|
#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
|
|
#undef _mm256_sra_epi32
|
|
#define _mm256_sra_epi32(a, count) simde_mm256_sra_epi32(a, count)
|
|
#endif
|
|
|
|
SIMDE_FUNCTION_ATTRIBUTES
|
|
simde__m256i
|
|
simde_mm256_srai_epi16 (simde__m256i a, const int imm8)
|
|
SIMDE_REQUIRE_RANGE(imm8, 0, 255) {
|
|
simde__m256i_private
|
|
r_,
|
|
a_ = simde__m256i_to_private(a);
|
|
unsigned int shift = HEDLEY_STATIC_CAST(unsigned int, imm8);
|
|
|
|
if (shift > 15) shift = 15;
|
|
|
|
#if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR)
|
|
r_.i16 = a_.i16 >> HEDLEY_STATIC_CAST(int16_t, shift);
|
|
#else
|
|
SIMDE_VECTORIZE
|
|
for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) {
|
|
r_.i16[i] = a_.i16[i] >> shift;
|
|
}
|
|
#endif
|
|
|
|
return simde__m256i_from_private(r_);
|
|
}
|
|
#if defined(SIMDE_X86_AVX2_NATIVE)
|
|
# define simde_mm256_srai_epi16(a, imm8) _mm256_srai_epi16(a, imm8)
|
|
#elif SIMDE_NATURAL_VECTOR_SIZE_LE(128)
|
|
# define simde_mm256_srai_epi16(a, imm8) \
|
|
simde_mm256_set_m128i( \
|
|
simde_mm_srai_epi16(simde_mm256_extracti128_si256(a, 1), (imm8)), \
|
|
simde_mm_srai_epi16(simde_mm256_extracti128_si256(a, 0), (imm8)))
|
|
#endif
|
|
#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
|
|
#undef _mm256_srai_epi16
|
|
#define _mm256_srai_epi16(a, imm8) simde_mm256_srai_epi16(a, imm8)
|
|
#endif
|
|
|
|
SIMDE_FUNCTION_ATTRIBUTES
|
|
simde__m256i
|
|
simde_mm256_srai_epi32 (simde__m256i a, const int imm8)
|
|
SIMDE_REQUIRE_RANGE(imm8, 0, 255) {
|
|
simde__m256i_private
|
|
r_,
|
|
a_ = simde__m256i_to_private(a);
|
|
unsigned int shift = HEDLEY_STATIC_CAST(unsigned int, imm8);
|
|
|
|
if (shift > 31) shift = 31;
|
|
|
|
#if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR)
|
|
r_.i32 = a_.i32 >> HEDLEY_STATIC_CAST(int16_t, shift);
|
|
#else
|
|
SIMDE_VECTORIZE
|
|
for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
|
|
r_.i32[i] = a_.i32[i] >> shift;
|
|
}
|
|
#endif
|
|
|
|
return simde__m256i_from_private(r_);
|
|
}
|
|
#if defined(SIMDE_X86_AVX2_NATIVE)
|
|
# define simde_mm256_srai_epi32(a, imm8) _mm256_srai_epi32(a, imm8)
|
|
#elif SIMDE_NATURAL_VECTOR_SIZE_LE(128)
|
|
# define simde_mm256_srai_epi32(a, imm8) \
|
|
simde_mm256_set_m128i( \
|
|
simde_mm_srai_epi32(simde_mm256_extracti128_si256(a, 1), (imm8)), \
|
|
simde_mm_srai_epi32(simde_mm256_extracti128_si256(a, 0), (imm8)))
|
|
#endif
|
|
#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
|
|
#undef _mm256_srai_epi32
|
|
#define _mm256_srai_epi32(a, imm8) simde_mm256_srai_epi32(a, imm8)
|
|
#endif
|
|
|
|
SIMDE_FUNCTION_ATTRIBUTES
|
|
simde__m128i
|
|
simde_mm_srav_epi32 (simde__m128i a, simde__m128i count) {
|
|
#if defined(SIMDE_X86_AVX2_NATIVE)
|
|
return _mm_srav_epi32(a, count);
|
|
#else
|
|
simde__m128i_private
|
|
r_,
|
|
a_ = simde__m128i_to_private(a),
|
|
count_ = simde__m128i_to_private(count);
|
|
|
|
#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
|
|
int32x4_t cnt = vreinterpretq_s32_u32(vminq_u32(count_.neon_u32, vdupq_n_u32(31)));
|
|
r_.neon_i32 = vshlq_s32(a_.neon_i32, vnegq_s32(cnt));
|
|
#else
|
|
SIMDE_VECTORIZE
|
|
for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
|
|
uint32_t shift = HEDLEY_STATIC_CAST(uint32_t, count_.i32[i]);
|
|
r_.i32[i] = a_.i32[i] >> HEDLEY_STATIC_CAST(int, shift > 31 ? 31 : shift);
|
|
}
|
|
#endif
|
|
|
|
return simde__m128i_from_private(r_);
|
|
#endif
|
|
}
|
|
#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
|
|
#undef _mm_srav_epi32
|
|
#define _mm_srav_epi32(a, count) simde_mm_srav_epi32(a, count)
|
|
#endif
|
|
|
|
SIMDE_FUNCTION_ATTRIBUTES
|
|
simde__m256i
|
|
simde_mm256_srav_epi32 (simde__m256i a, simde__m256i count) {
|
|
#if defined(SIMDE_X86_AVX2_NATIVE)
|
|
return _mm256_srav_epi32(a, count);
|
|
#else
|
|
simde__m256i_private
|
|
r_,
|
|
a_ = simde__m256i_to_private(a),
|
|
count_ = simde__m256i_to_private(count);
|
|
|
|
#if SIMDE_NATURAL_VECTOR_SIZE_LE(128)
|
|
r_.m128i[0] = simde_mm_srav_epi32(a_.m128i[0], count_.m128i[0]);
|
|
r_.m128i[1] = simde_mm_srav_epi32(a_.m128i[1], count_.m128i[1]);
|
|
#else
|
|
SIMDE_VECTORIZE
|
|
for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
|
|
uint32_t shift = HEDLEY_STATIC_CAST(uint32_t, count_.i32[i]);
|
|
if (shift > 31) shift = 31;
|
|
r_.i32[i] = a_.i32[i] >> shift;
|
|
}
|
|
#endif
|
|
|
|
return simde__m256i_from_private(r_);
|
|
#endif
|
|
}
|
|
#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
|
|
#undef _mm256_srav_epi32
|
|
#define _mm256_srav_epi32(a, count) simde_mm256_srav_epi32(a, count)
|
|
#endif
|
|
|
|
SIMDE_FUNCTION_ATTRIBUTES
|
|
simde__m256i
|
|
simde_mm256_srl_epi16 (simde__m256i a, simde__m128i count) {
|
|
#if defined(SIMDE_X86_AVX2_NATIVE)
|
|
return _mm256_srl_epi16(a, count);
|
|
#else
|
|
simde__m256i_private
|
|
r_,
|
|
a_ = simde__m256i_to_private(a);
|
|
|
|
#if SIMDE_NATURAL_VECTOR_SIZE_LE(128)
|
|
r_.m128i[0] = simde_mm_srl_epi16(a_.m128i[0], count);
|
|
r_.m128i[1] = simde_mm_srl_epi16(a_.m128i[1], count);
|
|
#else
|
|
simde__m128i_private
|
|
count_ = simde__m128i_to_private(count);
|
|
|
|
uint64_t shift = HEDLEY_STATIC_CAST(uint64_t , (count_.i64[0] > 16 ? 16 : count_.i64[0]));
|
|
|
|
#if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR)
|
|
r_.u16 = a_.u16 >> SIMDE_CAST_VECTOR_SHIFT_COUNT(16, shift);
|
|
#else
|
|
SIMDE_VECTORIZE
|
|
for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) {
|
|
r_.u16[i] = a_.u16[i] >> (shift);
|
|
}
|
|
#endif
|
|
#endif
|
|
|
|
return simde__m256i_from_private(r_);
|
|
#endif
|
|
}
|
|
#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
|
|
#undef _mm256_srl_epi16
|
|
#define _mm256_srl_epi16(a, count) simde_mm256_srl_epi16(a, count)
|
|
#endif
|
|
|
|
SIMDE_FUNCTION_ATTRIBUTES
|
|
simde__m256i
|
|
simde_mm256_srl_epi32 (simde__m256i a, simde__m128i count) {
|
|
#if defined(SIMDE_X86_AVX2_NATIVE)
|
|
return _mm256_srl_epi32(a, count);
|
|
#else
|
|
simde__m256i_private
|
|
r_,
|
|
a_ = simde__m256i_to_private(a);
|
|
|
|
#if SIMDE_NATURAL_VECTOR_SIZE_LE(128)
|
|
r_.m128i[0] = simde_mm_srl_epi32(a_.m128i[0], count);
|
|
r_.m128i[1] = simde_mm_srl_epi32(a_.m128i[1], count);
|
|
#else
|
|
simde__m128i_private
|
|
count_ = simde__m128i_to_private(count);
|
|
|
|
uint64_t shift = HEDLEY_STATIC_CAST(uint64_t , (count_.i64[0] > 32 ? 32 : count_.i64[0]));
|
|
|
|
#if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR)
|
|
r_.u32 = a_.u32 >> SIMDE_CAST_VECTOR_SHIFT_COUNT(32, shift);
|
|
#else
|
|
SIMDE_VECTORIZE
|
|
for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
|
|
r_.u32[i] = a_.u32[i] >> (shift);
|
|
}
|
|
#endif
|
|
#endif
|
|
|
|
return simde__m256i_from_private(r_);
|
|
#endif
|
|
}
|
|
#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
|
|
#undef _mm256_srl_epi32
|
|
#define _mm256_srl_epi32(a, count) simde_mm256_srl_epi32(a, count)
|
|
#endif
|
|
|
|
SIMDE_FUNCTION_ATTRIBUTES
|
|
simde__m256i
|
|
simde_mm256_srl_epi64 (simde__m256i a, simde__m128i count) {
|
|
#if defined(SIMDE_X86_AVX2_NATIVE)
|
|
return _mm256_srl_epi64(a, count);
|
|
#else
|
|
simde__m256i_private
|
|
r_,
|
|
a_ = simde__m256i_to_private(a);
|
|
|
|
#if SIMDE_NATURAL_VECTOR_SIZE_LE(128)
|
|
r_.m128i[0] = simde_mm_srl_epi64(a_.m128i[0], count);
|
|
r_.m128i[1] = simde_mm_srl_epi64(a_.m128i[1], count);
|
|
#else
|
|
simde__m128i_private
|
|
count_ = simde__m128i_to_private(count);
|
|
|
|
uint64_t shift = HEDLEY_STATIC_CAST(uint64_t , (count_.i64[0] > 64 ? 64 : count_.i64[0]));
|
|
|
|
#if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR)
|
|
r_.u64 = a_.u64 >> SIMDE_CAST_VECTOR_SHIFT_COUNT(64, shift);
|
|
#else
|
|
SIMDE_VECTORIZE
|
|
for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) {
|
|
r_.u64[i] = a_.u64[i] >> (shift);
|
|
}
|
|
#endif
|
|
#endif
|
|
|
|
return simde__m256i_from_private(r_);
|
|
#endif
|
|
}
|
|
#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
|
|
#undef _mm256_srl_epi64
|
|
#define _mm256_srl_epi64(a, count) simde_mm256_srl_epi64(a, count)
|
|
#endif
|
|
|
|
SIMDE_FUNCTION_ATTRIBUTES
|
|
simde__m256i
|
|
simde_mm256_srli_epi16 (simde__m256i a, const int imm8)
|
|
SIMDE_REQUIRE_RANGE(imm8, 0, 255) {
|
|
simde__m256i_private
|
|
r_,
|
|
a_ = simde__m256i_to_private(a);
|
|
|
|
if (imm8 > 15)
|
|
return simde_mm256_setzero_si256();
|
|
|
|
#if defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
|
|
SIMDE_POWER_ALTIVEC_VECTOR(unsigned short) sv = vec_splats(HEDLEY_STATIC_CAST(unsigned short, imm8));
|
|
for (size_t i = 0 ; i < (sizeof(a_.altivec_u16) / sizeof(a_.altivec_u16[0])) ; i++) {
|
|
r_.altivec_u16[i] = vec_sr(a_.altivec_u16[i], sv);
|
|
}
|
|
#else
|
|
if (HEDLEY_STATIC_CAST(unsigned int, imm8) > 15) {
|
|
simde_memset(&r_, 0, sizeof(r_));
|
|
} else {
|
|
#if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR)
|
|
r_.u16 = a_.u16 >> SIMDE_CAST_VECTOR_SHIFT_COUNT(16, imm8);
|
|
#else
|
|
SIMDE_VECTORIZE
|
|
for (size_t i = 0 ; i < (sizeof(r_.u16) / sizeof(r_.u16[0])) ; i++) {
|
|
r_.u16[i] = a_.u16[i] >> imm8;
|
|
}
|
|
#endif
|
|
}
|
|
#endif
|
|
|
|
return simde__m256i_from_private(r_);
|
|
}
|
|
#if defined(SIMDE_X86_AVX2_NATIVE)
|
|
# define simde_mm256_srli_epi16(a, imm8) _mm256_srli_epi16(a, imm8)
|
|
#elif SIMDE_NATURAL_VECTOR_SIZE_LE(128)
|
|
# define simde_mm256_srli_epi16(a, imm8) \
|
|
simde_mm256_set_m128i( \
|
|
simde_mm_srli_epi16(simde_mm256_extracti128_si256(a, 1), (imm8)), \
|
|
simde_mm_srli_epi16(simde_mm256_extracti128_si256(a, 0), (imm8)))
|
|
#endif
|
|
#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
|
|
#undef _mm256_srli_epi16
|
|
#define _mm256_srli_epi16(a, imm8) simde_mm256_srli_epi16(a, imm8)
|
|
#endif
|
|
|
|
SIMDE_FUNCTION_ATTRIBUTES
|
|
simde__m256i
|
|
simde_mm256_srli_epi32 (simde__m256i a, const int imm8)
|
|
SIMDE_REQUIRE_RANGE(imm8, 0, 255) {
|
|
simde__m256i_private
|
|
r_,
|
|
a_ = simde__m256i_to_private(a);
|
|
|
|
#if defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
|
|
SIMDE_POWER_ALTIVEC_VECTOR(unsigned int) sv = vec_splats(HEDLEY_STATIC_CAST(unsigned int, imm8));
|
|
for (size_t i = 0 ; i < (sizeof(a_.altivec_u32) / sizeof(a_.altivec_u32[0])) ; i++) {
|
|
r_.altivec_u32[i] = vec_sr(a_.altivec_u32[i], sv);
|
|
}
|
|
#elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR)
|
|
r_.u32 = a_.u32 >> SIMDE_CAST_VECTOR_SHIFT_COUNT(16, imm8);
|
|
#else
|
|
SIMDE_VECTORIZE
|
|
for (size_t i = 0 ; i < (sizeof(r_.u32) / sizeof(r_.u32[0])) ; i++) {
|
|
r_.u32[i] = a_.u32[i] >> imm8;
|
|
}
|
|
#endif
|
|
|
|
return simde__m256i_from_private(r_);
|
|
}
|
|
#if defined(SIMDE_X86_AVX2_NATIVE)
|
|
# define simde_mm256_srli_epi32(a, imm8) _mm256_srli_epi32(a, imm8)
|
|
#elif SIMDE_NATURAL_VECTOR_SIZE_LE(128)
|
|
# define simde_mm256_srli_epi32(a, imm8) \
|
|
simde_mm256_set_m128i( \
|
|
simde_mm_srli_epi32(simde_mm256_extracti128_si256(a, 1), (imm8)), \
|
|
simde_mm_srli_epi32(simde_mm256_extracti128_si256(a, 0), (imm8)))
|
|
#endif
|
|
#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
|
|
#undef _mm256_srli_epi32
|
|
#define _mm256_srli_epi32(a, imm8) simde_mm256_srli_epi32(a, imm8)
|
|
#endif
|
|
|
|
SIMDE_FUNCTION_ATTRIBUTES
|
|
simde__m256i
|
|
simde_mm256_srli_epi64 (simde__m256i a, const int imm8)
|
|
SIMDE_REQUIRE_RANGE(imm8, 0, 255) {
|
|
simde__m256i_private
|
|
r_,
|
|
a_ = simde__m256i_to_private(a);
|
|
|
|
#if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR)
|
|
r_.u64 = a_.u64 >> SIMDE_CAST_VECTOR_SHIFT_COUNT(32, imm8);
|
|
#else
|
|
SIMDE_VECTORIZE
|
|
for (size_t i = 0 ; i < (sizeof(r_.u64) / sizeof(r_.u64[0])) ; i++) {
|
|
r_.u64[i] = a_.u64[i] >> imm8;
|
|
}
|
|
#endif
|
|
|
|
return simde__m256i_from_private(r_);
|
|
}
|
|
#if defined(SIMDE_X86_AVX2_NATIVE)
|
|
# define simde_mm256_srli_epi64(a, imm8) _mm256_srli_epi64(a, imm8)
|
|
#elif SIMDE_NATURAL_VECTOR_SIZE_LE(128)
|
|
# define simde_mm256_srli_epi64(a, imm8) \
|
|
simde_mm256_set_m128i( \
|
|
simde_mm_srli_epi64(simde_mm256_extracti128_si256(a, 1), (imm8)), \
|
|
simde_mm_srli_epi64(simde_mm256_extracti128_si256(a, 0), (imm8)))
|
|
#endif
|
|
#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
|
|
#undef _mm256_srli_epi64
|
|
#define _mm256_srli_epi64(a, imm8) simde_mm256_srli_epi64(a, imm8)
|
|
#endif
|
|
|
|
SIMDE_FUNCTION_ATTRIBUTES
|
|
simde__m256i
|
|
simde_mm256_srli_si256 (simde__m256i a, const int imm8)
|
|
SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 255) {
|
|
simde__m256i_private
|
|
r_,
|
|
a_ = simde__m256i_to_private(a);
|
|
|
|
for (size_t h = 0 ; h < (sizeof(r_.m128i_private) / sizeof(r_.m128i_private[0])) ; h++) {
|
|
SIMDE_VECTORIZE
|
|
for (size_t i = 0 ; i < (sizeof(r_.m128i_private[h].i8) / sizeof(r_.m128i_private[h].i8[0])) ; i++) {
|
|
const int e = imm8 + HEDLEY_STATIC_CAST(int, i);
|
|
r_.m128i_private[h].i8[i] = (e < 16) ? a_.m128i_private[h].i8[e] : 0;
|
|
}
|
|
}
|
|
|
|
return simde__m256i_from_private(r_);
|
|
}
|
|
#if defined(SIMDE_X86_AVX2_NATIVE)
|
|
# define simde_mm256_srli_si256(a, imm8) _mm256_srli_si256(a, imm8)
|
|
#elif SIMDE_NATURAL_VECTOR_SIZE_LE(128) && !defined(__PGI)
|
|
# define simde_mm256_srli_si256(a, imm8) \
|
|
simde_mm256_set_m128i( \
|
|
simde_mm_srli_si128(simde_mm256_extracti128_si256(a, 1), (imm8)), \
|
|
simde_mm_srli_si128(simde_mm256_extracti128_si256(a, 0), (imm8)))
|
|
#elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
|
|
# define simde_mm256_srli_si256(a, imm8) \
|
|
simde_mm256_set_m128i( \
|
|
simde_mm_bsrli_si128(simde_mm256_extracti128_si256(a, 1), (imm8)), \
|
|
simde_mm_bsrli_si128(simde_mm256_extracti128_si256(a, 0), (imm8)))
|
|
#endif
|
|
#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
|
|
#undef _mm256_srli_si256
|
|
#define _mm256_srli_si256(a, imm8) simde_mm256_srli_si256(a, imm8)
|
|
#endif
|
|
|
|
SIMDE_FUNCTION_ATTRIBUTES
|
|
simde__m128i
|
|
simde_mm_srlv_epi32 (simde__m128i a, simde__m128i b) {
|
|
simde__m128i_private
|
|
a_ = simde__m128i_to_private(a),
|
|
b_ = simde__m128i_to_private(b),
|
|
r_;
|
|
|
|
#if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR)
|
|
r_.u32 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.u32), (b_.u32 < 32)) & (a_.u32 >> b_.u32);
|
|
#else
|
|
SIMDE_VECTORIZE
|
|
for (size_t i = 0 ; i < (sizeof(r_.u32) / sizeof(r_.u32[0])) ; i++) {
|
|
r_.u32[i] = (b_.u32[i] < 32) ? (a_.u32[i] >> b_.u32[i]) : 0;
|
|
}
|
|
#endif
|
|
|
|
return simde__m128i_from_private(r_);
|
|
}
|
|
#if defined(SIMDE_X86_AVX2_NATIVE)
|
|
#define simde_mm_srlv_epi32(a, b) _mm_srlv_epi32(a, b)
|
|
#endif
|
|
#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
|
|
#undef _mm_srlv_epi32
|
|
#define _mm_srlv_epi32(a, b) simde_mm_srlv_epi32(a, b)
|
|
#endif
|
|
|
|
SIMDE_FUNCTION_ATTRIBUTES
|
|
simde__m256i
|
|
simde_mm256_srlv_epi32 (simde__m256i a, simde__m256i b) {
|
|
simde__m256i_private
|
|
a_ = simde__m256i_to_private(a),
|
|
b_ = simde__m256i_to_private(b),
|
|
r_;
|
|
|
|
#if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR)
|
|
r_.u32 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.u32), (b_.u32 < 32)) & (a_.u32 >> b_.u32);
|
|
#else
|
|
SIMDE_VECTORIZE
|
|
for (size_t i = 0 ; i < (sizeof(r_.u32) / sizeof(r_.u32[0])) ; i++) {
|
|
r_.u32[i] = (b_.u32[i] < 32) ? (a_.u32[i] >> b_.u32[i]) : 0;
|
|
}
|
|
#endif
|
|
|
|
return simde__m256i_from_private(r_);
|
|
}
|
|
#if defined(SIMDE_X86_AVX2_NATIVE)
|
|
#define simde_mm256_srlv_epi32(a, b) _mm256_srlv_epi32(a, b)
|
|
#endif
|
|
#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
|
|
#undef _mm256_srlv_epi32
|
|
#define _mm256_srlv_epi32(a, b) simde_mm256_srlv_epi32(a, b)
|
|
#endif
|
|
|
|
SIMDE_FUNCTION_ATTRIBUTES
|
|
simde__m128i
|
|
simde_mm_srlv_epi64 (simde__m128i a, simde__m128i b) {
|
|
simde__m128i_private
|
|
a_ = simde__m128i_to_private(a),
|
|
b_ = simde__m128i_to_private(b),
|
|
r_;
|
|
|
|
#if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR)
|
|
r_.u64 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.u64), (b_.u64 < 64)) & (a_.u64 >> b_.u64);
|
|
#else
|
|
SIMDE_VECTORIZE
|
|
for (size_t i = 0 ; i < (sizeof(r_.u64) / sizeof(r_.u64[0])) ; i++) {
|
|
r_.u64[i] = (b_.u64[i] < 64) ? (a_.u64[i] >> b_.u64[i]) : 0;
|
|
}
|
|
#endif
|
|
|
|
return simde__m128i_from_private(r_);
|
|
}
|
|
#if defined(SIMDE_X86_AVX2_NATIVE)
|
|
#define simde_mm_srlv_epi64(a, b) _mm_srlv_epi64(a, b)
|
|
#endif
|
|
#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
|
|
#undef _mm_srlv_epi64
|
|
#define _mm_srlv_epi64(a, b) simde_mm_srlv_epi64(a, b)
|
|
#endif
|
|
|
|
SIMDE_FUNCTION_ATTRIBUTES
|
|
simde__m256i
|
|
simde_mm256_srlv_epi64 (simde__m256i a, simde__m256i b) {
|
|
simde__m256i_private
|
|
a_ = simde__m256i_to_private(a),
|
|
b_ = simde__m256i_to_private(b),
|
|
r_;
|
|
|
|
#if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR)
|
|
r_.u64 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.u64), (b_.u64 < 64)) & (a_.u64 >> b_.u64);
|
|
#else
|
|
SIMDE_VECTORIZE
|
|
for (size_t i = 0 ; i < (sizeof(r_.u64) / sizeof(r_.u64[0])) ; i++) {
|
|
r_.u64[i] = (b_.u64[i] < 64) ? (a_.u64[i] >> b_.u64[i]) : 0;
|
|
}
|
|
#endif
|
|
|
|
return simde__m256i_from_private(r_);
|
|
}
|
|
#if defined(SIMDE_X86_AVX2_NATIVE)
|
|
#define simde_mm256_srlv_epi64(a, b) _mm256_srlv_epi64(a, b)
|
|
#endif
|
|
#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
|
|
#undef _mm256_srlv_epi64
|
|
#define _mm256_srlv_epi64(a, b) simde_mm256_srlv_epi64(a, b)
|
|
#endif
|
|
|
|
SIMDE_FUNCTION_ATTRIBUTES
|
|
simde__m256i
|
|
simde_mm256_stream_load_si256 (const simde__m256i* mem_addr) {
|
|
#if defined(SIMDE_X86_AVX2_NATIVE)
|
|
return _mm256_stream_load_si256(HEDLEY_CONST_CAST(simde__m256i*, mem_addr));
|
|
#else
|
|
simde__m256i r;
|
|
simde_memcpy(&r, SIMDE_ALIGN_ASSUME_LIKE(mem_addr, simde__m256i), sizeof(r));
|
|
return r;
|
|
#endif
|
|
}
|
|
#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
|
|
# define _mm256_stream_load_si256(mem_addr) simde_mm256_stream_load_si256(mem_addr)
|
|
#endif
|
|
|
|
SIMDE_FUNCTION_ATTRIBUTES
|
|
simde__m256i
|
|
simde_mm256_sub_epi8 (simde__m256i a, simde__m256i b) {
|
|
#if defined(SIMDE_X86_AVX2_NATIVE)
|
|
return _mm256_sub_epi8(a, b);
|
|
#else
|
|
simde__m256i_private
|
|
r_,
|
|
a_ = simde__m256i_to_private(a),
|
|
b_ = simde__m256i_to_private(b);
|
|
|
|
#if SIMDE_NATURAL_VECTOR_SIZE_LE(128)
|
|
r_.m128i[0] = simde_mm_sub_epi8(a_.m128i[0], b_.m128i[0]);
|
|
r_.m128i[1] = simde_mm_sub_epi8(a_.m128i[1], b_.m128i[1]);
|
|
#elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
|
|
r_.i8 = a_.i8 - b_.i8;
|
|
#else
|
|
SIMDE_VECTORIZE
|
|
for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) {
|
|
r_.i8[i] = a_.i8[i] - b_.i8[i];
|
|
}
|
|
#endif
|
|
|
|
return simde__m256i_from_private(r_);
|
|
#endif
|
|
}
|
|
#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
|
|
#undef _mm256_sub_epi8
|
|
#define _mm256_sub_epi8(a, b) simde_mm256_sub_epi8(a, b)
|
|
#endif
|
|
|
|
SIMDE_FUNCTION_ATTRIBUTES
|
|
simde__m256i
|
|
simde_mm256_sub_epi16 (simde__m256i a, simde__m256i b) {
|
|
#if defined(SIMDE_X86_AVX2_NATIVE)
|
|
return _mm256_sub_epi16(a, b);
|
|
#else
|
|
simde__m256i_private
|
|
r_,
|
|
a_ = simde__m256i_to_private(a),
|
|
b_ = simde__m256i_to_private(b);
|
|
|
|
#if SIMDE_NATURAL_VECTOR_SIZE_LE(128)
|
|
r_.m128i[0] = simde_mm_sub_epi16(a_.m128i[0], b_.m128i[0]);
|
|
r_.m128i[1] = simde_mm_sub_epi16(a_.m128i[1], b_.m128i[1]);
|
|
#elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
|
|
r_.i16 = a_.i16 - b_.i16;
|
|
#else
|
|
SIMDE_VECTORIZE
|
|
for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) {
|
|
r_.i16[i] = a_.i16[i] - b_.i16[i];
|
|
}
|
|
#endif
|
|
|
|
return simde__m256i_from_private(r_);
|
|
#endif
|
|
}
|
|
#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
|
|
#undef _mm256_sub_epi16
|
|
#define _mm256_sub_epi16(a, b) simde_mm256_sub_epi16(a, b)
|
|
#endif
|
|
|
|
SIMDE_FUNCTION_ATTRIBUTES
|
|
simde__m256i
|
|
simde_mm256_hsub_epi16 (simde__m256i a, simde__m256i b) {
|
|
#if defined(SIMDE_X86_AVX2_NATIVE)
|
|
return _mm256_hsub_epi16(a, b);
|
|
#else
|
|
return simde_mm256_sub_epi16(simde_x_mm256_deinterleaveeven_epi16(a, b), simde_x_mm256_deinterleaveodd_epi16(a, b));
|
|
#endif
|
|
}
|
|
#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
|
|
#undef _mm256_hsub_epi16
|
|
#define _mm256_hsub_epi16(a, b) simde_mm256_hsub_epi16(a, b)
|
|
#endif
|
|
|
|
SIMDE_FUNCTION_ATTRIBUTES
|
|
simde__m256i
|
|
simde_mm256_sub_epi32 (simde__m256i a, simde__m256i b) {
|
|
#if defined(SIMDE_X86_AVX2_NATIVE)
|
|
return _mm256_sub_epi32(a, b);
|
|
#else
|
|
simde__m256i_private
|
|
r_,
|
|
a_ = simde__m256i_to_private(a),
|
|
b_ = simde__m256i_to_private(b);
|
|
|
|
#if SIMDE_NATURAL_VECTOR_SIZE_LE(128)
|
|
r_.m128i[0] = simde_mm_sub_epi32(a_.m128i[0], b_.m128i[0]);
|
|
r_.m128i[1] = simde_mm_sub_epi32(a_.m128i[1], b_.m128i[1]);
|
|
#elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
|
|
r_.i32 = a_.i32 - b_.i32;
|
|
#else
|
|
SIMDE_VECTORIZE
|
|
for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
|
|
r_.i32[i] = a_.i32[i] - b_.i32[i];
|
|
}
|
|
#endif
|
|
|
|
return simde__m256i_from_private(r_);
|
|
#endif
|
|
}
|
|
#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
|
|
#undef _mm256_sub_epi32
|
|
#define _mm256_sub_epi32(a, b) simde_mm256_sub_epi32(a, b)
|
|
#endif
|
|
|
|
SIMDE_FUNCTION_ATTRIBUTES
|
|
simde__m256i
|
|
simde_mm256_hsub_epi32 (simde__m256i a, simde__m256i b) {
|
|
#if defined(SIMDE_X86_AVX2_NATIVE)
|
|
return _mm256_hsub_epi32(a, b);
|
|
#else
|
|
return simde_mm256_sub_epi32(simde_x_mm256_deinterleaveeven_epi32(a, b), simde_x_mm256_deinterleaveodd_epi32(a, b));
|
|
#endif
|
|
}
|
|
#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
|
|
#undef _mm256_hsub_epi32
|
|
#define _mm256_hsub_epi32(a, b) simde_mm256_hsub_epi32(a, b)
|
|
#endif
|
|
|
|
SIMDE_FUNCTION_ATTRIBUTES
|
|
simde__m256i
|
|
simde_mm256_sub_epi64 (simde__m256i a, simde__m256i b) {
|
|
#if defined(SIMDE_X86_AVX2_NATIVE)
|
|
return _mm256_sub_epi64(a, b);
|
|
#else
|
|
simde__m256i_private
|
|
r_,
|
|
a_ = simde__m256i_to_private(a),
|
|
b_ = simde__m256i_to_private(b);
|
|
|
|
#if SIMDE_NATURAL_VECTOR_SIZE_LE(128)
|
|
r_.m128i[0] = simde_mm_sub_epi64(a_.m128i[0], b_.m128i[0]);
|
|
r_.m128i[1] = simde_mm_sub_epi64(a_.m128i[1], b_.m128i[1]);
|
|
#elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
|
|
r_.i64 = a_.i64 - b_.i64;
|
|
#else
|
|
SIMDE_VECTORIZE
|
|
for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) {
|
|
r_.i64[i] = a_.i64[i] - b_.i64[i];
|
|
}
|
|
#endif
|
|
|
|
return simde__m256i_from_private(r_);
|
|
#endif
|
|
}
|
|
#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
|
|
#undef _mm256_sub_epi64
|
|
#define _mm256_sub_epi64(a, b) simde_mm256_sub_epi64(a, b)
|
|
#endif
|
|
|
|
SIMDE_FUNCTION_ATTRIBUTES
|
|
simde__m256i
|
|
simde_x_mm256_sub_epu32 (simde__m256i a, simde__m256i b) {
|
|
simde__m256i_private
|
|
r_,
|
|
a_ = simde__m256i_to_private(a),
|
|
b_ = simde__m256i_to_private(b);
|
|
|
|
#if defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
|
|
r_.u32 = a_.u32 - b_.u32;
|
|
#elif SIMDE_NATURAL_VECTOR_SIZE_LE(128)
|
|
r_.m128i[0] = simde_x_mm_sub_epu32(a_.m128i[0], b_.m128i[0]);
|
|
r_.m128i[1] = simde_x_mm_sub_epu32(a_.m128i[1], b_.m128i[1]);
|
|
#else
|
|
SIMDE_VECTORIZE
|
|
for (size_t i = 0 ; i < (sizeof(r_.u32) / sizeof(r_.u32[0])) ; i++) {
|
|
r_.u32[i] = a_.u32[i] - b_.u32[i];
|
|
}
|
|
#endif
|
|
|
|
return simde__m256i_from_private(r_);
|
|
}
|
|
|
|
SIMDE_FUNCTION_ATTRIBUTES
|
|
simde__m256i
|
|
simde_mm256_subs_epi8 (simde__m256i a, simde__m256i b) {
|
|
#if defined(SIMDE_X86_AVX2_NATIVE)
|
|
return _mm256_subs_epi8(a, b);
|
|
#else
|
|
simde__m256i_private
|
|
r_,
|
|
a_ = simde__m256i_to_private(a),
|
|
b_ = simde__m256i_to_private(b);
|
|
|
|
#if SIMDE_NATURAL_VECTOR_SIZE_LE(128)
|
|
r_.m128i[0] = simde_mm_subs_epi8(a_.m128i[0], b_.m128i[0]);
|
|
r_.m128i[1] = simde_mm_subs_epi8(a_.m128i[1], b_.m128i[1]);
|
|
#else
|
|
SIMDE_VECTORIZE
|
|
for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) {
|
|
r_.i8[i] = simde_math_subs_i8(a_.i8[i], b_.i8[i]);
|
|
}
|
|
#endif
|
|
|
|
return simde__m256i_from_private(r_);
|
|
#endif
|
|
}
|
|
#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
|
|
#undef _mm256_subs_epi8
|
|
#define _mm256_subs_epi8(a, b) simde_mm256_subs_epi8(a, b)
|
|
#endif
|
|
|
|
SIMDE_FUNCTION_ATTRIBUTES
|
|
simde__m256i
|
|
simde_mm256_subs_epi16(simde__m256i a, simde__m256i b) {
|
|
#if defined(SIMDE_X86_AVX2_NATIVE)
|
|
return _mm256_subs_epi16(a, b);
|
|
#else
|
|
simde__m256i_private
|
|
r_,
|
|
a_ = simde__m256i_to_private(a),
|
|
b_ = simde__m256i_to_private(b);
|
|
|
|
#if SIMDE_NATURAL_VECTOR_SIZE_LE(128)
|
|
r_.m128i[0] = simde_mm_subs_epi16(a_.m128i[0], b_.m128i[0]);
|
|
r_.m128i[1] = simde_mm_subs_epi16(a_.m128i[1], b_.m128i[1]);
|
|
#else
|
|
SIMDE_VECTORIZE
|
|
for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) {
|
|
r_.i16[i] = simde_math_subs_i16(a_.i16[i], b_.i16[i]);
|
|
}
|
|
#endif
|
|
|
|
return simde__m256i_from_private(r_);
|
|
#endif
|
|
}
|
|
#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
|
|
#undef _mm256_subs_epi16
|
|
#define _mm256_subs_epi16(a, b) simde_mm256_subs_epi16(a, b)
|
|
#endif
|
|
|
|
SIMDE_FUNCTION_ATTRIBUTES
|
|
simde__m256i
|
|
simde_mm256_hsubs_epi16 (simde__m256i a, simde__m256i b) {
|
|
#if defined(SIMDE_X86_AVX2_NATIVE)
|
|
return _mm256_hsubs_epi16(a, b);
|
|
#else
|
|
return simde_mm256_subs_epi16(simde_x_mm256_deinterleaveeven_epi16(a, b), simde_x_mm256_deinterleaveodd_epi16(a, b));
|
|
#endif
|
|
}
|
|
#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
|
|
#undef _mm256_hsubs_epi16
|
|
#define _mm256_hsubs_epi16(a, b) simde_mm256_hsubs_epi16(a, b)
|
|
#endif
|
|
|
|
SIMDE_FUNCTION_ATTRIBUTES
|
|
simde__m256i
|
|
simde_mm256_subs_epu8 (simde__m256i a, simde__m256i b) {
|
|
#if defined(SIMDE_X86_AVX2_NATIVE)
|
|
return _mm256_subs_epu8(a, b);
|
|
#else
|
|
simde__m256i_private
|
|
r_,
|
|
a_ = simde__m256i_to_private(a),
|
|
b_ = simde__m256i_to_private(b);
|
|
|
|
#if SIMDE_NATURAL_VECTOR_SIZE_LE(128)
|
|
r_.m128i[0] = simde_mm_subs_epu8(a_.m128i[0], b_.m128i[0]);
|
|
r_.m128i[1] = simde_mm_subs_epu8(a_.m128i[1], b_.m128i[1]);
|
|
#else
|
|
SIMDE_VECTORIZE
|
|
for (size_t i = 0 ; i < (sizeof(r_.u8) / sizeof(r_.u8[0])) ; i++) {
|
|
r_.u8[i] = simde_math_subs_u8(a_.u8[i], b_.u8[i]);
|
|
}
|
|
#endif
|
|
|
|
return simde__m256i_from_private(r_);
|
|
#endif
|
|
}
|
|
#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
|
|
#undef _mm256_subs_epu8
|
|
#define _mm256_subs_epu8(a, b) simde_mm256_subs_epu8(a, b)
|
|
#endif
|
|
|
|
SIMDE_FUNCTION_ATTRIBUTES
|
|
simde__m256i
|
|
simde_mm256_subs_epu16(simde__m256i a, simde__m256i b) {
|
|
#if defined(SIMDE_X86_AVX2_NATIVE)
|
|
return _mm256_subs_epu16(a, b);
|
|
#else
|
|
simde__m256i_private
|
|
r_,
|
|
a_ = simde__m256i_to_private(a),
|
|
b_ = simde__m256i_to_private(b);
|
|
|
|
#if SIMDE_NATURAL_VECTOR_SIZE_LE(128)
|
|
r_.m128i[0] = simde_mm_subs_epu16(a_.m128i[0], b_.m128i[0]);
|
|
r_.m128i[1] = simde_mm_subs_epu16(a_.m128i[1], b_.m128i[1]);
|
|
#else
|
|
SIMDE_VECTORIZE
|
|
for (size_t i = 0 ; i < (sizeof(r_.u16) / sizeof(r_.u16[0])) ; i++) {
|
|
r_.u16[i] = simde_math_subs_u16(a_.u16[i], b_.u16[i]);
|
|
}
|
|
#endif
|
|
|
|
return simde__m256i_from_private(r_);
|
|
#endif
|
|
}
|
|
#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
|
|
#undef _mm256_subs_epu16
|
|
#define _mm256_subs_epu16(a, b) simde_mm256_subs_epu16(a, b)
|
|
#endif
|
|
|
|
SIMDE_FUNCTION_ATTRIBUTES
|
|
int
|
|
simde_x_mm256_test_all_ones (simde__m256i a) {
|
|
simde__m256i_private a_ = simde__m256i_to_private(a);
|
|
int r;
|
|
int_fast32_t r_ = ~HEDLEY_STATIC_CAST(int_fast32_t, 0);
|
|
|
|
SIMDE_VECTORIZE_REDUCTION(&:r_)
|
|
for (size_t i = 0 ; i < (sizeof(a_.i32f) / sizeof(a_.i32f[0])) ; i++) {
|
|
r_ &= a_.i32f[i];
|
|
}
|
|
|
|
r = (r_ == ~HEDLEY_STATIC_CAST(int_fast32_t, 0));
|
|
|
|
return r;
|
|
}
|
|
|
|
SIMDE_FUNCTION_ATTRIBUTES
|
|
simde__m256i
|
|
simde_mm256_unpacklo_epi8 (simde__m256i a, simde__m256i b) {
|
|
#if defined(SIMDE_X86_AVX2_NATIVE)
|
|
return _mm256_unpacklo_epi8(a, b);
|
|
#else
|
|
simde__m256i_private
|
|
r_,
|
|
a_ = simde__m256i_to_private(a),
|
|
b_ = simde__m256i_to_private(b);
|
|
|
|
#if SIMDE_NATURAL_VECTOR_SIZE_LE(128)
|
|
r_.m128i[0] = simde_mm_unpacklo_epi8(a_.m128i[0], b_.m128i[0]);
|
|
r_.m128i[1] = simde_mm_unpacklo_epi8(a_.m128i[1], b_.m128i[1]);
|
|
#elif defined(SIMDE_SHUFFLE_VECTOR_)
|
|
r_.i8 = SIMDE_SHUFFLE_VECTOR_(8, 32, a_.i8, b_.i8,
|
|
0, 32, 1, 33, 2, 34, 3, 35,
|
|
4, 36, 5, 37, 6, 38, 7, 39,
|
|
16, 48, 17, 49, 18, 50, 19, 51,
|
|
20, 52, 21, 53, 22, 54, 23, 55);
|
|
#else
|
|
SIMDE_VECTORIZE
|
|
for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0]) / 2) ; i++) {
|
|
r_.i8[2 * i] = a_.i8[i + ~(~i | 7)];
|
|
r_.i8[2 * i + 1] = b_.i8[i + ~(~i | 7)];
|
|
}
|
|
#endif
|
|
|
|
return simde__m256i_from_private(r_);
|
|
#endif
|
|
}
|
|
#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
|
|
#undef _mm256_unpacklo_epi8
|
|
#define _mm256_unpacklo_epi8(a, b) simde_mm256_unpacklo_epi8(a, b)
|
|
#endif
|
|
|
|
SIMDE_FUNCTION_ATTRIBUTES
|
|
simde__m256i
|
|
simde_mm256_unpacklo_epi16 (simde__m256i a, simde__m256i b) {
|
|
#if defined(SIMDE_X86_AVX2_NATIVE)
|
|
return _mm256_unpacklo_epi16(a, b);
|
|
#else
|
|
simde__m256i_private
|
|
r_,
|
|
a_ = simde__m256i_to_private(a),
|
|
b_ = simde__m256i_to_private(b);
|
|
|
|
#if SIMDE_NATURAL_VECTOR_SIZE_LE(128)
|
|
r_.m128i[0] = simde_mm_unpacklo_epi16(a_.m128i[0], b_.m128i[0]);
|
|
r_.m128i[1] = simde_mm_unpacklo_epi16(a_.m128i[1], b_.m128i[1]);
|
|
#elif defined(SIMDE_SHUFFLE_VECTOR_)
|
|
r_.i16 =SIMDE_SHUFFLE_VECTOR_(16, 32, a_.i16, b_.i16,
|
|
0, 16, 1, 17, 2, 18, 3, 19, 8, 24, 9, 25, 10, 26, 11, 27);
|
|
#else
|
|
SIMDE_VECTORIZE
|
|
for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0]) / 2) ; i++) {
|
|
r_.i16[2 * i] = a_.i16[i + ~(~i | 3)];
|
|
r_.i16[2 * i + 1] = b_.i16[i + ~(~i | 3)];
|
|
}
|
|
#endif
|
|
|
|
return simde__m256i_from_private(r_);
|
|
#endif
|
|
}
|
|
#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
|
|
#undef _mm256_unpacklo_epi16
|
|
#define _mm256_unpacklo_epi16(a, b) simde_mm256_unpacklo_epi16(a, b)
|
|
#endif
|
|
|
|
SIMDE_FUNCTION_ATTRIBUTES
|
|
simde__m256i
|
|
simde_mm256_unpacklo_epi32 (simde__m256i a, simde__m256i b) {
|
|
#if defined(SIMDE_X86_AVX2_NATIVE)
|
|
return _mm256_unpacklo_epi32(a, b);
|
|
#else
|
|
simde__m256i_private
|
|
r_,
|
|
a_ = simde__m256i_to_private(a),
|
|
b_ = simde__m256i_to_private(b);
|
|
|
|
#if SIMDE_NATURAL_VECTOR_SIZE_LE(128)
|
|
r_.m128i[0] = simde_mm_unpacklo_epi32(a_.m128i[0], b_.m128i[0]);
|
|
r_.m128i[1] = simde_mm_unpacklo_epi32(a_.m128i[1], b_.m128i[1]);
|
|
#elif defined(SIMDE_SHUFFLE_VECTOR_)
|
|
r_.i32 = SIMDE_SHUFFLE_VECTOR_(32, 32, a_.i32, b_.i32,
|
|
0, 8, 1, 9, 4, 12, 5, 13);
|
|
#else
|
|
SIMDE_VECTORIZE
|
|
for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0]) / 2) ; i++) {
|
|
r_.i32[2 * i] = a_.i32[i + ~(~i | 1)];
|
|
r_.i32[2 * i + 1] = b_.i32[i + ~(~i | 1)];
|
|
}
|
|
#endif
|
|
|
|
return simde__m256i_from_private(r_);
|
|
#endif
|
|
}
|
|
#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
|
|
#undef _mm256_unpacklo_epi32
|
|
#define _mm256_unpacklo_epi32(a, b) simde_mm256_unpacklo_epi32(a, b)
|
|
#endif
|
|
|
|
SIMDE_FUNCTION_ATTRIBUTES
|
|
simde__m256i
|
|
simde_mm256_unpacklo_epi64 (simde__m256i a, simde__m256i b) {
|
|
#if defined(SIMDE_X86_AVX2_NATIVE)
|
|
return _mm256_unpacklo_epi64(a, b);
|
|
#else
|
|
simde__m256i_private
|
|
r_,
|
|
a_ = simde__m256i_to_private(a),
|
|
b_ = simde__m256i_to_private(b);
|
|
|
|
#if SIMDE_NATURAL_VECTOR_SIZE_LE(128)
|
|
r_.m128i[0] = simde_mm_unpacklo_epi64(a_.m128i[0], b_.m128i[0]);
|
|
r_.m128i[1] = simde_mm_unpacklo_epi64(a_.m128i[1], b_.m128i[1]);
|
|
#elif defined(SIMDE_SHUFFLE_VECTOR_)
|
|
r_.i64 = SIMDE_SHUFFLE_VECTOR_(64, 32, a_.i64, b_.i64, 0, 4, 2, 6);
|
|
#else
|
|
SIMDE_VECTORIZE
|
|
for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0]) / 2) ; i++) {
|
|
r_.i64[2 * i] = a_.i64[2 * i];
|
|
r_.i64[2 * i + 1] = b_.i64[2 * i];
|
|
}
|
|
#endif
|
|
|
|
return simde__m256i_from_private(r_);
|
|
#endif
|
|
}
|
|
#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
|
|
#undef _mm256_unpacklo_epi64
|
|
#define _mm256_unpacklo_epi64(a, b) simde_mm256_unpacklo_epi64(a, b)
|
|
#endif
|
|
|
|
SIMDE_FUNCTION_ATTRIBUTES
|
|
simde__m256i
|
|
simde_mm256_unpackhi_epi8 (simde__m256i a, simde__m256i b) {
|
|
#if defined(SIMDE_X86_AVX2_NATIVE)
|
|
return _mm256_unpackhi_epi8(a, b);
|
|
#else
|
|
simde__m256i_private
|
|
r_,
|
|
a_ = simde__m256i_to_private(a),
|
|
b_ = simde__m256i_to_private(b);
|
|
|
|
#if SIMDE_NATURAL_VECTOR_SIZE_LE(128)
|
|
r_.m128i[0] = simde_mm_unpackhi_epi8(a_.m128i[0], b_.m128i[0]);
|
|
r_.m128i[1] = simde_mm_unpackhi_epi8(a_.m128i[1], b_.m128i[1]);
|
|
#elif defined(SIMDE_SHUFFLE_VECTOR_)
|
|
r_.i8 = SIMDE_SHUFFLE_VECTOR_(8, 32, a_.i8, b_.i8,
|
|
8, 40, 9, 41, 10, 42, 11, 43,
|
|
12, 44, 13, 45, 14, 46, 15, 47,
|
|
24, 56, 25, 57, 26, 58, 27, 59,
|
|
28, 60, 29, 61, 30, 62, 31, 63);
|
|
#else
|
|
SIMDE_VECTORIZE
|
|
for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0]) / 2) ; i++) {
|
|
r_.i8[2 * i] = a_.i8[i + 8 + ~(~i | 7)];
|
|
r_.i8[2 * i + 1] = b_.i8[i + 8 + ~(~i | 7)];
|
|
}
|
|
#endif
|
|
|
|
return simde__m256i_from_private(r_);
|
|
#endif
|
|
}
|
|
#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
|
|
#undef _mm256_unpackhi_epi8
|
|
#define _mm256_unpackhi_epi8(a, b) simde_mm256_unpackhi_epi8(a, b)
|
|
#endif
|
|
|
|
SIMDE_FUNCTION_ATTRIBUTES
|
|
simde__m256i
|
|
simde_mm256_unpackhi_epi16 (simde__m256i a, simde__m256i b) {
|
|
#if defined(SIMDE_X86_AVX2_NATIVE)
|
|
return _mm256_unpackhi_epi16(a, b);
|
|
#else
|
|
simde__m256i_private
|
|
r_,
|
|
a_ = simde__m256i_to_private(a),
|
|
b_ = simde__m256i_to_private(b);
|
|
|
|
#if SIMDE_NATURAL_VECTOR_SIZE_LE(128)
|
|
r_.m128i[0] = simde_mm_unpackhi_epi16(a_.m128i[0], b_.m128i[0]);
|
|
r_.m128i[1] = simde_mm_unpackhi_epi16(a_.m128i[1], b_.m128i[1]);
|
|
#elif defined(SIMDE_SHUFFLE_VECTOR_)
|
|
r_.i16 = SIMDE_SHUFFLE_VECTOR_(16, 32, a_.i16, b_.i16,
|
|
4, 20, 5, 21, 6, 22, 7, 23,
|
|
12, 28, 13, 29, 14, 30, 15, 31);
|
|
#else
|
|
SIMDE_VECTORIZE
|
|
for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0]) / 2) ; i++) {
|
|
r_.i16[2 * i] = a_.i16[i + 4 + ~(~i | 3)];
|
|
r_.i16[2 * i + 1] = b_.i16[i + 4 + ~(~i | 3)];
|
|
}
|
|
#endif
|
|
|
|
return simde__m256i_from_private(r_);
|
|
#endif
|
|
}
|
|
#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
|
|
#undef _mm256_unpackhi_epi16
|
|
#define _mm256_unpackhi_epi16(a, b) simde_mm256_unpackhi_epi16(a, b)
|
|
#endif
|
|
|
|
SIMDE_FUNCTION_ATTRIBUTES
|
|
simde__m256i
|
|
simde_mm256_unpackhi_epi32 (simde__m256i a, simde__m256i b) {
|
|
#if defined(SIMDE_X86_AVX2_NATIVE)
|
|
return _mm256_unpackhi_epi32(a, b);
|
|
#else
|
|
simde__m256i_private
|
|
r_,
|
|
a_ = simde__m256i_to_private(a),
|
|
b_ = simde__m256i_to_private(b);
|
|
|
|
#if SIMDE_NATURAL_VECTOR_SIZE_LE(128)
|
|
r_.m128i[0] = simde_mm_unpackhi_epi32(a_.m128i[0], b_.m128i[0]);
|
|
r_.m128i[1] = simde_mm_unpackhi_epi32(a_.m128i[1], b_.m128i[1]);
|
|
#elif defined(SIMDE_SHUFFLE_VECTOR_)
|
|
r_.i32 = SIMDE_SHUFFLE_VECTOR_(32, 32, a_.i32, b_.i32,
|
|
2, 10, 3, 11, 6, 14, 7, 15);
|
|
#else
|
|
SIMDE_VECTORIZE
|
|
for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0]) / 2) ; i++) {
|
|
r_.i32[2 * i] = a_.i32[i + 2 + ~(~i | 1)];
|
|
r_.i32[2 * i + 1] = b_.i32[i + 2 + ~(~i | 1)];
|
|
}
|
|
#endif
|
|
|
|
return simde__m256i_from_private(r_);
|
|
#endif
|
|
}
|
|
#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
|
|
#undef _mm256_unpackhi_epi32
|
|
#define _mm256_unpackhi_epi32(a, b) simde_mm256_unpackhi_epi32(a, b)
|
|
#endif
|
|
|
|
SIMDE_FUNCTION_ATTRIBUTES
|
|
simde__m256i
|
|
simde_mm256_unpackhi_epi64 (simde__m256i a, simde__m256i b) {
|
|
#if defined(SIMDE_X86_AVX2_NATIVE)
|
|
return _mm256_unpackhi_epi64(a, b);
|
|
#else
|
|
simde__m256i_private
|
|
r_,
|
|
a_ = simde__m256i_to_private(a),
|
|
b_ = simde__m256i_to_private(b);
|
|
|
|
#if SIMDE_NATURAL_VECTOR_SIZE_LE(128)
|
|
r_.m128i[0] = simde_mm_unpackhi_epi64(a_.m128i[0], b_.m128i[0]);
|
|
r_.m128i[1] = simde_mm_unpackhi_epi64(a_.m128i[1], b_.m128i[1]);
|
|
#elif defined(SIMDE_SHUFFLE_VECTOR_)
|
|
r_.i64 = SIMDE_SHUFFLE_VECTOR_(64, 32, a_.i64, b_.i64, 1, 5, 3, 7);
|
|
#else
|
|
SIMDE_VECTORIZE
|
|
for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0]) / 2) ; i++) {
|
|
r_.i64[2 * i] = a_.i64[2 * i + 1];
|
|
r_.i64[2 * i + 1] = b_.i64[2 * i + 1];
|
|
}
|
|
#endif
|
|
|
|
return simde__m256i_from_private(r_);
|
|
#endif
|
|
}
|
|
#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
|
|
#undef _mm256_unpackhi_epi64
|
|
#define _mm256_unpackhi_epi64(a, b) simde_mm256_unpackhi_epi64(a, b)
|
|
#endif
|
|
|
|
SIMDE_FUNCTION_ATTRIBUTES
|
|
simde__m256i
|
|
simde_mm256_xor_si256 (simde__m256i a, simde__m256i b) {
|
|
#if defined(SIMDE_X86_AVX2_NATIVE)
|
|
return _mm256_xor_si256(a, b);
|
|
#else
|
|
simde__m256i_private
|
|
r_,
|
|
a_ = simde__m256i_to_private(a),
|
|
b_ = simde__m256i_to_private(b);
|
|
|
|
#if SIMDE_NATURAL_VECTOR_SIZE_LE(128)
|
|
r_.m128i[0] = simde_mm_xor_si128(a_.m128i[0], b_.m128i[0]);
|
|
r_.m128i[1] = simde_mm_xor_si128(a_.m128i[1], b_.m128i[1]);
|
|
#elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
|
|
r_.i32f = a_.i32f ^ b_.i32f;
|
|
#else
|
|
SIMDE_VECTORIZE
|
|
for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) {
|
|
r_.i64[i] = a_.i64[i] ^ b_.i64[i];
|
|
}
|
|
#endif
|
|
|
|
return simde__m256i_from_private(r_);
|
|
#endif
|
|
}
|
|
#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
|
|
#undef _mm256_xor_si256
|
|
#define _mm256_xor_si256(a, b) simde_mm256_xor_si256(a, b)
|
|
#endif
|
|
|
|
SIMDE_END_DECLS_
|
|
|
|
HEDLEY_DIAGNOSTIC_POP
|
|
|
|
#endif /* !defined(SIMDE_X86_AVX2_H) */
|