stk-code_catmod/lib/simd_wrapper/simde/x86/avx512/round.h
2022-04-29 11:02:25 +08:00

283 lines
13 KiB
C

#if !defined(SIMDE_X86_AVX512_ROUND_H)
#define SIMDE_X86_AVX512_ROUND_H
#include "types.h"
HEDLEY_DIAGNOSTIC_PUSH
SIMDE_DISABLE_UNWANTED_DIAGNOSTICS
SIMDE_BEGIN_DECLS_
#if SIMDE_NATURAL_VECTOR_SIZE_LE(256) && defined(SIMDE_STATEMENT_EXPR_)
#define simde_x_mm512_round_ps(a, rounding) SIMDE_STATEMENT_EXPR_(({ \
simde__m512_private \
simde_x_mm512_round_ps_r_, \
simde_x_mm512_round_ps_a_ = simde__m512_to_private(a); \
\
for (size_t simde_x_mm512_round_ps_i = 0 ; simde_x_mm512_round_ps_i < (sizeof(simde_x_mm512_round_ps_r_.m256) / sizeof(simde_x_mm512_round_ps_r_.m256[0])) ; simde_x_mm512_round_ps_i++) { \
simde_x_mm512_round_ps_r_.m256[simde_x_mm512_round_ps_i] = simde_mm256_round_ps(simde_x_mm512_round_ps_a_.m256[simde_x_mm512_round_ps_i], rounding); \
} \
\
simde__m512_from_private(simde_x_mm512_round_ps_r_); \
}))
#else
SIMDE_FUNCTION_ATTRIBUTES
simde__m512
simde_x_mm512_round_ps (simde__m512 a, int rounding)
SIMDE_REQUIRE_CONSTANT_RANGE(rounding, 0, 15) {
simde__m512_private
r_,
a_ = simde__m512_to_private(a);
/* For architectures which lack a current direction SIMD instruction.
*
* Note that NEON actually has a current rounding mode instruction,
* but in ARMv8+ the rounding mode is ignored and nearest is always
* used, so we treat ARMv7 as having a rounding mode but ARMv8 as
* not. */
#if \
defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || \
defined(SIMDE_ARM_NEON_A32V8)
if ((rounding & 7) == SIMDE_MM_FROUND_CUR_DIRECTION)
rounding = HEDLEY_STATIC_CAST(int, SIMDE_MM_GET_ROUNDING_MODE()) << 13;
#endif
switch (rounding & ~SIMDE_MM_FROUND_NO_EXC) {
case SIMDE_MM_FROUND_CUR_DIRECTION:
#if defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_14_NATIVE)
for (size_t i = 0 ; i < (sizeof(r_.m128_private) / sizeof(r_.m128_private[0])) ; i++) {
r_.m128_private[i].altivec_f32 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(float), vec_round(a_.m128_private[i].altivec_f32));
}
#elif defined(SIMDE_ARM_NEON_A32V8_NATIVE) && !defined(SIMDE_BUG_GCC_95399)
for (size_t i = 0 ; i < (sizeof(r_.m128_private) / sizeof(r_.m128_private[0])) ; i++) {
r_.m128_private[i].neon_f32 = vrndiq_f32(a_.m128_private[i].neon_f32);
}
#elif defined(simde_math_nearbyintf)
SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) {
r_.f32[i] = simde_math_nearbyintf(a_.f32[i]);
}
#else
HEDLEY_UNREACHABLE_RETURN(simde_mm512_setzero_ps());
#endif
break;
case SIMDE_MM_FROUND_TO_NEAREST_INT:
#if defined(SIMDE_POWER_ALTIVEC_P7_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_14_NATIVE)
for (size_t i = 0 ; i < (sizeof(r_.m128_private) / sizeof(r_.m128_private[0])) ; i++) {
r_.m128_private[i].altivec_f32 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(float), vec_rint(a_.m128_private[i].altivec_f32));
}
#elif defined(SIMDE_ARM_NEON_A32V8_NATIVE)
for (size_t i = 0 ; i < (sizeof(r_.m128_private) / sizeof(r_.m128_private[0])) ; i++) {
r_.m128_private[i].neon_f32 = vrndnq_f32(a_.m128_private[i].neon_f32);
}
#elif defined(simde_math_roundevenf)
SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) {
r_.f32[i] = simde_math_roundevenf(a_.f32[i]);
}
#else
HEDLEY_UNREACHABLE_RETURN(simde_mm512_setzero_ps());
#endif
break;
case SIMDE_MM_FROUND_TO_NEG_INF:
#if defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_14_NATIVE)
for (size_t i = 0 ; i < (sizeof(r_.m128_private) / sizeof(r_.m128_private[0])) ; i++) {
r_.m128_private[i].altivec_f32 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(float), vec_floor(a_.m128_private[i].altivec_f32));
}
#elif defined(SIMDE_ARM_NEON_A32V8_NATIVE)
for (size_t i = 0 ; i < (sizeof(r_.m128_private) / sizeof(r_.m128_private[0])) ; i++) {
r_.m128_private[i].neon_f32 = vrndmq_f32(a_.m128_private[i].neon_f32);
}
#elif defined(simde_math_floorf)
SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) {
r_.f32[i] = simde_math_floorf(a_.f32[i]);
}
#else
HEDLEY_UNREACHABLE_RETURN(simde_mm512_setzero_ps());
#endif
break;
case SIMDE_MM_FROUND_TO_POS_INF:
#if defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_14_NATIVE)
for (size_t i = 0 ; i < (sizeof(r_.m128_private) / sizeof(r_.m128_private[0])) ; i++) {
r_.m128_private[i].altivec_f32 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(float), vec_ceil(a_.m128_private[i].altivec_f32));
}
#elif defined(SIMDE_ARM_NEON_A32V8_NATIVE)
for (size_t i = 0 ; i < (sizeof(r_.m128_private) / sizeof(r_.m128_private[0])) ; i++) {
r_.m128_private[i].neon_f32 = vrndpq_f32(a_.m128_private[i].neon_f32);
}
#elif defined(simde_math_ceilf)
SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) {
r_.f32[i] = simde_math_ceilf(a_.f32[i]);
}
#else
HEDLEY_UNREACHABLE_RETURN(simde_mm512_setzero_ps());
#endif
break;
case SIMDE_MM_FROUND_TO_ZERO:
#if defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_14_NATIVE)
for (size_t i = 0 ; i < (sizeof(r_.m128_private) / sizeof(r_.m128_private[0])) ; i++) {
r_.m128_private[i].altivec_f32 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(float), vec_trunc(a_.m128_private[i].altivec_f32));
}
#elif defined(SIMDE_ARM_NEON_A32V8_NATIVE)
for (size_t i = 0 ; i < (sizeof(r_.m128_private) / sizeof(r_.m128_private[0])) ; i++) {
r_.m128_private[i].neon_f32 = vrndq_f32(a_.m128_private[i].neon_f32);
}
#elif defined(simde_math_truncf)
SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) {
r_.f32[i] = simde_math_truncf(a_.f32[i]);
}
#else
HEDLEY_UNREACHABLE_RETURN(simde_mm512_setzero_ps());
#endif
break;
default:
HEDLEY_UNREACHABLE_RETURN(simde_mm512_setzero_ps());
}
return simde__m512_from_private(r_);
}
#endif
#if SIMDE_NATURAL_VECTOR_SIZE_LE(256) && defined(SIMDE_STATEMENT_EXPR_)
#define simde_x_mm512_round_pd(a, rounding) SIMDE_STATEMENT_EXPR_(({ \
simde__m512d_private \
simde_x_mm512_round_pd_r_, \
simde_x_mm512_round_pd_a_ = simde__m512d_to_private(a); \
\
for (size_t simde_x_mm512_round_pd_i = 0 ; simde_x_mm512_round_pd_i < (sizeof(simde_x_mm512_round_pd_r_.m256d) / sizeof(simde_x_mm512_round_pd_r_.m256d[0])) ; simde_x_mm512_round_pd_i++) { \
simde_x_mm512_round_pd_r_.m256d[simde_x_mm512_round_pd_i] = simde_mm256_round_pd(simde_x_mm512_round_pd_a_.m256d[simde_x_mm512_round_pd_i], rounding); \
} \
\
simde__m512d_from_private(simde_x_mm512_round_pd_r_); \
}))
#else
SIMDE_FUNCTION_ATTRIBUTES
simde__m512d
simde_x_mm512_round_pd (simde__m512d a, int rounding)
SIMDE_REQUIRE_CONSTANT_RANGE(rounding, 0, 15) {
simde__m512d_private
r_,
a_ = simde__m512d_to_private(a);
/* For architectures which lack a current direction SIMD instruction. */
#if defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
if ((rounding & 7) == SIMDE_MM_FROUND_CUR_DIRECTION)
rounding = HEDLEY_STATIC_CAST(int, SIMDE_MM_GET_ROUNDING_MODE()) << 13;
#endif
switch (rounding & ~SIMDE_MM_FROUND_NO_EXC) {
case SIMDE_MM_FROUND_CUR_DIRECTION:
#if defined(SIMDE_POWER_ALTIVEC_P7_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE)
for (size_t i = 0 ; i < (sizeof(r_.m128d_private) / sizeof(r_.m128d_private[0])) ; i++) {
r_.m128d_private[i].altivec_f64 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(double), vec_round(a_.m128d_private[i].altivec_f64));
}
#elif defined(SIMDE_ARM_NEON_A64V8_NATIVE)
for (size_t i = 0 ; i < (sizeof(r_.m128d_private) / sizeof(r_.m128d_private[0])) ; i++) {
r_.m128d_private[i].neon_f64 = vrndiq_f64(a_.m128d_private[i].neon_f64);
}
#elif defined(simde_math_nearbyint)
SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) {
r_.f64[i] = simde_math_nearbyint(a_.f64[i]);
}
#else
HEDLEY_UNREACHABLE_RETURN(simde_mm512_setzero_pd());
#endif
break;
case SIMDE_MM_FROUND_TO_NEAREST_INT:
#if defined(SIMDE_POWER_ALTIVEC_P7_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE)
for (size_t i = 0 ; i < (sizeof(r_.m128d_private) / sizeof(r_.m128d_private[0])) ; i++) {
r_.m128d_private[i].altivec_f64 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(double), vec_round(a_.m128d_private[i].altivec_f64));
}
#elif defined(SIMDE_ARM_NEON_A64V8_NATIVE)
for (size_t i = 0 ; i < (sizeof(r_.m128d_private) / sizeof(r_.m128d_private[0])) ; i++) {
r_.m128d_private[i].neon_f64 = vrndaq_f64(a_.m128d_private[i].neon_f64);
}
#elif defined(simde_math_roundeven)
SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) {
r_.f64[i] = simde_math_roundeven(a_.f64[i]);
}
#else
HEDLEY_UNREACHABLE_RETURN(simde_mm512_setzero_pd());
#endif
break;
case SIMDE_MM_FROUND_TO_NEG_INF:
#if defined(SIMDE_POWER_ALTIVEC_P7_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE)
for (size_t i = 0 ; i < (sizeof(r_.m128d_private) / sizeof(r_.m128d_private[0])) ; i++) {
r_.m128d_private[i].altivec_f64 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(double), vec_floor(a_.m128d_private[i].altivec_f64));
}
#elif defined(SIMDE_ARM_NEON_A64V8_NATIVE)
for (size_t i = 0 ; i < (sizeof(r_.m128d_private) / sizeof(r_.m128d_private[0])) ; i++) {
r_.m128d_private[i].neon_f64 = vrndmq_f64(a_.m128d_private[i].neon_f64);
}
#elif defined(simde_math_floor)
SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) {
r_.f64[i] = simde_math_floor(a_.f64[i]);
}
#else
HEDLEY_UNREACHABLE_RETURN(simde_mm512_setzero_pd());
#endif
break;
case SIMDE_MM_FROUND_TO_POS_INF:
#if defined(SIMDE_POWER_ALTIVEC_P7_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE)
for (size_t i = 0 ; i < (sizeof(r_.m128d_private) / sizeof(r_.m128d_private[0])) ; i++) {
r_.m128d_private[i].altivec_f64 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(double), vec_ceil(a_.m128d_private[i].altivec_f64));
}
#elif defined(SIMDE_ARM_NEON_A64V8_NATIVE)
for (size_t i = 0 ; i < (sizeof(r_.m128d_private) / sizeof(r_.m128d_private[0])) ; i++) {
r_.m128d_private[i].neon_f64 = vrndpq_f64(a_.m128d_private[i].neon_f64);
}
#elif defined(simde_math_ceil)
SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) {
r_.f64[i] = simde_math_ceil(a_.f64[i]);
}
#else
HEDLEY_UNREACHABLE_RETURN(simde_mm512_setzero_pd());
#endif
break;
case SIMDE_MM_FROUND_TO_ZERO:
#if defined(SIMDE_POWER_ALTIVEC_P7_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE)
for (size_t i = 0 ; i < (sizeof(r_.m128d_private) / sizeof(r_.m128d_private[0])) ; i++) {
r_.m128d_private[i].altivec_f64 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(double), vec_trunc(a_.m128d_private[i].altivec_f64));
}
#elif defined(SIMDE_ARM_NEON_A64V8_NATIVE)
for (size_t i = 0 ; i < (sizeof(r_.m128d_private) / sizeof(r_.m128d_private[0])) ; i++) {
r_.m128d_private[i].neon_f64 = vrndq_f64(a_.m128d_private[i].neon_f64);
}
#elif defined(simde_math_trunc)
SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) {
r_.f64[i] = simde_math_trunc(a_.f64[i]);
}
#else
HEDLEY_UNREACHABLE_RETURN(simde_mm512_setzero_pd());
#endif
break;
default:
HEDLEY_UNREACHABLE_RETURN(simde_mm512_setzero_pd());
}
return simde__m512d_from_private(r_);
}
#endif
SIMDE_END_DECLS_
HEDLEY_DIAGNOSTIC_POP
#endif /* !defined(SIMDE_X86_AVX512_ROUND_H) */