stk-code_catmod/lib/graphics_utils/mipmap/cpusimd.h
2022-04-29 11:28:47 +08:00

383 lines
12 KiB
C

/* -----------------------------------------------------------------------------
*
* Copyright (c) 2008-2016 Alexis Naveros.
*
* The SIMD trigonometry functions are Copyright (C) 2007 Julien Pommier
* See copyright notice for simd4f_sin_ps(), simd4f_cos_ps(), simd4f_sincos_ps()
*
* Portions developed under contract to the SURVICE Engineering Company.
*
* This software is provided 'as-is', without any express or implied
* warranty. In no event will the authors be held liable for any damages
* arising from the use of this software.
*
* Permission is granted to anyone to use this software for any purpose,
* including commercial applications, and to alter it and redistribute it
* freely, subject to the following restrictions:
*
* 1. The origin of this software must not be misrepresented; you must not
* claim that you wrote the original software. If you use this software
* in a product, an acknowledgment in the product documentation would be
* appreciated but is not required.
* 2. Altered source versions must be plainly marked as such, and must not be
* misrepresented as being the original software.
* 3. This notice may not be removed or altered from any source distribution.
*
* -----------------------------------------------------------------------------
*/
#ifndef CPUSIMD_H
#define CPUSIMD_H
////
#include <simd_wrapper.h>
#if __SSE4A__ || CPU_ENABLE_SSE4A
#include <ammintrin.h>
#define CPU_SSE4A_SUPPORT (1)
#endif
#if __AVX__ || CPU_ENABLE_AVX
#include <immintrin.h>
#define CPU_AVX_SUPPORT (1)
#endif
#if __AVX2__ || CPU_ENABLE_AVX2
#include <immintrin.h>
#define CPU_AVX2_SUPPORT (1)
#endif
#if __XOP__ || CPU_ENABLE_XOP
#include <immintrin.h>
#define CPU_XOP_SUPPORT (1)
#endif
#if __FMA3__ || CPU_ENABLE_FMA3
#include <immintrin.h>
#define CPU_FMA3_SUPPORT (1)
#endif
#if __FMA4__ || CPU_ENABLE_FMA4
#include <immintrin.h>
#define CPU_FMA4_SUPPORT (1)
#endif
#if __RDRND__ || CPU_ENABLE_RDRND
#include <immintrin.h>
#define CPU_RDRND_SUPPORT (1)
#endif
#if __POPCNT__ || CPU_ENABLE_POPCNT
#include <popcntintrin.h>
#define CPU_POPCNT_SUPPORT (1)
#endif
#if __LZCNT__ || CPU_ENABLE_LZCNT
#include <x86intrin.h>
#define CPU_LZCNT_SUPPORT (1)
#endif
#if __F16C__ || CPU_ENABLE_F16C
#include <x86intrin.h>
#define CPU_F16C_SUPPORT (1)
#endif
#if __BMI__ || CPU_ENABLE_BMI
#include <x86intrin.h>
#define CPU_BMI_SUPPORT (1)
#endif
#if __BMI2__ || CPU_ENABLE_BMI2
#include <x86intrin.h>
#define CPU_BMI2_SUPPORT (1)
#endif
#if __TBM__ || CPU_ENABLE_TBM
#include <x86intrin.h>
#define CPU_TBM_SUPPORT (1)
#endif
#if defined(__GNUC__) || defined(__INTEL_COMPILER)
#define CPU_ALIGN16 __attribute__((aligned(16)))
#define CPU_ALIGN32 __attribute__((aligned(32)))
#define CPU_ALIGN64 __attribute__((aligned(64)))
#elif defined(_MSC_VER)
#define CPU_ALIGN16 __declspec(align(16))
#define CPU_ALIGN64 __declspec(align(64))
#else
#define CPU_ALIGN16
#define CPU_ALIGN32
#define CPU_ALIGN64
#warning "SSE/AVX Disabled: Unsupported Compiler."
#undef CPU_SSE_SUPPORT
#undef CPU_SSE2_SUPPORT
#undef CPU_SSE3_SUPPORT
#undef CPU_SSSE3_SUPPORT
#undef CPU_SSE4_1_SUPPORT
#undef CPU_SSE4_2_SUPPORT
#undef CPU_AVX_SUPPORT
#undef CPU_AVX2_SUPPORT
#undef CPU_XOP_SUPPORT
#undef CPU_FMA3_SUPPORT
#undef CPU_FMA4_SUPPORT
#endif
////
#if CPU_SSE_SUPPORT
#define CPU_APPROX_DIV_FLOAT(z,w) _mm_cvtss_f32(_mm_mul_ss(_mm_set_ss(z),_mm_rcp_ss(_mm_set_ss(w))))
#define CPU_APPROX_SQRT_FLOAT(z) _mm_cvtss_f32(_mm_mul_ss(_mm_set_ss(z),_mm_rsqrt_ss(_mm_set_ss(z))))
#define CPU_APPROX_RSQRT_FLOAT(z) _mm_cvtss_f32(_mm_rsqrt_ss(_mm_set_ss(z)))
#define CPU_APPROX_DIVSQRT_FLOAT(z,w) _mm_cvtss_f32(_mm_mul_ss(_mm_set_ss(z),_mm_rsqrt_ss(_mm_set_ss(w))))
#else
#define CPU_APPROX_DIV_FLOAT(z,w) ((z)/(w))
#define CPU_APPROX_SQRT_FLOAT(z) (sqrtf(z))
#define CPU_APPROX_RSQRT_FLOAT(z) (1.0/sqrtf(z))
#define CPU_APPROX_DIVSQRT_FLOAT(z,w) ((z)/sqrtf(w))
#endif
#if CPU_SSE3_SUPPORT
#define CPU_HADD_PS(vx,vy) _mm_hadd_ps(vx,vy)
#define CPU_HADD_PD(vx,vy) _mm_hadd_pd(vx,vy)
#elif CPU_SSE_SUPPORT
static inline __m128 CPU_HADD_PS( __m128 vx, __m128 vy )
{
__m128 vh, vl;
vh = _mm_shuffle_ps( vx, vy, _MM_SHUFFLE(3,1,3,1) );
vl = _mm_shuffle_ps( vx, vy, _MM_SHUFFLE(2,0,2,0) );
return _mm_add_ps( vh, vl );
}
#define CPU_HADD_PD(vx,vy) _mm_add_sd(vx,_mm_unpackhi_pd(vy,vy))
#endif
#if CPU_SSE4_1_SUPPORT
#define CPU_CVT_U8_TO_I32(x,vzero) _mm_cvtepu8_epi32(x)
#define CPU_CVT_S8_TO_I32(x,vzero) _mm_cvtepi8_epi32(x)
#elif CPU_SSE2_SUPPORT
#define CPU_CVT_U8_TO_I32(x,vzero) _mm_unpacklo_epi16(_mm_unpacklo_epi8((x),(vzero)),(vzero))
static inline __m128i CPU_CVT_S8_TO_I32( __m128i vx, __m128i vzero )
{
__m128i vsign;
vsign = _mm_cmpgt_epi8( vzero, vx );
return _mm_unpacklo_epi16( _mm_unpacklo_epi8( vx, vsign ), _mm_unpacklo_epi8( vsign, vsign ) );
}
#endif
#if CPU_SSE4_1_SUPPORT
#define CPU_BLENDV_PS(x,y,mask) _mm_blendv_ps(x,y,mask)
#define CPU_BLENDV_PD(x,y,mask) _mm_blendv_pd(x,y,mask)
#elif CPU_SSE2_SUPPORT
#define CPU_BLENDV_PS(x,y,mask) _mm_or_ps(_mm_andnot_ps(mask,x),_mm_and_ps(y,mask))
#define CPU_BLENDV_PD(x,y,mask) _mm_or_pd(_mm_andnot_pd(mask,x),_mm_and_pd(y,mask))
#endif
/*
CPU_FMADD = ((f0*f1)+t0)
CPU_FMSUB = ((f0*f1)-t0)
*/
#if CPU_FMA3_SUPPORT
#define CPU_FMADD_SS(f0,f1,t0) _mm_fmadd_ss(f0,f1,t0)
#define CPU_FMADD_PS(f0,f1,t0) _mm_fmadd_ps(f0,f1,t0)
#define CPU_FMADD_SD(f0,f1,t0) _mm_fmadd_sd(f0,f1,t0)
#define CPU_FMADD_PD(f0,f1,t0) _mm_fmadd_pd(f0,f1,t0)
#define CPU_FMSUB_SS(f0,f1,t0) _mm_fmsub_ss(f0,f1,t0)
#define CPU_FMSUB_PS(f0,f1,t0) _mm_fmsub_ps(f0,f1,t0)
#define CPU_FMSUB_SD(f0,f1,t0) _mm_fmsub_sd(f0,f1,t0)
#define CPU_FMSUB_PD(f0,f1,t0) _mm_fmsub_pd(f0,f1,t0)
#define CPU_FMADD256_SS(f0,f1,t0) _mm256_fmadd_ss(f0,f1,t0)
#define CPU_FMADD256_PS(f0,f1,t0) _mm256_fmadd_ps(f0,f1,t0)
#define CPU_FMADD256_SD(f0,f1,t0) _mm256_fmadd_sd(f0,f1,t0)
#define CPU_FMADD256_PD(f0,f1,t0) _mm256_fmadd_pd(f0,f1,t0)
#define CPU_FMSUB256_SS(f0,f1,t0) _mm256_fmsub_ss(f0,f1,t0)
#define CPU_FMSUB256_PS(f0,f1,t0) _mm256_fmsub_ps(f0,f1,t0)
#define CPU_FMSUB256_SD(f0,f1,t0) _mm256_fmsub_sd(f0,f1,t0)
#define CPU_FMSUB256_PD(f0,f1,t0) _mm256_fmsub_pd(f0,f1,t0)
#elif CPU_FMA4_SUPPORT
#define CPU_FMADD_SS(f0,f1,t0) _mm_macc_ss(f0,f1,t0)
#define CPU_FMADD_PS(f0,f1,t0) _mm_macc_ps(f0,f1,t0)
#define CPU_FMADD_SD(f0,f1,t0) _mm_macc_sd(f0,f1,t0)
#define CPU_FMADD_PD(f0,f1,t0) _mm_macc_pd(f0,f1,t0)
#define CPU_FMSUB_SS(f0,f1,t0) _mm_msub_ss(f0,f1,t0)
#define CPU_FMSUB_PS(f0,f1,t0) _mm_msub_ps(f0,f1,t0)
#define CPU_FMSUB_SD(f0,f1,t0) _mm_msub_sd(f0,f1,t0)
#define CPU_FMSUB_PD(f0,f1,t0) _mm_msub_pd(f0,f1,t0)
#define CPU_FMADD256_SS(f0,f1,t0) _mm256_macc_ss(f0,f1,t0)
#define CPU_FMADD256_PS(f0,f1,t0) _mm256_macc_ps(f0,f1,t0)
#define CPU_FMADD256_SD(f0,f1,t0) _mm256_macc_sd(f0,f1,t0)
#define CPU_FMADD256_PD(f0,f1,t0) _mm256_macc_pd(f0,f1,t0)
#define CPU_FMSUB256_SS(f0,f1,t0) _mm256_msub_ss(f0,f1,t0)
#define CPU_FMSUB256_PS(f0,f1,t0) _mm256_msub_ps(f0,f1,t0)
#define CPU_FMSUB256_SD(f0,f1,t0) _mm256_msub_sd(f0,f1,t0)
#define CPU_FMSUB256_PD(f0,f1,t0) _mm256_msub_pd(f0,f1,t0)
#else
#define CPU_FMADD_SS(f0,f1,t0) _mm_add_ss(_mm_mul_ss(f0,f1),t0)
#define CPU_FMADD_PS(f0,f1,t0) _mm_add_ps(_mm_mul_ps(f0,f1),t0)
#define CPU_FMADD_SD(f0,f1,t0) _mm_add_sd(_mm_mul_sd(f0,f1),t0)
#define CPU_FMADD_PD(f0,f1,t0) _mm_add_pd(_mm_mul_pd(f0,f1),t0)
#define CPU_FMSUB_SS(f0,f1,t0) _mm_sub_ss(_mm_mul_ss(f0,f1),t0)
#define CPU_FMSUB_PS(f0,f1,t0) _mm_sub_ps(_mm_mul_ps(f0,f1),t0)
#define CPU_FMSUB_SD(f0,f1,t0) _mm_sub_sd(_mm_mul_sd(f0,f1),t0)
#define CPU_FMSUB_PD(f0,f1,t0) _mm_sub_pd(_mm_mul_pd(f0,f1),t0)
#define CPU_FMADD256_SS(f0,f1,t0) _mm256_add_ss(_mm256_mul_ss(f0,f1),t0)
#define CPU_FMADD256_PS(f0,f1,t0) _mm256_add_ps(_mm256_mul_ps(f0,f1),t0)
#define CPU_FMADD256_SD(f0,f1,t0) _mm256_add_sd(_mm256_mul_sd(f0,f1),t0)
#define CPU_FMADD256_PD(f0,f1,t0) _mm256_add_pd(_mm256_mul_pd(f0,f1),t0)
#define CPU_FMSUB256_SS(f0,f1,t0) _mm256_sub_ss(_mm256_mul_ss(f0,f1),t0)
#define CPU_FMSUB256_PS(f0,f1,t0) _mm256_sub_ps(_mm256_mul_ps(f0,f1),t0)
#define CPU_FMSUB256_SD(f0,f1,t0) _mm256_sub_sd(_mm256_mul_sd(f0,f1),t0)
#define CPU_FMSUB256_PD(f0,f1,t0) _mm256_sub_pd(_mm256_mul_pd(f0,f1),t0)
#endif
////
#if CPU_SSE_SUPPORT
extern const uint32_t simd4fSignMask[4];
extern const uint32_t simd4fSignMaskInv[4];
extern const float simd4fHalf[4];
extern const float simd4fOne[4];
extern const float simd4fTwo[4];
extern const float simd4fThree[4];
extern const uint32_t simd4uOne[4];
extern const uint32_t simd4uOneInv[4];
extern const uint32_t simd4uTwo[4];
extern const uint32_t simd4uFour[4];
extern const float simd4fQuarter[4];
extern const float simd4fPi[4];
extern const float simd4fZeroOneTwoThree[4];
extern const uint32_t simd4fAlphaMask[4];
extern const float simd4f255[4];
extern const float simd4f255Inv[4];
#endif
#if CPU_SSE2_SUPPORT
/* Input range between -8192 and 8192 */
__m128 simd4f_sin_ps( __m128 x );
__m128 simd4f_cos_ps( __m128 x );
void simd4f_sincos_ps( __m128 x, __m128 *s, __m128 *c );
#endif
#if CPU_SSE2_SUPPORT
__m128 simd4f_exp2_ps( __m128 x );
__m128 simd4f_log2_ps( __m128 x );
__m128 simd4f_pow_ps( __m128 x, __m128 y );
#endif
#if CPU_SSE2_SUPPORT
__m128 simd4f_pow12d5_ps( __m128 arg );
__m128 simd4f_pow5d12_ps( __m128 arg );
#endif
////
#if CPU_SSE2_SUPPORT
#ifndef CC_ALWAYSINLINE
#if defined(__GNUC__) || defined(__INTEL_COMPILER)
#define CC_ALWAYSINLINE __attribute__((always_inline))
#else
#define CC_ALWAYSINLINE
#endif
#endif
static inline CC_ALWAYSINLINE __m128 simd4f_pow12d5_inline_ps( __m128 vx )
{
__m128 vpow, vpwsqrtinv, vpwsqrt, vx2;
vx2 = _mm_mul_ps( vx, vx );
vpow = _mm_castsi128_ps( _mm_cvtps_epi32( _mm_mul_ps( _mm_cvtepi32_ps( _mm_castps_si128( _mm_mul_ps( vx, _mm_set1_ps( 5417434112.0f ) ) ) ), _mm_set1_ps( 0.8f ) ) ) );
vpwsqrtinv = _mm_rsqrt_ps( vpow );
vpwsqrt = _mm_mul_ps( vpow, vpwsqrtinv );
return _mm_mul_ps( _mm_add_ps( _mm_mul_ps( vx2, vpwsqrt ), _mm_mul_ps( _mm_mul_ps( _mm_mul_ps( vx2, vx ), vpwsqrtinv ), _mm_rsqrt_ps( vpwsqrt ) ) ), _mm_set1_ps( 0.51011878327f ) );
}
static inline CC_ALWAYSINLINE __m128 simd4f_pow5d12_inline_ps( __m128 vx )
{
__m128 vpow;
vpow = _mm_castsi128_ps( _mm_cvtps_epi32( _mm_mul_ps( _mm_cvtepi32_ps( _mm_castps_si128( _mm_mul_ps( vx, _mm_set1_ps( 6521909350804488192.0f ) ) ) ), _mm_set1_ps( 0.666666666666f ) ) ) );
vx = _mm_mul_ps( _mm_add_ps( _mm_mul_ps( vx, vpow ), _mm_mul_ps( _mm_mul_ps( vx, vx ), _mm_rsqrt_ps( vpow ) ) ), _mm_set1_ps( 0.5290553722f ) );
#if 0
vx = _mm_mul_ps( vx, _mm_rsqrt_ps( vx ) );
vx = _mm_mul_ps( vx, _mm_rsqrt_ps( vx ) );
#else
vx = _mm_sqrt_ps( vx );
vx = _mm_sqrt_ps( vx );
#endif
return vx;
}
#endif
////
#if CPU_SSE_SUPPORT
static inline void simdPrintDebugSSE4f( char *str, __m128 v )
{
float CPU_ALIGN16 store[4];
_mm_store_ps( (float *)store, v );
printf( "%s %f %f %f %f\n", str, (double)store[0], (double)store[1], (double)store[2], (double)store[3] );
return;
}
static inline void simdPrintDebugSSE2d( char *str, __m128d v )
{
double CPU_ALIGN16 store[2];
_mm_store_pd( (double *)store, v );
printf( "%s %f %f\n", str, store[0], store[1] );
return;
}
static inline void simdPrintDebugSSE16u8( char *str, __m128i v )
{
uint8_t CPU_ALIGN16 store[16];
_mm_store_si128( (void *)store, v );
printf( "%s %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d\n", str, store[0], store[1], store[2], store[3], store[4], store[5], store[6], store[7], store[8], store[9], store[10], store[11], store[12], store[13], store[14], store[15] );
return;
}
static inline void simdPrintDebugSSE8u16( char *str, __m128i v )
{
uint16_t CPU_ALIGN16 store[8];
_mm_store_si128( (void *)store, v );
printf( "%s %d %d %d %d %d %d %d %d\n", str, store[0], store[1], store[2], store[3], store[4], store[5], store[6], store[7] );
return;
}
static inline void simdPrintDebugSSE4u32( char *str, __m128i v )
{
uint32_t CPU_ALIGN16 store[4];
_mm_store_si128( (void *)store, v );
printf( "%s %d %d %d %d\n", str, store[0], store[1], store[2], store[3] );
return;
}
static inline void simdPrintDebugSSE2u64( char *str, __m128i v )
{
uint64_t CPU_ALIGN16 store[2];
_mm_store_si128( (void *)store, v );
printf( "%s %lld %lld\n", str, (long long)store[0], (long long)store[1] );
return;
}
#endif
////
#endif