Use simde on graphics_utils

This commit is contained in:
Benau 2022-04-29 11:28:47 +08:00
parent 383bd93261
commit 98eb912a76
4 changed files with 16 additions and 42 deletions

View File

@ -153,7 +153,8 @@ LOCAL_MODULE := graphics_utils
LOCAL_PATH := .
LOCAL_CPP_FEATURES += rtti
LOCAL_SRC_FILES := $(wildcard ../lib/graphics_utils/mipmap/*.c)
LOCAL_CFLAGS := -I../lib/graphics_utils/mipmap
LOCAL_CFLAGS := -I../lib/graphics_utils/mipmap \
-I../lib/simd_wrapper
ifeq ($(TARGET_ARCH_ABI), armeabi-v7a)
LOCAL_ARM_NEON := false
endif

View File

@ -1,4 +1,5 @@
cmake_minimum_required(VERSION 2.6)
include_directories("${PROJECT_SOURCE_DIR}/lib/simd_wrapper")
if (UNIX OR MINGW)
add_definitions(-O3)
endif()

View File

@ -32,36 +32,8 @@
////
#include <simd_wrapper.h>
#if __MMX__ || CPU_ENABLE_MMX
#include <mmintrin.h>
#define CPU_MMX_SUPPORT (1)
#endif
#if __SSE__ || defined(_M_X64) || ( defined(_M_IX86_FP) && ( _M_IX86_FP >= 1 ) ) || CPU_ENABLE_SSE
#include <xmmintrin.h>
#define CPU_SSE_SUPPORT (1)
#endif
#if __SSE2__ || defined(_M_X64) || ( defined(_M_IX86_FP) && ( _M_IX86_FP >= 2 ) ) || CPU_ENABLE_SSE2
#include <emmintrin.h>
#define CPU_SSE2_SUPPORT (1)
#endif
#if __SSE3__ || __AVX__ || CPU_ENABLE_SSE3
#include <pmmintrin.h>
#define CPU_SSE3_SUPPORT (1)
#endif
#if __SSSE3__ || __AVX__ || CPU_ENABLE_SSSE3
#include <tmmintrin.h>
#define CPU_SSSE3_SUPPORT (1)
#endif
#if __SSE4_1__ || __AVX__ || CPU_ENABLE_SSE4_1
#include <smmintrin.h>
#define CPU_SSE4_1_SUPPORT (1)
#endif
#if __SSE4_2__ || CPU_ENABLE_SSE4_2
#include <nmmintrin.h>
#define CPU_SSE4_2_SUPPORT (1)
#endif
#if __SSE4A__ || CPU_ENABLE_SSE4A
#include <ammintrin.h>
#define CPU_SSE4A_SUPPORT (1)
@ -355,7 +327,7 @@ static inline CC_ALWAYSINLINE __m128 simd4f_pow5d12_inline_ps( __m128 vx )
static inline void simdPrintDebugSSE4f( char *str, __m128 v )
{
float CPU_ALIGN16 store[4];
_mm_store_ps( (void *)store, v );
_mm_store_ps( (float *)store, v );
printf( "%s %f %f %f %f\n", str, (double)store[0], (double)store[1], (double)store[2], (double)store[3] );
return;
}
@ -363,7 +335,7 @@ static inline void simdPrintDebugSSE4f( char *str, __m128 v )
static inline void simdPrintDebugSSE2d( char *str, __m128d v )
{
double CPU_ALIGN16 store[2];
_mm_store_pd( (void *)store, v );
_mm_store_pd( (double *)store, v );
printf( "%s %f %f\n", str, store[0], store[1] );
return;
}

View File

@ -776,7 +776,7 @@ static void imStaticKernel4Linear( unsigned char *dst, int pointx, int pointy, i
}
#if CPU_SSE2_SUPPORT
_mm_store_ss( (void *)dst, _mm_castsi128_ps( _mm_packus_epi16( _mm_packs_epi32( _mm_cvtps_epi32( vsum ), vzero ), vzero ) ) );
_mm_store_ss( (float *)dst, _mm_castsi128_ps( _mm_packus_epi16( _mm_packs_epi32( _mm_cvtps_epi32( vsum ), vzero ), vzero ) ) );
#else
dst[0] = (unsigned char)( fmaxf( 0.0f, fminf( 255.0f, sum0 + 0.5f ) ) );
dst[1] = (unsigned char)( fmaxf( 0.0f, fminf( 255.0f, sum1 + 0.5f ) ) );
@ -825,7 +825,7 @@ static void imStaticKernel4Linear_Core( unsigned char *dst, int pointx, int poin
mapy = 0;
}
_mm_store_ss( (void *)dst, _mm_castsi128_ps( _mm_packus_epi16( _mm_packs_epi32( _mm_cvtps_epi32( vsum ), vzero ), vzero ) ) );
_mm_store_ss( (float *)dst, _mm_castsi128_ps( _mm_packus_epi16( _mm_packs_epi32( _mm_cvtps_epi32( vsum ), vzero ), vzero ) ) );
return;
}
@ -1091,7 +1091,7 @@ static void imStaticKernel2sRGB( unsigned char *dst, int pointx, int pointy, imS
uint32_t i;
} u;
vsum = linear2srgb3( vsum );
_mm_store_ss( (void *)&u.i, _mm_castsi128_ps( _mm_packus_epi16( _mm_packs_epi32( _mm_cvtps_epi32( vsum ), vzero ), vzero ) ) );
_mm_store_ss( (float *)&u.i, _mm_castsi128_ps( _mm_packus_epi16( _mm_packs_epi32( _mm_cvtps_epi32( vsum ), vzero ), vzero ) ) );
dst[0] = u.c[0];
dst[1] = u.c[1];
#else
@ -1157,7 +1157,7 @@ static void imStaticKernel3sRGB( unsigned char *dst, int pointx, int pointy, imS
uint32_t i;
} u;
vsum = linear2srgb3( vsum );
_mm_store_ss( (void *)&u.i, _mm_castsi128_ps( _mm_packus_epi16( _mm_packs_epi32( _mm_cvtps_epi32( vsum ), vzero ), vzero ) ) );
_mm_store_ss( (float *)&u.i, _mm_castsi128_ps( _mm_packus_epi16( _mm_packs_epi32( _mm_cvtps_epi32( vsum ), vzero ), vzero ) ) );
dst[0] = u.c[0];
dst[1] = u.c[1];
dst[2] = u.c[2];
@ -1222,7 +1222,7 @@ static void imStaticKernel4sRGB( unsigned char *dst, int pointx, int pointy, imS
#if CPU_SSE2_SUPPORT
vsum = linear2srgb3( vsum );
_mm_store_ss( (void *)dst, _mm_castsi128_ps( _mm_packus_epi16( _mm_packs_epi32( _mm_cvtps_epi32( vsum ), vzero ), vzero ) ) );
_mm_store_ss( (float *)dst, _mm_castsi128_ps( _mm_packus_epi16( _mm_packs_epi32( _mm_cvtps_epi32( vsum ), vzero ), vzero ) ) );
#else
dst[0] = (unsigned char)( fmaxf( 0.0f, fminf( 255.0f, linear2srgb( sum0 ) + 0.5f ) ) );
dst[1] = (unsigned char)( fmaxf( 0.0f, fminf( 255.0f, linear2srgb( sum1 ) + 0.5f ) ) );
@ -1291,7 +1291,7 @@ static void imStaticKernel3sRGB_Core( unsigned char *dst, int pointx, int pointy
#endif
vsum0 = linear2srgb3( vsum0 );
_mm_store_ss( (void *)&u.i, _mm_castsi128_ps( _mm_packus_epi16( _mm_packs_epi32( _mm_cvtps_epi32( vsum0 ), vzero ), vzero ) ) );
_mm_store_ss( (float *)&u.i, _mm_castsi128_ps( _mm_packus_epi16( _mm_packs_epi32( _mm_cvtps_epi32( vsum0 ), vzero ), vzero ) ) );
dst[0] = u.c[0];
dst[1] = u.c[1];
dst[2] = u.c[2];
@ -1332,7 +1332,7 @@ static void imStaticKernel4sRGB_Core( unsigned char *dst, int pointx, int pointy
}
vsum = linear2srgb3( vsum );
_mm_store_ss( (void *)dst, _mm_castsi128_ps( _mm_packus_epi16( _mm_packs_epi32( _mm_cvtps_epi32( vsum ), vzero ), vzero ) ) );
_mm_store_ss( (float *)dst, _mm_castsi128_ps( _mm_packus_epi16( _mm_packs_epi32( _mm_cvtps_epi32( vsum ), vzero ), vzero ) ) );
return;
}
@ -2729,7 +2729,7 @@ static void imDynamicKernel2sRGB( unsigned char *dst, imGenericMatrixState *stat
uint32_t i;
} u;
vsum = linear2srgb3( _mm_div_ps( vsum, _mm_set1_ps( matrixsum ) ) );
_mm_store_ss( (void *)&u.i, _mm_castsi128_ps( _mm_packus_epi16( _mm_packs_epi32( _mm_cvtps_epi32( vsum ), vzero ), vzero ) ) );
_mm_store_ss( (float *)&u.i, _mm_castsi128_ps( _mm_packus_epi16( _mm_packs_epi32( _mm_cvtps_epi32( vsum ), vzero ), vzero ) ) );
dst[0] = u.c[0];
dst[1] = u.c[1];
#else
@ -2800,7 +2800,7 @@ static void imDynamicKernel3sRGB( unsigned char *dst, imGenericMatrixState *stat
uint32_t i;
} u;
vsum = linear2srgb3( _mm_div_ps( vsum, _mm_set1_ps( matrixsum ) ) );
_mm_store_ss( (void *)&u.i, _mm_castsi128_ps( _mm_packus_epi16( _mm_packs_epi32( _mm_cvtps_epi32( vsum ), vzero ), vzero ) ) );
_mm_store_ss( (float *)&u.i, _mm_castsi128_ps( _mm_packus_epi16( _mm_packs_epi32( _mm_cvtps_epi32( vsum ), vzero ), vzero ) ) );
dst[0] = u.c[0];
dst[1] = u.c[1];
dst[2] = u.c[2];
@ -2871,7 +2871,7 @@ static void imDynamicKernel4sRGB( unsigned char *dst, imGenericMatrixState *stat
#if CPU_SSE2_SUPPORT
vsum = linear2srgb3( _mm_div_ps( vsum, _mm_set1_ps( matrixsum ) ) );
_mm_store_ss( (void *)dst, _mm_castsi128_ps( _mm_packus_epi16( _mm_packs_epi32( _mm_cvtps_epi32( vsum ), vzero ), vzero ) ) );
_mm_store_ss( (float *)dst, _mm_castsi128_ps( _mm_packus_epi16( _mm_packs_epi32( _mm_cvtps_epi32( vsum ), vzero ), vzero ) ) );
#else
matrixsum = 1.0f / matrixsum;
sum0 *= matrixsum;