From 76aa38e5b4fbae3041c393e5b76727ea16a18aa7 Mon Sep 17 00:00:00 2001 From: Benau Date: Mon, 13 Mar 2017 10:28:43 +0800 Subject: [PATCH] Try HQMipmapGenerator --- CMakeLists.txt | 5 + lib/graphics_utils/CMakeLists.txt | 9 + lib/graphics_utils/mipmap/cpusimd.c | 568 ++++ lib/graphics_utils/mipmap/cpusimd.h | 410 +++ lib/graphics_utils/mipmap/img.c | 628 ++++ lib/graphics_utils/mipmap/img.h | 74 + lib/graphics_utils/mipmap/imgresize.c | 4098 +++++++++++++++++++++++++ lib/graphics_utils/mipmap/imgresize.h | 150 + src/config/user_config.hpp | 5 + src/graphics/central_settings.cpp | 2 +- src/graphics/hq_mipmap_generator.cpp | 119 + src/graphics/hq_mipmap_generator.hpp | 98 + src/graphics/stk_tex_manager.cpp | 6 +- src/graphics/stk_texture.cpp | 58 +- src/graphics/stk_texture.hpp | 2 + src/graphics/threaded_tex_loader.cpp | 3 +- 16 files changed, 6226 insertions(+), 9 deletions(-) create mode 100644 lib/graphics_utils/CMakeLists.txt create mode 100644 lib/graphics_utils/mipmap/cpusimd.c create mode 100644 lib/graphics_utils/mipmap/cpusimd.h create mode 100644 lib/graphics_utils/mipmap/img.c create mode 100644 lib/graphics_utils/mipmap/img.h create mode 100644 lib/graphics_utils/mipmap/imgresize.c create mode 100644 lib/graphics_utils/mipmap/imgresize.h create mode 100644 src/graphics/hq_mipmap_generator.cpp create mode 100644 src/graphics/hq_mipmap_generator.hpp diff --git a/CMakeLists.txt b/CMakeLists.txt index 31ef6f003..8a0a558a8 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -116,6 +116,10 @@ if((WIN32 AND NOT MINGW) OR APPLE) set(JPEG_INCLUDE_DIR "${PROJECT_SOURCE_DIR}/lib/jpeglib/") set(JPEG_LIBRARY jpeglib) endif() + +add_subdirectory("${PROJECT_SOURCE_DIR}/lib/graphics_utils") +include_directories("${PROJECT_SOURCE_DIR}/lib/graphics_utils") + # Build the irrlicht library add_subdirectory("${PROJECT_SOURCE_DIR}/lib/irrlicht") include_directories("${PROJECT_SOURCE_DIR}/lib/irrlicht/include") @@ -370,6 +374,7 @@ target_link_libraries(supertuxkart bulletmath enet stkirrlicht + graphics_utils ${Angelscript_LIBRARIES} ${CURL_LIBRARIES} ${OGGVORBIS_LIBRARIES} diff --git a/lib/graphics_utils/CMakeLists.txt b/lib/graphics_utils/CMakeLists.txt new file mode 100644 index 000000000..fa8efa49e --- /dev/null +++ b/lib/graphics_utils/CMakeLists.txt @@ -0,0 +1,9 @@ +cmake_minimum_required(VERSION 2.6) +if (UNIX OR MINGW) + add_definitions(-O3 -msse -msse2 -msse3 -mssse3 -msse4.1 -mavx -ffast-math) +endif() +add_library(graphics_utils STATIC + mipmap/cpusimd.c + mipmap/img.c + mipmap/imgresize.c +) diff --git a/lib/graphics_utils/mipmap/cpusimd.c b/lib/graphics_utils/mipmap/cpusimd.c new file mode 100644 index 000000000..74e0b8be6 --- /dev/null +++ b/lib/graphics_utils/mipmap/cpusimd.c @@ -0,0 +1,568 @@ +/* ----------------------------------------------------------------------------- + * + * Copyright (c) 2008-2016 Alexis Naveros. + * + * + * The SIMD trigonometry functions are Copyright (C) 2007 Julien Pommier + * See copyright notice for simd4f_sin_ps(), simd4f_cos_ps(), simd4f_sincos_ps() + * + * + * Some functions are Copyright (C) 2008 José Fonseca + * See copyright notice for simd4f_exp2_ps(), simd4f_log2_ps(), simd4f_pow_ps() + * + * + * Portions developed under contract to the SURVICE Engineering Company. + * + * This software is provided 'as-is', without any express or implied + * warranty. In no event will the authors be held liable for any damages + * arising from the use of this software. + * + * Permission is granted to anyone to use this software for any purpose, + * including commercial applications, and to alter it and redistribute it + * freely, subject to the following restrictions: + * + * 1. The origin of this software must not be misrepresented; you must not + * claim that you wrote the original software. If you use this software + * in a product, an acknowledgment in the product documentation would be + * appreciated but is not required. + * 2. Altered source versions must be plainly marked as such, and must not be + * misrepresented as being the original software. + * 3. This notice may not be removed or altered from any source distribution. + * + * ----------------------------------------------------------------------------- + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include + + +#include "cpusimd.h" + + +//// + + +#if CPU_SSE_SUPPORT + +const uint32_t simd4fSignMask[4] CPU_ALIGN16 = { 0x80000000, 0x80000000, 0x80000000, 0x80000000 }; +const uint32_t simd4fSignMaskInv[4] CPU_ALIGN16 = { 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff }; +const float simd4fHalf[4] CPU_ALIGN16 = { 0.5, 0.5, 0.5, 0.5 }; +const float simd4fOne[4] CPU_ALIGN16 = { 1.0, 1.0, 1.0, 1.0 }; +const float simd4fTwo[4] CPU_ALIGN16 = { 2.0, 2.0, 2.0, 2.0 }; +const float simd4fThree[4] CPU_ALIGN16 = { 3.0, 3.0, 3.0, 3.0 }; +const uint32_t simd4uOne[4] CPU_ALIGN16 = { 1, 1, 1, 1 }; +const uint32_t simd4uOneInv[4] CPU_ALIGN16 = { ~1, ~1, ~1, ~1 }; +const uint32_t simd4uTwo[4] CPU_ALIGN16 = { 2, 2, 2, 2 }; +const uint32_t simd4uFour[4] CPU_ALIGN16 = { 4, 4, 4, 4 }; +const float simd4fQuarter[4] CPU_ALIGN16 = { 0.25, 0.25, 0.25, 0.25 }; +const float simd4fPi[4] CPU_ALIGN16 = { M_PI, M_PI, M_PI, M_PI }; +const float simd4fZeroOneTwoThree[4] CPU_ALIGN16 = { 0.0, 1.0, 2.0, 3.0 }; +const uint32_t simd4fAlphaMask[4] CPU_ALIGN16 = { 0x00000000, 0x00000000, 0x00000000, 0xffffffff }; +const float simd4f255[4] CPU_ALIGN16 = { 255.0f, 255.0f, 255.0f, 255.0f }; +const float simd4f255Inv[4] CPU_ALIGN16 = { 1.0f/255.0f, 1.0f/255.0f, 1.0f/255.0f, 1.0f/255.0f }; + +#endif + + +//// + + +#if CPU_SSE2_SUPPORT + + +/* Copyright (C) 2007 Julien Pommier + + This software is provided 'as-is', without any express or implied + warranty. In no event will the authors be held liable for any damages + arising from the use of this software. + + Permission is granted to anyone to use this software for any purpose, + including commercial applications, and to alter it and redistribute it + freely, subject to the following restrictions: + + 1. The origin of this software must not be misrepresented; you must not + claim that you wrote the original software. If you use this software + in a product, an acknowledgment in the product documentation would be + appreciated but is not required. + 2. Altered source versions must be plainly marked as such, and must not be + misrepresented as being the original software. + 3. This notice may not be removed or altered from any source distribution. + + (this is the zlib license) +*/ + +static const float simd4f_cephes_FOPI[4] CPU_ALIGN16 = { 1.27323954473516, 1.27323954473516, 1.27323954473516, 1.27323954473516 }; +static const float simd4f_minus_cephes_DP1[4] CPU_ALIGN16 = { -0.78515625, -0.78515625, -0.78515625, -0.78515625 }; +static const float simd4f_minus_cephes_DP2[4] CPU_ALIGN16 = { -2.4187564849853515625e-4, -2.4187564849853515625e-4, -2.4187564849853515625e-4, -2.4187564849853515625e-4 }; +static const float simd4f_minus_cephes_DP3[4] CPU_ALIGN16 = { -3.77489497744594108e-8, -3.77489497744594108e-8, -3.77489497744594108e-8, -3.77489497744594108e-8 }; +static const float simd4f_sincof_p0[4] CPU_ALIGN16 = { -1.9515295891E-4, -1.9515295891E-4, -1.9515295891E-4, -1.9515295891E-4 }; +static const float simd4f_sincof_p1[4] CPU_ALIGN16 = { 8.3321608736E-3, 8.3321608736E-3, 8.3321608736E-3, 8.3321608736E-3 }; +static const float simd4f_sincof_p2[4] CPU_ALIGN16 = { -1.6666654611E-1, -1.6666654611E-1, -1.6666654611E-1, -1.6666654611E-1 }; +static const float simd4f_coscof_p0[4] CPU_ALIGN16 = { 2.443315711809948E-005, 2.443315711809948E-005, 2.443315711809948E-005, 2.443315711809948E-005 }; +static const float simd4f_coscof_p1[4] CPU_ALIGN16 = { -1.388731625493765E-003, -1.388731625493765E-003, -1.388731625493765E-003, -1.388731625493765E-003 }; +static const float simd4f_coscof_p2[4] CPU_ALIGN16 = { 4.166664568298827E-002, 4.166664568298827E-002, 4.166664568298827E-002, 4.166664568298827E-002 }; + +__m128 simd4f_sin_ps( __m128 x ) +{ + __m128 xmm1, xmm2, xmm3, sign_bit, y; + __m128i emm0, emm2; + + xmm2 = _mm_setzero_ps(); + + sign_bit = x; + /* take the absolute value */ + x = _mm_and_ps( x, *(__m128 *)simd4fSignMaskInv ); + /* extract the sign bit (upper one) */ + sign_bit = _mm_and_ps(sign_bit, *(__m128 *)simd4fSignMask); + + /* scale by 4/Pi */ + y = _mm_mul_ps(x, *(__m128 *)simd4f_cephes_FOPI); + + /* store the integer part of y in mm0 */ + emm2 = _mm_cvttps_epi32(y); + /* j=(j+1) & (~1) (see the cephes sources) */ + emm2 = _mm_add_epi32(emm2, *(__m128i*)simd4uOne); + emm2 = _mm_and_si128(emm2, *(__m128i*)simd4uOneInv); + y = _mm_cvtepi32_ps(emm2); + + /* get the swap sign flag */ + emm0 = _mm_and_si128(emm2, *(__m128i*)simd4uFour); + emm0 = _mm_slli_epi32(emm0, 29); + /* get the polynom selection mask + there is one polynom for 0 <= x <= Pi/4 + and another one for Pi/4 + #define CPU_MMX_SUPPORT (1) +#endif +#if __SSE__ || _M_X64 || _M_IX86_FP >= 1 || CPU_ENABLE_SSE + #include + #define CPU_SSE_SUPPORT (1) +#endif +#if __SSE2__ || _M_X64 || _M_IX86_FP >= 2 || CPU_ENABLE_SSE2 + #include + #define CPU_SSE2_SUPPORT (1) +#endif +#if __SSE3__ || __AVX__ || CPU_ENABLE_SSE3 + #include + #define CPU_SSE3_SUPPORT (1) +#endif +#if __SSSE3__ || __AVX__ || CPU_ENABLE_SSSE3 + #include + #define CPU_SSSE3_SUPPORT (1) +#endif +#if __SSE4_1__ || __AVX__ || CPU_ENABLE_SSE4_1 + #include + #define CPU_SSE4_1_SUPPORT (1) +#endif +#if __SSE4_2__ || CPU_ENABLE_SSE4_2 + #include + #define CPU_SSE4_2_SUPPORT (1) +#endif +#if __SSE4A__ || CPU_ENABLE_SSE4A + #include + #define CPU_SSE4A_SUPPORT (1) +#endif +#if __AVX__ || CPU_ENABLE_AVX + #include + #define CPU_AVX_SUPPORT (1) +#endif +#if __AVX2__ || CPU_ENABLE_AVX2 + #include + #define CPU_AVX2_SUPPORT (1) +#endif +#if __XOP__ || CPU_ENABLE_XOP + #include + #define CPU_XOP_SUPPORT (1) +#endif +#if __FMA3__ || CPU_ENABLE_FMA3 + #include + #define CPU_FMA3_SUPPORT (1) +#endif +#if __FMA4__ || CPU_ENABLE_FMA4 + #include + #define CPU_FMA4_SUPPORT (1) +#endif +#if __RDRND__ || CPU_ENABLE_RDRND + #include + #define CPU_RDRND_SUPPORT (1) +#endif +#if __POPCNT__ || CPU_ENABLE_POPCNT + #include + #define CPU_POPCNT_SUPPORT (1) +#endif +#if __LZCNT__ || CPU_ENABLE_LZCNT + #include + #define CPU_LZCNT_SUPPORT (1) +#endif +#if __F16C__ || CPU_ENABLE_F16C + #include + #define CPU_F16C_SUPPORT (1) +#endif +#if __BMI__ || CPU_ENABLE_BMI + #include + #define CPU_BMI_SUPPORT (1) +#endif +#if __BMI2__ || CPU_ENABLE_BMI2 + #include + #define CPU_BMI2_SUPPORT (1) +#endif +#if __TBM__ || CPU_ENABLE_TBM + #include + #define CPU_TBM_SUPPORT (1) +#endif + + + +#if defined(__GNUC__) || defined(__INTEL_COMPILER) + #define CPU_ALIGN16 __attribute__((aligned(16))) + #define CPU_ALIGN32 __attribute__((aligned(32))) + #define CPU_ALIGN64 __attribute__((aligned(64))) +#elif defined(_MSC_VER) + #define CPU_ALIGN16 __declspec(align(16)) + #define CPU_ALIGN64 __declspec(align(64)) +#else + #define CPU_ALIGN16 + #define CPU_ALIGN32 + #define CPU_ALIGN64 + #warning "SSE/AVX Disabled: Unsupported Compiler." + #undef CPU_SSE_SUPPORT + #undef CPU_SSE2_SUPPORT + #undef CPU_SSE3_SUPPORT + #undef CPU_SSSE3_SUPPORT + #undef CPU_SSE4_1_SUPPORT + #undef CPU_SSE4_2_SUPPORT + #undef CPU_AVX_SUPPORT + #undef CPU_AVX2_SUPPORT + #undef CPU_XOP_SUPPORT + #undef CPU_FMA3_SUPPORT + #undef CPU_FMA4_SUPPORT +#endif + + +//// + + +#if CPU_SSE_SUPPORT + #define CPU_APPROX_DIV_FLOAT(z,w) _mm_cvtss_f32(_mm_mul_ss(_mm_set_ss(z),_mm_rcp_ss(_mm_set_ss(w)))) + #define CPU_APPROX_SQRT_FLOAT(z) _mm_cvtss_f32(_mm_mul_ss(_mm_set_ss(z),_mm_rsqrt_ss(_mm_set_ss(z)))) + #define CPU_APPROX_RSQRT_FLOAT(z) _mm_cvtss_f32(_mm_rsqrt_ss(_mm_set_ss(z))) + #define CPU_APPROX_DIVSQRT_FLOAT(z,w) _mm_cvtss_f32(_mm_mul_ss(_mm_set_ss(z),_mm_rsqrt_ss(_mm_set_ss(w)))) +#else + #define CPU_APPROX_DIV_FLOAT(z,w) ((z)/(w)) + #define CPU_APPROX_SQRT_FLOAT(z) (sqrtf(z)) + #define CPU_APPROX_RSQRT_FLOAT(z) (1.0/sqrtf(z)) + #define CPU_APPROX_DIVSQRT_FLOAT(z,w) ((z)/sqrtf(w)) +#endif + + +#if CPU_SSE3_SUPPORT + #define CPU_HADD_PS(vx,vy) _mm_hadd_ps(vx,vy) + #define CPU_HADD_PD(vx,vy) _mm_hadd_pd(vx,vy) +#elif CPU_SSE_SUPPORT + static inline __m128 CPU_HADD_PS( __m128 vx, __m128 vy ) + { + __m128 vh, vl; + vh = _mm_shuffle_ps( vx, vy, _MM_SHUFFLE(3,1,3,1) ); + vl = _mm_shuffle_ps( vx, vy, _MM_SHUFFLE(2,0,2,0) ); + return _mm_add_ps( vh, vl ); + } + #define CPU_HADD_PD(vx,vy) _mm_add_sd(vx,_mm_unpackhi_pd(vy,vy)) +#endif + + +#if CPU_SSE4_1_SUPPORT + #define CPU_CVT_U8_TO_I32(x,vzero) _mm_cvtepu8_epi32(x) + #define CPU_CVT_S8_TO_I32(x,vzero) _mm_cvtepi8_epi32(x) +#elif CPU_SSE2_SUPPORT + #define CPU_CVT_U8_TO_I32(x,vzero) _mm_unpacklo_epi16(_mm_unpacklo_epi8((x),(vzero)),(vzero)) +static inline __m128i CPU_CVT_S8_TO_I32( __m128i vx, __m128i vzero ) +{ + __m128i vsign; + vsign = _mm_cmpgt_epi8( vzero, vx ); + return _mm_unpacklo_epi16( _mm_unpacklo_epi8( vx, vsign ), _mm_unpacklo_epi8( vsign, vsign ) ); +} +#endif + + +#if CPU_SSE4_1_SUPPORT + #define CPU_BLENDV_PS(x,y,mask) _mm_blendv_ps(x,y,mask) + #define CPU_BLENDV_PD(x,y,mask) _mm_blendv_pd(x,y,mask) +#elif CPU_SSE2_SUPPORT + #define CPU_BLENDV_PS(x,y,mask) _mm_or_ps(_mm_andnot_ps(mask,x),_mm_and_ps(y,mask)) + #define CPU_BLENDV_PD(x,y,mask) _mm_or_pd(_mm_andnot_pd(mask,x),_mm_and_pd(y,mask)) +#endif + + + +/* +CPU_FMADD = ((f0*f1)+t0) +CPU_FMSUB = ((f0*f1)-t0) +*/ +#if CPU_FMA3_SUPPORT + #define CPU_FMADD_SS(f0,f1,t0) _mm_fmadd_ss(f0,f1,t0) + #define CPU_FMADD_PS(f0,f1,t0) _mm_fmadd_ps(f0,f1,t0) + #define CPU_FMADD_SD(f0,f1,t0) _mm_fmadd_sd(f0,f1,t0) + #define CPU_FMADD_PD(f0,f1,t0) _mm_fmadd_pd(f0,f1,t0) + #define CPU_FMSUB_SS(f0,f1,t0) _mm_fmsub_ss(f0,f1,t0) + #define CPU_FMSUB_PS(f0,f1,t0) _mm_fmsub_ps(f0,f1,t0) + #define CPU_FMSUB_SD(f0,f1,t0) _mm_fmsub_sd(f0,f1,t0) + #define CPU_FMSUB_PD(f0,f1,t0) _mm_fmsub_pd(f0,f1,t0) + #define CPU_FMADD256_SS(f0,f1,t0) _mm256_fmadd_ss(f0,f1,t0) + #define CPU_FMADD256_PS(f0,f1,t0) _mm256_fmadd_ps(f0,f1,t0) + #define CPU_FMADD256_SD(f0,f1,t0) _mm256_fmadd_sd(f0,f1,t0) + #define CPU_FMADD256_PD(f0,f1,t0) _mm256_fmadd_pd(f0,f1,t0) + #define CPU_FMSUB256_SS(f0,f1,t0) _mm256_fmsub_ss(f0,f1,t0) + #define CPU_FMSUB256_PS(f0,f1,t0) _mm256_fmsub_ps(f0,f1,t0) + #define CPU_FMSUB256_SD(f0,f1,t0) _mm256_fmsub_sd(f0,f1,t0) + #define CPU_FMSUB256_PD(f0,f1,t0) _mm256_fmsub_pd(f0,f1,t0) +#elif CPU_FMA4_SUPPORT + #define CPU_FMADD_SS(f0,f1,t0) _mm_macc_ss(f0,f1,t0) + #define CPU_FMADD_PS(f0,f1,t0) _mm_macc_ps(f0,f1,t0) + #define CPU_FMADD_SD(f0,f1,t0) _mm_macc_sd(f0,f1,t0) + #define CPU_FMADD_PD(f0,f1,t0) _mm_macc_pd(f0,f1,t0) + #define CPU_FMSUB_SS(f0,f1,t0) _mm_msub_ss(f0,f1,t0) + #define CPU_FMSUB_PS(f0,f1,t0) _mm_msub_ps(f0,f1,t0) + #define CPU_FMSUB_SD(f0,f1,t0) _mm_msub_sd(f0,f1,t0) + #define CPU_FMSUB_PD(f0,f1,t0) _mm_msub_pd(f0,f1,t0) + #define CPU_FMADD256_SS(f0,f1,t0) _mm256_macc_ss(f0,f1,t0) + #define CPU_FMADD256_PS(f0,f1,t0) _mm256_macc_ps(f0,f1,t0) + #define CPU_FMADD256_SD(f0,f1,t0) _mm256_macc_sd(f0,f1,t0) + #define CPU_FMADD256_PD(f0,f1,t0) _mm256_macc_pd(f0,f1,t0) + #define CPU_FMSUB256_SS(f0,f1,t0) _mm256_msub_ss(f0,f1,t0) + #define CPU_FMSUB256_PS(f0,f1,t0) _mm256_msub_ps(f0,f1,t0) + #define CPU_FMSUB256_SD(f0,f1,t0) _mm256_msub_sd(f0,f1,t0) + #define CPU_FMSUB256_PD(f0,f1,t0) _mm256_msub_pd(f0,f1,t0) +#else + #define CPU_FMADD_SS(f0,f1,t0) _mm_add_ss(_mm_mul_ss(f0,f1),t0) + #define CPU_FMADD_PS(f0,f1,t0) _mm_add_ps(_mm_mul_ps(f0,f1),t0) + #define CPU_FMADD_SD(f0,f1,t0) _mm_add_sd(_mm_mul_sd(f0,f1),t0) + #define CPU_FMADD_PD(f0,f1,t0) _mm_add_pd(_mm_mul_pd(f0,f1),t0) + #define CPU_FMSUB_SS(f0,f1,t0) _mm_sub_ss(_mm_mul_ss(f0,f1),t0) + #define CPU_FMSUB_PS(f0,f1,t0) _mm_sub_ps(_mm_mul_ps(f0,f1),t0) + #define CPU_FMSUB_SD(f0,f1,t0) _mm_sub_sd(_mm_mul_sd(f0,f1),t0) + #define CPU_FMSUB_PD(f0,f1,t0) _mm_sub_pd(_mm_mul_pd(f0,f1),t0) + #define CPU_FMADD256_SS(f0,f1,t0) _mm256_add_ss(_mm256_mul_ss(f0,f1),t0) + #define CPU_FMADD256_PS(f0,f1,t0) _mm256_add_ps(_mm256_mul_ps(f0,f1),t0) + #define CPU_FMADD256_SD(f0,f1,t0) _mm256_add_sd(_mm256_mul_sd(f0,f1),t0) + #define CPU_FMADD256_PD(f0,f1,t0) _mm256_add_pd(_mm256_mul_pd(f0,f1),t0) + #define CPU_FMSUB256_SS(f0,f1,t0) _mm256_sub_ss(_mm256_mul_ss(f0,f1),t0) + #define CPU_FMSUB256_PS(f0,f1,t0) _mm256_sub_ps(_mm256_mul_ps(f0,f1),t0) + #define CPU_FMSUB256_SD(f0,f1,t0) _mm256_sub_sd(_mm256_mul_sd(f0,f1),t0) + #define CPU_FMSUB256_PD(f0,f1,t0) _mm256_sub_pd(_mm256_mul_pd(f0,f1),t0) +#endif + + +//// + + +#if CPU_SSE_SUPPORT + +extern const uint32_t simd4fSignMask[4]; +extern const uint32_t simd4fSignMaskInv[4]; +extern const float simd4fHalf[4]; +extern const float simd4fOne[4]; +extern const float simd4fTwo[4]; +extern const float simd4fThree[4]; +extern const uint32_t simd4uOne[4]; +extern const uint32_t simd4uOneInv[4]; +extern const uint32_t simd4uTwo[4]; +extern const uint32_t simd4uFour[4]; +extern const float simd4fQuarter[4]; +extern const float simd4fPi[4]; +extern const float simd4fZeroOneTwoThree[4]; +extern const uint32_t simd4fAlphaMask[4]; +extern const float simd4f255[4]; +extern const float simd4f255Inv[4]; + +#endif + + +#if CPU_SSE2_SUPPORT + +/* Input range between -8192 and 8192 */ +__m128 simd4f_sin_ps( __m128 x ); +__m128 simd4f_cos_ps( __m128 x ); +void simd4f_sincos_ps( __m128 x, __m128 *s, __m128 *c ); + +#endif + +#if CPU_SSE2_SUPPORT + +__m128 simd4f_exp2_ps( __m128 x ); +__m128 simd4f_log2_ps( __m128 x ); +__m128 simd4f_pow_ps( __m128 x, __m128 y ); + +#endif + +#if CPU_SSE2_SUPPORT + +__m128 simd4f_pow12d5_ps( __m128 arg ); +__m128 simd4f_pow5d12_ps( __m128 arg ); + +#endif + + +//// + + +#if CPU_SSE2_SUPPORT + +#ifndef CC_ALWAYSINLINE + #if defined(__GNUC__) || defined(__INTEL_COMPILER) + #define CC_ALWAYSINLINE __attribute__((always_inline)) + #else + #define CC_ALWAYSINLINE + #endif +#endif + +static inline CC_ALWAYSINLINE __m128 simd4f_pow12d5_inline_ps( __m128 vx ) +{ + __m128 vpow, vpwsqrtinv, vpwsqrt, vx2; + vx2 = _mm_mul_ps( vx, vx ); + vpow = _mm_castsi128_ps( _mm_cvtps_epi32( _mm_mul_ps( _mm_cvtepi32_ps( _mm_castps_si128( _mm_mul_ps( vx, _mm_set1_ps( 5417434112.0f ) ) ) ), _mm_set1_ps( 0.8f ) ) ) ); + vpwsqrtinv = _mm_rsqrt_ps( vpow ); + vpwsqrt = _mm_mul_ps( vpow, vpwsqrtinv ); + return _mm_mul_ps( _mm_add_ps( _mm_mul_ps( vx2, vpwsqrt ), _mm_mul_ps( _mm_mul_ps( _mm_mul_ps( vx2, vx ), vpwsqrtinv ), _mm_rsqrt_ps( vpwsqrt ) ) ), _mm_set1_ps( 0.51011878327f ) ); +} + +static inline CC_ALWAYSINLINE __m128 simd4f_pow5d12_inline_ps( __m128 vx ) +{ + __m128 vpow; + vpow = _mm_castsi128_ps( _mm_cvtps_epi32( _mm_mul_ps( _mm_cvtepi32_ps( _mm_castps_si128( _mm_mul_ps( vx, _mm_set1_ps( 6521909350804488192.0f ) ) ) ), _mm_set1_ps( 0.666666666666f ) ) ) ); + vx = _mm_mul_ps( _mm_add_ps( _mm_mul_ps( vx, vpow ), _mm_mul_ps( _mm_mul_ps( vx, vx ), _mm_rsqrt_ps( vpow ) ) ), _mm_set1_ps( 0.5290553722f ) ); +#if 0 + vx = _mm_mul_ps( vx, _mm_rsqrt_ps( vx ) ); + vx = _mm_mul_ps( vx, _mm_rsqrt_ps( vx ) ); +#else + vx = _mm_sqrt_ps( vx ); + vx = _mm_sqrt_ps( vx ); +#endif + return vx; +} + +#endif + + +//// + + +#if CPU_SSE_SUPPORT + +static inline void simdPrintDebugSSE4f( char *str, __m128 v ) +{ + float CPU_ALIGN16 store[4]; + _mm_store_ps( (void *)store, v ); + printf( "%s %f %f %f %f\n", str, store[0], store[1], store[2], store[3] ); + return; +} + +static inline void simdPrintDebugSSE2d( char *str, __m128d v ) +{ + double CPU_ALIGN16 store[2]; + _mm_store_pd( (void *)store, v ); + printf( "%s %f %f\n", str, store[0], store[1] ); + return; +} + +static inline void simdPrintDebugSSE16u8( char *str, __m128i v ) +{ + uint8_t CPU_ALIGN16 store[16]; + _mm_store_si128( (void *)store, v ); + printf( "%s %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d\n", str, store[0], store[1], store[2], store[3], store[4], store[5], store[6], store[7], store[8], store[9], store[10], store[11], store[12], store[13], store[14], store[15] ); + return; +} + +static inline void simdPrintDebugSSE8u16( char *str, __m128i v ) +{ + uint16_t CPU_ALIGN16 store[8]; + _mm_store_si128( (void *)store, v ); + printf( "%s %d %d %d %d %d %d %d %d\n", str, store[0], store[1], store[2], store[3], store[4], store[5], store[6], store[7] ); + return; +} + +static inline void simdPrintDebugSSE4u32( char *str, __m128i v ) +{ + uint32_t CPU_ALIGN16 store[4]; + _mm_store_si128( (void *)store, v ); + printf( "%s %d %d %d %d\n", str, store[0], store[1], store[2], store[3] ); + return; +} + +static inline void simdPrintDebugSSE2u64( char *str, __m128i v ) +{ + uint64_t CPU_ALIGN16 store[2]; + _mm_store_si128( (void *)store, v ); + printf( "%s %lld %lld\n", str, (long long)store[0], (long long)store[1] ); + return; +} + +#endif + + +//// + + +#endif + diff --git a/lib/graphics_utils/mipmap/img.c b/lib/graphics_utils/mipmap/img.c new file mode 100644 index 000000000..369e21f16 --- /dev/null +++ b/lib/graphics_utils/mipmap/img.c @@ -0,0 +1,628 @@ +/* ***************************************************************************** + * + * Copyright (c) 2007-2016 Alexis Naveros. + * Portions developed under contract to the SURVICE Engineering Company. + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * version 2.1 as published by the Free Software Foundation. + * + * This library is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this file; see the file named COPYING for more + * information. + * + * ***************************************************************************** + */ + +#include +#include +#include +#include +#include +#include + + +#include "cpusimd.h" + +#include "img.h" + + +#ifndef ADDRESS + #define ADDRESS(p,o) ((void *)(((char *)p)+(o))) +#endif + + +//// + + +void imgCopyRect( imgImage *image, int dstx, int dsty, int srcx, int srcy, int sizex, int sizey ) +{ + int y; + void *dst, *src; + src = ADDRESS( image->data, ( srcx * image->format.bytesperpixel ) + ( srcy * image->format.bytesperline ) ); + dst = ADDRESS( image->data, ( dstx * image->format.bytesperpixel ) + ( dsty * image->format.bytesperline ) ); + for( y = 0 ; y < sizey ; y++ ) + { + memcpy( dst, src, sizex * image->format.bytesperpixel ); + src = ADDRESS( src, image->format.bytesperline ); + dst = ADDRESS( dst, image->format.bytesperline ); + } + return; +} + + +#if CPU_SSE2_SUPPORT +static const uint16_t CPU_ALIGN16 imgBlendRgbMask[8] = { 0xffff, 0xffff, 0xffff, 0x0000, 0xffff, 0xffff, 0xffff, 0x0000 }; +static const uint8_t CPU_ALIGN16 imgBlendAlphaTestMask[16] = { 0x00, 0x00, 0x00, 0xff, 0x00, 0x00, 0x00, 0xff, 0x00, 0x00, 0x00, 0xff, 0x00, 0x00, 0x00, 0xff }; +static const uint16_t CPU_ALIGN16 imgBlendRoundBias[8] = { 128, 128, 128, 128, 128, 128, 128, 128 }; + #if CPU_SSSE3_SUPPORT +static const uint8_t CPU_ALIGN16 imgBlendShufMask[16] = { 6,7,6,7,6,7,6,7, 14,15,14,15,14,15,14,15 }; + #endif +#endif + +static void imgBlendImageRgba2Rgba( imgImage *dstimage, int dstx, int dsty, imgImage *srcimage ) +{ + int x, y; +#if CPU_SSE2_SUPPORT + int row4size; + __m128i vsrc01, vsrc23, vdst01, vdst23, vblend01, vblend23; + __m128i vzero, v255, vrgbmask, valphatest, vroundbias; + #if CPU_SSSE3_SUPPORT + __m128i vshufmask; + #endif +#else + int32_t dstr, dstg, dstb, dsta; + int32_t srcr, srcg, srcb, srca; +#endif + unsigned char *src, *srcrow, *dstrow; + uint32_t *dst; + + /* TODO: Other function to clamp copy area? */ + +#if CPU_SSE2_SUPPORT + row4size = srcimage->format.width & ~3; + vzero = _mm_setzero_si128(); + v255 = _mm_set1_epi16( 255 ); + vrgbmask = _mm_load_si128( (void *)imgBlendRgbMask ); + valphatest = _mm_load_si128( (void *)imgBlendAlphaTestMask ); + vroundbias = _mm_load_si128( (void *)imgBlendRoundBias ); + #if CPU_SSSE3_SUPPORT + vshufmask = _mm_load_si128( (void *)imgBlendShufMask ); + #endif +#endif + + src = srcimage->data; + dst = ADDRESS( dstimage->data, ( dstx * 4 ) + ( dsty * dstimage->format.bytesperline ) ); + for( y = 0 ; y < srcimage->format.height ; y++ ) + { + srcrow = src; + dstrow = (unsigned char *)dst; + +#if CPU_SSE2_SUPPORT + for( x = 0 ; x < row4size ; x += 4, srcrow += 16, dstrow += 16 ) + { + /* r0g0b0a0,r1g1b1a1,r2g2b2a2,r3g3b3a3 */ + vsrc23 = _mm_loadu_si128( (void *)srcrow ); + if( _mm_movemask_ps( _mm_castsi128_ps( _mm_cmpeq_epi32( _mm_and_si128( valphatest, vsrc23 ), vzero ) ) ) == 0xf ) + continue; + vdst23 = _mm_loadu_si128( (void *)dstrow ); + /* r0__g0__b0__a0__, r1__g1__b1__a1__ */ + vsrc01 = _mm_unpacklo_epi8( vsrc23, vzero ); + vdst01 = _mm_unpacklo_epi8( vdst23, vzero ); + /* r2__g2__b2__a2__, r3__g3__b3__a3__ */ + vsrc23 = _mm_unpackhi_epi8( vsrc23, vzero ); + vdst23 = _mm_unpackhi_epi8( vdst23, vzero ); + #if CPU_SSSE3_SUPPORT + /* __a0__a0__a0__a0, __a1__a1__a1__a1 */ + vblend01 = _mm_shuffle_epi8( vsrc01, vshufmask ); + /* __a2__a2__a2__a2, __a3__a3__a3__a3 */ + vblend23 = _mm_shuffle_epi8( vsrc23, vshufmask ); + #else + vblend01 = _mm_shufflelo_epi16( vsrc01, 0xff ); + vblend01 = _mm_shufflehi_epi16( vblend01, 0xff ); + vblend23 = _mm_shufflelo_epi16( vsrc23, 0xff ); + vblend23 = _mm_shufflehi_epi16( vblend23, 0xff ); + #endif + vdst01 = _mm_adds_epu16( _mm_adds_epu16( _mm_mullo_epi16( vdst01, _mm_sub_epi16( v255, _mm_and_si128( vblend01, vrgbmask ) ) ), _mm_mullo_epi16( vsrc01, vblend01 ) ), vroundbias ); + vdst23 = _mm_adds_epu16( _mm_adds_epu16( _mm_mullo_epi16( vdst23, _mm_sub_epi16( v255, _mm_and_si128( vblend23, vrgbmask ) ) ), _mm_mullo_epi16( vsrc23, vblend23 ) ), vroundbias ); + /* Correction to divide by 255 instead of 256 */ + vdst01 = _mm_srli_epi16( _mm_adds_epu16( vdst01, _mm_srli_epi16( vdst01, 8 ) ), 8 ); + vdst23 = _mm_srli_epi16( _mm_adds_epu16( vdst23, _mm_srli_epi16( vdst23, 8 ) ), 8 ); + /* Combine interleaved and store */ + _mm_storeu_si128( (void *)dstrow, _mm_packus_epi16( vdst01, vdst23 ) ); + } + for( ; x < srcimage->format.width ; x++, srcrow += 4, dstrow += 4 ) + { + if( !( srcrow[3] ) ) + continue; + vsrc01 = _mm_castps_si128( _mm_load_ss( (void *)srcrow ) ); + vdst01 = _mm_castps_si128( _mm_load_ss( (void *)dstrow ) ); + vsrc01 = _mm_unpacklo_epi8( vsrc01, vzero ); + vdst01 = _mm_unpacklo_epi8( vdst01, vzero ); + #if CPU_SSSE3_SUPPORT + vblend01 = _mm_shuffle_epi8( vsrc01, vshufmask ); + #else + vblend01 = _mm_shufflelo_epi16( vsrc01, 0xff ); + vblend01 = _mm_shufflehi_epi16( vblend01, 0xff ); + #endif + vdst01 = _mm_adds_epu16( _mm_adds_epu16( _mm_mullo_epi16( vdst01, _mm_sub_epi16( v255, _mm_and_si128( vblend01, vrgbmask ) ) ), _mm_mullo_epi16( vsrc01, vblend01 ) ), vroundbias ); + /* Correction to divide by 255 instead of 256 */ + vdst01 = _mm_srli_epi16( _mm_adds_epu16( vdst01, _mm_srli_epi16( vdst01, 8 ) ), 8 ); + _mm_store_ss( (void *)dstrow, _mm_castsi128_ps( _mm_packus_epi16( vdst01, vdst01 ) ) ); + } +#else + for( x = 0 ; x < srcimage->format.width ; x++, srcrow += 4, dstrow += 4 ) + { + if( !( srcrow[3] ) ) + continue; + srcr = (int32_t)srcrow[0]; + srcg = (int32_t)srcrow[1]; + srcb = (int32_t)srcrow[2]; + srca = (int32_t)srcrow[3]; + dstr = (int32_t)dstrow[0]; + dstg = (int32_t)dstrow[1]; + dstb = (int32_t)dstrow[2]; + dsta = (int32_t)dstrow[3]; + dstr = ( ( dstr << 8 ) - dstr + ( srca * ( srcr - dstr ) ) + 128 ); + dstg = ( ( dstg << 8 ) - dstg + ( srca * ( srcg - dstg ) ) + 128 ); + dstb = ( ( dstb << 8 ) - dstb + ( srca * ( srcb - dstb ) ) + 128 ); + dsta = ( ( dsta << 8 ) - dsta + ( srca * srca ) + 128 ); + dstr = ( dstr + ( dstr >> 8 ) ) >> 8; + dstg = ( dstg + ( dstg >> 8 ) ) >> 8; + dstb = ( dstb + ( dstb >> 8 ) ) >> 8; + dsta = ( dsta + ( dsta >> 8 ) ) >> 8; + if( dsta > 255 ) + dsta = 255; + dstrow[0] = (unsigned char)dstr; + dstrow[1] = (unsigned char)dstg; + dstrow[2] = (unsigned char)dstb; + dstrow[3] = (unsigned char)dsta; + } +#endif + src = ADDRESS( src, srcimage->format.bytesperline ); + dst = ADDRESS( dst, dstimage->format.bytesperline ); + } + + return; +} + +static void imgBlendImageRgba2Rgbx( imgImage *dstimage, int dstx, int dsty, imgImage *srcimage ) +{ + int x, y; +#if CPU_SSE2_SUPPORT + int row4size; + __m128i vsrc01, vsrc23, vdst01, vdst23, vblend01, vblend23; + __m128i vzero, v255, valphatest, vroundbias; + #if CPU_SSSE3_SUPPORT + __m128i vshufmask; + #endif +#else + int32_t dstr, dstg, dstb; + int32_t srcr, srcg, srcb, srca; +#endif + unsigned char *src, *srcrow, *dstrow; + uint32_t *dst; + + /* TODO: Other function to clamp copy area? */ + +#if CPU_SSE2_SUPPORT + row4size = srcimage->format.width & ~3; + vzero = _mm_setzero_si128(); + v255 = _mm_set1_epi16( 255 ); + valphatest = _mm_load_si128( (void *)imgBlendAlphaTestMask ); + vroundbias = _mm_load_si128( (void *)imgBlendRoundBias ); + #if CPU_SSSE3_SUPPORT + vshufmask = _mm_load_si128( (void *)imgBlendShufMask ); + #endif +#endif + + src = srcimage->data; + dst = ADDRESS( dstimage->data, ( dstx * 4 ) + ( dsty * dstimage->format.bytesperline ) ); + for( y = 0 ; y < srcimage->format.height ; y++ ) + { + srcrow = src; + dstrow = (unsigned char *)dst; + +#if CPU_SSE2_SUPPORT + for( x = 0 ; x < row4size ; x += 4, srcrow += 16, dstrow += 16 ) + { + /* r0g0b0a0,r1g1b1a1,r2g2b2a2,r3g3b3a3 */ + vsrc23 = _mm_loadu_si128( (void *)srcrow ); + if( _mm_movemask_ps( _mm_castsi128_ps( _mm_cmpeq_epi32( _mm_and_si128( valphatest, vsrc23 ), vzero ) ) ) == 0xf ) + continue; + vdst23 = _mm_loadu_si128( (void *)dstrow ); + /* r0__g0__b0__a0__, r1__g1__b1__a1__ */ + vsrc01 = _mm_unpacklo_epi8( vsrc23, vzero ); + vdst01 = _mm_unpacklo_epi8( vdst23, vzero ); + /* r2__g2__b2__a2__, r3__g3__b3__a3__ */ + vsrc23 = _mm_unpackhi_epi8( vsrc23, vzero ); + vdst23 = _mm_unpackhi_epi8( vdst23, vzero ); + #if CPU_SSSE3_SUPPORT + /* __a0__a0__a0__a0, __a1__a1__a1__a1 */ + vblend01 = _mm_shuffle_epi8( vsrc01, vshufmask ); + /* __a2__a2__a2__a2, __a3__a3__a3__a3 */ + vblend23 = _mm_shuffle_epi8( vsrc23, vshufmask ); + #else + vblend01 = _mm_shufflelo_epi16( vsrc01, 0xff ); + vblend01 = _mm_shufflehi_epi16( vblend01, 0xff ); + vblend23 = _mm_shufflelo_epi16( vsrc23, 0xff ); + vblend23 = _mm_shufflehi_epi16( vblend23, 0xff ); + #endif + vdst01 = _mm_adds_epu16( _mm_adds_epu16( _mm_mullo_epi16( vdst01, _mm_sub_epi16( v255, vblend01 ) ), _mm_mullo_epi16( vsrc01, vblend01 ) ), vroundbias ); + vdst23 = _mm_adds_epu16( _mm_adds_epu16( _mm_mullo_epi16( vdst23, _mm_sub_epi16( v255, vblend23 ) ), _mm_mullo_epi16( vsrc23, vblend23 ) ), vroundbias ); + /* Correction to divide by 255 instead of 256 */ + vdst01 = _mm_srli_epi16( _mm_adds_epu16( vdst01, _mm_srli_epi16( vdst01, 8 ) ), 8 ); + vdst23 = _mm_srli_epi16( _mm_adds_epu16( vdst23, _mm_srli_epi16( vdst23, 8 ) ), 8 ); + /* Combine interleaved and store */ + _mm_storeu_si128( (void *)dstrow, _mm_or_si128( _mm_packus_epi16( vdst01, vdst23 ), valphatest ) ); + } + for( ; x < srcimage->format.width ; x++, srcrow += 4, dstrow += 4 ) + { + if( !( srcrow[3] ) ) + continue; + vsrc01 = _mm_castps_si128( _mm_load_ss( (void *)srcrow ) ); + vdst01 = _mm_castps_si128( _mm_load_ss( (void *)dstrow ) ); + vsrc01 = _mm_unpacklo_epi8( vsrc01, vzero ); + vdst01 = _mm_unpacklo_epi8( vdst01, vzero ); + #if CPU_SSSE3_SUPPORT + vblend01 = _mm_shuffle_epi8( vsrc01, vshufmask ); + #else + vblend01 = _mm_shufflelo_epi16( vsrc01, 0xff ); + vblend01 = _mm_shufflehi_epi16( vblend01, 0xff ); + #endif + vdst01 = _mm_adds_epu16( _mm_adds_epu16( _mm_mullo_epi16( vdst01, _mm_sub_epi16( v255, vblend01 ) ), _mm_mullo_epi16( vsrc01, vblend01 ) ), vroundbias ); + /* Correction to divide by 255 instead of 256 */ + vdst01 = _mm_srli_epi16( _mm_adds_epu16( vdst01, _mm_srli_epi16( vdst01, 8 ) ), 8 ); + _mm_store_ss( (void *)dstrow, _mm_castsi128_ps( _mm_or_si128( _mm_packus_epi16( vdst01, vdst01 ), valphatest ) ) ); + } +#else + for( x = 0 ; x < srcimage->format.width ; x++, srcrow += 4, dstrow += 4 ) + { + if( !( srcrow[3] ) ) + continue; + srcr = (int32_t)srcrow[0]; + srcg = (int32_t)srcrow[1]; + srcb = (int32_t)srcrow[2]; + srca = (int32_t)srcrow[3]; + dstr = (int32_t)dstrow[0]; + dstg = (int32_t)dstrow[1]; + dstb = (int32_t)dstrow[2]; + dstr = ( ( dstr << 8 ) - dstr + ( srca * ( srcr - dstr ) ) + 128 ); + dstg = ( ( dstg << 8 ) - dstg + ( srca * ( srcg - dstg ) ) + 128 ); + dstb = ( ( dstb << 8 ) - dstb + ( srca * ( srcb - dstb ) ) + 128 ); + dstr = ( dstr + ( dstr >> 8 ) ) >> 8; + dstg = ( dstg + ( dstg >> 8 ) ) >> 8; + dstb = ( dstb + ( dstb >> 8 ) ) >> 8; + dstrow[0] = (unsigned char)dstr; + dstrow[1] = (unsigned char)dstg; + dstrow[2] = (unsigned char)dstb; + dstrow[3] = (unsigned char)255; + } +#endif + src = ADDRESS( src, srcimage->format.bytesperline ); + dst = ADDRESS( dst, dstimage->format.bytesperline ); + } + + return; +} + +static void imgBlendImageRgba2Rgb( imgImage *dstimage, int dstx, int dsty, imgImage *srcimage ) +{ + int x, y; + int32_t dstr, dstg, dstb; + int32_t srcr, srcg, srcb, srca; + unsigned char *src, *srcrow, *dstrow; + uint32_t *dst; + + /* TODO: Other function to clamp copy area? */ + + src = srcimage->data; + dst = ADDRESS( dstimage->data, ( dstx * 3 ) + ( dsty * dstimage->format.bytesperline ) ); + for( y = 0 ; y < srcimage->format.height ; y++ ) + { + srcrow = src; + dstrow = (unsigned char *)dst; + for( x = 0 ; x < srcimage->format.width ; x++, srcrow += 4, dstrow += 3 ) + { + if( !( srcrow[3] ) ) + continue; + srcr = (int32_t)srcrow[0]; + srcg = (int32_t)srcrow[1]; + srcb = (int32_t)srcrow[2]; + srca = (int32_t)srcrow[3]; + dstr = (int32_t)dstrow[0]; + dstg = (int32_t)dstrow[1]; + dstb = (int32_t)dstrow[2]; + dstr = ( ( dstr << 8 ) - dstr + ( srca * ( srcr - dstr ) ) + 128 ); + dstg = ( ( dstg << 8 ) - dstg + ( srca * ( srcg - dstg ) ) + 128 ); + dstb = ( ( dstb << 8 ) - dstb + ( srca * ( srcb - dstb ) ) + 128 ); + dstr = ( dstr + ( dstr >> 8 ) ) >> 8; + dstg = ( dstg + ( dstg >> 8 ) ) >> 8; + dstb = ( dstb + ( dstb >> 8 ) ) >> 8; + dstrow[0] = (unsigned char)dstr; + dstrow[1] = (unsigned char)dstg; + dstrow[2] = (unsigned char)dstb; + } + src = ADDRESS( src, srcimage->format.bytesperline ); + dst = ADDRESS( dst, dstimage->format.bytesperline ); + } + + return; +} + + +void (*imgBlendGetFunction( imgImage *dstimage, imgImage *srcimage ))( imgImage *dstimage, int dstx, int dsty, imgImage *srcimage ) +{ + void (*blendfunc)( imgImage *dstimage, int dstx, int dsty, imgImage *srcimage ); + blendfunc = 0; + if( srcimage->format.bytesperpixel == 4 ) + { + if( dstimage->format.bytesperpixel == 4 ) + { + if( ( dstimage->format.type == IMG_FORMAT_TYPE_RGBA32 ) || ( dstimage->format.type == IMG_FORMAT_TYPE_BGRA32 ) ) + blendfunc = imgBlendImageRgba2Rgba; + else + blendfunc = imgBlendImageRgba2Rgbx; + } + else if( dstimage->format.bytesperpixel == 3 ) + blendfunc = imgBlendImageRgba2Rgb; + } + return blendfunc; +} + + +int imgBlendImage( imgImage *dstimage, int dstx, int dsty, imgImage *srcimage ) +{ + void (*blendfunc)( imgImage *dstimage, int dstx, int dsty, imgImage *srcimage ); + blendfunc = imgBlendGetFunction( dstimage, srcimage ); + if( blendfunc ) + { + blendfunc( dstimage, dstx, dsty, srcimage ); + return 1; + } + return 0; +} + + +//// + + +void imgAllocCopy( imgImage *dstimage, imgImage *srcimage ) +{ + dstimage->format = srcimage->format; + dstimage->data = malloc( srcimage->format.height * srcimage->format.bytesperline ); + memcpy( dstimage->data, srcimage->data, srcimage->format.height * srcimage->format.bytesperline ); + return; +} + + +void imgAllocCopyExtendBorder( imgImage *dstimage, imgImage *srcimage, int extendsize ) +{ + int y; + void *dst, *src, *dstrow; + + dstimage->format.width = srcimage->format.width + ( extendsize << 1 ); + dstimage->format.height = srcimage->format.height + ( extendsize << 1 ); + dstimage->format.type = srcimage->format.type; + dstimage->format.bytesperpixel = srcimage->format.bytesperpixel; + dstimage->format.bytesperline = dstimage->format.width * dstimage->format.bytesperpixel; + dstimage->data = malloc( dstimage->format.height * dstimage->format.bytesperline ); + + src = srcimage->data; + dst = dstimage->data; + for( y = 0 ; y < extendsize ; y++ ) + { + memset( dst, 0, dstimage->format.bytesperline ); + dst = ADDRESS( dst, dstimage->format.bytesperline ); + } + for( y = 0 ; y < srcimage->format.height ; y++ ) + { + dstrow = dst; + memset( dstrow, 0, extendsize * dstimage->format.bytesperpixel ); + dstrow = ADDRESS( dstrow, extendsize * dstimage->format.bytesperpixel ); + memcpy( dstrow, src, srcimage->format.width * dstimage->format.bytesperpixel ); + dstrow = ADDRESS( dstrow, srcimage->format.width * dstimage->format.bytesperpixel ); + memset( dstrow, 0, extendsize * dstimage->format.bytesperpixel ); + src = ADDRESS( src, srcimage->format.bytesperline ); + dst = ADDRESS( dst, dstimage->format.bytesperline ); + } + for( y = 0 ; y < extendsize ; y++ ) + { + memset( dst, 0, dstimage->format.bytesperline ); + dst = ADDRESS( dst, dstimage->format.bytesperline ); + } + + return; +} + + +void imgAllocExtractChannel( imgImage *dstimage, imgImage *srcimage, int channelindex ) +{ + int x, y; + unsigned char *dst, *src, *srcrow; + + dstimage->format.width = srcimage->format.width; + dstimage->format.height = srcimage->format.height; + dstimage->format.type = IMG_FORMAT_TYPE_GRAYSCALE; + dstimage->format.bytesperpixel = 1; + dstimage->format.bytesperline = dstimage->format.width * dstimage->format.bytesperpixel; + dstimage->data = malloc( dstimage->format.height * dstimage->format.bytesperline ); + + src = ADDRESS( srcimage->data, channelindex ); + dst = dstimage->data; + for( y = 0 ; y < dstimage->format.height ; y++ ) + { + srcrow = src; + for( x = 0 ; x < dstimage->format.width ; x++ ) + { + dst[x] = *srcrow; + srcrow = ADDRESS( srcrow, srcimage->format.bytesperpixel ); + } + src = ADDRESS( src, srcimage->format.bytesperline ); + dst = ADDRESS( dst, dstimage->format.bytesperline ); + } + + return; +} + + +void imgAllocExtractChannelExtendBorder( imgImage *dstimage, imgImage *srcimage, int channelindex, int extendsize ) +{ + int x, y; + unsigned char *src, *dst, *srcrow, *dstrow; + + dstimage->format.width = srcimage->format.width + ( extendsize << 1 ); + dstimage->format.height = srcimage->format.height + ( extendsize << 1 ); + dstimage->format.type = IMG_FORMAT_TYPE_GRAYSCALE; + dstimage->format.bytesperpixel = 1; + dstimage->format.bytesperline = dstimage->format.width * dstimage->format.bytesperpixel; + dstimage->data = malloc( dstimage->format.height * dstimage->format.bytesperline ); + + src = ADDRESS( srcimage->data, channelindex ); + dst = dstimage->data; + for( y = 0 ; y < extendsize ; y++ ) + { + memset( dst, 0, dstimage->format.bytesperline ); + dst = ADDRESS( dst, dstimage->format.bytesperline ); + } + for( y = 0 ; y < srcimage->format.height ; y++ ) + { + srcrow = src; + dstrow = dst; + memset( dstrow, 0, extendsize * dstimage->format.bytesperpixel ); + dstrow = ADDRESS( dstrow, extendsize * dstimage->format.bytesperpixel ); + for( x = 0 ; x < srcimage->format.width ; x++ ) + { + dstrow[x] = *srcrow; + srcrow = ADDRESS( srcrow, srcimage->format.bytesperpixel ); + } + dstrow = ADDRESS( dstrow, srcimage->format.width * dstimage->format.bytesperpixel ); + memset( dstrow, 0, extendsize * dstimage->format.bytesperpixel ); + src = ADDRESS( src, srcimage->format.bytesperline ); + dst = ADDRESS( dst, dstimage->format.bytesperline ); + } + for( y = 0 ; y < extendsize ; y++ ) + { + memset( dst, 0, dstimage->format.bytesperline ); + dst = ADDRESS( dst, dstimage->format.bytesperline ); + } + + return; +} + + +void imgAllocCopyChannelToAlpha( imgImage *dstimage, imgImage *srcimage, int channelindex, unsigned char r, unsigned char g, unsigned char b ) +{ + int x, y; + unsigned char *dst, *src, *dstrow, *srcrow; + + dstimage->format.width = srcimage->format.width; + dstimage->format.height = srcimage->format.height; + dstimage->format.type = IMG_FORMAT_TYPE_RGBA32; + dstimage->format.bytesperpixel = 4; + dstimage->format.bytesperline = dstimage->format.width * dstimage->format.bytesperpixel; + dstimage->data = malloc( dstimage->format.height * dstimage->format.bytesperline ); + + src = ADDRESS( srcimage->data, channelindex ); + dst = dstimage->data; + for( y = 0 ; y < dstimage->format.height ; y++ ) + { + srcrow = src; + dstrow = dst; + for( x = 0 ; x < dstimage->format.width ; x++ ) + { + dstrow[0] = r; + dstrow[1] = g; + dstrow[2] = b; + dstrow[3] = *srcrow; + srcrow = ADDRESS( srcrow, srcimage->format.bytesperpixel ); + dstrow = ADDRESS( dstrow, dstimage->format.bytesperpixel ); + } + src = ADDRESS( src, srcimage->format.bytesperline ); + dst = ADDRESS( dst, dstimage->format.bytesperline ); + } + + return; +} + + +void imgAllocAdjustBrightnessContrast( imgImage *dstimage, imgImage *srcimage, float brightness, float contrast ) +{ + int x, y; + float r, g, b; + unsigned char *dst, *src, *dstrow, *srcrow; + + dstimage->format = srcimage->format; + dstimage->data = malloc( srcimage->format.height * srcimage->format.bytesperline ); + + brightness += 0.5f; + + if( dstimage->format.bytesperpixel >= 3 ) + { + src = srcimage->data; + dst = dstimage->data; + for( y = 0 ; y < dstimage->format.height ; y++ ) + { + srcrow = src; + dstrow = dst; + for( x = 0 ; x < dstimage->format.width ; x++ ) + { + r = (1.0f/255.0f) * (float)srcrow[0]; + g = (1.0f/255.0f) * (float)srcrow[1]; + b = (1.0f/255.0f) * (float)srcrow[2]; + r = ( ( r - 0.5f ) * contrast ) + brightness; + g = ( ( g - 0.5f ) * contrast ) + brightness; + b = ( ( b - 0.5f ) * contrast ) + brightness; + dstrow[0] = (unsigned char)fmaxf( 0.0f, fminf( 255.0f, roundf( r * 255.0f ) ) ); + dstrow[1] = (unsigned char)fmaxf( 0.0f, fminf( 255.0f, roundf( g * 255.0f ) ) ); + dstrow[2] = (unsigned char)fmaxf( 0.0f, fminf( 255.0f, roundf( b * 255.0f ) ) ); + if( dstimage->format.bytesperpixel >= 4 ) + dstrow[3] = srcrow[3]; + srcrow = ADDRESS( srcrow, srcimage->format.bytesperpixel ); + dstrow = ADDRESS( dstrow, dstimage->format.bytesperpixel ); + } + src = ADDRESS( src, srcimage->format.bytesperline ); + dst = ADDRESS( dst, dstimage->format.bytesperline ); + } + } + else if( dstimage->format.bytesperpixel == 1 ) + { + src = srcimage->data; + dst = dstimage->data; + for( y = 0 ; y < dstimage->format.height ; y++ ) + { + srcrow = src; + dstrow = dst; + for( x = 0 ; x < dstimage->format.width ; x++ ) + { + r = (1.0f/255.0f) * (float)srcrow[0]; + r = ( ( r - 0.5f ) * contrast ) + brightness; + dstrow[0] = (unsigned char)fmaxf( 0.0f, fminf( 255.0f, roundf( r * 255.0f ) ) ); + srcrow = ADDRESS( srcrow, srcimage->format.bytesperpixel ); + dstrow = ADDRESS( dstrow, dstimage->format.bytesperpixel ); + } + src = ADDRESS( src, srcimage->format.bytesperline ); + dst = ADDRESS( dst, dstimage->format.bytesperline ); + } + } + + return; +} + + +void imgFree( imgImage *image ) +{ + free( image->data ); + image->data = 0; + return; +} + + +//// + + diff --git a/lib/graphics_utils/mipmap/img.h b/lib/graphics_utils/mipmap/img.h new file mode 100644 index 000000000..3871ca3f3 --- /dev/null +++ b/lib/graphics_utils/mipmap/img.h @@ -0,0 +1,74 @@ +/* ***************************************************************************** + * + * Copyright (c) 2007-2016 Alexis Naveros. + * Portions developed under contract to the SURVICE Engineering Company. + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * version 2.1 as published by the Free Software Foundation. + * + * This library is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this file; see the file named COPYING for more + * information. + * + * ***************************************************************************** + */ + +#ifndef IMG_H +#define IMG_H + + +typedef struct +{ + int width; + int height; + int type; + int bytesperpixel; + int bytesperline; +} imgFormat; + +enum +{ + IMG_FORMAT_TYPE_ANY, + IMG_FORMAT_TYPE_RGB24, + IMG_FORMAT_TYPE_BGR24, + IMG_FORMAT_TYPE_RGBX32, + IMG_FORMAT_TYPE_BGRX32, + IMG_FORMAT_TYPE_RGBA32, + IMG_FORMAT_TYPE_BGRA32, + IMG_FORMAT_TYPE_GRAYSCALE, + IMG_FORMAT_TYPE_GRAYALPHA +}; + +typedef struct +{ + imgFormat format; + void *data; +} imgImage; + + +//// + + +void imgCopyRect( imgImage *image, int dstx, int dsty, int srcx, int srcy, int sizex, int sizey ); + +void (*imgBlendGetFunction( imgImage *dstimage, imgImage *srcimage ))( imgImage *dstimage, int dstx, int dsty, imgImage *srcimage ); +int imgBlendImage( imgImage *dstimage, int dstx, int dsty, imgImage *srcimage ); + +void imgAllocCopy( imgImage *dst, imgImage *src ); +void imgAllocCopyExtendBorder( imgImage *dstimage, imgImage *srcimage, int extendsize ); +void imgAllocExtractChannel( imgImage *dst, imgImage *src, int channelindex ); +void imgAllocExtractChannelExtendBorder( imgImage *dstimage, imgImage *srcimage, int channelindex, int extendsize ); +void imgAllocCopyChannelToAlpha( imgImage *dstimage, imgImage *srcimage, int channelindex, unsigned char r, unsigned char g, unsigned char b ); +void imgAllocAdjustBrightnessContrast( imgImage *dstimage, imgImage *srcimage, float brightness, float contrast ); + +void imgFree( imgImage *image ); + + +#endif + diff --git a/lib/graphics_utils/mipmap/imgresize.c b/lib/graphics_utils/mipmap/imgresize.c new file mode 100644 index 000000000..72edeb9a1 --- /dev/null +++ b/lib/graphics_utils/mipmap/imgresize.c @@ -0,0 +1,4098 @@ +/* ----------------------------------------------------------------------------- + * + * Copyright (c) 2014-2017 Alexis Naveros. + * Portions developed under contract to the SURVICE Engineering Company. + * + * This software is provided 'as-is', without any express or implied + * warranty. In no event will the authors be held liable for any damages + * arising from the use of this software. + * + * Permission is granted to anyone to use this software for any purpose, + * including commercial applications, and to alter it and redistribute it + * freely, subject to the following restrictions: + * + * 1. The origin of this software must not be misrepresented; you must not + * claim that you wrote the original software. If you use this software + * in a product, an acknowledgment in the product documentation would be + * appreciated but is not required. + * 2. Altered source versions must be plainly marked as such, and must not be + * misrepresented as being the original software. + * 3. This notice may not be removed or altered from any source distribution. + * + * ----------------------------------------------------------------------------- + */ + +#include +#include +#include +#include +#include +#include +#include +#include + + +#include "cpusimd.h" + +#include "img.h" +#include "imgresize.h" + + +//// + + +#define IM_RESIZE_DEBUG (0) +#define IM_RESIZE_DEBUG_PROGRESS (0) + + +//// + + +#ifndef M_PI + #define M_PI (3.14159265358979323846) +#endif + +#ifndef ADDRESS + #define ADDRESS(p,o) ((void *)(((char *)p)+(o))) +#endif + +#if defined(__GNUC__) || defined(__INTEL_COMPILER) + #define CC_ALWAYSINLINE __attribute__((always_inline)) +#elif defined(_MSC_VER) + #define CC_ALWAYSINLINE __forceinline +#else + #define CC_ALWAYSINLINE +#endif + +static inline CC_ALWAYSINLINE uint32_t ccIsPow2Int32( uint32_t v ) +{ + return ( ( v & ( v - 1 ) ) == 0 ); +} + +#define ROUND_POSITIVE_FLOAT(x) ((int)((x)+0.5f)) + + +//// + + +#if defined(__x86_64__) || defined(__x86_64) || defined(__amd64__) || defined(__amd64) || defined(__i386__) || defined(__i386) || defined(i386) + +/* Input is 0.0,255.0, output is 0.0,1.0 */ +static inline CC_ALWAYSINLINE float srgb2linear( float v ) +{ + float v2, vpow, vpwsqrt; + union + { + int32_t i; + float f; + } u; + if( v <= (0.04045f*255.0f) ) + v = v * ( (1.0f/12.92f)*(1.0f/255.0f) ); + else + { + v = ( v + (0.055f*255.0f) ) * ( (1.0f/1.055f)*(1.0f/255.0f) ); + v2 = v * v; + u.f = v * 5417434112.0f; + u.i = (int32_t)ROUND_POSITIVE_FLOAT( (float)u.i * 0.8f ); + vpow = u.f; + vpwsqrt = sqrtf( vpow ); + v = ( ( v2 * vpwsqrt ) + ( ( ( v2 * v ) / vpwsqrt ) / sqrtf( vpwsqrt ) ) ) * 0.51011878327f; + } + return v; +} + +/* Input is 0.0,1.0, output is 0.0,255.0 */ +static inline CC_ALWAYSINLINE float linear2srgb( float v ) +{ + float vpow; + union + { + int32_t i; + float f; + } u; + if( v <= 0.0031308f ) + v = v * (12.92f*255.0f); + else + { + u.f = ( v * 6521909350804488192.0f ); + u.i = (int32_t)ROUND_POSITIVE_FLOAT( (float)u.i * 0.666666666666f ); + vpow = u.f; + v = ( v * vpow ) + ( ( v * v ) / sqrtf( vpow ) ); + v = ( (1.055f*255.0f) * sqrtf( sqrtf( v * 0.5290553722f ) ) - (0.055f*255.0f) ); + } + return v; +} + +#else + +/* Input is 0.0,255.0, output is 0.0,1.0 */ +/* Only for reference, this is waayyy too slow and should never be used */ +static inline CC_ALWAYSINLINE float srgb2linear( float v ) +{ + v *= (1.0f/255.0f); + if( v <= 0.04045f ) + v = v * (1.0f/12.92); + else + v = pow( ( v + 0.055f ) * (1.0f/1.055f), 2.4f ); + return v; +} + +/* Input is 0.0,1.0, output is 0.0,255.0 */ +/* Only for reference, this is waayyy too slow and should never be used */ +static inline CC_ALWAYSINLINE float linear2srgb( float v ) +{ + if( v <= 0.0031308f ) + v = v * 12.92f; + else + v = 1.055f * pow( v, 1.0f/2.4f ) - 0.055f; + return 255.0f * v; +} + +#endif + + +//// + + +#if CPU_SSE2_SUPPORT + +static const float CPU_ALIGN16 srgbLinearConst00[4] = { 0.04045f*255.0f, 0.04045f*255.0f, 0.04045f*255.0f, 1024.0f }; +static const float CPU_ALIGN16 srgbLinearConst01[4] = { (1.0f/12.92f)*(1.0f/255.0f), (1.0f/12.92f)*(1.0f/255.0f), (1.0f/12.92f)*(1.0f/255.0f), 1.0f }; +static const float CPU_ALIGN16 srgbLinearConst02[4] = { 0.055f*255.0f, 0.055f*255.0f, 0.055f*255.0f, 0.055f*255.0f }; +static const float CPU_ALIGN16 srgbLinearConst03[4] = { (1.0f/1.055f)*(1.0f/255.0f), (1.0f/1.055f)*(1.0f/255.0f), (1.0f/1.055f)*(1.0f/255.0f), (1.0f/1.055f)*(1.0f/255.0f) }; +static const float CPU_ALIGN16 srgbLinearConst04[4] = { 5417434112.0f, 5417434112.0f, 5417434112.0f, 5417434112.0f }; +static const float CPU_ALIGN16 srgbLinearConst05[4] = { 0.8f, 0.8f, 0.8f, 0.8f }; +static const float CPU_ALIGN16 srgbLinearConst06[4] = { 0.51011878327f, 0.51011878327f, 0.51011878327f, 0.51011878327f }; +static const float CPU_ALIGN16 srgbLinearConst07[4] = { 0.0031308f, 0.0031308f, 0.0031308f, 1024.0f }; +static const float CPU_ALIGN16 srgbLinearConst08[4] = { 12.92f*255.0f, 12.92f*255.0f, 12.92f*255.0f, 1.0f }; +static const float CPU_ALIGN16 srgbLinearConst09[4] = { 6521909350804488192.0f, 6521909350804488192.0f, 6521909350804488192.0f, 6521909350804488192.0f }; +static const float CPU_ALIGN16 srgbLinearConst10[4] = { 0.666666666666f, 0.666666666666f, 0.666666666666f, 0.666666666666f }; +static const float CPU_ALIGN16 srgbLinearConst11[4] = { 0.5290553722f, 0.5290553722f, 0.5290553722f, 0.5290553722f }; +static const float CPU_ALIGN16 srgbLinearConst12[4] = { 1.055f*255.0f, 1.055f*255.0f, 1.055f*255.0f, 1.055f*255.0f }; +static const float CPU_ALIGN16 srgbLinearConst13[4] = { -0.055f*255.0f, -0.055f*255.0f, -0.055f*255.0f, -0.055f*255.0f }; +static const float CPU_ALIGN16 srgbLinearConst14[4] = { 0.04045f*255.0f, 0.04045f*255.0f, 0.04045f*255.0f, 0.04045f*255.0f }; +static const float CPU_ALIGN16 srgbLinearConst15[4] = { (1.0f/12.92f)*(1.0f/255.0f), (1.0f/12.92f)*(1.0f/255.0f), (1.0f/12.92f)*(1.0f/255.0f), (1.0f/12.92f)*(1.0f/255.0f) }; + +/* Input is 0.0,255.0 ~ output is 0.0,1.0 ~ alpha channel is passed as-is */ +static inline CC_ALWAYSINLINE __m128 srgb2linear3( __m128 vx ) +{ + __m128 vmask, vbase; + __m128 vpow, vpwsqrtinv, vpwsqrt, vx2; + vmask = _mm_cmple_ps( vx, *(__m128*)srgbLinearConst00 ); + vbase = _mm_mul_ps( vx, *(__m128*)srgbLinearConst01 ); + vx = _mm_mul_ps( _mm_add_ps( vx, *(__m128*)srgbLinearConst02 ), *(__m128*)srgbLinearConst03 ); + vx2 = _mm_mul_ps( vx, vx ); + vpow = _mm_castsi128_ps( _mm_cvtps_epi32( _mm_mul_ps( _mm_cvtepi32_ps( _mm_castps_si128( _mm_mul_ps( vx, *(__m128*)srgbLinearConst04 ) ) ), *(__m128*)srgbLinearConst05 ) ) ); + vpwsqrtinv = _mm_rsqrt_ps( vpow ); + vpwsqrt = _mm_mul_ps( vpow, vpwsqrtinv ); + vx = _mm_mul_ps( _mm_add_ps( _mm_mul_ps( vx2, vpwsqrt ), _mm_mul_ps( _mm_mul_ps( _mm_mul_ps( vx2, vx ), vpwsqrtinv ), _mm_rsqrt_ps( vpwsqrt ) ) ), *(__m128*)srgbLinearConst06 ); + return CPU_BLENDV_PS( vx, vbase, vmask ); +} + +/* Input is 0.0,1.0 ~ output is 0.0,255.0 ~ alpha channel is passed as-is */ +static inline CC_ALWAYSINLINE __m128 linear2srgb3( __m128 vx ) +{ + __m128 vmask, vbase, vpow; + vmask = _mm_cmple_ps( vx, *(__m128*)srgbLinearConst07 ); + vbase = _mm_mul_ps( vx, *(__m128*)srgbLinearConst08 ); + vpow = _mm_castsi128_ps( _mm_cvtps_epi32( _mm_mul_ps( _mm_cvtepi32_ps( _mm_castps_si128( _mm_mul_ps( vx, *(__m128*)srgbLinearConst09 ) ) ), *(__m128*)srgbLinearConst10 ) ) ); + vx = _mm_add_ps( _mm_mul_ps( _mm_sqrt_ps( _mm_sqrt_ps( _mm_mul_ps( _mm_add_ps( _mm_mul_ps( vx, vpow ), _mm_mul_ps( _mm_mul_ps( vx, vx ), _mm_rsqrt_ps( vpow ) ) ), *(__m128*)srgbLinearConst11 ) ) ), *(__m128*)srgbLinearConst12 ), *(__m128*)srgbLinearConst13 ); + return CPU_BLENDV_PS( vx, vbase, vmask ); +} + +/* Input is 0.0,255.0 ~ output is 0.0,1.0 ~ alpha channel is passed as-is */ +static inline CC_ALWAYSINLINE __m128 srgb2linear4( __m128 vx ) +{ + __m128 vmask, vbase; + __m128 vpow, vpwsqrtinv, vpwsqrt, vx2; + vmask = _mm_cmple_ps( vx, *(__m128*)srgbLinearConst14 ); + vbase = _mm_mul_ps( vx, *(__m128*)srgbLinearConst15 ); + vx = _mm_mul_ps( _mm_add_ps( vx, *(__m128*)srgbLinearConst02 ), *(__m128*)srgbLinearConst03 ); + vx2 = _mm_mul_ps( vx, vx ); + vpow = _mm_castsi128_ps( _mm_cvtps_epi32( _mm_mul_ps( _mm_cvtepi32_ps( _mm_castps_si128( _mm_mul_ps( vx, *(__m128*)srgbLinearConst04 ) ) ), *(__m128*)srgbLinearConst05 ) ) ); + vpwsqrtinv = _mm_rsqrt_ps( vpow ); + vpwsqrt = _mm_mul_ps( vpow, vpwsqrtinv ); + vx = _mm_mul_ps( _mm_add_ps( _mm_mul_ps( vx2, vpwsqrt ), _mm_mul_ps( _mm_mul_ps( _mm_mul_ps( vx2, vx ), vpwsqrtinv ), _mm_rsqrt_ps( vpwsqrt ) ) ), *(__m128*)srgbLinearConst06 ); + return CPU_BLENDV_PS( vx, vbase, vmask ); +} + +#endif + + +//// + + +static inline CC_ALWAYSINLINE double bessel( double x ) +{ + double sum, t, y; + + /* Zeroth order Bessel function of the first kind. */ + sum = 1.0; + y = x * x * 0.25; + t = y; + sum += t; + t *= y * (1.0/(2.0*2.0)); + sum += t; + t *= y * (1.0/(3.0*3.0)); + sum += t; + t *= y * (1.0/(4.0*4.0)); + sum += t; + t *= y * (1.0/(5.0*5.0)); + sum += t; + t *= y * (1.0/(6.0*6.0)); + sum += t; + t *= y * (1.0/(7.0*7.0)); + sum += t; + t *= y * (1.0/(8.0*8.0)); + sum += t; + t *= y * (1.0/(9.0*9.0)); + sum += t; + t *= y * (1.0/(10.0*10.0)); + sum += t; + t *= y * (1.0/(11.0*11.0)); + sum += t; + t *= y * (1.0/(12.0*12.0)); + sum += t; + t *= y * (1.0/(13.0*13.0)); + sum += t; + t *= y * (1.0/(14.0*14.0)); + sum += t; + + return sum; +} + +static inline CC_ALWAYSINLINE double kaiser( double x, double beta ) +{ + return bessel( beta * sqrt( fmaxf( 0.0, 1.0 - ( x * x ) ) ) ); +} + +static inline CC_ALWAYSINLINE double sinc( double x ) +{ + if( x == 0.0 ) + return 1.0; + x = sin( x * M_PI ) / ( x * M_PI ); + return x; +} + + +//// + + +#if CPU_SSE2_SUPPORT + +static inline CC_ALWAYSINLINE __m128 simd4f_bessel( __m128 x ) +{ + __m128 sum, t, y; + + sum = *(__m128 *)simd4fOne; + y = _mm_mul_ps( *(__m128 *)simd4fQuarter, _mm_mul_ps( x, x ) ); + t = y; + sum = _mm_add_ps( sum, t ); + t = _mm_mul_ps( t, _mm_mul_ps( y, _mm_set1_ps( 1.0f/(2.0f*2.0f) ) ) ); + sum = _mm_add_ps( sum, t ); + t = _mm_mul_ps( t, _mm_mul_ps( y, _mm_set1_ps( 1.0f/(3.0f*3.0f) ) ) ); + sum = _mm_add_ps( sum, t ); + t = _mm_mul_ps( t, _mm_mul_ps( y, _mm_set1_ps( 1.0f/(4.0f*4.0f) ) ) ); + sum = _mm_add_ps( sum, t ); + t = _mm_mul_ps( t, _mm_mul_ps( y, _mm_set1_ps( 1.0f/(5.0f*5.0f) ) ) ); + sum = _mm_add_ps( sum, t ); + t = _mm_mul_ps( t, _mm_mul_ps( y, _mm_set1_ps( 1.0f/(6.0f*6.0f) ) ) ); + sum = _mm_add_ps( sum, t ); + t = _mm_mul_ps( t, _mm_mul_ps( y, _mm_set1_ps( 1.0f/(7.0f*7.0f) ) ) ); + sum = _mm_add_ps( sum, t ); + t = _mm_mul_ps( t, _mm_mul_ps( y, _mm_set1_ps( 1.0f/(8.0f*8.0f) ) ) ); + sum = _mm_add_ps( sum, t ); + t = _mm_mul_ps( t, _mm_mul_ps( y, _mm_set1_ps( 1.0f/(9.0f*9.0f) ) ) ); + sum = _mm_add_ps( sum, t ); + t = _mm_mul_ps( t, _mm_mul_ps( y, _mm_set1_ps( 1.0f/(10.0f*10.0f) ) ) ); + sum = _mm_add_ps( sum, t ); + t = _mm_mul_ps( t, _mm_mul_ps( y, _mm_set1_ps( 1.0f/(11.0f*11.0f) ) ) ); + sum = _mm_add_ps( sum, t ); + t = _mm_mul_ps( t, _mm_mul_ps( y, _mm_set1_ps( 1.0f/(12.0f*12.0f) ) ) ); + sum = _mm_add_ps( sum, t ); + t = _mm_mul_ps( t, _mm_mul_ps( y, _mm_set1_ps( 1.0f/(13.0f*13.0f) ) ) ); + sum = _mm_add_ps( sum, t ); + t = _mm_mul_ps( t, _mm_mul_ps( y, _mm_set1_ps( 1.0f/(14.0f*14.0f) ) ) ); + sum = _mm_add_ps( sum, t ); + + return sum; +} + +static inline CC_ALWAYSINLINE __m128 simd4f_kaiser( __m128 x, __m128 beta ) +{ + return simd4f_bessel( _mm_mul_ps( beta, _mm_sqrt_ps( _mm_max_ps( _mm_setzero_ps(), _mm_sub_ps( *(__m128 *)simd4fOne, _mm_mul_ps( x, x ) ) ) ) ) ); +} + +static inline CC_ALWAYSINLINE __m128 simd4f_sinc( __m128 x ) +{ + __m128 zeromask; + zeromask = _mm_cmpeq_ps( x, _mm_setzero_ps() ); + x = _mm_mul_ps( x, _mm_load_ps( simd4fPi ) ); + x = _mm_div_ps( simd4f_sin_ps( x ), x ); + x = CPU_BLENDV_PS( x, *(__m128 *)simd4fOne, zeromask ); + return x; +} + +#endif + + +//// + + +typedef struct +{ + int matrixsize; + int matrixoffset; + int matrixrowwidth; + int matrixrowsize; + int rowreturn; + float *matrix; + + int minimumalpha; + float dithersum; + float minimumalphaf; + float amplifynormal; + float normalsustainfactor; + void *alloc; + + unsigned char *srcdata; + int width1; + int width2; + int width3; + int width4; + int height; + int bytesperline; +} imStaticMatrixState; + + +static int imBuildStaticMatrix( imStaticMatrixState * restrict state, int sizedivisor, float hopcount, float alpha ) +{ + int i, j, minx, maxx; + double x, xshift, hopsize, offset, scalefactor, hopcountinv, beta, linsq, sum; + double *linear; + float *matrix; + + if( alpha > 16.0f ) + alpha = 16.0f; + beta = alpha * M_PI; + hopcountinv = 1.0 / hopcount; + + scalefactor = 1.0 / (double)sizedivisor; + hopsize = 0.5 * (double)sizedivisor; + offset = hopsize - 0.5; + minx = (int)ceil( ( -hopcount * hopsize ) + offset ); + maxx = (int)floor( ( hopcount * hopsize ) + offset ); + state->matrixoffset = minx; + state->matrixsize = ( maxx - minx ) + 1; + state->matrixrowwidth = ( state->matrixsize + 3 ) & ~3; + state->rowreturn = state->matrixrowwidth - state->matrixsize; + state->matrixrowsize = state->matrixrowwidth * sizeof(float); + +#if IM_RESIZE_DEBUG + printf( "ResizeMatrix ; scalefactor %.3f, offset %.3f, hopsize %.3f\n", scalefactor, offset, hopsize ); +#endif + + linear = malloc( state->matrixrowwidth * sizeof(double) ); + for( i = 0 ; i < state->matrixsize ; i++ ) + { + x = (double)( i + state->matrixoffset ); + xshift = 2.0 * scalefactor * ( x - offset ); + linear[i] = sinc( xshift ) * kaiser( hopcountinv * xshift, beta ); +#if IM_RESIZE_DEBUG + printf( " x[%+.3f] = %+.3f ( %+.3f * %+.3f )\n", x, linear[i], sinc( xshift ), kaiser( hopcountinv * xshift, beta ) ); +#endif + } + for( ; i < state->matrixrowwidth ; i++ ) + linear[i] = 0.0; + + /* Build normalized state */ + state->alloc = malloc( ( state->matrixsize * state->matrixrowsize ) + 16 ); + state->matrix = (void *)( ( (uintptr_t)state->alloc + 0xf ) & ~0xf ); + matrix = state->matrix; + sum = 0.0; + for( i = 0 ; i < state->matrixsize ; i++ ) + { + for( j = 0 ; j < state->matrixsize ; j++ ) + { + linsq = linear[i] * linear[j]; + matrix[j] = (float)linsq; + sum += linsq; + } + for( ; j < state->matrixrowwidth ; j++ ) + matrix[j] = 0.0f; + matrix += state->matrixrowwidth; + } + free( linear ); + +#if IM_RESIZE_DEBUG + printf( "Matrix sum : %f\n", sum ); +#endif + + sum = 1.0 / sum; + j = state->matrixsize * state->matrixrowwidth; + for( i = 0 ; i < j ; i++ ) + state->matrix[i] *= sum; + +#if IM_RESIZE_DEBUG + printf( "Matrix %dx%d :\n", state->matrixsize, state->matrixsize ); + for( i = 0 ; i < state->matrixsize ; i++ ) + { + for( j = 0 ; j < state->matrixsize ; j++ ) + printf( " %+.6f", state->matrix[ ( i * state->matrixrowwidth ) + j ] ); + printf( "\n" ); + } + printf( "Matrix Offset : %d\n", state->matrixoffset ); + printf( "Matrix Size : %d\n", state->matrixsize ); + printf( "Matrix Rowwidth : %d\n", state->matrixrowwidth ); +#endif + + return 1; +} + + +static void imFreeStaticState( imStaticMatrixState * restrict state ) +{ + free( state->alloc ); + state->alloc = 0; + return; +} + + +//// + + +typedef struct +{ + int matrixsizex, matrixsizey; + int matrixoffsetx, matrixoffsety; + float *linearx; + float *lineary; + float beta; + float hopcountinv; + + float dithersum; + int minimumalpha; + float minimumalphaf; + float amplifynormal; + float normalsustainfactor; + void *alloc; + + unsigned char *srcdata; + int width1; + int width2; + int width3; + int width4; + int height; + int bytesperline; +} imGenericMatrixState; + + +static inline int imAllocGenericState( imGenericMatrixState *state, float scalex, float scaley, float hopcount, float alpha ) +{ + int allocx, allocy, size; + void *align; + if( alpha > 16.0f ) + alpha = 16.0f; + allocx = ( (int)ceilf( hopcount / scalex ) + 2 + 3 ) & ~0x3; + allocy = ( (int)ceilf( hopcount / scaley ) + 2 + 3 ) & ~0x3; + size = ( ( allocx + allocy ) * sizeof(float) ) + 16; + state->alloc = malloc( size ); + memset( state->alloc, 0, size ); + align = (void *)( ( (uintptr_t)state->alloc + 0xf ) & ~0xf ); + state->linearx = align; + state->lineary = ADDRESS( align, allocx * sizeof(float) ); + state->beta = (float)alpha * M_PI; + state->hopcountinv = 1.0f / (float)hopcount; + return 1; +} + +static inline void imBuildGenericLinearX( imGenericMatrixState *state, float scalex, float scaleinvx, float sourcex, float hopcount, float alpha, int width ) +{ + int i, minx, maxx; + float hopsizex, offsetx; + float *linearx; + + hopsizex = 0.5 * scaleinvx; + offsetx = (float)sourcex; + minx = (int)ceil( ( -hopcount * hopsizex ) + offsetx ); + maxx = (int)floor( ( hopcount * hopsizex ) + offsetx ); + state->matrixsizex = ( maxx - minx ) + 1; + state->matrixoffsetx = ( minx + ( width << 8 ) ) % width; + + linearx = state->linearx; + scalex *= 2.0; +#if CPU_SSE2_SUPPORT + for( i = 0 ; i < state->matrixsizex ; i += 4 ) + { + __m128 vx, vxshift; + vx = _mm_add_ps( _mm_set1_ps( (float)( i + minx ) ), _mm_load_ps( simd4fZeroOneTwoThree ) ); + vxshift = _mm_mul_ps( _mm_set1_ps( scalex ), _mm_sub_ps( vx, _mm_set1_ps( offsetx ) ) ); + _mm_store_ps( &linearx[i], _mm_mul_ps( simd4f_sinc( vxshift ), simd4f_kaiser( _mm_mul_ps( _mm_set1_ps( state->hopcountinv ), vxshift ), _mm_set1_ps( state->beta ) ) ) ); + #if IM_RESIZE_DEBUG + printf( " linearx[%d] = %.3f\n", i+minx+0, linearx[i+0] ); + printf( " linearx[%d] = %.3f\n", i+minx+1, linearx[i+1] ); + printf( " linearx[%d] = %.3f\n", i+minx+2, linearx[i+2] ); + printf( " linearx[%d] = %.3f\n", i+minx+3, linearx[i+3] ); + #endif + } +#else + for( i = 0 ; i < state->matrixsizex ; i++ ) + { + float x, xshift; + x = (float)( i + minx ); + xshift = scalex * ( x - offsetx ); + linearx[i] = (float)( sinc( xshift ) * kaiser( state->hopcountinv * xshift, state->beta ) ); + #if IM_RESIZE_DEBUG + printf( " linearx[%+.3f] = %.3f ( %+.3f * %+.3f )\n", x, linearx[i], sinc( xshift ), kaiser( state->hopcountinv * xshift, state->beta ) ); + #endif + } +#endif + + return; +} + +static inline void imBuildGenericLinearY( imGenericMatrixState *state, float scaley, float scaleinvy, float sourcey, float hopcount, float alpha, int height ) +{ + int i, miny, maxy; + float hopsizey, offsety; + float *lineary; + + hopsizey = 0.5 * scaleinvy; + offsety = (float)sourcey; + miny = (int)ceil( ( -hopcount * hopsizey ) + offsety ); + maxy = (int)floor( ( hopcount * hopsizey ) + offsety ); + state->matrixsizey = ( maxy - miny ) + 1; + state->matrixoffsety = ( miny + ( height << 8 ) ) % height; + + lineary = state->lineary; + scaley *= 2.0; +#if CPU_SSE2_SUPPORT + for( i = 0 ; i < state->matrixsizey ; i += 4 ) + { + __m128 vy, vyshift; + vy = _mm_add_ps( _mm_set1_ps( (float)( i + miny ) ), _mm_load_ps( simd4fZeroOneTwoThree ) ); + vyshift = _mm_mul_ps( _mm_set1_ps( scaley ), _mm_sub_ps( vy, _mm_set1_ps( offsety ) ) ); + _mm_store_ps( &lineary[i], _mm_mul_ps( simd4f_sinc( vyshift ), simd4f_kaiser( _mm_mul_ps( _mm_set1_ps( state->hopcountinv ), vyshift ), _mm_set1_ps( state->beta ) ) ) ); + #if IM_RESIZE_DEBUG + printf( " lineary[%d] = %.3f\n", i+miny+0, lineary[i+0] ); + printf( " lineary[%d] = %.3f\n", i+miny+1, lineary[i+1] ); + printf( " lineary[%d] = %.3f\n", i+miny+2, lineary[i+2] ); + printf( " lineary[%d] = %.3f\n", i+miny+3, lineary[i+3] ); + #endif + } +#else + for( i = 0 ; i < state->matrixsizey ; i++ ) + { + float y, yshift; + y = (float)( i + miny ); + yshift = scaley * ( y - offsety ); + lineary[i] = (float)( sinc( yshift ) * kaiser( state->hopcountinv * yshift, state->beta ) ); + #if IM_RESIZE_DEBUG + printf( " lineary[%+.3f] = %.3f ( %+.3f * %+.3f )\n", y, lineary[i], sinc( yshift ), kaiser( state->hopcountinv * yshift, state->beta ) ); + #endif + } +#endif + + return; +} + + +static inline void imFreeGenericState( imGenericMatrixState *state ) +{ + free( state->alloc ); + state->alloc = 0; + return; +} + + + +//////////////////////////////////////////////////////////////////////////////// + + + +static void imStaticKernel1Linear( unsigned char *dst, int pointx, int pointy, imStaticMatrixState * restrict state ) +{ + int x, y, mapx, mapy; + float f, sum0; + float *matrix; + unsigned char *src; + + sum0 = 0.0f; + matrix = state->matrix; + mapy = pointy; + for( y = 0 ; y < state->matrixsize ; y++ ) + { + src = ADDRESS( state->srcdata, ( mapy * state->bytesperline ) ); + mapx = pointx; + for( x = 0 ; x < state->matrixsize ; x++ ) + { + f = matrix[x]; + sum0 += f * (float)src[ mapx + 0 ]; + mapx++; + if( mapx >= state->width1 ) + mapx = 0; + } + matrix = ADDRESS( matrix, state->matrixrowsize ); + mapy++; + if( mapy >= state->height ) + mapy = 0; + } + + dst[0] = (unsigned char)( fmaxf( 0.0f, fminf( 255.0f, sum0 + 0.5f ) ) ); + + return; +} + +static void imStaticKernel2Linear( unsigned char *dst, int pointx, int pointy, imStaticMatrixState * restrict state ) +{ + int x, y, mapx, mapy; + float f, sum0, sum1; + float *matrix; + unsigned char *src; + + sum0 = 0.0f; + sum1 = 0.0f; + matrix = state->matrix; + mapy = pointy; + for( y = 0 ; y < state->matrixsize ; y++ ) + { + src = ADDRESS( state->srcdata, ( mapy * state->bytesperline ) ); + mapx = pointx << 1; + for( x = 0 ; x < state->matrixsize ; x++ ) + { + f = matrix[x]; + sum0 += f * (float)src[ mapx + 0 ]; + sum1 += f * (float)src[ mapx + 1 ]; + mapx += 2; + if( mapx >= state->width2 ) + mapx = 0; + } + matrix = ADDRESS( matrix, state->matrixrowsize ); + mapy++; + if( mapy >= state->height ) + mapy = 0; + } + + dst[0] = (unsigned char)( fmaxf( 0.0f, fminf( 255.0f, sum0 + 0.5f ) ) ); + dst[1] = (unsigned char)( fmaxf( 0.0f, fminf( 255.0f, sum1 + 0.5f ) ) ); + + return; +} + +static void imStaticKernel3Linear( unsigned char *dst, int pointx, int pointy, imStaticMatrixState * restrict state ) +{ + int x, y, mapx, mapy; + float f, sum0, sum1, sum2; + float *matrix; + unsigned char *src; + + sum0 = 0.0f; + sum1 = 0.0f; + sum2 = 0.0f; + matrix = state->matrix; + mapy = pointy; + for( y = 0 ; y < state->matrixsize ; y++ ) + { + src = ADDRESS( state->srcdata, ( mapy * state->bytesperline ) ); + mapx = pointx + ( pointx << 1 ); + for( x = 0 ; x < state->matrixsize ; x++ ) + { + f = matrix[x]; + sum0 += f * (float)src[ mapx + 0 ]; + sum1 += f * (float)src[ mapx + 1 ]; + sum2 += f * (float)src[ mapx + 2 ]; + mapx += 3; + if( mapx >= state->width3 ) + mapx = 0; + } + matrix = ADDRESS( matrix, state->matrixrowsize ); + mapy++; + if( mapy >= state->height ) + mapy = 0; + } + + dst[0] = (unsigned char)( fmaxf( 0.0f, fminf( 255.0f, sum0 + 0.5f ) ) ); + dst[1] = (unsigned char)( fmaxf( 0.0f, fminf( 255.0f, sum1 + 0.5f ) ) ); + dst[2] = (unsigned char)( fmaxf( 0.0f, fminf( 255.0f, sum2 + 0.5f ) ) ); + + return; +} + +static void imStaticKernel4Linear( unsigned char *dst, int pointx, int pointy, imStaticMatrixState * restrict state ) +{ + int x, y, mapx, mapy; +#if CPU_SSE2_SUPPORT + __m128 vsum, vsrc; + __m128i vzero; +#else + float f, sum0, sum1, sum2, sum3; +#endif + float *matrix; + unsigned char *src; + +#if CPU_SSE2_SUPPORT + vsum = _mm_setzero_ps(); + vzero = _mm_setzero_si128(); +#else + sum0 = 0.0f; + sum1 = 0.0f; + sum2 = 0.0f; + sum3 = 0.0f; +#endif + matrix = state->matrix; + mapy = pointy; + for( y = 0 ; y < state->matrixsize ; y++ ) + { + src = ADDRESS( state->srcdata, ( mapy * state->bytesperline ) ); + mapx = pointx << 2; + for( x = 0 ; x < state->matrixsize ; x++ ) + { +#if CPU_SSE2_SUPPORT + vsrc = _mm_cvtepi32_ps( CPU_CVT_U8_TO_I32( _mm_castps_si128( _mm_load_ss( (void *)&src[ mapx ] ) ), vzero ) ); + vsum = _mm_add_ps( vsum, _mm_mul_ps( _mm_set1_ps( matrix[x] ), vsrc ) ); +#else + f = matrix[x]; + sum0 += f * (float)src[ mapx + 0 ]; + sum1 += f * (float)src[ mapx + 1 ]; + sum2 += f * (float)src[ mapx + 2 ]; + sum3 += f * (float)src[ mapx + 3 ]; +#endif + mapx += 4; + if( mapx >= state->width4 ) + mapx = 0; + } + matrix = ADDRESS( matrix, state->matrixrowsize ); + mapy++; + if( mapy >= state->height ) + mapy = 0; + } + +#if CPU_SSE2_SUPPORT + _mm_store_ss( (void *)dst, _mm_castsi128_ps( _mm_packus_epi16( _mm_packs_epi32( _mm_cvtps_epi32( vsum ), vzero ), vzero ) ) ); +#else + dst[0] = (unsigned char)( fmaxf( 0.0f, fminf( 255.0f, sum0 + 0.5f ) ) ); + dst[1] = (unsigned char)( fmaxf( 0.0f, fminf( 255.0f, sum1 + 0.5f ) ) ); + dst[2] = (unsigned char)( fmaxf( 0.0f, fminf( 255.0f, sum2 + 0.5f ) ) ); + dst[3] = (unsigned char)( fmaxf( 0.0f, fminf( 255.0f, sum3 + 0.5f ) ) ); +#endif + + return; +} + + +#if CPU_SSE2_SUPPORT + +static void imStaticKernel4Linear_Core( unsigned char *dst, int pointx, int pointy, imStaticMatrixState * restrict state ) +{ + int x, y, mapx, mapy; + __m128 vsum, vf, v0, v1, v2, v3; + __m128i vzero; + float *matrix; + unsigned char *src; + + vsum = _mm_setzero_ps(); + vzero = _mm_setzero_si128(); + matrix = state->matrix; + mapy = pointy; + for( y = 0 ; y < state->matrixsize ; y++ ) + { + src = ADDRESS( state->srcdata, ( mapy * state->bytesperline ) ); + mapx = pointx << 2; + for( x = 0 ; x < state->matrixsize ; x += 4 ) + { + vf = _mm_load_ps( &matrix[x] ); + v0 = _mm_cvtepi32_ps( CPU_CVT_U8_TO_I32( _mm_castps_si128( _mm_load_ss( (void *)&src[ mapx + 0 ] ) ), vzero ) ); + v1 = _mm_cvtepi32_ps( CPU_CVT_U8_TO_I32( _mm_castps_si128( _mm_load_ss( (void *)&src[ mapx + 4 ] ) ), vzero ) ); + v2 = _mm_cvtepi32_ps( CPU_CVT_U8_TO_I32( _mm_castps_si128( _mm_load_ss( (void *)&src[ mapx + 8 ] ) ), vzero ) ); + v3 = _mm_cvtepi32_ps( CPU_CVT_U8_TO_I32( _mm_castps_si128( _mm_load_ss( (void *)&src[ mapx + 12 ] ) ), vzero ) ); + vsum = _mm_add_ps( vsum, _mm_mul_ps( _mm_shuffle_ps( vf, vf, 0x00 ), v0 ) ); + vsum = _mm_add_ps( vsum, _mm_mul_ps( _mm_shuffle_ps( vf, vf, 0x55 ), v1 ) ); + vsum = _mm_add_ps( vsum, _mm_mul_ps( _mm_shuffle_ps( vf, vf, 0xaa ), v2 ) ); + vsum = _mm_add_ps( vsum, _mm_mul_ps( _mm_shuffle_ps( vf, vf, 0xff ), v3 ) ); + mapx += 16; + } + matrix = ADDRESS( matrix, state->matrixrowsize ); + mapy++; + if( mapy >= state->height ) + mapy = 0; + } + + _mm_store_ss( (void *)dst, _mm_castsi128_ps( _mm_packus_epi16( _mm_packs_epi32( _mm_cvtps_epi32( vsum ), vzero ), vzero ) ) ); + + return; +} + +#endif + + +//// + + +static void imStaticKernel4LinearAlphaNorm( unsigned char *dst, int pointx, int pointy, imStaticMatrixState * restrict state ) +{ + int x, y, mapx, mapy; + float f, sum0, sum1, sum2, sum3; + float *matrix; + unsigned char *src; + + sum0 = 0.0f; + sum1 = 0.0f; + sum2 = 0.0f; + sum3 = 0.0f; + matrix = state->matrix; + mapy = pointy; + for( y = 0 ; y < state->matrixsize ; y++ ) + { + src = ADDRESS( state->srcdata, ( mapy * state->bytesperline ) ); + mapx = pointx << 2; + for( x = 0 ; x < state->matrixsize ; x++ ) + { + f = matrix[x] * (float)src[ mapx + 3 ]; + sum0 += f * (float)src[ mapx + 0 ]; + sum1 += f * (float)src[ mapx + 1 ]; + sum2 += f * (float)src[ mapx + 2 ]; + sum3 += f; + mapx += 4; + if( mapx >= state->width4 ) + mapx = 0; + } + matrix = ADDRESS( matrix, state->matrixrowsize ); + mapy++; + if( mapy >= state->height ) + mapy = 0; + } + + if( sum3 >= state->minimumalphaf ) + { + f = 1.0f / sum3; + dst[0] = (unsigned char)( fmaxf( 0.0f, fminf( 255.0f, ( sum0 * f ) + 0.5f ) ) ); + dst[1] = (unsigned char)( fmaxf( 0.0f, fminf( 255.0f, ( sum1 * f ) + 0.5f ) ) ); + dst[2] = (unsigned char)( fmaxf( 0.0f, fminf( 255.0f, ( sum2 * f ) + 0.5f ) ) ); + dst[3] = (unsigned char)( fmaxf( 0.0f, fminf( 255.0f, sum3 + 0.5f ) ) ); + } + else + { + dst[0] = 0; + dst[1] = 0; + dst[2] = 0; + dst[3] = 0; + } + + return; +} + + +#if CPU_SSE2_SUPPORT + +static void imStaticKernel4LinearAlphaNorm_Core( unsigned char *dst, int pointx, int pointy, imStaticMatrixState * restrict state ) +{ + int x, y, mapx, mapy; + uint32_t pixel; + float *matrix; + unsigned char *src; + __m128 vsum0, vsum1, vsum2, vsum3; + __m128 vf, valpha, vr, vg, vb, va, vsrcf; + __m128i vsrc, vshufmask; + __m128i vzero; + + #if CPU_SSSE3_SUPPORT + vshufmask = _mm_setr_epi8( 0x00,0x04,0x08,0x0c, 0x01,0x05,0x09,0x0d, 0x02,0x06,0x0a,0x0e, 0x03,0x07,0x0b,0x0f ); + #endif + vsum0 = _mm_setzero_ps(); + vsum1 = _mm_setzero_ps(); + vsum2 = _mm_setzero_ps(); + vsum3 = _mm_setzero_ps(); + vzero = _mm_castps_si128( _mm_setzero_ps() ); + + matrix = state->matrix; + mapy = pointy; + for( y = 0 ; y < state->matrixsize ; y++ ) + { + src = ADDRESS( state->srcdata, ( mapy * state->bytesperline ) ); + mapx = pointx << 2; + for( x = 0 ; x < state->matrixsize ; x += 4 ) + { + vf = _mm_load_ps( &matrix[x] ); + /* Load 16 bytes and unpack as RRRR,GGGG,BBBB,AAAA in one SSE register */ + vsrc = _mm_loadu_si128( (void *)&src[ mapx ] ); + #if CPU_SSSE3_SUPPORT + vsrc = _mm_shuffle_epi8( vsrc, vshufmask ); + #else + vshufmask = _mm_shuffle_epi32( vsrc, 0x39 ); + vsrc = _mm_unpacklo_epi16( _mm_unpacklo_epi8( vsrc, vshufmask ), _mm_unpackhi_epi8( vsrc, vshufmask ) ); + #endif + /* Break that into 4 SSE registers as floats: vR,vG,vB,vA */ + vsrcf = _mm_castsi128_ps( vsrc ); + vr = _mm_cvtepi32_ps( CPU_CVT_U8_TO_I32( _mm_castps_si128( vsrcf ), vzero ) ); + #if CPU_SSE3_SUPPORT + vg = _mm_cvtepi32_ps( CPU_CVT_U8_TO_I32( _mm_castps_si128( _mm_movehdup_ps( vsrcf ) ), vzero ) ); + #else + vg = _mm_cvtepi32_ps( CPU_CVT_U8_TO_I32( _mm_castps_si128( _mm_shuffle_ps( vsrcf, vsrcf, 0x55 ) ), vzero ) ); + #endif + vb = _mm_cvtepi32_ps( CPU_CVT_U8_TO_I32( _mm_castps_si128( _mm_movehl_ps( vsrcf, vsrcf ) ), vzero ) ); + va = _mm_cvtepi32_ps( CPU_CVT_U8_TO_I32( _mm_castps_si128( _mm_shuffle_ps( vsrcf, vsrcf, 0xff ) ), vzero ) ); + valpha = _mm_mul_ps( va, vf ); + vsum0 = _mm_add_ps( vsum0, _mm_mul_ps( vr, valpha ) ); + vsum1 = _mm_add_ps( vsum1, _mm_mul_ps( vg, valpha ) ); + vsum2 = _mm_add_ps( vsum2, _mm_mul_ps( vb, valpha ) ); + vsum3 = _mm_add_ps( vsum3, valpha ); + mapx += 16; + } + matrix = ADDRESS( matrix, state->matrixrowsize ); + mapy++; + if( mapy >= state->height ) + mapy = 0; + } + + #if CPU_SSE3_SUPPORT + vsum0 = _mm_hadd_ps( vsum0, vsum1 ); + vsum2 = _mm_hadd_ps( vsum2, vsum3 ); + vsum0 = _mm_hadd_ps( vsum0, vsum2 ); + #else + vsum0 = _mm_add_ps( _mm_unpacklo_ps( vsum0, vsum2 ), _mm_unpackhi_ps( vsum0, vsum2 ) ); + vsum1 = _mm_add_ps( _mm_unpacklo_ps( vsum1, vsum3 ), _mm_unpackhi_ps( vsum1, vsum3 ) ); + vsum0 = _mm_add_ps( _mm_unpacklo_ps( vsum0, vsum1 ), _mm_unpackhi_ps( vsum0, vsum1 ) ); + #endif + + valpha = _mm_shuffle_ps( vsum0, vsum0, 0xff ); + pixel = 0; + if( _mm_comige_ss( valpha, _mm_load_ss( &state->minimumalphaf ) ) ) + { + __m128i vpixel; + vsum0 = _mm_mul_ps( vsum0, _mm_rcp_ps( valpha ) ); + vsum0 = CPU_BLENDV_PS( vsum0, valpha, *(__m128 *)simd4fAlphaMask ); + vpixel = _mm_cvtps_epi32( vsum0 ); + vpixel = _mm_packs_epi32( vpixel, vpixel ); + vpixel = _mm_packus_epi16( vpixel, vpixel ); + pixel = (uint32_t)_mm_cvtsi128_si32( vpixel ); + } + *(uint32_t *)dst = pixel; + + return; +} + +#endif + + +//// + + +static void imStaticKernel1sRGB( unsigned char *dst, int pointx, int pointy, imStaticMatrixState * restrict state ) +{ + int x, y, mapx, mapy; +#if CPU_SSE2_SUPPORT + __m128 vsum, vsrc; + __m128i vzero; +#else + float f, sum0; +#endif + float *matrix; + unsigned char *src; + +#if CPU_SSE2_SUPPORT + vsum = _mm_setzero_ps(); + vzero = _mm_setzero_si128(); +#else + sum0 = 0.0f; +#endif + matrix = state->matrix; + mapy = pointy; + for( y = 0 ; y < state->matrixsize ; y++ ) + { + src = ADDRESS( state->srcdata, ( mapy * state->bytesperline ) ); + mapx = pointx; + for( x = 0 ; x < state->matrixsize ; x++ ) + { +#if CPU_SSE2_SUPPORT + vsrc = _mm_set_ss( (float)src[ mapx + 0 ] ); + vsrc = srgb2linear3( vsrc ); + vsum = _mm_add_ps( vsum, _mm_mul_ps( _mm_set_ss( matrix[x] ), vsrc ) ); +#else + f = matrix[x]; + sum0 += f * srgb2linear( (float)src[ mapx + 0 ] ); +#endif + mapx++; + if( mapx >= state->width1 ) + mapx = 0; + } + matrix = ADDRESS( matrix, state->matrixrowsize ); + mapy++; + if( mapy >= state->height ) + mapy = 0; + } + +#if CPU_SSE2_SUPPORT + union + { + char c[4]; + uint32_t i; + } u; + vsum = linear2srgb3( vsum ); + _mm_store_ss( (void *)&u.i, _mm_castsi128_ps( _mm_packus_epi16( _mm_packs_epi32( _mm_cvtps_epi32( vsum ), vzero ), vzero ) ) ); + dst[0] = u.c[0]; +#else + dst[0] = (unsigned char)( fmaxf( 0.0f, fminf( 255.0f, linear2srgb( sum0 ) + 0.5f ) ) ); +#endif + + return; +} + +static void imStaticKernel2sRGB( unsigned char *dst, int pointx, int pointy, imStaticMatrixState * restrict state ) +{ + int x, y, mapx, mapy; +#if CPU_SSE2_SUPPORT + __m128 vsum, vsrc; + __m128i vzero; +#else + float f, sum0, sum1; +#endif + float *matrix; + unsigned char *src; + +#if CPU_SSE2_SUPPORT + vsum = _mm_setzero_ps(); + vzero = _mm_setzero_si128(); +#else + sum0 = 0.0f; + sum1 = 0.0f; +#endif + matrix = state->matrix; + mapy = pointy; + for( y = 0 ; y < state->matrixsize ; y++ ) + { + src = ADDRESS( state->srcdata, ( mapy * state->bytesperline ) ); + mapx = pointx << 1; + for( x = 0 ; x < state->matrixsize ; x++ ) + { +#if CPU_SSE2_SUPPORT + vsrc = _mm_set_ps( 0.0f, 0.0f, (float)src[ mapx + 1 ], (float)src[ mapx + 0 ] ); + vsrc = srgb2linear3( vsrc ); + vsum = _mm_add_ps( vsum, _mm_mul_ps( _mm_set1_ps( matrix[x] ), vsrc ) ); +#else + f = matrix[x]; + sum0 += f * srgb2linear( (float)src[ mapx + 0 ] ); + sum1 += f * srgb2linear( (float)src[ mapx + 1 ] ); +#endif + mapx += 2; + if( mapx >= state->width2 ) + mapx = 0; + } + matrix = ADDRESS( matrix, state->matrixrowsize ); + mapy++; + if( mapy >= state->height ) + mapy = 0; + } + +#if CPU_SSE2_SUPPORT + union + { + char c[4]; + uint32_t i; + } u; + vsum = linear2srgb3( vsum ); + _mm_store_ss( (void *)&u.i, _mm_castsi128_ps( _mm_packus_epi16( _mm_packs_epi32( _mm_cvtps_epi32( vsum ), vzero ), vzero ) ) ); + dst[0] = u.c[0]; + dst[1] = u.c[1]; +#else + dst[0] = (unsigned char)( fmaxf( 0.0f, fminf( 255.0f, linear2srgb( sum0 ) + 0.5f ) ) ); + dst[1] = (unsigned char)( fmaxf( 0.0f, fminf( 255.0f, linear2srgb( sum1 ) + 0.5f ) ) ); +#endif + + return; +} + +static void imStaticKernel3sRGB( unsigned char *dst, int pointx, int pointy, imStaticMatrixState * restrict state ) +{ + int x, y, mapx, mapy; +#if CPU_SSE2_SUPPORT + __m128 vsum, vsrc; + __m128i vzero; +#else + float f, sum0, sum1, sum2; +#endif + float *matrix; + unsigned char *src; + +#if CPU_SSE2_SUPPORT + vsum = _mm_setzero_ps(); + vzero = _mm_setzero_si128(); +#else + sum0 = 0.0f; + sum1 = 0.0f; + sum2 = 0.0f; +#endif + matrix = state->matrix; + mapy = pointy; + for( y = 0 ; y < state->matrixsize ; y++ ) + { + src = ADDRESS( state->srcdata, ( mapy * state->bytesperline ) ); + mapx = pointx + ( pointx << 1 ); + for( x = 0 ; x < state->matrixsize ; x++ ) + { +#if CPU_SSE2_SUPPORT + vsrc = _mm_set_ps( 0.0f, (float)src[ mapx + 2 ], (float)src[ mapx + 1 ], (float)src[ mapx + 0 ] ); + vsrc = srgb2linear3( vsrc ); + vsum = _mm_add_ps( vsum, _mm_mul_ps( _mm_set1_ps( matrix[x] ), vsrc ) ); +#else + f = matrix[x]; + sum0 += f * srgb2linear( (float)src[ mapx + 0 ] ); + sum1 += f * srgb2linear( (float)src[ mapx + 1 ] ); + sum2 += f * srgb2linear( (float)src[ mapx + 2 ] ); +#endif + mapx += 3; + if( mapx >= state->width3 ) + mapx = 0; + } + matrix = ADDRESS( matrix, state->matrixrowsize ); + mapy++; + if( mapy >= state->height ) + mapy = 0; + } + +#if CPU_SSE2_SUPPORT + union + { + char c[4]; + uint32_t i; + } u; + vsum = linear2srgb3( vsum ); + _mm_store_ss( (void *)&u.i, _mm_castsi128_ps( _mm_packus_epi16( _mm_packs_epi32( _mm_cvtps_epi32( vsum ), vzero ), vzero ) ) ); + dst[0] = u.c[0]; + dst[1] = u.c[1]; + dst[2] = u.c[2]; +#else + dst[0] = (unsigned char)( fmaxf( 0.0f, fminf( 255.0f, linear2srgb( sum0 ) + 0.5f ) ) ); + dst[1] = (unsigned char)( fmaxf( 0.0f, fminf( 255.0f, linear2srgb( sum1 ) + 0.5f ) ) ); + dst[2] = (unsigned char)( fmaxf( 0.0f, fminf( 255.0f, linear2srgb( sum2 ) + 0.5f ) ) ); +#endif + + return; +} + +static void imStaticKernel4sRGB( unsigned char *dst, int pointx, int pointy, imStaticMatrixState * restrict state ) +{ + int x, y, mapx, mapy; +#if CPU_SSE2_SUPPORT + __m128 vsum, vsrc; + __m128i vzero; +#else + float f, sum0, sum1, sum2, sum3; +#endif + float *matrix; + unsigned char *src; + +#if CPU_SSE2_SUPPORT + vsum = _mm_setzero_ps(); + vzero = _mm_setzero_si128(); +#else + sum0 = 0.0f; + sum1 = 0.0f; + sum2 = 0.0f; + sum3 = 0.0f; +#endif + matrix = state->matrix; + mapy = pointy; + for( y = 0 ; y < state->matrixsize ; y++ ) + { + src = ADDRESS( state->srcdata, ( mapy * state->bytesperline ) ); + mapx = pointx << 2; + for( x = 0 ; x < state->matrixsize ; x++ ) + { +#if CPU_SSE2_SUPPORT + vsrc = _mm_cvtepi32_ps( CPU_CVT_U8_TO_I32( _mm_castps_si128( _mm_load_ss( (void *)&src[ mapx ] ) ), vzero ) ); + vsrc = srgb2linear3( vsrc ); + vsum = _mm_add_ps( vsum, _mm_mul_ps( _mm_set1_ps( matrix[x] ), vsrc ) ); +#else + f = matrix[x]; + sum0 += f * srgb2linear( (float)src[ mapx + 0 ] ); + sum1 += f * srgb2linear( (float)src[ mapx + 1 ] ); + sum2 += f * srgb2linear( (float)src[ mapx + 2 ] ); + sum3 += f * (float)src[ mapx + 3 ]; +#endif + mapx += 4; + if( mapx >= state->width4 ) + mapx = 0; + } + matrix = ADDRESS( matrix, state->matrixrowsize ); + mapy++; + if( mapy >= state->height ) + mapy = 0; + } + +#if CPU_SSE2_SUPPORT + vsum = linear2srgb3( vsum ); + _mm_store_ss( (void *)dst, _mm_castsi128_ps( _mm_packus_epi16( _mm_packs_epi32( _mm_cvtps_epi32( vsum ), vzero ), vzero ) ) ); +#else + dst[0] = (unsigned char)( fmaxf( 0.0f, fminf( 255.0f, linear2srgb( sum0 ) + 0.5f ) ) ); + dst[1] = (unsigned char)( fmaxf( 0.0f, fminf( 255.0f, linear2srgb( sum1 ) + 0.5f ) ) ); + dst[2] = (unsigned char)( fmaxf( 0.0f, fminf( 255.0f, linear2srgb( sum2 ) + 0.5f ) ) ); + dst[3] = (unsigned char)( fmaxf( 0.0f, fminf( 255.0f, sum3 + 0.5f ) ) ); +#endif + + return; +} + + +#if CPU_SSE2_SUPPORT + +static void imStaticKernel3sRGB_Core( unsigned char *dst, int pointx, int pointy, imStaticMatrixState * restrict state ) +{ + int x, y, mapx, mapy; + __m128 vsum0, vsum1, vsum2, vsrc0, vsrc1, vsrc2, vf; + __m128i vzero; + float *matrix; + unsigned char *src; + union + { + char c[4]; + uint32_t i; + } u; + + vsum0 = _mm_setzero_ps(); + vsum1 = _mm_setzero_ps(); + vsum2 = _mm_setzero_ps(); + vzero = _mm_setzero_si128(); + matrix = state->matrix; + mapy = pointy; + for( y = 0 ; y < state->matrixsize ; y++ ) + { + src = ADDRESS( state->srcdata, ( mapy * state->bytesperline ) ); + mapx = pointx + ( pointx << 1 ); + for( x = 0 ; x < state->matrixsize ; x += 4 ) + { + vsrc0 = _mm_cvtepi32_ps( CPU_CVT_U8_TO_I32( _mm_castps_si128( _mm_load_ss( (void *)&src[ mapx+0 ] ) ), vzero ) ); + vsrc1 = _mm_cvtepi32_ps( CPU_CVT_U8_TO_I32( _mm_castps_si128( _mm_load_ss( (void *)&src[ mapx+4 ] ) ), vzero ) ); + vsrc2 = _mm_cvtepi32_ps( CPU_CVT_U8_TO_I32( _mm_castps_si128( _mm_load_ss( (void *)&src[ mapx+8 ] ) ), vzero ) ); + vsrc0 = srgb2linear4( vsrc0 ); + vsrc1 = srgb2linear4( vsrc1 ); + vsrc2 = srgb2linear4( vsrc2 ); + vf = _mm_load_ps( &matrix[x] ); + vsum0 = _mm_add_ps( vsum0, _mm_mul_ps( _mm_shuffle_ps( vf, vf, 0x40 ), vsrc0 ) ); + vsum1 = _mm_add_ps( vsum1, _mm_mul_ps( _mm_shuffle_ps( vf, vf, 0xA5 ), vsrc1 ) ); + vsum2 = _mm_add_ps( vsum2, _mm_mul_ps( _mm_shuffle_ps( vf, vf, 0xFE ), vsrc2 ) ); + mapx += 12; + } + matrix = ADDRESS( matrix, state->matrixrowsize ); + mapy++; + if( mapy >= state->height ) + mapy = 0; + } + + #if CPU_SSSE3_SUPPORT + vsum0 = _mm_add_ps( vsum0, _mm_castsi128_ps( _mm_alignr_epi8( _mm_castps_si128( vsum1 ), _mm_castps_si128( vsum0 ), 12 ) ) ); + vsum0 = _mm_add_ps( vsum0, _mm_castsi128_ps( _mm_alignr_epi8( _mm_castps_si128( vsum2 ), _mm_castps_si128( vsum1 ), 8 ) ) ); + vsum0 = _mm_add_ps( vsum0, _mm_castsi128_ps( _mm_alignr_epi8( _mm_castps_si128( vsum2 ), _mm_castps_si128( vsum2 ), 4 ) ) ); + #else + vf = _mm_shuffle_ps( vsum0, vsum1, 0x4f ); + vsum0 = _mm_add_ps( vsum0, _mm_shuffle_ps( vf, vf, 0x38 ) ); + vsum0 = _mm_add_ps( vsum0, _mm_shuffle_ps( vsum1, vsum2, 0x0E ) ); + vsum0 = _mm_add_ps( vsum0, _mm_shuffle_ps( vsum2, vsum2, 0x39 ) ); + #endif + + vsum0 = linear2srgb3( vsum0 ); + _mm_store_ss( (void *)&u.i, _mm_castsi128_ps( _mm_packus_epi16( _mm_packs_epi32( _mm_cvtps_epi32( vsum0 ), vzero ), vzero ) ) ); + dst[0] = u.c[0]; + dst[1] = u.c[1]; + dst[2] = u.c[2]; + + return; +} + +static void imStaticKernel4sRGB_Core( unsigned char *dst, int pointx, int pointy, imStaticMatrixState * restrict state ) +{ + int x, y, mapx, mapy; + __m128 vsum, vsrc0, vsrc1; + __m128i vzero; + float *matrix; + unsigned char *src; + + vsum = _mm_setzero_ps(); + vzero = _mm_setzero_si128(); + matrix = state->matrix; + mapy = pointy; + for( y = 0 ; y < state->matrixsize ; y++ ) + { + src = ADDRESS( state->srcdata, ( mapy * state->bytesperline ) ); + mapx = pointx << 2; + for( x = 0 ; x < state->matrixsize ; x += 2 ) + { + vsrc0 = _mm_cvtepi32_ps( CPU_CVT_U8_TO_I32( _mm_castps_si128( _mm_load_ss( (void *)&src[ mapx+0 ] ) ), vzero ) ); + vsrc1 = _mm_cvtepi32_ps( CPU_CVT_U8_TO_I32( _mm_castps_si128( _mm_load_ss( (void *)&src[ mapx+4 ] ) ), vzero ) ); + vsrc0 = srgb2linear3( vsrc0 ); + vsrc1 = srgb2linear3( vsrc1 ); + vsum = _mm_add_ps( vsum, _mm_mul_ps( _mm_set1_ps( matrix[x+0] ), vsrc0 ) ); + vsum = _mm_add_ps( vsum, _mm_mul_ps( _mm_set1_ps( matrix[x+1] ), vsrc1 ) ); + mapx += 8; + } + matrix = ADDRESS( matrix, state->matrixrowsize ); + mapy++; + if( mapy >= state->height ) + mapy = 0; + } + + vsum = linear2srgb3( vsum ); + _mm_store_ss( (void *)dst, _mm_castsi128_ps( _mm_packus_epi16( _mm_packs_epi32( _mm_cvtps_epi32( vsum ), vzero ), vzero ) ) ); + + return; +} + +#endif + + +//// + + +static void imStaticKernel4sRGBAlphaNorm( unsigned char *dst, int pointx, int pointy, imStaticMatrixState * restrict state ) +{ + int x, y, mapx, mapy; +#if CPU_SSE2_SUPPORT + __m128 vsum, vsrc, valpha; + __m128i vzero; + uint32_t pixel; +#else + float f, sum0, sum1, sum2, sum3; +#endif + float *matrix; + unsigned char *src; + +#if CPU_SSE2_SUPPORT + vsum = _mm_setzero_ps(); + vzero = _mm_setzero_si128(); +#else + sum0 = 0.0f; + sum1 = 0.0f; + sum2 = 0.0f; + sum3 = 0.0f; +#endif + matrix = state->matrix; + mapy = pointy; + for( y = 0 ; y < state->matrixsize ; y++ ) + { + src = ADDRESS( state->srcdata, ( mapy * state->bytesperline ) ); + mapx = pointx << 2; + for( x = 0 ; x < state->matrixsize ; x++ ) + { +#if CPU_SSE2_SUPPORT + vsrc = _mm_cvtepi32_ps( CPU_CVT_U8_TO_I32( _mm_castps_si128( _mm_load_ss( (void *)&src[ mapx ] ) ), vzero ) ); + valpha = _mm_shuffle_ps( vsrc, _mm_set_ss( 1.0f ), 0x0f ); + vsrc = srgb2linear3( vsrc ); + vsum = _mm_add_ps( vsum, _mm_mul_ps( _mm_mul_ps( _mm_shuffle_ps( valpha, valpha, 0xC0 ), _mm_set1_ps( matrix[x] ) ), vsrc ) ); +#else + f = matrix[x] * (float)src[ mapx + 3 ]; + sum0 += f * srgb2linear( (float)src[ mapx + 0 ] ); + sum1 += f * srgb2linear( (float)src[ mapx + 1 ] ); + sum2 += f * srgb2linear( (float)src[ mapx + 2 ] ); + sum3 += f; +#endif + mapx += 4; + if( mapx >= state->width4 ) + mapx = 0; + } + matrix = ADDRESS( matrix, state->matrixrowsize ); + mapy++; + if( mapy >= state->height ) + mapy = 0; + } + +#if CPU_SSE2_SUPPORT + valpha = _mm_shuffle_ps( vsum, vsum, 0xff ); + pixel = 0; + if( _mm_comige_ss( valpha, _mm_load_ss( &state->minimumalphaf ) ) ) + { + __m128i vpixel; + vsum = _mm_mul_ps( vsum, _mm_rcp_ps( valpha ) ); + vsum = CPU_BLENDV_PS( vsum, valpha, *(__m128 *)simd4fAlphaMask ); + vsum = linear2srgb3( vsum ); + vpixel = _mm_cvtps_epi32( vsum ); + vpixel = _mm_packs_epi32( vpixel, vpixel ); + vpixel = _mm_packus_epi16( vpixel, vpixel ); + pixel = (uint32_t)_mm_cvtsi128_si32( vpixel ); + } + *(uint32_t *)dst = pixel; +#else + if( sum3 >= state->minimumalphaf ) + { + f = 1.0f / sum3; + dst[0] = (unsigned char)( fmaxf( 0.0f, fminf( 255.0f, linear2srgb( sum0 * f ) + 0.5f ) ) ); + dst[1] = (unsigned char)( fmaxf( 0.0f, fminf( 255.0f, linear2srgb( sum1 * f ) + 0.5f ) ) ); + dst[2] = (unsigned char)( fmaxf( 0.0f, fminf( 255.0f, linear2srgb( sum2 * f ) + 0.5f ) ) ); + dst[3] = (unsigned char)( fmaxf( 0.0f, fminf( 255.0f, sum3 + 0.5f ) ) ); + } + else + { + dst[0] = 0; + dst[1] = 0; + dst[2] = 0; + dst[3] = 0; + } +#endif + + return; +} + + +#if CPU_SSE2_SUPPORT + +static void imStaticKernel4sRGBAlphaNorm_Core( unsigned char *dst, int pointx, int pointy, imStaticMatrixState * restrict state ) +{ + int x, y, mapx, mapy; + __m128 vsum, vsrc0, vsrc1, valpha0, valpha1; + __m128i vzero; + uint32_t pixel; + float *matrix; + unsigned char *src; + + vsum = _mm_setzero_ps(); + vzero = _mm_setzero_si128(); + matrix = state->matrix; + mapy = pointy; + for( y = 0 ; y < state->matrixsize ; y++ ) + { + src = ADDRESS( state->srcdata, ( mapy * state->bytesperline ) ); + mapx = pointx << 2; + for( x = 0 ; x < state->matrixsize ; x += 2 ) + { + vsrc0 = _mm_cvtepi32_ps( CPU_CVT_U8_TO_I32( _mm_castps_si128( _mm_load_ss( (void *)&src[ mapx+0 ] ) ), vzero ) ); + vsrc1 = _mm_cvtepi32_ps( CPU_CVT_U8_TO_I32( _mm_castps_si128( _mm_load_ss( (void *)&src[ mapx+4 ] ) ), vzero ) ); + valpha0 = _mm_shuffle_ps( vsrc0, _mm_set_ss( 1.0f ), 0x0f ); + valpha1 = _mm_shuffle_ps( vsrc1, _mm_set_ss( 1.0f ), 0x0f ); + vsrc0 = srgb2linear3( vsrc0 ); + vsrc1 = srgb2linear3( vsrc1 ); + vsum = _mm_add_ps( vsum, _mm_mul_ps( _mm_mul_ps( _mm_shuffle_ps( valpha0, valpha0, 0xC0 ), _mm_set1_ps( matrix[x+0] ) ), vsrc0 ) ); + vsum = _mm_add_ps( vsum, _mm_mul_ps( _mm_mul_ps( _mm_shuffle_ps( valpha1, valpha1, 0xC0 ), _mm_set1_ps( matrix[x+1] ) ), vsrc1 ) ); + mapx += 8; + } + matrix = ADDRESS( matrix, state->matrixrowsize ); + mapy++; + if( mapy >= state->height ) + mapy = 0; + } + + valpha0 = _mm_shuffle_ps( vsum, vsum, 0xff ); + pixel = 0; + if( _mm_comige_ss( valpha0, _mm_load_ss( &state->minimumalphaf ) ) ) + { + __m128i vpixel; + vsum = _mm_mul_ps( vsum, _mm_rcp_ps( valpha0 ) ); + vsum = CPU_BLENDV_PS( vsum, valpha0, *(__m128 *)simd4fAlphaMask ); + vsum = linear2srgb3( vsum ); + vpixel = _mm_cvtps_epi32( vsum ); + vpixel = _mm_packs_epi32( vpixel, vpixel ); + vpixel = _mm_packus_epi16( vpixel, vpixel ); + pixel = (uint32_t)_mm_cvtsi128_si32( vpixel ); + } + *(uint32_t *)dst = pixel; + + return; +} + +#endif + +//// + + +static void imStaticKernel3Normal( unsigned char *dst, int pointx, int pointy, imStaticMatrixState * restrict state ) +{ + int x, y, mapx, mapy; + float f, sum0, sum1, sum2, suminv; + float *matrix; + unsigned char *src; + + sum0 = 0.0f; + sum1 = 0.0f; + sum2 = 0.0f; + matrix = state->matrix; + mapy = pointy; + for( y = 0 ; y < state->matrixsize ; y++ ) + { + src = ADDRESS( state->srcdata, ( mapy * state->bytesperline ) ); + mapx = pointx + ( pointx << 1 ); + for( x = 0 ; x < state->matrixsize ; x++ ) + { + f = matrix[x]; + sum0 += f * (float)src[ mapx + 0 ]; + sum1 += f * (float)src[ mapx + 1 ]; + sum2 += f * (float)src[ mapx + 2 ]; + mapx += 3; + if( mapx >= state->width3 ) + mapx = 0; + } + matrix = ADDRESS( matrix, state->matrixrowsize ); + mapy++; + if( mapy >= state->height ) + mapy = 0; + } + + sum0 -= 0.5f*255.0f; + sum1 -= 0.5f*255.0f; + sum2 -= 0.5f*255.0f; + sum0 *= state->amplifynormal; + sum1 *= state->amplifynormal; + suminv = (0.5f*255.0f) / sqrtf( ( sum0 * sum0 ) + ( sum1 * sum1 ) + ( sum2 * sum2 ) ); + sum0 = (0.5f*255.0f) + ( sum0 * suminv ); + sum1 = (0.5f*255.0f) + ( sum1 * suminv ); + sum2 = (0.5f*255.0f) + ( sum2 * suminv ); + dst[0] = (unsigned char)( fmaxf( 0.0f, fminf( 255.0f, sum0 + 0.5f ) ) ); + dst[1] = (unsigned char)( fmaxf( 0.0f, fminf( 255.0f, sum1 + 0.5f ) ) ); + dst[2] = (unsigned char)( fmaxf( 0.0f, fminf( 255.0f, sum2 + 0.5f ) ) ); + + return; +} + +static void imStaticKernel4Normal( unsigned char *dst, int pointx, int pointy, imStaticMatrixState * restrict state ) +{ + int x, y, mapx, mapy; +#if CPU_SSE2_SUPPORT + __m128 vsum, vsrc; + __m128i vzero; +#else + float f; +#endif + float sum0, sum1, sum2, sum3, suminv; + float *matrix; + unsigned char *src; + +#if CPU_SSE2_SUPPORT + vsum = _mm_setzero_ps(); + vzero = _mm_setzero_si128(); +#else + sum0 = 0.0f; + sum1 = 0.0f; + sum2 = 0.0f; + sum3 = 0.0f; +#endif + matrix = state->matrix; + mapy = pointy; + for( y = 0 ; y < state->matrixsize ; y++ ) + { + src = ADDRESS( state->srcdata, ( mapy * state->bytesperline ) ); + mapx = pointx << 2; + for( x = 0 ; x < state->matrixsize ; x++ ) + { +#if CPU_SSE2_SUPPORT + vsrc = _mm_cvtepi32_ps( CPU_CVT_U8_TO_I32( _mm_castps_si128( _mm_load_ss( (void *)&src[ mapx ] ) ), vzero ) ); + vsum = _mm_add_ps( vsum, _mm_mul_ps( _mm_set1_ps( matrix[x] ), vsrc ) ); +#else + f = matrix[x]; + sum0 += f * (float)src[ mapx + 0 ]; + sum1 += f * (float)src[ mapx + 1 ]; + sum2 += f * (float)src[ mapx + 2 ]; + sum3 += f * (float)src[ mapx + 3 ]; +#endif + mapx += 4; + if( mapx >= state->width4 ) + mapx = 0; + } + matrix = ADDRESS( matrix, state->matrixrowsize ); + mapy++; + if( mapy >= state->height ) + mapy = 0; + } + +#if CPU_SSE2_SUPPORT + vsum = _mm_sub_ps( vsum, _mm_set_ps( 1.0f/255.0f, 1.0f/255.0f, 1.0f/255.0f, 0.0f ) ); + sum0 = _mm_cvtss_f32( vsum ); + #if CPU_SSE3_SUPPORT + sum1 = _mm_cvtss_f32( _mm_movehdup_ps( vsum ) ); + #else + sum1 = _mm_cvtss_f32( _mm_shuffle_ps( vsum, vsum, 0x55 ) ); + #endif + sum2 = _mm_cvtss_f32( _mm_movehl_ps( vsum, vsum ) ); + sum3 = _mm_cvtss_f32( _mm_shuffle_ps( vsum, vsum, 0xff ) ); +#else + sum0 -= 0.5f*255.0f; + sum1 -= 0.5f*255.0f; + sum2 -= 0.5f*255.0f; +#endif + sum0 *= state->amplifynormal; + sum1 *= state->amplifynormal; + suminv = (0.5f*255.0f) / sqrtf( ( sum0 * sum0 ) + ( sum1 * sum1 ) + ( sum2 * sum2 ) ); + sum0 = (0.5f*255.0f) + ( sum0 * suminv ); + sum1 = (0.5f*255.0f) + ( sum1 * suminv ); + sum2 = (0.5f*255.0f) + ( sum2 * suminv ); + dst[0] = (unsigned char)( fmaxf( 0.0f, fminf( 255.0f, sum0 + 0.5f ) ) ); + dst[1] = (unsigned char)( fmaxf( 0.0f, fminf( 255.0f, sum1 + 0.5f ) ) ); + dst[2] = (unsigned char)( fmaxf( 0.0f, fminf( 255.0f, sum2 + 0.5f ) ) ); + dst[3] = (unsigned char)( fmaxf( 0.0f, fminf( 255.0f, sum3 + 0.5f ) ) ); + + return; +} + + +//// + + +static void imStaticKernel3NormalSustain( unsigned char *dst, int pointx, int pointy, imStaticMatrixState * restrict state ) +{ + int x, y, mapx, mapy; + float f, v0, v1, v2, energy, sum0, sum1, sum2, sumenergy, suminv; + float *matrix; + unsigned char *src; + + sum0 = 0.0f; + sum1 = 0.0f; + sum2 = 0.0f; + sumenergy = 0.0f; + matrix = state->matrix; + mapy = pointy; + for( y = 0 ; y < state->matrixsize ; y++ ) + { + src = ADDRESS( state->srcdata, ( mapy * state->bytesperline ) ); + mapx = pointx + ( pointx << 1 ); + for( x = 0 ; x < state->matrixsize ; x++ ) + { + f = matrix[x]; + v0 = f * ( (float)src[ mapx + 0 ] - 127.5f ); + v1 = f * ( (float)src[ mapx + 1 ] - 127.5f ); + v2 = f * ( (float)src[ mapx + 2 ] - 127.5f ); + sum0 += v0; + sum1 += v1; + sum2 += v2; + energy = ( v0 * v0 ) + ( v1 * v1 ); + if( energy ) + sumenergy += sqrtf( energy ) / sqrtf( energy + ( v2 * v2 ) ); + mapx += 3; + if( mapx >= state->width3 ) + mapx = 0; + } + matrix = ADDRESS( matrix, state->matrixrowsize ); + mapy++; + if( mapy >= state->height ) + mapy = 0; + } + + sum0 *= state->amplifynormal; + sum1 *= state->amplifynormal; + suminv = (0.5f*255.0f) / fmaxf( 0.0625f, sqrtf( ( sum0 * sum0 ) + ( sum1 * sum1 ) + ( sum2 * sum2 ) ) ); + sum0 *= suminv; + sum1 *= suminv; + sum2 *= suminv; + energy = sqrtf( ( sum0 * sum0 ) + ( sum1 * sum1 ) ); + sumenergy *= state->normalsustainfactor; + if( energy < sumenergy ) + { + f = fminf( sumenergy / energy, 8.0f ); + sum0 *= f; + sum1 *= f; + suminv = (0.5f*255.0f) / fmaxf( 0.0625f, sqrtf( ( sum0 * sum0 ) + ( sum1 * sum1 ) + ( sum2 * sum2 ) ) ); + sum0 *= suminv; + sum1 *= suminv; + sum2 *= suminv; + } + sum0 += (0.5f*255.0f); + sum1 += (0.5f*255.0f); + sum2 += (0.5f*255.0f); + dst[0] = (unsigned char)( fmaxf( 0.0f, fminf( 255.0f, sum0 + 0.5f ) ) ); + dst[1] = (unsigned char)( fmaxf( 0.0f, fminf( 255.0f, sum1 + 0.5f ) ) ); + dst[2] = (unsigned char)( fmaxf( 0.0f, fminf( 255.0f, sum2 + 0.5f ) ) ); + + return; +} + +static void imStaticKernel4NormalSustain( unsigned char *dst, int pointx, int pointy, imStaticMatrixState * restrict state ) +{ + int x, y, mapx, mapy; + float f, v0, v1, v2, v3, energy, sum0, sum1, sum2, sum3, sumenergy, suminv; + float *matrix; + unsigned char *src; + + sum0 = 0.0f; + sum1 = 0.0f; + sum2 = 0.0f; + sum3 = 0.0f; + sumenergy = 0.0f; + matrix = state->matrix; + mapy = pointy; + for( y = 0 ; y < state->matrixsize ; y++ ) + { + src = ADDRESS( state->srcdata, ( mapy * state->bytesperline ) ); + mapx = pointx << 2; + for( x = 0 ; x < state->matrixsize ; x++ ) + { + f = matrix[x]; + v0 = f * ( (float)src[ mapx + 0 ] - 127.5f ); + v1 = f * ( (float)src[ mapx + 1 ] - 127.5f ); + v2 = f * ( (float)src[ mapx + 2 ] - 127.5f ); + v3 = f * (float)src[ mapx + 3 ]; + sum0 += v0; + sum1 += v1; + sum2 += v2; + sum3 += v3; + energy = ( v0 * v0 ) + ( v1 * v1 ); + if( energy ) + sumenergy += sqrtf( energy ) / sqrtf( energy + ( v2 * v2 ) ); + mapx += 4; + if( mapx >= state->width4 ) + mapx = 0; + } + matrix = ADDRESS( matrix, state->matrixrowsize ); + mapy++; + if( mapy >= state->height ) + mapy = 0; + } + + sum0 *= state->amplifynormal; + sum1 *= state->amplifynormal; + suminv = (0.5f*255.0f) / fmaxf( 0.0625f, sqrtf( ( sum0 * sum0 ) + ( sum1 * sum1 ) + ( sum2 * sum2 ) ) ); + sum0 *= suminv; + sum1 *= suminv; + sum2 *= suminv; + energy = sqrtf( ( sum0 * sum0 ) + ( sum1 * sum1 ) ); + sumenergy *= state->normalsustainfactor; + if( energy < sumenergy ) + { + f = fminf( sumenergy / energy, 8.0f ); + sum0 *= f; + sum1 *= f; + suminv = (0.5f*255.0f) / fmaxf( 0.0625f, sqrtf( ( sum0 * sum0 ) + ( sum1 * sum1 ) + ( sum2 * sum2 ) ) ); + sum0 *= suminv; + sum1 *= suminv; + sum2 *= suminv; + } + sum0 += (0.5f*255.0f); + sum1 += (0.5f*255.0f); + sum2 += (0.5f*255.0f); + dst[0] = (unsigned char)( fmaxf( 0.0f, fminf( 255.0f, sum0 + 0.5f ) ) ); + dst[1] = (unsigned char)( fmaxf( 0.0f, fminf( 255.0f, sum1 + 0.5f ) ) ); + dst[2] = (unsigned char)( fmaxf( 0.0f, fminf( 255.0f, sum2 + 0.5f ) ) ); + dst[3] = (unsigned char)( fmaxf( 0.0f, fminf( 255.0f, sum3 + 0.5f ) ) ); + + return; +} + + +//// + + +static void imStaticKernel4NormalSustainAlphaNorm( unsigned char *dst, int pointx, int pointy, imStaticMatrixState * restrict state ) +{ + int x, y, mapx, mapy; + float f, v0, v1, v2, v3, energy, sum0, sum1, sum2, sum3, sumenergy, suminv; + float *matrix; + unsigned char *src; + + sum0 = 0.0f; + sum1 = 0.0f; + sum2 = 0.0f; + sum3 = 0.0f; + sumenergy = 0.0f; + matrix = state->matrix; + mapy = pointy; + for( y = 0 ; y < state->matrixsize ; y++ ) + { + src = ADDRESS( state->srcdata, ( mapy * state->bytesperline ) ); + mapx = pointx << 2; + for( x = 0 ; x < state->matrixsize ; x++ ) + { + f = matrix[x] * (float)src[ mapx + 3 ]; + v0 = f * ( (float)src[ mapx + 0 ] - 127.5f ); + v1 = f * ( (float)src[ mapx + 1 ] - 127.5f ); + v2 = f * ( (float)src[ mapx + 2 ] - 127.5f ); + v3 = f; + sum0 += v0; + sum1 += v1; + sum2 += v2; + sum3 += v3; + energy = ( v0 * v0 ) + ( v1 * v1 ); + if( energy ) + sumenergy += sqrtf( energy ) / sqrtf( energy + ( v2 * v2 ) ); + mapx += 4; + if( mapx >= state->width4 ) + mapx = 0; + } + matrix = ADDRESS( matrix, state->matrixrowsize ); + mapy++; + if( mapy >= state->height ) + mapy = 0; + } + + if( sum3 >= state->minimumalphaf ) + { + f = 1.0f / sum3; + sum0 *= f; + sum1 *= f; + sum2 *= f; + sum0 *= state->amplifynormal; + sum1 *= state->amplifynormal; + suminv = (0.5f*255.0f) / fmaxf( 0.0625f, sqrtf( ( sum0 * sum0 ) + ( sum1 * sum1 ) + ( sum2 * sum2 ) ) ); + sum0 *= suminv; + sum1 *= suminv; + sum2 *= suminv; + energy = sqrtf( ( sum0 * sum0 ) + ( sum1 * sum1 ) ); + sumenergy *= state->normalsustainfactor; + if( energy < sumenergy ) + { + f = fminf( sumenergy / energy, 8.0f ); + sum0 *= f; + sum1 *= f; + suminv = (0.5f*255.0f) / fmaxf( 0.0625f, sqrtf( ( sum0 * sum0 ) + ( sum1 * sum1 ) + ( sum2 * sum2 ) ) ); + sum0 *= suminv; + sum1 *= suminv; + sum2 *= suminv; + } + sum0 += (0.5f*255.0f); + sum1 += (0.5f*255.0f); + sum2 += (0.5f*255.0f); + dst[0] = (unsigned char)( fmaxf( 0.0f, fminf( 255.0f, sum0 + 0.5f ) ) ); + dst[1] = (unsigned char)( fmaxf( 0.0f, fminf( 255.0f, sum1 + 0.5f ) ) ); + dst[2] = (unsigned char)( fmaxf( 0.0f, fminf( 255.0f, sum2 + 0.5f ) ) ); + dst[3] = (unsigned char)( fmaxf( 0.0f, fminf( 255.0f, sum3 + 0.5f ) ) ); + } + else + { + dst[0] = 0; + dst[1] = 0; + dst[2] = 0; + dst[3] = 0; + } + + return; +} + + +//// + + +static void imStaticKernelPoT3Water( unsigned char *dst, int pointx, int pointy, imStaticMatrixState * restrict state ) +{ + int x, y, mapx, mapy, heightmask, widthmask; + int minx, maxx, miny, maxy; + float f, sum0, sum1, sum2, suminv; + float *matrix; + unsigned char *src; + + minx = pointx; + maxx = minx + state->matrixsize; + miny = pointy; + maxy = miny + state->matrixsize; + heightmask = state->height - 1; + widthmask = state->width1 - 1; + + sum0 = 0.0f; + sum1 = 0.0f; + sum2 = 0.0f; + matrix = state->matrix; + for( y = miny ; y < maxy ; y++ ) + { + mapy = y & heightmask; + src = ADDRESS( state->srcdata, ( mapy * state->bytesperline ) ); + for( x = minx ; x < maxx ; x++, matrix++ ) + { + mapx = x & widthmask; + mapx += mapx << 1; + f = *matrix; + sum0 += (float)src[ mapx + 0 ] * f; + sum1 += (float)src[ mapx + 1 ] * f; + sum2 += (float)src[ mapx + 2 ] * f; + } + matrix += state->rowreturn; + } + + sum0 *= 1.0f/255.0f; + sum1 *= 1.0f/255.0f; + sum2 *= 1.0f/255.0f; + sum0 = 2.0f * ( sum0 - 0.5f ); + sum1 = 2.0f * ( sum1 - 0.5f ); + suminv = sqrtf( ( sum0 * sum0 ) + ( sum1 * sum1 ) ); + if( suminv < 0.75f ) + { + suminv = 0.5f / suminv; + sum0 = 0.5f + ( sum0 * suminv ); + sum1 = 0.5f + ( sum1 * suminv ); + } + if( sum2 > 0.1f ) + { + state->dithersum += sum2; + if( sum2 > 0.45f ) + sum2 = 1.0f; + else if( ( sum2 < 0.3f ) && ( state->dithersum < 1.0f ) ) + sum2 = 0.0f; + else + sum2 = ( ( sum2 + state->dithersum ) < 0.45f ? 0.0f : 1.0f ); + state->dithersum -= sum2; + } + sum0 *= 255.0f; + sum1 *= 255.0f; + sum2 *= 255.0f; + dst[0] = (unsigned char)( fmaxf( 0.0f, fminf( 255.0f, sum0 + 0.5f ) ) ); + dst[1] = (unsigned char)( fmaxf( 0.0f, fminf( 255.0f, sum1 + 0.5f ) ) ); + dst[2] = (unsigned char)( fmaxf( 0.0f, fminf( 255.0f, sum2 + 0.5f ) ) ); + + return; +} + +static void imStaticKernelPoT4Water( unsigned char *dst, int pointx, int pointy, imStaticMatrixState * restrict state ) +{ + int x, y, mapx, mapy, heightmask, widthmask; + int minx, maxx, miny, maxy; + float f, sum0, sum1, sum2, sum3, suminv; + float *matrix; + unsigned char *src; + + minx = pointx; + maxx = minx + state->matrixsize; + miny = pointy; + maxy = miny + state->matrixsize; + heightmask = state->height - 1; + widthmask = state->width1 - 1; + + sum0 = 0.0f; + sum1 = 0.0f; + sum2 = 0.0f; + sum3 = 0.0f; + matrix = state->matrix; + for( y = miny ; y < maxy ; y++ ) + { + mapy = y & heightmask; + src = ADDRESS( state->srcdata, ( mapy * state->bytesperline ) ); + for( x = minx ; x < maxx ; x++, matrix++ ) + { + mapx = x & widthmask; + mapx <<= 2; + f = *matrix; + sum0 += (float)src[ mapx + 0 ] * f; + sum1 += (float)src[ mapx + 1 ] * f; + sum2 += (float)src[ mapx + 2 ] * f; + sum3 += (float)src[ mapx + 3 ] * f; + } + matrix += state->rowreturn; + } + + sum0 *= 1.0f/255.0f; + sum1 *= 1.0f/255.0f; + sum2 *= 1.0f/255.0f; + sum0 = 2.0f * ( sum0 - 0.5f ); + sum1 = 2.0f * ( sum1 - 0.5f ); + suminv = sqrtf( ( sum0 * sum0 ) + ( sum1 * sum1 ) ); + if( suminv < 0.75f ) + { + suminv = 0.5f / suminv; + sum0 = 0.5f + ( sum0 * suminv ); + sum1 = 0.5f + ( sum1 * suminv ); + } + if( sum2 > 0.1f ) + { + state->dithersum += sum2; + if( sum2 > 0.45f ) + sum2 = 1.0f; + else if( ( sum2 < 0.3f ) && ( state->dithersum < 1.0f ) ) + sum2 = 0.0f; + else + sum2 = ( ( sum2 + state->dithersum ) < 0.45f ? 0.0f : 1.0f ); + state->dithersum -= sum2; + } + sum0 *= 255.0f; + sum1 *= 255.0f; + sum2 *= 255.0f; + dst[0] = (unsigned char)( fmaxf( 0.0f, fminf( 255.0f, sum0 + 0.5f ) ) ); + dst[1] = (unsigned char)( fmaxf( 0.0f, fminf( 255.0f, sum1 + 0.5f ) ) ); + dst[2] = (unsigned char)( fmaxf( 0.0f, fminf( 255.0f, sum2 + 0.5f ) ) ); + dst[3] = (unsigned char)( fmaxf( 0.0f, fminf( 255.0f, sum3 + 0.5f ) ) ); + + return; +} + + +//// + + +static void imStaticKernelPoT4Plant( unsigned char *dst, int pointx, int pointy, imStaticMatrixState * restrict state ) +{ + int x, y, mapx, mapy, heightmask, widthmask; + int minx, maxx, miny, maxy; + float f, sum0, sum1, sum2, sum3; + float *matrix; + unsigned char *src; + + minx = pointx; + maxx = minx + state->matrixsize; + miny = pointy; + maxy = miny + state->matrixsize; + heightmask = state->height - 1; + widthmask = state->width1 - 1; + + sum0 = 0.0f; + sum1 = 0.0f; + sum2 = 0.0f; + sum3 = 0.0f; + matrix = state->matrix; + for( y = miny ; y < maxy ; y++ ) + { + mapy = y & heightmask; + src = ADDRESS( state->srcdata, ( mapy * state->bytesperline ) ); + for( x = minx ; x < maxx ; x++, matrix++ ) + { + mapx = x & widthmask; + mapx <<= 2; + f = *matrix; + sum0 += (float)src[ mapx + 0 ] * f; + sum1 += (float)src[ mapx + 1 ] * f; + sum2 += (float)src[ mapx + 2 ] * f; + sum3 += (float)src[ mapx + 3 ] * f; + } + matrix += state->rowreturn; + } + + sum3 *= 1.25f; + dst[0] = (unsigned char)( fmaxf( 0.0f, fminf( 255.0f, sum0 + 0.5f ) ) ); + dst[1] = (unsigned char)( fmaxf( 0.0f, fminf( 255.0f, sum1 + 0.5f ) ) ); + dst[2] = (unsigned char)( fmaxf( 0.0f, fminf( 255.0f, sum2 + 0.5f ) ) ); + dst[3] = (unsigned char)( fmaxf( 0.0f, fminf( 255.0f, sum3 + 0.5f ) ) ); + + return; +} + + +//// + + +int imReduceImageKaiserDataDivisor( unsigned char *dstdata, unsigned char *srcdata, int width, int height, int bytesperpixel, int bytesperline, int sizedivisor, imReduceOptions *options ) +{ + int filter, x, y, pointx, pointy, basex, basey, pow2flag; + int newwidth, newheight; + unsigned char *dst; + imStaticMatrixState state; + void (*applykernel)( unsigned char *dst, int pointx, int pointy, imStaticMatrixState * restrict state ); +#if CPU_SSE2_SUPPORT + int corebase, corerange; + void (*applykernelcore)( unsigned char *dst, int pointx, int pointy, imStaticMatrixState * restrict state ); +#endif + + filter = options->filter; + imBuildStaticMatrix( &state, sizedivisor, options->hopcount, options->alpha ); + + newwidth = ( width < sizedivisor ) ? 1 : ( ( width + sizedivisor - 1 ) / sizedivisor ); + newheight = ( height < sizedivisor ) ? 1 : ( ( height + sizedivisor - 1 ) / sizedivisor ); + + pow2flag = ccIsPow2Int32( width ) && ccIsPow2Int32( height ); + applykernel = 0; +#if CPU_SSE2_SUPPORT + applykernelcore = 0; +#endif + + if( filter == IM_REDUCE_FILTER_LINEAR ) + { + if( bytesperpixel == 4 ) + { + applykernel = imStaticKernel4Linear; +#if CPU_SSE2_SUPPORT + applykernelcore = imStaticKernel4Linear_Core; +#endif + } + else if( bytesperpixel == 3 ) + applykernel = imStaticKernel3Linear; + else if( bytesperpixel == 2 ) + applykernel = imStaticKernel2Linear; + else if( bytesperpixel == 1 ) + applykernel = imStaticKernel1Linear; + } + else if( filter == IM_REDUCE_FILTER_LINEAR_ALPHANORM ) + { + if( bytesperpixel == 4 ) + { + applykernel = imStaticKernel4LinearAlphaNorm; +#if CPU_SSE2_SUPPORT + applykernelcore = imStaticKernel4LinearAlphaNorm_Core; +#endif + } + else if( bytesperpixel == 3 ) + applykernel = imStaticKernel3Linear; + else if( bytesperpixel == 2 ) + applykernel = imStaticKernel2Linear; + else if( bytesperpixel == 1 ) + applykernel = imStaticKernel1Linear; + } + else if( filter == IM_REDUCE_FILTER_SRGB ) + { + if( bytesperpixel == 4 ) + { + applykernel = imStaticKernel4sRGB; +#if CPU_SSE2_SUPPORT + applykernelcore = imStaticKernel4sRGB_Core; +#endif + } + else if( bytesperpixel == 3 ) + { + applykernel = imStaticKernel3sRGB; +#if CPU_SSE2_SUPPORT + applykernelcore = imStaticKernel3sRGB_Core; +#endif + } + else if( bytesperpixel == 2 ) + applykernel = imStaticKernel2sRGB; + else if( bytesperpixel == 1 ) + applykernel = imStaticKernel1sRGB; + } + else if( filter == IM_REDUCE_FILTER_SRGB_ALPHANORM ) + { + if( bytesperpixel == 4 ) + { + applykernel = imStaticKernel4sRGBAlphaNorm; +#if CPU_SSE2_SUPPORT + applykernelcore = imStaticKernel4sRGBAlphaNorm_Core; +#endif + } + else if( bytesperpixel == 3 ) + applykernel = imStaticKernel3sRGB; + else if( bytesperpixel == 2 ) + applykernel = imStaticKernel2sRGB; + else if( bytesperpixel == 1 ) + applykernel = imStaticKernel1sRGB; + } + else if( filter == IM_REDUCE_FILTER_NORMALMAP ) + { + if( bytesperpixel == 4 ) + applykernel = imStaticKernel4Normal; + else if( bytesperpixel == 3 ) + applykernel = imStaticKernel3Normal; + } + else if( filter == IM_REDUCE_FILTER_NORMALMAP_SUSTAIN ) + { + if( bytesperpixel == 4 ) + applykernel = imStaticKernel4NormalSustain; + else if( bytesperpixel == 3 ) + applykernel = imStaticKernel3NormalSustain; + } + else if( filter == IM_REDUCE_FILTER_NORMALMAP_SUSTAIN_ALPHANORM ) + { + if( bytesperpixel == 4 ) + applykernel = imStaticKernel4NormalSustainAlphaNorm; + else if( bytesperpixel == 3 ) + applykernel = imStaticKernel3NormalSustain; + } + else if( filter == IM_REDUCE_FILTER_WATERMAP ) + { + if( ( bytesperpixel == 4 ) && ( pow2flag ) ) + applykernel = imStaticKernelPoT4Water; + else if( ( bytesperpixel == 3 ) && ( pow2flag ) ) + applykernel = imStaticKernelPoT3Water; + } + else if( filter == IM_REDUCE_FILTER_PLANTMAP ) + { + if( ( bytesperpixel == 4 ) && ( pow2flag ) ) + applykernel = imStaticKernelPoT4Plant; + } + + if( !applykernel ) + return 0; + +#if CPU_SSE2_SUPPORT + corebase = -state.matrixoffset; + corerange = ( newwidth + state.matrixoffset ) - corebase; +#endif + + state.dithersum = 0.0f; + if( ( newwidth | newheight ) > 2 ) + state.dithersum = 0.5f; + + state.srcdata = srcdata; + state.width1 = width * 1; + state.width2 = width * 2; + state.width3 = width * 3; + state.width4 = width * 4; + state.height = height; + state.bytesperline = bytesperline; + + state.minimumalpha = 4; + state.minimumalphaf = (float)state.minimumalpha; + state.amplifynormal = fmaxf( 1.0f, options->amplifynormal ); + state.normalsustainfactor = options->normalsustainfactor; + + basex = ( state.matrixoffset + ( width << 8 ) ) % width; + basey = ( state.matrixoffset + ( height << 8 ) ) % height; + while( basex < 0 ) + basex += width; + while( basey < 0 ) + basey += height; + +#if CPU_SSE2_SUPPORT + if( applykernelcore ) + { + dst = dstdata; + pointy = basey; + for( y = 0 ; y < newheight ; y++ ) + { + pointx = basex; + for( x = 0 ; x < newwidth ; x++, dst += bytesperpixel ) + { + ( (unsigned int)( x - corebase ) < corerange ? applykernelcore : applykernel )( dst, pointx, pointy, &state ); + pointx += sizedivisor; + while( pointx >= width ) + pointx -= width; + } + pointy += sizedivisor; + while( pointy >= height ) + pointy -= height; + } + } + else +#endif + { + dst = dstdata; + pointy = basey; + for( y = 0 ; y < newheight ; y++ ) + { + pointx = basex; + for( x = 0 ; x < newwidth ; x++, dst += bytesperpixel ) + { + applykernel( dst, pointx, pointy, &state ); + pointx += sizedivisor; + while( pointx >= width ) + pointx -= width; + } + pointy += sizedivisor; + while( pointy >= height ) + pointy -= height; + } + } + + imFreeStaticState( &state ); + + return 1; +} + + +int imReduceImageKaiserDivisor( imgImage *imgdst, imgImage *imgsrc, int sizedivisor, imReduceOptions *options ) +{ + int width, height; + int newwidth, newheight, retvalue; + + width = imgsrc->format.width; + height = imgsrc->format.height; + newwidth = ( width < sizedivisor ) ? 1 : ( ( width + sizedivisor - 1 ) / sizedivisor ); + newheight = ( height < sizedivisor ) ? 1 : ( ( height + sizedivisor - 1 ) / sizedivisor ); + + imgdst->format.width = newwidth; + imgdst->format.height = newheight; + imgdst->format.type = imgsrc->format.type; + imgdst->format.bytesperpixel = imgsrc->format.bytesperpixel; + imgdst->format.bytesperline = imgdst->format.width * imgdst->format.bytesperpixel; + imgdst->data = malloc( imgdst->format.height * imgdst->format.bytesperline ); + if( !( imgdst->data ) ) + return 0; + + retvalue = imReduceImageKaiserDataDivisor( imgdst->data, imgsrc->data, width, height, imgsrc->format.bytesperpixel, imgsrc->format.bytesperline, sizedivisor, options ); + + return retvalue; +} + + + +//////////////////////////////////////////////////////////////////////////////// + + + +static void imDynamicKernel1Linear( unsigned char *dst, imGenericMatrixState *state ) +{ + int x, y, mapx, mapy; + float f, sum0; + float matrixsum; + unsigned char *src; + + sum0 = 0.0f; + matrixsum = 0.0f; + mapy = state->matrixoffsety; + for( y = 0 ; y < state->matrixsizey ; y++ ) + { + src = ADDRESS( state->srcdata, ( mapy * state->bytesperline ) ); + mapx = state->matrixoffsetx; + for( x = 0 ; x < state->matrixsizex ; x++ ) + { + f = state->linearx[x] * state->lineary[y]; + sum0 += f * (float)src[ mapx + 0 ]; + matrixsum += f; + mapx++; + if( mapx >= state->width1 ) + mapx = 0; + } + mapy++; + if( mapy >= state->height ) + mapy = 0; + } + + sum0 /= matrixsum; + dst[0] = (unsigned char)( fmaxf( 0.0f, fminf( 255.0f, sum0 + 0.5f ) ) ); + + return; +} + + +static void imDynamicKernel2Linear( unsigned char *dst, imGenericMatrixState *state ) +{ + int x, y, mapx, mapy; + float f, sum0, sum1; + float matrixsum; + unsigned char *src; + + sum0 = 0.0f; + sum1 = 0.0f; + matrixsum = 0.0f; + mapy = state->matrixoffsety; + for( y = 0 ; y < state->matrixsizey ; y++ ) + { + src = ADDRESS( state->srcdata, ( mapy * state->bytesperline ) ); + mapx = state->matrixoffsetx << 1; + for( x = 0 ; x < state->matrixsizex ; x++ ) + { + f = state->linearx[x] * state->lineary[y]; + sum0 += f * (float)src[ mapx + 0 ]; + sum1 += f * (float)src[ mapx + 1 ]; + matrixsum += f; + mapx += 2; + if( mapx >= state->width2 ) + mapx = 0; + } + mapy++; + if( mapy >= state->height ) + mapy = 0; + } + + matrixsum = 1.0f / matrixsum; + sum0 *= matrixsum; + sum1 *= matrixsum; + dst[0] = (unsigned char)( fmaxf( 0.0f, fminf( 255.0f, sum0 + 0.5f ) ) ); + dst[1] = (unsigned char)( fmaxf( 0.0f, fminf( 255.0f, sum1 + 0.5f ) ) ); + + return; +} + + +static void imDynamicKernel3Linear( unsigned char *dst, imGenericMatrixState *state ) +{ + int x, y, mapx, mapy; + float f, sum0, sum1, sum2; + float matrixsum; + unsigned char *src; + + sum0 = 0.0f; + sum1 = 0.0f; + sum2 = 0.0f; + matrixsum = 0.0f; + mapy = state->matrixoffsety; + for( y = 0 ; y < state->matrixsizey ; y++ ) + { + src = ADDRESS( state->srcdata, ( mapy * state->bytesperline ) ); + mapx = state->matrixoffsetx + ( state->matrixoffsetx << 1 ); + for( x = 0 ; x < state->matrixsizex ; x++ ) + { + f = state->linearx[x] * state->lineary[y]; + sum0 += f * (float)src[ mapx + 0 ]; + sum1 += f * (float)src[ mapx + 1 ]; + sum2 += f * (float)src[ mapx + 2 ]; + matrixsum += f; + mapx += 3; + if( mapx >= state->width3 ) + mapx = 0; + } + mapy++; + if( mapy >= state->height ) + mapy = 0; + } + + matrixsum = 1.0f / matrixsum; + sum0 *= matrixsum; + sum1 *= matrixsum; + sum2 *= matrixsum; + dst[0] = (unsigned char)( fmaxf( 0.0f, fminf( 255.0f, sum0 + 0.5f ) ) ); + dst[1] = (unsigned char)( fmaxf( 0.0f, fminf( 255.0f, sum1 + 0.5f ) ) ); + dst[2] = (unsigned char)( fmaxf( 0.0f, fminf( 255.0f, sum2 + 0.5f ) ) ); + + return; +} + + +static void imDynamicKernel4Linear( unsigned char *dst, imGenericMatrixState *state ) +{ + int x, y, mapx, mapy; + float f, sum0, sum1, sum2, sum3; + float matrixsum; + unsigned char *src; + + sum0 = 0.0f; + sum1 = 0.0f; + sum2 = 0.0f; + sum3 = 0.0f; + matrixsum = 0.0f; + mapy = state->matrixoffsety; + for( y = 0 ; y < state->matrixsizey ; y++ ) + { + src = ADDRESS( state->srcdata, ( mapy * state->bytesperline ) ); + mapx = state->matrixoffsetx << 2; + for( x = 0 ; x < state->matrixsizex ; x++ ) + { + f = state->linearx[x] * state->lineary[y]; + sum0 += f * (float)src[ mapx + 0 ]; + sum1 += f * (float)src[ mapx + 1 ]; + sum2 += f * (float)src[ mapx + 2 ]; + sum3 += f * (float)src[ mapx + 3 ]; + matrixsum += f; + mapx += 4; + if( mapx >= state->width4 ) + mapx = 0; + } + mapy++; + if( mapy >= state->height ) + mapy = 0; + } + + matrixsum = 1.0f / matrixsum; + sum0 *= matrixsum; + sum1 *= matrixsum; + sum2 *= matrixsum; + sum3 *= matrixsum; + dst[0] = (unsigned char)( fmaxf( 0.0f, fminf( 255.0f, sum0 + 0.5f ) ) ); + dst[1] = (unsigned char)( fmaxf( 0.0f, fminf( 255.0f, sum1 + 0.5f ) ) ); + dst[2] = (unsigned char)( fmaxf( 0.0f, fminf( 255.0f, sum2 + 0.5f ) ) ); + dst[3] = (unsigned char)( fmaxf( 0.0f, fminf( 255.0f, sum3 + 0.5f ) ) ); + + return; +} + + +//// + + +static void imDynamicKernel4LinearAlphaNorm( unsigned char *dst, imGenericMatrixState *state ) +{ + int x, y, mapx, mapy; + float f, sum0, sum1, sum2, sum3, alpha; + float matrixsum; + unsigned char *src; + + sum0 = 0.0f; + sum1 = 0.0f; + sum2 = 0.0f; + sum3 = 0.0f; + matrixsum = 0.0f; + mapy = state->matrixoffsety; + for( y = 0 ; y < state->matrixsizey ; y++ ) + { + src = ADDRESS( state->srcdata, ( mapy * state->bytesperline ) ); + mapx = state->matrixoffsetx << 2; + for( x = 0 ; x < state->matrixsizex ; x++ ) + { + f = state->linearx[x] * state->lineary[y]; + alpha = (float)src[ mapx + 3 ] * f; + sum0 += alpha * (float)src[ mapx + 0 ]; + sum1 += alpha * (float)src[ mapx + 1 ]; + sum2 += alpha * (float)src[ mapx + 2 ]; + sum3 += alpha; + matrixsum += f; + mapx += 4; + if( mapx >= state->width4 ) + mapx = 0; + } + mapy++; + if( mapy >= state->height ) + mapy = 0; + } + + matrixsum = 1.0f / matrixsum; + sum0 *= matrixsum; + sum1 *= matrixsum; + sum2 *= matrixsum; + sum3 *= matrixsum; + if( sum3 >= state->minimumalphaf ) + { + f = 1.0 / sum3; + dst[0] = (unsigned char)( fmaxf( 0.0f, fminf( 255.0f, ( sum0 * f ) + 0.5f ) ) ); + dst[1] = (unsigned char)( fmaxf( 0.0f, fminf( 255.0f, ( sum1 * f ) + 0.5f ) ) ); + dst[2] = (unsigned char)( fmaxf( 0.0f, fminf( 255.0f, ( sum2 * f ) + 0.5f ) ) ); + dst[3] = (unsigned char)( fmaxf( 0.0f, fminf( 255.0f, sum3 + 0.5f ) ) ); + } + else + { + dst[0] = 0; + dst[1] = 0; + dst[2] = 0; + dst[3] = 0; + } + + return; +} + + +#if CPU_SSE2_SUPPORT + +static void imDynamicKernel4LinearAlphaNorm_Core( unsigned char *dst, imGenericMatrixState *state ) +{ + int x, y, mapx, mapy; + uint32_t pixel; + unsigned char *src; + __m128 vmatrixsum, vsum0, vsum1, vsum2, vsum3; + __m128 vlx, vly, vf, valpha, vr, vg, vb, va, vsrcf; + __m128i vsrc, vshufmask; + __m128i vzero; + + #if CPU_SSSE3_SUPPORT + vshufmask = _mm_setr_epi8( 0x00,0x04,0x08,0x0c, 0x01,0x05,0x09,0x0d, 0x02,0x06,0x0a,0x0e, 0x03,0x07,0x0b,0x0f ); + #endif + vsum0 = _mm_setzero_ps(); + vsum1 = _mm_setzero_ps(); + vsum2 = _mm_setzero_ps(); + vsum3 = _mm_setzero_ps(); + vmatrixsum = _mm_setzero_ps(); + vzero = _mm_castps_si128( _mm_setzero_ps() ); + + mapy = state->matrixoffsety; + for( y = 0 ; y < state->matrixsizey ; y++ ) + { + src = ADDRESS( state->srcdata, ( mapy * state->bytesperline ) ); + mapx = state->matrixoffsetx << 2; + vly = _mm_set1_ps( state->lineary[y] ); + for( x = 0 ; x < state->matrixsizex ; x += 4 ) + { + vlx = _mm_load_ps( &state->linearx[x] ); + /* Load 16 bytes and unpack as RRRR,GGGG,BBBB,AAAA in one SSE register */ + vsrc = _mm_loadu_si128( (void *)&src[ mapx ] ); + #if CPU_SSSE3_SUPPORT + vsrc = _mm_shuffle_epi8( vsrc, vshufmask ); + #else + vshufmask = _mm_shuffle_epi32( vsrc, 0x39 ); + vsrc = _mm_unpacklo_epi16( _mm_unpacklo_epi8( vsrc, vshufmask ), _mm_unpackhi_epi8( vsrc, vshufmask ) ); + #endif + /* Break that into 4 SSE registers as floats: vR,vG,vB,vA */ + vsrcf = _mm_castsi128_ps( vsrc ); + vr = _mm_cvtepi32_ps( CPU_CVT_U8_TO_I32( _mm_castps_si128( vsrcf ), vzero ) ); + #if CPU_SSE3_SUPPORT + vg = _mm_cvtepi32_ps( CPU_CVT_U8_TO_I32( _mm_castps_si128( _mm_movehdup_ps( vsrcf ) ), vzero ) ); + #else + vg = _mm_cvtepi32_ps( CPU_CVT_U8_TO_I32( _mm_castps_si128( _mm_shuffle_ps( vsrcf, vsrcf, 0x55 ) ), vzero ) ); + #endif + vb = _mm_cvtepi32_ps( CPU_CVT_U8_TO_I32( _mm_castps_si128( _mm_movehl_ps( vsrcf, vsrcf ) ), vzero ) ); + va = _mm_cvtepi32_ps( CPU_CVT_U8_TO_I32( _mm_castps_si128( _mm_shuffle_ps( vsrcf, vsrcf, 0xff ) ), vzero ) ); + vf = _mm_mul_ps( vlx, vly ); + valpha = _mm_mul_ps( va, vf ); + vsum0 = _mm_add_ps( vsum0, _mm_mul_ps( vr, valpha ) ); + vsum1 = _mm_add_ps( vsum1, _mm_mul_ps( vg, valpha ) ); + vsum2 = _mm_add_ps( vsum2, _mm_mul_ps( vb, valpha ) ); + vsum3 = _mm_add_ps( vsum3, valpha ); + vmatrixsum = _mm_add_ps( vmatrixsum, vf ); + mapx += 16; + } + mapy++; + if( mapy >= state->height ) + mapy = 0; + } + + #if CPU_SSE3_SUPPORT + vmatrixsum = _mm_hadd_ps( vmatrixsum, vmatrixsum ); + vmatrixsum = _mm_hadd_ps( vmatrixsum, vmatrixsum ); + #else + vmatrixsum = _mm_add_ps( vmatrixsum, _mm_shuffle_ps( vmatrixsum, vmatrixsum, 0x4e ) ); + vmatrixsum = _mm_add_ps( vmatrixsum, _mm_shuffle_ps( vmatrixsum, vmatrixsum, 0x39 ) ); + #endif + + #if CPU_SSE3_SUPPORT + vsum0 = _mm_hadd_ps( vsum0, vsum1 ); + vsum2 = _mm_hadd_ps( vsum2, vsum3 ); + vsum0 = _mm_hadd_ps( vsum0, vsum2 ); + #else + vsum0 = _mm_add_ps( _mm_unpacklo_ps( vsum0, vsum2 ), _mm_unpackhi_ps( vsum0, vsum2 ) ); + vsum1 = _mm_add_ps( _mm_unpacklo_ps( vsum1, vsum3 ), _mm_unpackhi_ps( vsum1, vsum3 ) ); + vsum0 = _mm_add_ps( _mm_unpacklo_ps( vsum0, vsum1 ), _mm_unpackhi_ps( vsum0, vsum1 ) ); + #endif + vsum0 = _mm_div_ps( vsum0, vmatrixsum ); + + valpha = _mm_shuffle_ps( vsum0, vsum0, 0xff ); + pixel = 0; + if( _mm_comige_ss( valpha, _mm_load_ss( &state->minimumalphaf ) ) ) + { + __m128i vpixel; + vsum0 = _mm_mul_ps( vsum0, _mm_rcp_ps( valpha ) ); + vsum0 = CPU_BLENDV_PS( vsum0, valpha, *(__m128 *)simd4fAlphaMask ); + vpixel = _mm_cvtps_epi32( vsum0 ); + vpixel = _mm_packs_epi32( vpixel, vpixel ); + vpixel = _mm_packus_epi16( vpixel, vpixel ); + pixel = (uint32_t)_mm_cvtsi128_si32( vpixel ); + } + *(uint32_t *)dst = pixel; + + return; +} + +#endif + + +//// + + +static void imDynamicKernel1sRGB( unsigned char *dst, imGenericMatrixState *state ) +{ + int x, y, mapx, mapy; +#if CPU_SSE2_SUPPORT + __m128 vsum, vsrc; + __m128i vzero; +#else + float sum0; +#endif + float f, matrixsum; + unsigned char *src; + +#if CPU_SSE2_SUPPORT + vsum = _mm_setzero_ps(); + vzero = _mm_setzero_si128(); +#else + sum0 = 0.0f; +#endif + matrixsum = 0.0f; + mapy = state->matrixoffsety; + for( y = 0 ; y < state->matrixsizey ; y++ ) + { + src = ADDRESS( state->srcdata, ( mapy * state->bytesperline ) ); + mapx = state->matrixoffsetx; + for( x = 0 ; x < state->matrixsizex ; x++ ) + { +#if CPU_SSE2_SUPPORT + f = state->linearx[x] * state->lineary[y]; + vsrc = _mm_set_ss( (float)src[ mapx + 0 ] ); + vsrc = srgb2linear3( vsrc ); + vsum = _mm_add_ps( vsum, _mm_mul_ps( _mm_set1_ps( f ), vsrc ) ); +#else + f = state->linearx[x] * state->lineary[y]; + sum0 += f * srgb2linear( (float)src[ mapx + 0 ] ); +#endif + matrixsum += f; + mapx++; + if( mapx >= state->width1 ) + mapx = 0; + } + mapy++; + if( mapy >= state->height ) + mapy = 0; + } + +#if CPU_SSE2_SUPPORT + vsum = linear2srgb3( _mm_div_ps( vsum, _mm_set1_ps( matrixsum ) ) ); + dst[0] = _mm_cvtsi128_si32( _mm_packus_epi16( _mm_packs_epi32( _mm_cvtps_epi32( vsum ), vzero ), vzero ) ); +#else + sum0 /= matrixsum; + dst[0] = (unsigned char)( fmaxf( 0.0f, fminf( 255.0f, linear2srgb( sum0 ) + 0.5f ) ) ); +#endif + + return; +} + + +static void imDynamicKernel2sRGB( unsigned char *dst, imGenericMatrixState *state ) +{ + int x, y, mapx, mapy; +#if CPU_SSE2_SUPPORT + __m128 vsum, vsrc; + __m128i vzero; +#else + float sum0, sum1; +#endif + float f, matrixsum; + unsigned char *src; + +#if CPU_SSE2_SUPPORT + vsum = _mm_setzero_ps(); + vzero = _mm_setzero_si128(); +#else + sum0 = 0.0f; + sum1 = 0.0f; +#endif + matrixsum = 0.0f; + mapy = state->matrixoffsety; + for( y = 0 ; y < state->matrixsizey ; y++ ) + { + src = ADDRESS( state->srcdata, ( mapy * state->bytesperline ) ); + mapx = state->matrixoffsetx << 1; + for( x = 0 ; x < state->matrixsizex ; x++ ) + { +#if CPU_SSE2_SUPPORT + f = state->linearx[x] * state->lineary[y]; + vsrc = _mm_set_ps( 0.0f, 0.0f, (float)src[ mapx + 1 ], (float)src[ mapx + 0 ] ); + vsrc = srgb2linear3( vsrc ); + vsum = _mm_add_ps( vsum, _mm_mul_ps( _mm_set1_ps( f ), vsrc ) ); +#else + f = state->linearx[x] * state->lineary[y]; + sum0 += f * srgb2linear( (float)src[ mapx + 0 ] ); + sum1 += f * srgb2linear( (float)src[ mapx + 1 ] ); +#endif + matrixsum += f; + mapx += 2; + if( mapx >= state->width2 ) + mapx = 0; + } + mapy++; + if( mapy >= state->height ) + mapy = 0; + } + +#if CPU_SSE2_SUPPORT + union + { + char c[4]; + uint32_t i; + } u; + vsum = linear2srgb3( _mm_div_ps( vsum, _mm_set1_ps( matrixsum ) ) ); + _mm_store_ss( (void *)&u.i, _mm_castsi128_ps( _mm_packus_epi16( _mm_packs_epi32( _mm_cvtps_epi32( vsum ), vzero ), vzero ) ) ); + dst[0] = u.c[0]; + dst[1] = u.c[1]; +#else + matrixsum = 1.0f / matrixsum; + sum0 *= matrixsum; + sum1 *= matrixsum; + dst[0] = (unsigned char)( fmaxf( 0.0f, fminf( 255.0f, linear2srgb( sum0 ) + 0.5f ) ) ); + dst[1] = (unsigned char)( fmaxf( 0.0f, fminf( 255.0f, linear2srgb( sum1 ) + 0.5f ) ) ); +#endif + + return; +} + + +static void imDynamicKernel3sRGB( unsigned char *dst, imGenericMatrixState *state ) +{ + int x, y, mapx, mapy; +#if CPU_SSE2_SUPPORT + __m128 vsum, vsrc; + __m128i vzero; +#else + float sum0, sum1, sum2; +#endif + float f, matrixsum; + unsigned char *src; + +#if CPU_SSE2_SUPPORT + vsum = _mm_setzero_ps(); + vzero = _mm_setzero_si128(); +#else + sum0 = 0.0f; + sum1 = 0.0f; + sum2 = 0.0f; +#endif + matrixsum = 0.0f; + mapy = state->matrixoffsety; + for( y = 0 ; y < state->matrixsizey ; y++ ) + { + src = ADDRESS( state->srcdata, ( mapy * state->bytesperline ) ); + mapx = state->matrixoffsetx + ( state->matrixoffsetx << 1 ); + for( x = 0 ; x < state->matrixsizex ; x++ ) + { +#if CPU_SSE2_SUPPORT + f = state->linearx[x] * state->lineary[y]; + vsrc = _mm_set_ps( 0.0f, (float)src[ mapx + 2 ], (float)src[ mapx + 1 ], (float)src[ mapx + 0 ] ); + vsrc = srgb2linear3( vsrc ); + vsum = _mm_add_ps( vsum, _mm_mul_ps( _mm_set1_ps( f ), vsrc ) ); +#else + f = state->linearx[x] * state->lineary[y]; + sum0 += f * srgb2linear( (float)src[ mapx + 0 ] ); + sum1 += f * srgb2linear( (float)src[ mapx + 1 ] ); + sum2 += f * srgb2linear( (float)src[ mapx + 2 ] ); +#endif + matrixsum += f; + mapx += 3; + if( mapx >= state->width3 ) + mapx = 0; + } + mapy++; + if( mapy >= state->height ) + mapy = 0; + } + +#if CPU_SSE2_SUPPORT + union + { + char c[4]; + uint32_t i; + } u; + vsum = linear2srgb3( _mm_div_ps( vsum, _mm_set1_ps( matrixsum ) ) ); + _mm_store_ss( (void *)&u.i, _mm_castsi128_ps( _mm_packus_epi16( _mm_packs_epi32( _mm_cvtps_epi32( vsum ), vzero ), vzero ) ) ); + dst[0] = u.c[0]; + dst[1] = u.c[1]; + dst[2] = u.c[2]; +#else + matrixsum = 1.0f / matrixsum; + sum0 *= matrixsum; + sum1 *= matrixsum; + sum2 *= matrixsum; + dst[0] = (unsigned char)( fmaxf( 0.0f, fminf( 255.0f, linear2srgb( sum0 ) + 0.5f ) ) ); + dst[1] = (unsigned char)( fmaxf( 0.0f, fminf( 255.0f, linear2srgb( sum1 ) + 0.5f ) ) ); + dst[2] = (unsigned char)( fmaxf( 0.0f, fminf( 255.0f, linear2srgb( sum2 ) + 0.5f ) ) ); +#endif + + return; +} + + +static void imDynamicKernel4sRGB( unsigned char *dst, imGenericMatrixState *state ) +{ + int x, y, mapx, mapy; +#if CPU_SSE2_SUPPORT + __m128 vsum, vsrc; + __m128i vzero; +#else + float sum0, sum1, sum2, sum3; +#endif + float f, matrixsum; + unsigned char *src; + +#if CPU_SSE2_SUPPORT + vsum = _mm_setzero_ps(); + vzero = _mm_setzero_si128(); +#else + sum0 = 0.0f; + sum1 = 0.0f; + sum2 = 0.0f; + sum3 = 0.0f; +#endif + matrixsum = 0.0f; + mapy = state->matrixoffsety; + for( y = 0 ; y < state->matrixsizey ; y++ ) + { + src = ADDRESS( state->srcdata, ( mapy * state->bytesperline ) ); + mapx = state->matrixoffsetx << 2; + for( x = 0 ; x < state->matrixsizex ; x++ ) + { +#if CPU_SSE2_SUPPORT + f = state->linearx[x] * state->lineary[y]; + vsrc = _mm_cvtepi32_ps( CPU_CVT_U8_TO_I32( _mm_castps_si128( _mm_load_ss( (void *)&src[ mapx ] ) ), vzero ) ); + vsrc = srgb2linear3( vsrc ); + vsum = _mm_add_ps( vsum, _mm_mul_ps( _mm_set1_ps( f ), vsrc ) ); +#else + f = state->linearx[x] * state->lineary[y]; + sum0 += f * srgb2linear( (float)src[ mapx + 0 ] ); + sum1 += f * srgb2linear( (float)src[ mapx + 1 ] ); + sum2 += f * srgb2linear( (float)src[ mapx + 2 ] ); + sum3 += f * (float)src[ mapx + 3 ]; +#endif + matrixsum += f; + mapx += 4; + if( mapx >= state->width4 ) + mapx = 0; + } + mapy++; + if( mapy >= state->height ) + mapy = 0; + } + +#if CPU_SSE2_SUPPORT + vsum = linear2srgb3( _mm_div_ps( vsum, _mm_set1_ps( matrixsum ) ) ); + _mm_store_ss( (void *)dst, _mm_castsi128_ps( _mm_packus_epi16( _mm_packs_epi32( _mm_cvtps_epi32( vsum ), vzero ), vzero ) ) ); +#else + matrixsum = 1.0f / matrixsum; + sum0 *= matrixsum; + sum1 *= matrixsum; + sum2 *= matrixsum; + sum3 *= matrixsum; + dst[0] = (unsigned char)( fmaxf( 0.0f, fminf( 255.0f, linear2srgb( sum0 ) + 0.5f ) ) ); + dst[1] = (unsigned char)( fmaxf( 0.0f, fminf( 255.0f, linear2srgb( sum1 ) + 0.5f ) ) ); + dst[2] = (unsigned char)( fmaxf( 0.0f, fminf( 255.0f, linear2srgb( sum2 ) + 0.5f ) ) ); + dst[3] = (unsigned char)( fmaxf( 0.0f, fminf( 255.0f, sum3 + 0.5f ) ) ); +#endif + + return; +} + + +//// + + +static void imDynamicKernel4sRGBAlphaNorm( unsigned char *dst, imGenericMatrixState *state ) +{ + int x, y, mapx, mapy; +#if CPU_SSE2_SUPPORT + __m128 vsum, vsrc, valpha; + __m128i vzero; + uint32_t pixel; +#else + float sum0, sum1, sum2, sum3, alpha; +#endif + float f, matrixsum; + unsigned char *src; + +#if CPU_SSE2_SUPPORT + vsum = _mm_setzero_ps(); + vzero = _mm_setzero_si128(); +#else + sum0 = 0.0f; + sum1 = 0.0f; + sum2 = 0.0f; + sum3 = 0.0f; +#endif + matrixsum = 0.0f; + mapy = state->matrixoffsety; + for( y = 0 ; y < state->matrixsizey ; y++ ) + { + src = ADDRESS( state->srcdata, ( mapy * state->bytesperline ) ); + mapx = state->matrixoffsetx << 2; + for( x = 0 ; x < state->matrixsizex ; x++ ) + { +#if CPU_SSE2_SUPPORT + f = state->linearx[x] * state->lineary[y]; + vsrc = _mm_cvtepi32_ps( CPU_CVT_U8_TO_I32( _mm_castps_si128( _mm_load_ss( (void *)&src[ mapx ] ) ), vzero ) ); + valpha = _mm_shuffle_ps( vsrc, _mm_set_ss( 1.0f ), 0x0f ); + vsrc = srgb2linear3( vsrc ); + vsum = _mm_add_ps( vsum, _mm_mul_ps( _mm_mul_ps( _mm_shuffle_ps( valpha, valpha, 0xC0 ), _mm_set1_ps( f ) ), vsrc ) ); +#else + f = state->linearx[x] * state->lineary[y]; + alpha = (float)src[ mapx + 3 ] * f; + sum0 += alpha * srgb2linear( (float)src[ mapx + 0 ] ); + sum1 += alpha * srgb2linear( (float)src[ mapx + 1 ] ); + sum2 += alpha * srgb2linear( (float)src[ mapx + 2 ] ); + sum3 += alpha; +#endif + matrixsum += f; + mapx += 4; + if( mapx >= state->width4 ) + mapx = 0; + } + mapy++; + if( mapy >= state->height ) + mapy = 0; + } + +#if CPU_SSE2_SUPPORT + vsum = _mm_div_ps( vsum, _mm_set1_ps( matrixsum ) ); + valpha = _mm_shuffle_ps( vsum, vsum, 0xff ); + pixel = 0; + if( _mm_comige_ss( valpha, _mm_load_ss( &state->minimumalphaf ) ) ) + { + __m128i vpixel; + vsum = _mm_mul_ps( vsum, _mm_rcp_ps( valpha ) ); + vsum = CPU_BLENDV_PS( vsum, valpha, *(__m128 *)simd4fAlphaMask ); + vsum = linear2srgb3( vsum ); + vpixel = _mm_cvtps_epi32( vsum ); + vpixel = _mm_packs_epi32( vpixel, vpixel ); + vpixel = _mm_packus_epi16( vpixel, vpixel ); + pixel = (uint32_t)_mm_cvtsi128_si32( vpixel ); + } + *(uint32_t *)dst = pixel; +#else + matrixsum = 1.0f / matrixsum; + sum0 *= matrixsum; + sum1 *= matrixsum; + sum2 *= matrixsum; + sum3 *= matrixsum; + if( sum3 >= state->minimumalphaf ) + { + f = 1.0f / sum3; + dst[0] = (unsigned char)( fmaxf( 0.0f, fminf( 255.0f, linear2srgb( sum0 * f ) + 0.5f ) ) ); + dst[1] = (unsigned char)( fmaxf( 0.0f, fminf( 255.0f, linear2srgb( sum1 * f ) + 0.5f ) ) ); + dst[2] = (unsigned char)( fmaxf( 0.0f, fminf( 255.0f, linear2srgb( sum2 * f ) + 0.5f ) ) ); + dst[3] = (unsigned char)( fmaxf( 0.0f, fminf( 255.0f, sum3 + 0.5f ) ) ); + } + else + { + dst[0] = 0; + dst[1] = 0; + dst[2] = 0; + dst[3] = 0; + } +#endif + + return; +} + + +//// + + +static void imDynamicKernel3Normal( unsigned char *dst, imGenericMatrixState *state ) +{ + int x, y, mapx, mapy; + float f, sum0, sum1, sum2; + float matrixsum, suminv; + unsigned char *src; + + sum0 = 0.0f; + sum1 = 0.0f; + sum2 = 0.0f; + matrixsum = 0.0f; + mapy = state->matrixoffsety; + for( y = 0 ; y < state->matrixsizey ; y++ ) + { + src = ADDRESS( state->srcdata, ( mapy * state->bytesperline ) ); + mapx = state->matrixoffsetx + ( state->matrixoffsetx << 1 ); + for( x = 0 ; x < state->matrixsizex ; x++ ) + { + f = state->linearx[x] * state->lineary[y]; + sum0 += f * (float)src[ mapx + 0 ]; + sum1 += f * (float)src[ mapx + 1 ]; + sum2 += f * (float)src[ mapx + 2 ]; + matrixsum += f; + mapx += 3; + if( mapx >= state->width3 ) + mapx = 0; + } + mapy++; + if( mapy >= state->height ) + mapy = 0; + } + + matrixsum = (1.0f/255.0f) / matrixsum; + sum0 *= matrixsum; + sum1 *= matrixsum; + sum2 *= matrixsum; + sum0 -= 0.5f; + sum1 -= 0.5f; + sum2 -= 0.5f; + sum0 *= state->amplifynormal; + sum1 *= state->amplifynormal; + suminv = (0.5f*255.0f) / sqrtf( ( sum0 * sum0 ) + ( sum1 * sum1 ) + ( sum2 * sum2 ) ); + sum0 = (0.5f*255.0f) + ( sum0 * suminv ); + sum1 = (0.5f*255.0f) + ( sum1 * suminv ); + sum2 = (0.5f*255.0f) + ( sum2 * suminv ); + dst[0] = (unsigned char)( fmaxf( 0.0f, fminf( 255.0f, sum0 + 0.5f ) ) ); + dst[1] = (unsigned char)( fmaxf( 0.0f, fminf( 255.0f, sum1 + 0.5f ) ) ); + dst[2] = (unsigned char)( fmaxf( 0.0f, fminf( 255.0f, sum2 + 0.5f ) ) ); + + return; +} + + +static void imDynamicKernel4Normal( unsigned char *dst, imGenericMatrixState *state ) +{ + int x, y, mapx, mapy; + float f, sum0, sum1, sum2, sum3; + float matrixsum, suminv; + unsigned char *src; + + sum0 = 0.0f; + sum1 = 0.0f; + sum2 = 0.0f; + sum3 = 0.0f; + matrixsum = 0.0f; + mapy = state->matrixoffsety; + for( y = 0 ; y < state->matrixsizey ; y++ ) + { + src = ADDRESS( state->srcdata, ( mapy * state->bytesperline ) ); + mapx = state->matrixoffsetx << 2; + for( x = 0 ; x < state->matrixsizex ; x++ ) + { + f = state->linearx[x] * state->lineary[y]; + sum0 += f * (float)src[ mapx + 0 ]; + sum1 += f * (float)src[ mapx + 1 ]; + sum2 += f * (float)src[ mapx + 2 ]; + sum3 += f * (float)src[ mapx + 3 ]; + matrixsum += f; + mapx += 4; + if( mapx >= state->width4 ) + mapx = 0; + } + mapy++; + if( mapy >= state->height ) + mapy = 0; + } + + matrixsum = (1.0f/255.0f) / matrixsum; + sum0 *= matrixsum; + sum1 *= matrixsum; + sum2 *= matrixsum; + sum3 *= matrixsum; + sum0 -= 0.5f; + sum1 -= 0.5f; + sum2 -= 0.5f; + sum0 *= state->amplifynormal; + sum1 *= state->amplifynormal; + suminv = (0.5f*255.0f) / sqrtf( ( sum0 * sum0 ) + ( sum1 * sum1 ) + ( sum2 * sum2 ) ); + sum0 = (0.5f*255.0f) + ( sum0 * suminv ); + sum1 = (0.5f*255.0f) + ( sum1 * suminv ); + sum2 = (0.5f*255.0f) + ( sum2 * suminv ); + sum3 *= 255.0f; + dst[0] = (unsigned char)( fmaxf( 0.0f, fminf( 255.0f, sum0 + 0.5f ) ) ); + dst[1] = (unsigned char)( fmaxf( 0.0f, fminf( 255.0f, sum1 + 0.5f ) ) ); + dst[2] = (unsigned char)( fmaxf( 0.0f, fminf( 255.0f, sum2 + 0.5f ) ) ); + dst[3] = (unsigned char)( fmaxf( 0.0f, fminf( 255.0f, sum3 + 0.5f ) ) ); + + return; +} + + +//// + + +static void imDynamicKernel3NormalSustain( unsigned char *dst, imGenericMatrixState *state ) +{ + int x, y, mapx, mapy; + float f, v0, v1, v2, energy, sum0, sum1, sum2, sumenergy; + float matrixsum, suminv; + unsigned char *src; + + sum0 = 0.0f; + sum1 = 0.0f; + sum2 = 0.0f; + sumenergy = 0.0f; + matrixsum = 0.0f; + mapy = state->matrixoffsety; + for( y = 0 ; y < state->matrixsizey ; y++ ) + { + src = ADDRESS( state->srcdata, ( mapy * state->bytesperline ) ); + mapx = state->matrixoffsetx + ( state->matrixoffsetx << 1 ); + for( x = 0 ; x < state->matrixsizex ; x++ ) + { + f = state->linearx[x] * state->lineary[y]; + v0 = f * ( (float)src[ mapx + 0 ] - 127.5f ); + v1 = f * ( (float)src[ mapx + 1 ] - 127.5f ); + v2 = f * ( (float)src[ mapx + 2 ] - 127.5f ); + sum0 += v0; + sum1 += v1; + sum2 += v2; + energy = ( v0 * v0 ) + ( v1 * v1 ); + if( energy ) + sumenergy += sqrtf( energy ) / sqrtf( energy + ( v2 * v2 ) ); + matrixsum += f; + mapx += 3; + if( mapx >= state->width3 ) + mapx = 0; + } + mapy++; + if( mapy >= state->height ) + mapy = 0; + } + + matrixsum = (1.0f/255.0f) / matrixsum; + sum0 *= matrixsum; + sum1 *= matrixsum; + sum2 *= matrixsum; + sum0 *= state->amplifynormal; + sum1 *= state->amplifynormal; + suminv = (0.5f*255.0f) / fmaxf( 0.0625f, sqrtf( ( sum0 * sum0 ) + ( sum1 * sum1 ) + ( sum2 * sum2 ) ) ); + sum0 *= suminv; + sum1 *= suminv; + sum2 *= suminv; + energy = sqrtf( ( sum0 * sum0 ) + ( sum1 * sum1 ) ); + sumenergy *= state->normalsustainfactor; + if( energy < sumenergy ) + { + f = fminf( sumenergy / energy, 8.0f ); + sum0 *= f; + sum1 *= f; + suminv = (0.5f*255.0f) / fmaxf( 0.0625f, sqrtf( ( sum0 * sum0 ) + ( sum1 * sum1 ) + ( sum2 * sum2 ) ) ); + sum0 *= suminv; + sum1 *= suminv; + sum2 *= suminv; + } + sum0 += (0.5f*255.0f); + sum1 += (0.5f*255.0f); + sum2 += (0.5f*255.0f); + dst[0] = (unsigned char)( fmaxf( 0.0f, fminf( 255.0f, sum0 + 0.5f ) ) ); + dst[1] = (unsigned char)( fmaxf( 0.0f, fminf( 255.0f, sum1 + 0.5f ) ) ); + dst[2] = (unsigned char)( fmaxf( 0.0f, fminf( 255.0f, sum2 + 0.5f ) ) ); + + return; +} + +static void imDynamicKernel4NormalSustain( unsigned char *dst, imGenericMatrixState *state ) +{ + int x, y, mapx, mapy; + float f, v0, v1, v2, v3, energy, sum0, sum1, sum2, sum3, sumenergy; + float matrixsum, suminv; + unsigned char *src; + + sum0 = 0.0f; + sum1 = 0.0f; + sum2 = 0.0f; + sum3 = 0.0f; + sumenergy = 0.0f; + matrixsum = 0.0f; + mapy = state->matrixoffsety; + for( y = 0 ; y < state->matrixsizey ; y++ ) + { + src = ADDRESS( state->srcdata, ( mapy * state->bytesperline ) ); + mapx = state->matrixoffsetx << 2; + for( x = 0 ; x < state->matrixsizex ; x++ ) + { + f = state->linearx[x] * state->lineary[y]; + v0 = f * ( (float)src[ mapx + 0 ] - 127.5f ); + v1 = f * ( (float)src[ mapx + 1 ] - 127.5f ); + v2 = f * ( (float)src[ mapx + 2 ] - 127.5f ); + v3 = f * (float)src[ mapx + 3 ]; + sum0 += v0; + sum1 += v1; + sum2 += v2; + sum3 += v3; + energy = ( v0 * v0 ) + ( v1 * v1 ); + if( energy ) + sumenergy += sqrtf( energy ) / sqrtf( energy + ( v2 * v2 ) ); + matrixsum += f; + mapx += 4; + if( mapx >= state->width4 ) + mapx = 0; + } + mapy++; + if( mapy >= state->height ) + mapy = 0; + } + + matrixsum = (1.0f/255.0f) / matrixsum; + sum0 *= matrixsum; + sum1 *= matrixsum; + sum2 *= matrixsum; + sum3 *= matrixsum; + sum0 *= state->amplifynormal; + sum1 *= state->amplifynormal; + suminv = (0.5f*255.0f) / fmaxf( 0.0625f, sqrtf( ( sum0 * sum0 ) + ( sum1 * sum1 ) + ( sum2 * sum2 ) ) ); + sum0 *= suminv; + sum1 *= suminv; + sum2 *= suminv; + energy = sqrtf( ( sum0 * sum0 ) + ( sum1 * sum1 ) ); + sumenergy *= state->normalsustainfactor; + if( energy < sumenergy ) + { + f = fminf( sumenergy / energy, 8.0f ); + sum0 *= f; + sum1 *= f; + suminv = (0.5f*255.0f) / fmaxf( 0.0625f, sqrtf( ( sum0 * sum0 ) + ( sum1 * sum1 ) + ( sum2 * sum2 ) ) ); + sum0 *= suminv; + sum1 *= suminv; + sum2 *= suminv; + } + sum0 += (0.5f*255.0f); + sum1 += (0.5f*255.0f); + sum2 += (0.5f*255.0f); + sum3 *= 255.0f; + dst[0] = (unsigned char)( fmaxf( 0.0f, fminf( 255.0f, sum0 + 0.5f ) ) ); + dst[1] = (unsigned char)( fmaxf( 0.0f, fminf( 255.0f, sum1 + 0.5f ) ) ); + dst[2] = (unsigned char)( fmaxf( 0.0f, fminf( 255.0f, sum2 + 0.5f ) ) ); + dst[3] = (unsigned char)( fmaxf( 0.0f, fminf( 255.0f, sum3 + 0.5f ) ) ); + + + return; +} + + +//// + + +static void imDynamicKernel4NormalSustainAlphaNorm( unsigned char *dst, imGenericMatrixState *state ) +{ + int x, y, mapx, mapy; + float f, alpha, v0, v1, v2, v3, energy, sum0, sum1, sum2, sum3, sumenergy; + float matrixsum, suminv; + unsigned char *src; + + sum0 = 0.0f; + sum1 = 0.0f; + sum2 = 0.0f; + sum3 = 0.0f; + sumenergy = 0.0f; + matrixsum = 0.0f; + mapy = state->matrixoffsety; + for( y = 0 ; y < state->matrixsizey ; y++ ) + { + src = ADDRESS( state->srcdata, ( mapy * state->bytesperline ) ); + mapx = state->matrixoffsetx << 2; + for( x = 0 ; x < state->matrixsizex ; x++ ) + { + f = state->linearx[x] * state->lineary[y]; + alpha = (float)src[ mapx + 3 ] * f; + v0 = alpha * ( (float)src[ mapx + 0 ] - 127.5f ); + v1 = alpha * ( (float)src[ mapx + 1 ] - 127.5f ); + v2 = alpha * ( (float)src[ mapx + 2 ] - 127.5f ); + v3 = alpha; + sum0 += v0; + sum1 += v1; + sum2 += v2; + sum3 += v3; + energy = ( v0 * v0 ) + ( v1 * v1 ); + if( energy ) + sumenergy += sqrtf( energy ) / sqrtf( energy + ( v2 * v2 ) ); + matrixsum += f; + mapx += 4; + if( mapx >= state->width4 ) + mapx = 0; + } + mapy++; + if( mapy >= state->height ) + mapy = 0; + } + + matrixsum = 1.0f / matrixsum; + sum0 *= matrixsum; + sum1 *= matrixsum; + sum2 *= matrixsum; + sum3 *= matrixsum; + if( sum3 >= state->minimumalphaf ) + { + f = 1.0f / sum3; + sum0 *= f; + sum1 *= f; + sum2 *= f; + sum0 *= state->amplifynormal; + sum1 *= state->amplifynormal; + suminv = (0.5f*255.0f) / fmaxf( 0.0625f, sqrtf( ( sum0 * sum0 ) + ( sum1 * sum1 ) + ( sum2 * sum2 ) ) ); + sum0 *= suminv; + sum1 *= suminv; + sum2 *= suminv; + energy = sqrtf( ( sum0 * sum0 ) + ( sum1 * sum1 ) ); + sumenergy *= state->normalsustainfactor; + if( energy < sumenergy ) + { + f = fminf( sumenergy / energy, 8.0f ); + sum0 *= f; + sum1 *= f; + suminv = (0.5f*255.0f) / fmaxf( 0.0625f, sqrtf( ( sum0 * sum0 ) + ( sum1 * sum1 ) + ( sum2 * sum2 ) ) ); + sum0 *= suminv; + sum1 *= suminv; + sum2 *= suminv; + } + sum0 += (0.5f*255.0f); + sum1 += (0.5f*255.0f); + sum2 += (0.5f*255.0f); + dst[0] = (unsigned char)( fmaxf( 0.0f, fminf( 255.0f, sum0 + 0.5f ) ) ); + dst[1] = (unsigned char)( fmaxf( 0.0f, fminf( 255.0f, sum1 + 0.5f ) ) ); + dst[2] = (unsigned char)( fmaxf( 0.0f, fminf( 255.0f, sum2 + 0.5f ) ) ); + dst[3] = (unsigned char)( fmaxf( 0.0f, fminf( 255.0f, sum3 + 0.5f ) ) ); + } + else + { + dst[0] = 0; + dst[1] = 0; + dst[2] = 0; + dst[3] = 0; + } + + return; +} + + +//// + + +int imReduceImageKaiserData( unsigned char *dstdata, unsigned char *srcdata, int width, int height, int bytesperpixel, int bytesperline, int newwidth, int newheight, imReduceOptions *options ) +{ + int filter, x, y; + float scalex, scaley, scaleinvx, scaleinvy; + float sourcex, sourcey; + unsigned char *dst; + imGenericMatrixState state; + void (*applykernel)( unsigned char *dst, imGenericMatrixState *state ); +#if CPU_SSE2_SUPPORT + void (*applykernelcore)( unsigned char *dst, imGenericMatrixState *state ); +#endif + + filter = options->filter; + if( ( newwidth > width ) || ( newheight > height ) ) + return 0; + + applykernel = 0; +#if CPU_SSE2_SUPPORT + applykernelcore = 0; +#endif + + if( filter == IM_REDUCE_FILTER_LINEAR ) + { + if( bytesperpixel == 4 ) + applykernel = imDynamicKernel4Linear; + else if( bytesperpixel == 3 ) + applykernel = imDynamicKernel3Linear; + else if( bytesperpixel == 2 ) + applykernel = imDynamicKernel2Linear; + else if( bytesperpixel == 1 ) + applykernel = imDynamicKernel1Linear; + } + else if( filter == IM_REDUCE_FILTER_LINEAR_ALPHANORM ) + { + if( bytesperpixel == 4 ) + { + applykernel = imDynamicKernel4LinearAlphaNorm; +#if CPU_SSE2_SUPPORT + applykernelcore = imDynamicKernel4LinearAlphaNorm_Core; +#endif + } + else if( bytesperpixel == 3 ) + applykernel = imDynamicKernel3Linear; + else if( bytesperpixel == 2 ) + applykernel = imDynamicKernel2Linear; + else if( bytesperpixel == 1 ) + applykernel = imDynamicKernel1Linear; + } + else if( filter == IM_REDUCE_FILTER_SRGB ) + { + if( bytesperpixel == 4 ) + applykernel = imDynamicKernel4sRGB; + else if( bytesperpixel == 3 ) + applykernel = imDynamicKernel3sRGB; + else if( bytesperpixel == 2 ) + applykernel = imDynamicKernel2sRGB; + else if( bytesperpixel == 1 ) + applykernel = imDynamicKernel1sRGB; + } + else if( filter == IM_REDUCE_FILTER_SRGB_ALPHANORM ) + { + if( bytesperpixel == 4 ) + applykernel = imDynamicKernel4sRGBAlphaNorm; + else if( bytesperpixel == 3 ) + applykernel = imDynamicKernel3sRGB; + else if( bytesperpixel == 2 ) + applykernel = imDynamicKernel2sRGB; + else if( bytesperpixel == 1 ) + applykernel = imDynamicKernel1sRGB; + } + else if( filter == IM_REDUCE_FILTER_NORMALMAP ) + { + if( bytesperpixel == 4 ) + applykernel = imDynamicKernel4Normal; + else if( bytesperpixel == 3 ) + applykernel = imDynamicKernel3Normal; + } + else if( filter == IM_REDUCE_FILTER_NORMALMAP_SUSTAIN ) + { + if( bytesperpixel == 4 ) + applykernel = imDynamicKernel4NormalSustain; + else if( bytesperpixel == 3 ) + applykernel = imDynamicKernel3NormalSustain; + } + else if( filter == IM_REDUCE_FILTER_NORMALMAP_SUSTAIN_ALPHANORM ) + { + if( bytesperpixel == 4 ) + applykernel = imDynamicKernel4NormalSustainAlphaNorm; + else if( bytesperpixel == 3 ) + applykernel = imDynamicKernel3NormalSustain; + } + + if( !applykernel ) + return 0; + + state.minimumalpha = 4; + state.minimumalphaf = (float)state.minimumalpha; + state.amplifynormal = fmaxf( 1.0f, options->amplifynormal ); + state.normalsustainfactor = options->normalsustainfactor; + + state.dithersum = 0.0f; + if( ( newwidth | newheight ) > 2 ) + state.dithersum = 0.5f; + + state.srcdata = srcdata; + state.width1 = width * 1; + state.width2 = width * 2; + state.width3 = width * 3; + state.width4 = width * 4; + state.height = height; + state.bytesperline = bytesperline; + + scalex = (float)newwidth / (float)width; + scaley = (float)newheight / (float)height; + scaleinvx = (float)width / (float)newwidth; + scaleinvy = (float)height / (float)newheight; + + imAllocGenericState( &state, scalex, scaley, options->hopcount, options->alpha ); + +#if CPU_SSE2_SUPPORT + if( applykernelcore ) + { + dst = dstdata; + for( y = 0 ; y < newheight ; y++ ) + { + sourcey = ( ( (float)y + 0.5f ) * scaleinvy ) - 0.5f; + imBuildGenericLinearY( &state, scaley, scaleinvy, sourcey, options->hopcount, options->alpha, height ); + for( x = 0 ; x < newwidth ; x++, dst += bytesperpixel ) + { + sourcex = ( ( (float)x + 0.5f ) * scaleinvx ) - 0.5f; + imBuildGenericLinearX( &state, scalex, scaleinvx, sourcex, options->hopcount, options->alpha, width ); + if( ( state.matrixoffsetx + ( ( state.matrixsizex + 3 ) & ~3 ) ) < width ) + applykernelcore( dst, &state ); + else + applykernel( dst, &state ); + } + } + } + else +#endif + { + dst = dstdata; + for( y = 0 ; y < newheight ; y++ ) + { + sourcey = ( ( (float)y + 0.5f ) * scaleinvy ) - 0.5f; + imBuildGenericLinearY( &state, scaley, scaleinvy, sourcey, options->hopcount, options->alpha, height ); + for( x = 0 ; x < newwidth ; x++, dst += bytesperpixel ) + { + sourcex = ( ( (float)x + 0.5f ) * scaleinvx ) - 0.5f; + imBuildGenericLinearX( &state, scalex, scaleinvx, sourcex, options->hopcount, options->alpha, width ); + applykernel( dst, &state ); + } + } + } + + imFreeGenericState( &state ); + + return 1; +} + + +int imReduceImageKaiser( imgImage *imgdst, imgImage *imgsrc, int newwidth, int newheight, imReduceOptions *options ) +{ + int retvalue; + + imgdst->format.width = newwidth; + imgdst->format.height = newheight; + imgdst->format.type = imgsrc->format.type; + imgdst->format.bytesperpixel = imgsrc->format.bytesperpixel; + imgdst->format.bytesperline = imgdst->format.width * imgdst->format.bytesperpixel; + imgdst->data = malloc( imgdst->format.height * imgdst->format.bytesperline ); + if( !( imgdst->data ) ) + return 0; + + retvalue = imReduceImageKaiserData( imgdst->data, imgsrc->data, imgsrc->format.width, imgsrc->format.height, imgsrc->format.bytesperpixel, imgsrc->format.bytesperline, newwidth, newheight, options ); + + return retvalue; +} + + + +//////////////////////////////////////////////////////////////////////////////// + + + +static inline CC_ALWAYSINLINE void imReduceHalfBox1Linear( unsigned char *dst, unsigned char *src, int bytesperpixel, int bytesperline, float *dithersum ) +{ + dst[0] = (unsigned char)( ( (int)src[0] + (int)src[bytesperpixel+0] + (int)src[bytesperline+0] + (int)src[bytesperpixel+bytesperline+0] + 2 ) >> 2 ); + return; +} + +static inline CC_ALWAYSINLINE void imReduceHalfBox2Linear( unsigned char *dst, unsigned char *src, int bytesperpixel, int bytesperline, float *dithersum ) +{ + dst[0] = (unsigned char)( ( (int)src[0] + (int)src[bytesperpixel+0] + (int)src[bytesperline+0] + (int)src[bytesperpixel+bytesperline+0] + 2 ) >> 2 ); + dst[1] = (unsigned char)( ( (int)src[1] + (int)src[bytesperpixel+1] + (int)src[bytesperline+1] + (int)src[bytesperpixel+bytesperline+1] + 2 ) >> 2 ); + return; +} + +static inline CC_ALWAYSINLINE void imReduceHalfBox3Linear( unsigned char *dst, unsigned char *src, int bytesperpixel, int bytesperline, float *dithersum ) +{ + dst[0] = (unsigned char)( ( (int)src[0] + (int)src[bytesperpixel+0] + (int)src[bytesperline+0] + (int)src[bytesperpixel+bytesperline+0] + 2 ) >> 2 ); + dst[1] = (unsigned char)( ( (int)src[1] + (int)src[bytesperpixel+1] + (int)src[bytesperline+1] + (int)src[bytesperpixel+bytesperline+1] + 2 ) >> 2 ); + dst[2] = (unsigned char)( ( (int)src[2] + (int)src[bytesperpixel+2] + (int)src[bytesperline+2] + (int)src[bytesperpixel+bytesperline+2] + 2 ) >> 2 ); + return; +} + +static inline CC_ALWAYSINLINE void imReduceHalfBox4Linear( unsigned char *dst, unsigned char *src, int bytesperpixel, int bytesperline, float *dithersum ) +{ + dst[0] = (unsigned char)( ( (int)src[0] + (int)src[bytesperpixel+0] + (int)src[bytesperline+0] + (int)src[bytesperpixel+bytesperline+0] + 2 ) >> 2 ); + dst[1] = (unsigned char)( ( (int)src[1] + (int)src[bytesperpixel+1] + (int)src[bytesperline+1] + (int)src[bytesperpixel+bytesperline+1] + 2 ) >> 2 ); + dst[2] = (unsigned char)( ( (int)src[2] + (int)src[bytesperpixel+2] + (int)src[bytesperline+2] + (int)src[bytesperpixel+bytesperline+2] + 2 ) >> 2 ); + dst[3] = (unsigned char)( ( (int)src[3] + (int)src[bytesperpixel+3] + (int)src[bytesperline+3] + (int)src[bytesperpixel+bytesperline+3] + 2 ) >> 2 ); + return; +} + +static inline CC_ALWAYSINLINE void imReduceHalfBox1sRGB( unsigned char *dst, unsigned char *src, int bytesperpixel, int bytesperline, float *dithersum ) +{ + int i, offset[4]; + float sum0; + offset[0] = 0; + offset[1] = bytesperpixel; + offset[2] = bytesperline; + offset[3] = bytesperline + bytesperpixel; + sum0 = 0.0f; + for( i = 0 ; i < 4 ; i++ ) + sum0 += srgb2linear( (float)src[offset[i]+0] ); + dst[0] = (unsigned char)( fmaxf( 0.0f, fminf( 255.0f, linear2srgb( sum0 * 0.25f ) + 0.5f ) ) ); + return; +} + +static inline CC_ALWAYSINLINE void imReduceHalfBox2sRGB( unsigned char *dst, unsigned char *src, int bytesperpixel, int bytesperline, float *dithersum ) +{ + int i, offset[4]; + float sum0, sum1; + offset[0] = 0; + offset[1] = bytesperpixel; + offset[2] = bytesperline; + offset[3] = bytesperline + bytesperpixel; + sum0 = 0.0f; + sum1 = 0.0f; + for( i = 0 ; i < 4 ; i++ ) + { + sum0 += srgb2linear( (float)src[offset[i]+0] ); + sum1 += srgb2linear( (float)src[offset[i]+1] ); + } + dst[0] = (unsigned char)( fmaxf( 0.0f, fminf( 255.0f, linear2srgb( sum0 * 0.25f ) + 0.5f ) ) ); + dst[1] = (unsigned char)( fmaxf( 0.0f, fminf( 255.0f, linear2srgb( sum1 * 0.25f ) + 0.5f ) ) ); + return; +} + +static inline CC_ALWAYSINLINE void imReduceHalfBox3sRGB( unsigned char *dst, unsigned char *src, int bytesperpixel, int bytesperline, float *dithersum ) +{ + int i, offset[4]; + float sum0, sum1, sum2; + offset[0] = 0; + offset[1] = bytesperpixel; + offset[2] = bytesperline; + offset[3] = bytesperline + bytesperpixel; + sum0 = 0.0f; + sum1 = 0.0f; + sum2 = 0.0f; + for( i = 0 ; i < 4 ; i++ ) + { + sum0 += srgb2linear( (float)src[offset[i]+0] ); + sum1 += srgb2linear( (float)src[offset[i]+1] ); + sum2 += srgb2linear( (float)src[offset[i]+2] ); + } + dst[0] = (unsigned char)( fmaxf( 0.0f, fminf( 255.0f, linear2srgb( sum0 * 0.25f ) + 0.5f ) ) ); + dst[1] = (unsigned char)( fmaxf( 0.0f, fminf( 255.0f, linear2srgb( sum1 * 0.25f ) + 0.5f ) ) ); + dst[2] = (unsigned char)( fmaxf( 0.0f, fminf( 255.0f, linear2srgb( sum2 * 0.25f ) + 0.5f ) ) ); + return; +} + +static inline CC_ALWAYSINLINE void imReduceHalfBox4sRGB( unsigned char *dst, unsigned char *src, int bytesperpixel, int bytesperline, float *dithersum ) +{ + int i, offset[4], sum3; + float sum0, sum1, sum2; + offset[0] = 0; + offset[1] = bytesperpixel; + offset[2] = bytesperline; + offset[3] = bytesperline + bytesperpixel; + sum0 = 0.0f; + sum1 = 0.0f; + sum2 = 0.0f; + sum3 = 2; + for( i = 0 ; i < 4 ; i++ ) + { + sum0 += srgb2linear( (float)src[offset[i]+0] ); + sum1 += srgb2linear( (float)src[offset[i]+1] ); + sum2 += srgb2linear( (float)src[offset[i]+2] ); + sum3 += (int)src[offset[i]+2]; + } + dst[0] = (unsigned char)( fmaxf( 0.0f, fminf( 255.0f, linear2srgb( sum0 * 0.25f ) + 0.5f ) ) ); + dst[1] = (unsigned char)( fmaxf( 0.0f, fminf( 255.0f, linear2srgb( sum1 * 0.25f ) + 0.5f ) ) ); + dst[2] = (unsigned char)( fmaxf( 0.0f, fminf( 255.0f, linear2srgb( sum2 * 0.25f ) + 0.5f ) ) ); + dst[3] = (unsigned char)( sum3 >> 2 ); + return; +} + +static inline CC_ALWAYSINLINE void imReduceHalfBox3Normal( unsigned char *dst, unsigned char *src, int bytesperpixel, int bytesperline, float *dithersum ) +{ + float v0, v1, v2, suminv; + + v0 = (1.0/1020.0) * (float)( (int)src[0] + (int)src[bytesperpixel+0] + (int)src[bytesperline+0] + (int)src[bytesperpixel+bytesperline+0] ); + v1 = (1.0/1020.0) * (float)( (int)src[1] + (int)src[bytesperpixel+1] + (int)src[bytesperline+1] + (int)src[bytesperpixel+bytesperline+1] ); + v2 = (1.0/1020.0) * (float)( (int)src[2] + (int)src[bytesperpixel+2] + (int)src[bytesperline+2] + (int)src[bytesperpixel+bytesperline+2] ); + v0 = 2.0 * ( v0 - 0.5 ); + v1 = 2.0 * ( v1 - 0.5 ); + v2 = 2.0 * ( v2 - 0.5 ); + suminv = 0.5 / sqrtf( ( v0 * v0 ) + ( v1 * v1 ) + ( v2 * v2 ) ); + v0 = 0.5 + ( v0 * suminv ); + v1 = 0.5 + ( v1 * suminv ); + v2 = 0.5 + ( v2 * suminv ); + dst[0] = (unsigned char)ROUND_POSITIVE_FLOAT( 255.0 * v0 ); + dst[1] = (unsigned char)ROUND_POSITIVE_FLOAT( 255.0 * v1 ); + dst[2] = (unsigned char)ROUND_POSITIVE_FLOAT( 255.0 * v2 ); + + return; +} + +static inline CC_ALWAYSINLINE void imReduceHalfBox4Normal( unsigned char *dst, unsigned char *src, int bytesperpixel, int bytesperline, float *dithersum ) +{ + float v0, v1, v2, suminv; + + v0 = (1.0/1020.0) * (float)( (int)src[0] + (int)src[bytesperpixel+0] + (int)src[bytesperline+0] + (int)src[bytesperpixel+bytesperline+0] ); + v1 = (1.0/1020.0) * (float)( (int)src[1] + (int)src[bytesperpixel+1] + (int)src[bytesperline+1] + (int)src[bytesperpixel+bytesperline+1] ); + v2 = (1.0/1020.0) * (float)( (int)src[2] + (int)src[bytesperpixel+2] + (int)src[bytesperline+2] + (int)src[bytesperpixel+bytesperline+2] ); + v0 = 2.0 * ( v0 - 0.5 ); + v1 = 2.0 * ( v1 - 0.5 ); + v2 = 2.0 * ( v2 - 0.5 ); + suminv = 0.5 / sqrtf( ( v0 * v0 ) + ( v1 * v1 ) + ( v2 * v2 ) ); + v0 = 0.5 + ( v0 * suminv ); + v1 = 0.5 + ( v1 * suminv ); + v2 = 0.5 + ( v2 * suminv ); + dst[0] = (unsigned char)ROUND_POSITIVE_FLOAT( 255.0 * v0 ); + dst[1] = (unsigned char)ROUND_POSITIVE_FLOAT( 255.0 * v1 ); + dst[2] = (unsigned char)ROUND_POSITIVE_FLOAT( 255.0 * v2 ); + dst[3] = (unsigned char)( ( (int)src[3] + (int)src[bytesperpixel+3] + (int)src[bytesperline+3] + (int)src[bytesperpixel+bytesperline+3] + 2 ) >> 2 ); + + return; +} + +static inline CC_ALWAYSINLINE void imReduceHalfBox3Water( unsigned char *dst, unsigned char *src, int bytesperpixel, int bytesperline, float *dithersum ) +{ + float v0, v1, v2, suminv; + + v0 = (1.0f/1020.0f) * (float)( (int)src[0] + (int)src[bytesperpixel+0] + (int)src[bytesperline+0] + (int)src[bytesperpixel+bytesperline+0] ); + v1 = (1.0f/1020.0f) * (float)( (int)src[1] + (int)src[bytesperpixel+1] + (int)src[bytesperline+1] + (int)src[bytesperpixel+bytesperline+1] ); + v2 = (1.0f/1020.0f) * (float)( (int)src[2] + (int)src[bytesperpixel+2] + (int)src[bytesperline+2] + (int)src[bytesperpixel+bytesperline+2] ); + + v0 = 2.0f * ( v0 - 0.5f ); + v1 = 2.0f * ( v1 - 0.5f ); + suminv = sqrtf( ( v0 * v0 ) + ( v1 * v1 ) ); + if( suminv < 0.75f ) + { + suminv = 0.5f / suminv; + v0 = 0.5f + ( v0 * suminv ); + v1 = 0.5f + ( v1 * suminv ); + } + if( v2 > 0.1f ) + { + *dithersum += v2; + if( v2 > 0.45f ) + v2 = 1.0f; + else if( ( v2 < 0.3f ) && ( *dithersum < 1.0f ) ) + v2 = 0.0f; + else + v2 = ( ( v2 + *dithersum ) < 0.45f ? 0.0f : 1.0f ); + *dithersum -= v2; + } + v0 *= 255.0f; + v1 *= 255.0f; + v2 *= 255.0f; + + dst[0] = (int)( fmaxf( 0.0f, fminf( 255.0f, v0 + 0.5f ) ) ); + dst[1] = (int)( fmaxf( 0.0f, fminf( 255.0f, v1 + 0.5f ) ) ); + dst[2] = (int)( fmaxf( 0.0f, fminf( 255.0f, v2 + 0.5f ) ) ); + + return; +} + +static inline CC_ALWAYSINLINE void imReduceHalfBox4Water( unsigned char *dst, unsigned char *src, int bytesperpixel, int bytesperline, float *dithersum ) +{ + float v0, v1, v2, suminv; + + v0 = (1.0f/1020.0f) * (float)( (int)src[0] + (int)src[bytesperpixel+0] + (int)src[bytesperline+0] + (int)src[bytesperpixel+bytesperline+0] ); + v1 = (1.0f/1020.0f) * (float)( (int)src[1] + (int)src[bytesperpixel+1] + (int)src[bytesperline+1] + (int)src[bytesperpixel+bytesperline+1] ); + v2 = (1.0f/1020.0f) * (float)( (int)src[2] + (int)src[bytesperpixel+2] + (int)src[bytesperline+2] + (int)src[bytesperpixel+bytesperline+2] ); + + v0 = 2.0f * ( v0 - 0.5f ); + v1 = 2.0f * ( v1 - 0.5f ); + suminv = sqrtf( ( v0 * v0 ) + ( v1 * v1 ) ); + if( suminv < 0.75f ) + { + suminv = 0.5f / suminv; + v0 = 0.5f + ( v0 * suminv ); + v1 = 0.5f + ( v1 * suminv ); + } + if( v2 > 0.1f ) + { + *dithersum += v2; + if( v2 > 0.45f ) + v2 = 1.0f; + else if( ( v2 < 0.3f ) && ( *dithersum < 1.0f ) ) + v2 = 0.0f; + else + v2 = ( ( v2 + *dithersum ) < 0.45f ? 0.0f : 1.0f ); + *dithersum -= v2; + } + v0 *= 255.0f; + v1 *= 255.0f; + v2 *= 255.0f; + + dst[0] = (int)( fmaxf( 0.0f, fminf( 255.0f, v0 + 0.5f ) ) ); + dst[1] = (int)( fmaxf( 0.0f, fminf( 255.0f, v1 + 0.5f ) ) ); + dst[2] = (int)( fmaxf( 0.0f, fminf( 255.0f, v2 + 0.5f ) ) ); + dst[3] = (unsigned char)( ( (int)src[3] + (int)src[bytesperpixel+3] + (int)src[bytesperline+3] + (int)src[bytesperpixel+bytesperline+3] + 2 ) >> 2 ); + + return; +} + +static inline CC_ALWAYSINLINE void imReduceHalfBox4Plant( unsigned char *dst, unsigned char *src, int bytesperpixel, int bytesperline, float *dithersum ) +{ + int alpha; + + dst[0] = (unsigned char)( ( (int)src[0] + (int)src[bytesperpixel+0] + (int)src[bytesperline+0] + (int)src[bytesperpixel+bytesperline+0] + 2 ) >> 2 ); + dst[1] = (unsigned char)( ( (int)src[1] + (int)src[bytesperpixel+1] + (int)src[bytesperline+1] + (int)src[bytesperpixel+bytesperline+1] + 2 ) >> 2 ); + dst[2] = (unsigned char)( ( (int)src[2] + (int)src[bytesperpixel+2] + (int)src[bytesperline+2] + (int)src[bytesperpixel+bytesperline+2] + 2 ) >> 2 ); + + alpha = ( (int)src[3] + (int)src[bytesperpixel+3] + (int)src[bytesperline+3] + (int)src[bytesperpixel+bytesperline+3] ); + alpha += alpha >> 2; + alpha = ( alpha + 2 ) >> 2; + if( alpha > 255 ) + alpha = 255; + dst[3] = (unsigned char)alpha; + + return; +} + + +static inline CC_ALWAYSINLINE void imReduceImageHalfBoxWork( unsigned char *dst, unsigned char *src, int width, int height, int bytesperpixel, int bytesperline, void (*work)( unsigned char *dst, unsigned char *src, int bytesperpixel, int bytesperline, float *dithersum ) ) +{ + int x, y, newwidth, newheight, rowoffset; + float dithersum; + newwidth = ( width < 2 ) ? 1 : ( ( width + 1 ) / 2 ); + newheight = ( height < 2 ) ? 1 : ( ( height + 1 ) / 2 ); + rowoffset = bytesperline + ( bytesperpixel * ( width - ( newwidth << 1 ) ) ); + dithersum = 0.0f; + if( ( newwidth | newheight ) > 2 ) + dithersum = 0.5f; + for( y = 0 ; y < newheight ; y++ ) + { + for( x = 0 ; x < newwidth ; x++, src += bytesperpixel, dst += bytesperpixel ) + { + work( dst, src, bytesperpixel, bytesperline, &dithersum ); + src += bytesperpixel; + } + src += rowoffset; + } + return; +} + + +int imReduceImageHalfBoxData( unsigned char *dstdata, unsigned char *srcdata, int width, int height, int bytesperpixel, int bytesperline, imReduceOptions *options ) +{ + int filter, retval; + + filter = options->filter; + retval = 1; + if( ( filter == IM_REDUCE_FILTER_LINEAR ) || ( filter == IM_REDUCE_FILTER_LINEAR_ALPHANORM ) ) + { + if( bytesperpixel == 4 ) + imReduceImageHalfBoxWork( dstdata, srcdata, width, height, bytesperpixel, bytesperline, imReduceHalfBox4Linear ); + else if( bytesperpixel == 3 ) + imReduceImageHalfBoxWork( dstdata, srcdata, width, height, bytesperpixel, bytesperline, imReduceHalfBox3Linear ); + else if( bytesperpixel == 2 ) + imReduceImageHalfBoxWork( dstdata, srcdata, width, height, bytesperpixel, bytesperline, imReduceHalfBox2Linear ); + else if( bytesperpixel == 1 ) + imReduceImageHalfBoxWork( dstdata, srcdata, width, height, bytesperpixel, bytesperline, imReduceHalfBox1Linear ); + else + retval = 0; + } + else if( ( filter == IM_REDUCE_FILTER_SRGB ) || ( filter == IM_REDUCE_FILTER_SRGB_ALPHANORM ) ) + { + if( bytesperpixel == 4 ) + imReduceImageHalfBoxWork( dstdata, srcdata, width, height, bytesperpixel, bytesperline, imReduceHalfBox4sRGB ); + else if( bytesperpixel == 3 ) + imReduceImageHalfBoxWork( dstdata, srcdata, width, height, bytesperpixel, bytesperline, imReduceHalfBox3sRGB ); + else if( bytesperpixel == 2 ) + imReduceImageHalfBoxWork( dstdata, srcdata, width, height, bytesperpixel, bytesperline, imReduceHalfBox2sRGB ); + else if( bytesperpixel == 1 ) + imReduceImageHalfBoxWork( dstdata, srcdata, width, height, bytesperpixel, bytesperline, imReduceHalfBox1sRGB ); + else + retval = 0; + } + else if( ( filter == IM_REDUCE_FILTER_NORMALMAP ) || ( filter == IM_REDUCE_FILTER_NORMALMAP_ALPHANORM ) || ( filter == IM_REDUCE_FILTER_NORMALMAP_SUSTAIN ) || ( filter == IM_REDUCE_FILTER_NORMALMAP_SUSTAIN_ALPHANORM ) ) + { + if( bytesperpixel == 4 ) + imReduceImageHalfBoxWork( dstdata, srcdata, width, height, bytesperpixel, bytesperline, imReduceHalfBox4Normal ); + else if( bytesperpixel == 3 ) + imReduceImageHalfBoxWork( dstdata, srcdata, width, height, bytesperpixel, bytesperline, imReduceHalfBox3Normal ); + else + retval = 0; + } + else if( filter == IM_REDUCE_FILTER_WATERMAP ) + { + if( bytesperpixel == 4 ) + imReduceImageHalfBoxWork( dstdata, srcdata, width, height, bytesperpixel, bytesperline, imReduceHalfBox4Water ); + else if( bytesperpixel == 3 ) + imReduceImageHalfBoxWork( dstdata, srcdata, width, height, bytesperpixel, bytesperline, imReduceHalfBox3Water ); + else + retval = 0; + } + else if( filter == IM_REDUCE_FILTER_PLANTMAP ) + { + if( bytesperpixel == 4 ) + imReduceImageHalfBoxWork( dstdata, srcdata, width, height, bytesperpixel, bytesperline, imReduceHalfBox4Plant ); + else + retval = 0; + } + else + retval = 0; + + return retval; +} + + +int imReduceImageHalfBox( imgImage *imgdst, imgImage *imgsrc, imReduceOptions *options ) +{ + int newwidth, newheight, retvalue; + + newwidth = ( ( imgsrc->format.width < 2 ) ? 1 : ( ( imgsrc->format.width + 1 ) / 2 ) ); + newheight = ( ( imgsrc->format.height < 2 ) ? 1 : ( ( imgsrc->format.height + 1 ) / 2 ) ); + + imgdst->format.width = newwidth; + imgdst->format.height = newheight; + imgdst->format.type = imgsrc->format.type; + imgdst->format.bytesperpixel = imgsrc->format.bytesperpixel; + imgdst->format.bytesperline = imgdst->format.width * imgdst->format.bytesperpixel; + imgdst->data = malloc( imgdst->format.height * imgdst->format.bytesperline ); + + retvalue = imReduceImageHalfBoxData( imgdst->data, imgsrc->data, imgsrc->format.width, imgsrc->format.height, imgsrc->format.bytesperpixel, imgsrc->format.bytesperline, options ); + + return retvalue; +} + + + +//////////////////////////////////////////////////////////////////////////////// + + + +int imBuildMipmapCascade( imMipmapCascade *cascade, void *imagedata, int width, int height, int layercount, int bytesperpixel, int bytesperline, imReduceOptions *options, int cascadeflags ) +{ + int layerindex, level, srclevel, srcwidth, srcheight, method, divisor; + int levelwidth, levelheight; + void *src, *dst; + + cascade->width = width; + cascade->height = height; + cascade->layercount = layercount; + cascade->bytesperpixel = bytesperpixel; + cascade->bytesperline = bytesperline; + cascade->options = options; + + /* No need for mipmaps */ + if( ( cascade->width == 1 ) && ( cascade->height == 1 ) ) + return 1; + if( bytesperpixel != 4 ) + cascadeflags &= ~( IM_CASCADE_FLAGS_COLOR_BORDER_BASE | IM_CASCADE_FLAGS_COLOR_BORDER_MIPMAPS ); + + /* Allocate all the mipmap levels */ + if( !( layercount ) ) + layercount = 1; + cascade->mipmap[0] = imagedata; + levelwidth = cascade->width; + levelheight = cascade->height; + for( level = 1 ; ; level++ ) + { + levelwidth = ( levelwidth < 2 ) ? 1 : ( levelwidth >> 1 ); + levelheight = ( levelheight < 2 ) ? 1 : ( levelheight >> 1 ); + if( !( cascade->mipmap[level] = malloc( levelwidth * levelheight * layercount * bytesperpixel ) ) ) + return 0; + if( ( levelwidth == 1 ) && ( levelheight == 1 ) ) + break; + } + cascade->mipmap[level+1] = 0; + + if( cascadeflags & IM_CASCADE_FLAGS_COLOR_BORDER_BASE ) + imPropagateAlphaBorder( imagedata, width, height * layercount, bytesperpixel, bytesperline ); + + /* For every layer, compute all its mipmap */ + for( layerindex = 0 ; layerindex < layercount ; layerindex++ ) + { + levelwidth = cascade->width; + levelheight = cascade->height; + for( level = 1 ; cascade->mipmap[level] ; level++ ) + { + levelwidth = ( levelwidth < 2 ) ? 1 : ( levelwidth >> 1 ); + levelheight = ( levelheight < 2 ) ? 1 : ( levelheight >> 1 ); + dst = ADDRESS( cascade->mipmap[level], layerindex * levelwidth * levelheight * bytesperpixel ); + + /* Decide what method and source level to pick */ + if( ( levelwidth | levelheight ) >= 16 ) + { + srclevel = level - 2; + if( srclevel < 0 ) + srclevel = 0; + method = 1; + } + else + { + srclevel = level - 1; + method = 0; + } +#if DEBUG_VERBOSE + printf( "Tex level %d, srclevel %d, layer %d, filter %d, method %d : %d x %d\n", level, srclevel, layerindex, options->filter, method, levelwidth, levelheight ); +#endif + srcwidth = width >> srclevel; + if( !( srcwidth ) ) + srcwidth = 1; + srcheight = height >> srclevel; + if( !( srcheight ) ) + srcheight = 1; + if( srclevel ) + src = ADDRESS( cascade->mipmap[srclevel], layerindex * srcheight * srcwidth * bytesperpixel ); + else + src = ADDRESS( cascade->mipmap[srclevel], layerindex * srcheight * cascade->bytesperline ); + + divisor = 1 << ( level - srclevel ); + if( ( ( levelwidth * divisor ) != srcwidth ) || ( ( levelheight * divisor ) != srcheight ) ) + method = 2; + + if( method == 2 ) + { + if( !( imReduceImageKaiserData( dst, src, srcwidth, srcheight, bytesperpixel, srcwidth * bytesperpixel, levelwidth, levelheight, options ) ) ) + { + printf( "ERROR AT %s:%d\n", __FILE__, __LINE__ ); + return 0; + } + } + else if( method == 1 ) + { + if( !( imReduceImageKaiserDataDivisor( dst, src, srcwidth, srcheight, bytesperpixel, srcwidth * bytesperpixel, divisor, options ) ) ) + { + printf( "ERROR AT %s:%d\n", __FILE__, __LINE__ ); + return 0; + } + } + else + { + if( !( imReduceImageHalfBoxData( dst, src, srcwidth, srcheight, bytesperpixel, srcwidth * bytesperpixel, options ) ) ) + { + printf( "ERROR AT %s:%d\n", __FILE__, __LINE__ ); + return 0; + } + } + + if( cascadeflags & IM_CASCADE_FLAGS_COLOR_BORDER_MIPMAPS ) + imPropagateAlphaBorder( dst, levelwidth, levelheight, bytesperpixel, levelwidth * bytesperpixel ); + } + } + + return 1; +} + + +void imFreeMipmapCascade( imMipmapCascade *cascade ) +{ + int level; + for( level = 1 ; ; level++ ) + { + if( !( cascade->mipmap[level] ) ) + break; + free( cascade->mipmap[level] ); + cascade->mipmap[level] = 0; + } + return; +} + + +//// + + +#define IM_PIXEL_ALPHA_MASK (0xff000000) +#define IM_PIXEL_RGB_MASK (0x00ffffff) + +void imPropagateAlphaBorder( unsigned char *imagedata, int width, int height, int bytesperpixel, int bytesperline ) +{ + int x, y, backtrackflag; + uint32_t pixel, refcolor, prevrowpixel; + uint32_t *row, *prevrow; + + if( bytesperpixel != 4 ) + return; + row = (uint32_t *)imagedata; + prevrow = row; + for( y = 0 ; y < height ; y++ ) + { + refcolor = 0; + backtrackflag = 0; + for( x = 0 ; x < width ; x++ ) + { + pixel = row[x]; + prevrowpixel = prevrow[x]; + if( pixel & IM_PIXEL_ALPHA_MASK ) + { + /* Pixel has some color, spread to neighbor if applicable */ + refcolor = pixel & IM_PIXEL_RGB_MASK; + if( backtrackflag ) + { + row[x-1] = refcolor; + backtrackflag = 0; + } + if( !( prevrowpixel & IM_PIXEL_ALPHA_MASK ) ) + prevrow[x] = refcolor; + } + else + { + /* Pixel is fully transparent, spread from neighbor if applicable */ + if( refcolor ) + { + row[x] = refcolor; + backtrackflag = 0; + refcolor = 0; + } + else if( prevrowpixel & IM_PIXEL_ALPHA_MASK ) + { + row[x] = prevrowpixel & IM_PIXEL_RGB_MASK; + backtrackflag = 0; + } + else + backtrackflag = 1; + } + } + prevrow = row; + row = ADDRESS( row, bytesperline ); + } + + return; +} + + + diff --git a/lib/graphics_utils/mipmap/imgresize.h b/lib/graphics_utils/mipmap/imgresize.h new file mode 100644 index 000000000..b47d07a6b --- /dev/null +++ b/lib/graphics_utils/mipmap/imgresize.h @@ -0,0 +1,150 @@ +/* ----------------------------------------------------------------------------- + * + * Copyright (c) 2014-2017 Alexis Naveros. + * Portions developed under contract to the SURVICE Engineering Company. + * + * This software is provided 'as-is', without any express or implied + * warranty. In no event will the authors be held liable for any damages + * arising from the use of this software. + * + * Permission is granted to anyone to use this software for any purpose, + * including commercial applications, and to alter it and redistribute it + * freely, subject to the following restrictions: + * + * 1. The origin of this software must not be misrepresented; you must not + * claim that you wrote the original software. If you use this software + * in a product, an acknowledgment in the product documentation would be + * appreciated but is not required. + * 2. Altered source versions must be plainly marked as such, and must not be + * misrepresented as being the original software. + * 3. This notice may not be removed or altered from any source distribution. + * + * ----------------------------------------------------------------------------- + */ + + +#ifndef IMGRESIZE_H +#define IMGRESIZE_H + + +typedef struct +{ + /* Specify filter type, from the IM_REDUCE_FILTER_* list */ + int filter; + /* High quality, a little slow: hopcount=3; */ + /* Good quality, much faster: hopcount=2; */ + int hopcount; + /* Strong preservation/amplification of details: alpha=2.0f; */ + /* Mild preservation/amplification of details: alpha=6.0f; */ + float alpha; + /* NORMALMAP filters: factor to amyplify normals on X and Y before normalization */ + float amplifynormal; + /* NORMALMAP_SUSTAIN filters: Preserve a factor of deviation "energy" as calculated by sqrtf(x*x+y*y) */ + float normalsustainfactor; +} imReduceOptions; + +static inline void imReduceSetOptions( imReduceOptions *options, int filter, int hopcount, float alpha, float amplifynormal, float normalsustainfactor ) +{ + options->filter = filter; + options->hopcount = hopcount; + options->alpha = alpha; + options->amplifynormal = amplifynormal; + options->normalsustainfactor = normalsustainfactor; + return; +} + + +/* Reduce the image's dimensions by an integer divisor ~ this is fairly fast */ +int imReduceImageKaiserDataDivisor( unsigned char *dstdata, unsigned char *srcdata, int width, int height, int bytesperpixel, int bytesperline, int sizedivisor, imReduceOptions *options ); +/* Same as imReduceImageKaiserDataDivisor(), but imgdst is allocated */ +int imReduceImageKaiserDivisor( imgImage *imgdst, imgImage *imgsrc, int sizedivisor, imReduceOptions *options ); + + +/* Reduce the image's dimensions to match the newwidth and newheight ~ this is a little slower */ +int imReduceImageKaiserData( unsigned char *dstdata, unsigned char *srcdata, int width, int height, int bytesperpixel, int bytesperline, int newwidth, int newheight, imReduceOptions *options ); +/* Same as imReduceImageKaiserData(), but imgdst is allocated */ +int imReduceImageKaiser( imgImage *imgdst, imgImage *imgsrc, int newwidth, int newheight, imReduceOptions *options ); + + +/* Resize by half with a dumb box filter ~ don't use that except for the smallest mipmaps */ +/* Filters with ALPHANORM and/or SUSTAIN keywords are processed as the regular base filter only */ +int imReduceImageHalfBoxData( unsigned char *dstdata, unsigned char *srcdata, int width, int height, int bytesperpixel, int bytesperline, imReduceOptions *options ); +int imReduceImageHalfBox( imgImage *imgdst, imgImage *imgsrc, imReduceOptions *options ); + + +/* +Keywords for image reduction filters + +LINEAR: Data is linear, note that this is *not* the format of typical diffuse textures +SRGB: Color is in sRGB space, any alpha is presumed linear +NORMALMAP: RGB represents a XYZ vector as (2.0*RGB)-1.0f, any alpha is presumed linear + +ALPHANORM: Alpha normalization, the weight of pixels is proportional to their alpha values + (do you have "black" fully transparent pixels? please use an ALPHANORM filter) +SUSTAIN: The "energy" of the normal map is sustained, amplified to preserve the level of details + Note that this filter is rather slow (set options->normalsustainfactor to 0.75 or so) +*/ + +enum +{ + /* Linear space */ + IM_REDUCE_FILTER_LINEAR, + IM_REDUCE_FILTER_LINEAR_ALPHANORM, + + /* sRGB space (probably what you want for diffuse textures) */ + IM_REDUCE_FILTER_SRGB, + IM_REDUCE_FILTER_SRGB_ALPHANORM, + + /* RGB represents a XYZ vector as (2.0*RGB)-1.0f, any alpha is presumed linear */ + IM_REDUCE_FILTER_NORMALMAP, + IM_REDUCE_FILTER_NORMALMAP_ALPHANORM, + IM_REDUCE_FILTER_NORMALMAP_SUSTAIN, + IM_REDUCE_FILTER_NORMALMAP_SUSTAIN_ALPHANORM, + + /* Custom specialized filters */ + IM_REDUCE_FILTER_WATERMAP, + IM_REDUCE_FILTER_PLANTMAP, + IM_REDUCE_FILTER_FOLLIAGE, + IM_REDUCE_FILTER_SKY, + IM_REDUCE_FILTER_FOG +}; + + +//// + + +#define IM_MIPMAP_CASCADE_MAX (16) + +typedef struct +{ + int width; + int height; + int layercount; + int bytesperpixel; + int bytesperline; + imReduceOptions *options; + void *mipmap[IM_MIPMAP_CASCADE_MAX]; +} imMipmapCascade; + + +int imBuildMipmapCascade( imMipmapCascade *cascade, void *imagedata, int width, int height, int layercount, int bytesperpixel, int bytesperline, imReduceOptions *options, int cascadeflags ); + +void imFreeMipmapCascade( imMipmapCascade *cascade ); + +/* For base texture, propagate RGB channels to neighbors if they are fully transparent (ignored if bytesperpixel != 4 ) */ +#define IM_CASCADE_FLAGS_COLOR_BORDER_BASE (0x1) +/* For generated mipmaps, propagate RGB channels to neighbors if they are fully transparent (ignored if bytesperpixel != 4 ) */ +#define IM_CASCADE_FLAGS_COLOR_BORDER_MIPMAPS (0x2) + + +//// + + +void imPropagateAlphaBorder( unsigned char *imagedata, int width, int height, int bytesperpixel, int bytesperline ); + + +//// + + +#endif + diff --git a/src/config/user_config.hpp b/src/config/user_config.hpp index 6bdfa1ae8..3efb659ee 100644 --- a/src/config/user_config.hpp +++ b/src/config/user_config.hpp @@ -934,6 +934,11 @@ namespace UserConfigParams PARAM_DEFAULT( BoolUserConfigParam(false, "everything_unlocked", "Enable all karts and tracks") ); + PARAM_PREFIX BoolUserConfigParam m_hq_mipmap + PARAM_DEFAULT( BoolUserConfigParam(false, "hq_mipmap", + "Generate mipmap for textures using " + "high quality method with SSE") ); + // TODO? implement blacklist for new irrlicht device and GUI PARAM_PREFIX std::vector m_blacklist_res; diff --git a/src/graphics/central_settings.cpp b/src/graphics/central_settings.cpp index 9ad4e7495..d34b3e102 100644 --- a/src/graphics/central_settings.cpp +++ b/src/graphics/central_settings.cpp @@ -490,7 +490,7 @@ bool CentralVideoSettings::isARBPixelBufferObjectUsable() const bool CentralVideoSettings::supportsThreadedTextureLoading() const { - return isARBPixelBufferObjectUsable() && isARBBufferStorageUsable(); + return isARBPixelBufferObjectUsable() && isARBBufferStorageUsable() && isARBTextureStorageUsable(); } #endif // !SERVER_ONLY diff --git a/src/graphics/hq_mipmap_generator.cpp b/src/graphics/hq_mipmap_generator.cpp new file mode 100644 index 000000000..4939f0ed5 --- /dev/null +++ b/src/graphics/hq_mipmap_generator.cpp @@ -0,0 +1,119 @@ +// SuperTuxKart - a fun racing game with go-kart +// Copyright (C) 2017 SuperTuxKart-Team +// +// This program is free software; you can redistribute it and/or +// modify it under the terms of the GNU General Public License +// as published by the Free Software Foundation; either version 3 +// of the License, or (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program; if not, write to the Free Software +// Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + + +#include "graphics/hq_mipmap_generator.hpp" +#define DUMP_MIPMAP +#ifdef DUMP_MIPMAP +#include "graphics/irr_driver.hpp" +#include "utils/string_utils.hpp" +#endif +#include + +extern "C" +{ + #include + #include +} + +// ---------------------------------------------------------------------------- +HQMipmapGenerator::HQMipmapGenerator(const io::path& name, uint8_t* data, + const core::dimension2d& size, + GLuint texture_name, bool single_channel) + : video::ITexture(name), m_orig_data(data), m_size(size), + m_texture_name(texture_name), m_texture_size(0), + m_single_channel(single_channel), m_mipmap_data(NULL) +{ + unsigned width = m_size.Width; + unsigned height = m_size.Height; + while (true) + { + width = width < 2 ? 1 : width >> 1; + height = height < 2 ? 1 : height >> 1; + m_mipmap_sizes.emplace_back(core::dimension2du(width, height), + m_texture_size); + m_texture_size += width * height * (m_single_channel ? 1 : 4); + if (width == 1 && height == 1) + break; + } + m_texture_size = unsigned(m_mipmap_sizes.back().second) + + (m_single_channel ? 1 : 4); + m_mipmap_data = malloc(sizeof(imMipmapCascade)); +} // HQMipmapGenerator + +// ---------------------------------------------------------------------------- +HQMipmapGenerator::~HQMipmapGenerator() +{ + imFreeMipmapCascade((imMipmapCascade*)m_mipmap_data); + free(m_mipmap_data); +} // ~HQMipmapGenerator + +// ---------------------------------------------------------------------------- +void HQMipmapGenerator::threadedReload(void* ptr, void* param) const +{ + imReduceOptions options; + imReduceSetOptions(&options, IM_REDUCE_FILTER_SRGB, 3, 2.0f, 0.0f, 0.0f); + imMipmapCascade* mm_cascade = (imMipmapCascade*)m_mipmap_data; +#ifdef DEBUG + int ret = imBuildMipmapCascade(mm_cascade, m_orig_data, m_size.Width, + m_size.Height, 1/*layercount*/, m_single_channel ? 1 : 4, + m_single_channel ? m_size.Width : m_size.Width * 4, &options, 0); + assert(ret == 1); +#else + imBuildMipmapCascade(mm_cascade, m_orig_data, m_size.Width, + m_size.Height, 1/*layercount*/, m_single_channel ? 1 : 4, + m_single_channel ? m_size.Width : m_size.Width * 4, &options, 0); +#endif + for (unsigned int i = 0; i < m_mipmap_sizes.size(); i++) + { + memcpy((uint8_t*)ptr + m_mipmap_sizes[i].second, + mm_cascade->mipmap[i + 1], + m_mipmap_sizes[i].first.getArea() * (m_single_channel ? 1 : 4)); +#ifdef DUMP_MIPMAP + if (m_single_channel) continue; + video::IImage* image = irr_driver->getVideoDriver() + ->createImageFromData(video::ECF_A8R8G8B8, m_mipmap_sizes[i].first, + mm_cascade->mipmap[i + 1], false/*ownForeignMemory*/); + irr_driver->getVideoDriver()->writeImageToFile(image, std::string + (StringUtils::toString(i) + "_" + + StringUtils::getBasename(NamedPath.getPtr()) + ".png").c_str()); + image->drop(); +#endif + } +} // threadedReload + +// ---------------------------------------------------------------------------- +void HQMipmapGenerator::threadedSubImage(void* ptr) const +{ +#if !(defined(SERVER_ONLY) || defined(USE_GLES2)) + glBindTexture(GL_TEXTURE_2D, m_texture_name); + for (unsigned int i = 0; i < m_mipmap_sizes.size(); i++) + { + glTexSubImage2D(GL_TEXTURE_2D, i + 1, 0, 0, + m_mipmap_sizes[i].first.Width, m_mipmap_sizes[i].first.Height, + m_single_channel ? GL_RED : GL_BGRA, GL_UNSIGNED_BYTE, + (uint8_t*)ptr + m_mipmap_sizes[i].second); + } + delete this; +#endif +} // threadedSubImage + +// ---------------------------------------------------------------------------- +void HQMipmapGenerator::cleanThreadedLoader() +{ + delete[] m_orig_data; +} // cleanThreadedLoader diff --git a/src/graphics/hq_mipmap_generator.hpp b/src/graphics/hq_mipmap_generator.hpp new file mode 100644 index 000000000..40827525d --- /dev/null +++ b/src/graphics/hq_mipmap_generator.hpp @@ -0,0 +1,98 @@ +// SuperTuxKart - a fun racing game with go-kart +// Copyright (C) 2017 SuperTuxKart-Team +// +// This program is free software; you can redistribute it and/or +// modify it under the terms of the GNU General Public License +// as published by the Free Software Foundation; either version 3 +// of the License, or (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program; if not, write to the Free Software +// Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + +#ifndef HEADER_HQ_MIPMAP_GENERATOR_HPP +#define HEADER_HQ_MIPMAP_GENERATOR_HPP + +#include "graphics/gl_headers.hpp" +#include "utils/no_copy.hpp" +#include "utils/types.hpp" + +#include +#include + +using namespace irr; + +class HQMipmapGenerator : public video::ITexture, NoCopy +{ +private: + uint8_t* m_orig_data; + + core::dimension2d m_size; + + GLuint m_texture_name; + + unsigned int m_texture_size; + + bool m_single_channel; + + void* m_mipmap_data; + + std::vector, size_t> > m_mipmap_sizes; + +public: + // ------------------------------------------------------------------------ + HQMipmapGenerator(const io::path& name, uint8_t* data, + const core::dimension2d& size, GLuint texture_name, + bool single_channel); + // ------------------------------------------------------------------------ + virtual ~HQMipmapGenerator(); + // ------------------------------------------------------------------------ + virtual void* lock(video::E_TEXTURE_LOCK_MODE mode = + video::ETLM_READ_WRITE, u32 mipmap_level = 0) + { return NULL; } + // ------------------------------------------------------------------------ + virtual void unlock() {} + // ------------------------------------------------------------------------ + virtual const core::dimension2d& getOriginalSize() const + { return m_size; } + // ------------------------------------------------------------------------ + virtual const core::dimension2d& getSize() const { return m_size; } + // ------------------------------------------------------------------------ + virtual video::E_DRIVER_TYPE getDriverType() const + { +#if defined(USE_GLES2) + return video::EDT_OGLES2; +#else + return video::EDT_OPENGL; +#endif + } + // ------------------------------------------------------------------------ + virtual video::ECOLOR_FORMAT getColorFormat() const + { return video::ECF_A8R8G8B8; } + // ------------------------------------------------------------------------ + virtual u32 getPitch() const { return 0; } + // ------------------------------------------------------------------------ + virtual bool hasMipMaps() const { return false; } + // ------------------------------------------------------------------------ + virtual void regenerateMipMapLevels(void* mipmap_data = NULL) {} + // ------------------------------------------------------------------------ + virtual u32 getOpenGLTextureName() const { return m_texture_name; } + // ------------------------------------------------------------------------ + virtual u64 getHandle() { return 0; } + // ------------------------------------------------------------------------ + virtual unsigned int getTextureSize() const { return m_texture_size; } + // ------------------------------------------------------------------------ + virtual void threadedReload(void* ptr, void* param) const; + // ------------------------------------------------------------------------ + virtual void threadedSubImage(void* ptr) const; + // ------------------------------------------------------------------------ + virtual void cleanThreadedLoader(); + +}; // HQMipmapGenerator + +#endif diff --git a/src/graphics/stk_tex_manager.cpp b/src/graphics/stk_tex_manager.cpp index e524c3b4b..02c4fef52 100644 --- a/src/graphics/stk_tex_manager.cpp +++ b/src/graphics/stk_tex_manager.cpp @@ -17,6 +17,7 @@ #include "graphics/stk_tex_manager.hpp" #include "config/hardware_stats.hpp" +#include "config/user_config.hpp" #include "graphics/central_settings.hpp" #include "graphics/materials.hpp" #include "graphics/threaded_tex_loader.hpp" @@ -33,11 +34,12 @@ STKTexManager::STKTexManager() : m_pbo(0), m_thread_size(0) #if !(defined(SERVER_ONLY) || defined(USE_GLES2)) if (CVS->supportsThreadedTextureLoading()) { + UserConfigParams::m_hq_mipmap = true; pthread_mutex_init(&m_threaded_load_textures_mutex, NULL); pthread_cond_init(&m_cond_request, NULL); m_thread_size = HardwareStats::getNumProcessors(); - m_thread_size = core::clamp(m_thread_size, 1, 3); - static const unsigned max_pbo_size = 48 * 1024 * 1024; + m_thread_size = core::clamp(m_thread_size, 1, 8); + static const unsigned max_pbo_size = 128 * 1024 * 1024; const unsigned each_capacity = max_pbo_size / m_thread_size; Log::info("STKTexManager", "%d thread(s) for texture loading," " each capacity %d MB", m_thread_size, diff --git a/src/graphics/stk_texture.cpp b/src/graphics/stk_texture.cpp index 0090db81a..2cfbe413c 100644 --- a/src/graphics/stk_texture.cpp +++ b/src/graphics/stk_texture.cpp @@ -18,14 +18,17 @@ #include "graphics/stk_texture.hpp" #include "config/user_config.hpp" #include "graphics/central_settings.hpp" +#include "graphics/hq_mipmap_generator.hpp" #include "graphics/irr_driver.hpp" #include "graphics/material.hpp" #include "graphics/material_manager.hpp" #include "graphics/materials.hpp" +#include "graphics/stk_tex_manager.hpp" #include "modes/profile_world.hpp" #include "utils/log.hpp" #include "utils/string_utils.hpp" +#include #include #include @@ -215,7 +218,7 @@ void STKTexture::reload(bool no_upload, uint8_t* preload_data, const unsigned int w = m_size.Width; const unsigned int h = m_size.Height; unsigned int format = m_single_channel ? GL_RED : GL_BGRA; - unsigned int internal_format = m_single_channel ? GL_R8 : GL_RGBA; + unsigned int internal_format = m_single_channel ? GL_R8 : GL_RGBA8; #if !defined(USE_GLES2) if (m_mesh_texture && CVS->isTextureCompressionEnabled()) @@ -227,13 +230,41 @@ void STKTexture::reload(bool no_upload, uint8_t* preload_data, else { internal_format = - m_single_channel ? GL_R8 : m_srgb ? GL_SRGB_ALPHA : GL_RGBA; + m_single_channel ? GL_R8 : m_srgb ? GL_SRGB8_ALPHA8 : GL_RGBA8; } #endif if (!useThreadedLoading()) formatConversion(data, &format, w, h); - if (!no_upload) + if (useThreadedLoading()) + { + if (m_texture_name == 0) + { + glGenTextures(1, &m_texture_name); + glBindTexture(GL_TEXTURE_2D, m_texture_name); + if (m_single_channel) + { + glPixelStorei(GL_UNPACK_ALIGNMENT, 1); + glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_SWIZZLE_R, GL_ONE); + glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_SWIZZLE_G, GL_ONE); + glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_SWIZZLE_B, GL_ONE); + glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_SWIZZLE_A, GL_RED); + } + int levels = 1; + int width = w; + int height = h; + while (true) + { + width = width < 2 ? 1 : width >> 1; + height = height < 2 ? 1 : height >> 1; + levels++; + if (width == 1 && height == 1) + break; + } + glTexStorage2D(GL_TEXTURE_2D, levels, internal_format, w, h); + } + } + else if (!no_upload) { const bool reload = m_texture_name != 0; if (!reload) @@ -253,14 +284,14 @@ void STKTexture::reload(bool no_upload, uint8_t* preload_data, glTexImage2D(GL_TEXTURE_2D, 0, internal_format, w, h, 0, format, GL_UNSIGNED_BYTE, data); } - else if (!useThreadedLoading()) + else { glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, w, h, format, GL_UNSIGNED_BYTE, data); } if (orig_img) orig_img->unlock(); - if (hasMipMaps() && !useThreadedLoading()) + if (hasMipMaps()) glGenerateMipmap(GL_TEXTURE_2D); } @@ -594,8 +625,15 @@ void STKTexture::threadedReload(void* ptr, void* param) const if (orig_img) { orig_img->unlock(); + orig_img->setDeleteMemory(false); orig_img->drop(); } + if (useHQMipmap()) + { + HQMipmapGenerator* hqmg = new HQMipmapGenerator(NamedPath, data, + m_size, m_texture_name, m_single_channel); + ((STKTexManager*)(param))->addThreadedLoadTexture(hqmg); + } else delete[] data; } // threadedReload @@ -607,8 +645,11 @@ void STKTexture::threadedSubImage(void* ptr) const glBindTexture(GL_TEXTURE_2D, m_texture_name); glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, m_size.Width, m_size.Height, m_single_channel ? GL_RED : GL_BGRA, GL_UNSIGNED_BYTE, ptr); + if (useHQMipmap()) + return; if (hasMipMaps()) glGenerateMipmap(GL_TEXTURE_2D); + #endif } // threadedSubImage @@ -620,3 +661,10 @@ void STKTexture::cleanThreadedLoader() m_file = NULL; m_img_loader = NULL; } // cleanThreadedLoader + +//----------------------------------------------------------------------------- +bool STKTexture::useHQMipmap() const +{ + return UserConfigParams::m_hq_mipmap && m_size.Width > 1 && + m_size.Height > 1; +} // useHQMipmap diff --git a/src/graphics/stk_texture.hpp b/src/graphics/stk_texture.hpp index 7d2c03fc6..652a41e8b 100644 --- a/src/graphics/stk_texture.hpp +++ b/src/graphics/stk_texture.hpp @@ -76,6 +76,8 @@ private: sc[i] = data[4 * i + 3]; return sc; } + // ------------------------------------------------------------------------ + bool useHQMipmap() const; public: // ------------------------------------------------------------------------ diff --git a/src/graphics/threaded_tex_loader.cpp b/src/graphics/threaded_tex_loader.cpp index 5d09d095d..ffefeb9af 100644 --- a/src/graphics/threaded_tex_loader.cpp +++ b/src/graphics/threaded_tex_loader.cpp @@ -84,8 +84,9 @@ void ThreadedTexLoader::handleCompletedTextures() size_t offset = m_pbo_offset; for (irr::video::ITexture* tex : m_completed_textures) { + size_t cur_offset = tex->getTextureSize(); tex->threadedSubImage((void*)offset); - offset += tex->getTextureSize(); + offset += cur_offset; } m_completed_textures.clear(); #endif