Try HQMipmapGenerator
This commit is contained in:
parent
498ce3ebc9
commit
76aa38e5b4
@ -116,6 +116,10 @@ if((WIN32 AND NOT MINGW) OR APPLE)
|
||||
set(JPEG_INCLUDE_DIR "${PROJECT_SOURCE_DIR}/lib/jpeglib/")
|
||||
set(JPEG_LIBRARY jpeglib)
|
||||
endif()
|
||||
|
||||
add_subdirectory("${PROJECT_SOURCE_DIR}/lib/graphics_utils")
|
||||
include_directories("${PROJECT_SOURCE_DIR}/lib/graphics_utils")
|
||||
|
||||
# Build the irrlicht library
|
||||
add_subdirectory("${PROJECT_SOURCE_DIR}/lib/irrlicht")
|
||||
include_directories("${PROJECT_SOURCE_DIR}/lib/irrlicht/include")
|
||||
@ -370,6 +374,7 @@ target_link_libraries(supertuxkart
|
||||
bulletmath
|
||||
enet
|
||||
stkirrlicht
|
||||
graphics_utils
|
||||
${Angelscript_LIBRARIES}
|
||||
${CURL_LIBRARIES}
|
||||
${OGGVORBIS_LIBRARIES}
|
||||
|
9
lib/graphics_utils/CMakeLists.txt
Normal file
9
lib/graphics_utils/CMakeLists.txt
Normal file
@ -0,0 +1,9 @@
|
||||
cmake_minimum_required(VERSION 2.6)
|
||||
if (UNIX OR MINGW)
|
||||
add_definitions(-O3 -msse -msse2 -msse3 -mssse3 -msse4.1 -mavx -ffast-math)
|
||||
endif()
|
||||
add_library(graphics_utils STATIC
|
||||
mipmap/cpusimd.c
|
||||
mipmap/img.c
|
||||
mipmap/imgresize.c
|
||||
)
|
568
lib/graphics_utils/mipmap/cpusimd.c
Normal file
568
lib/graphics_utils/mipmap/cpusimd.c
Normal file
@ -0,0 +1,568 @@
|
||||
/* -----------------------------------------------------------------------------
|
||||
*
|
||||
* Copyright (c) 2008-2016 Alexis Naveros.
|
||||
*
|
||||
*
|
||||
* The SIMD trigonometry functions are Copyright (C) 2007 Julien Pommier
|
||||
* See copyright notice for simd4f_sin_ps(), simd4f_cos_ps(), simd4f_sincos_ps()
|
||||
*
|
||||
*
|
||||
* Some functions are Copyright (C) 2008 José Fonseca
|
||||
* See copyright notice for simd4f_exp2_ps(), simd4f_log2_ps(), simd4f_pow_ps()
|
||||
*
|
||||
*
|
||||
* Portions developed under contract to the SURVICE Engineering Company.
|
||||
*
|
||||
* This software is provided 'as-is', without any express or implied
|
||||
* warranty. In no event will the authors be held liable for any damages
|
||||
* arising from the use of this software.
|
||||
*
|
||||
* Permission is granted to anyone to use this software for any purpose,
|
||||
* including commercial applications, and to alter it and redistribute it
|
||||
* freely, subject to the following restrictions:
|
||||
*
|
||||
* 1. The origin of this software must not be misrepresented; you must not
|
||||
* claim that you wrote the original software. If you use this software
|
||||
* in a product, an acknowledgment in the product documentation would be
|
||||
* appreciated but is not required.
|
||||
* 2. Altered source versions must be plainly marked as such, and must not be
|
||||
* misrepresented as being the original software.
|
||||
* 3. This notice may not be removed or altered from any source distribution.
|
||||
*
|
||||
* -----------------------------------------------------------------------------
|
||||
*/
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <stddef.h>
|
||||
#include <stdint.h>
|
||||
#include <string.h>
|
||||
#include <math.h>
|
||||
#include <float.h>
|
||||
#include <limits.h>
|
||||
|
||||
#include <sys/time.h>
|
||||
|
||||
|
||||
#include "cpusimd.h"
|
||||
|
||||
|
||||
////
|
||||
|
||||
|
||||
#if CPU_SSE_SUPPORT
|
||||
|
||||
const uint32_t simd4fSignMask[4] CPU_ALIGN16 = { 0x80000000, 0x80000000, 0x80000000, 0x80000000 };
|
||||
const uint32_t simd4fSignMaskInv[4] CPU_ALIGN16 = { 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff };
|
||||
const float simd4fHalf[4] CPU_ALIGN16 = { 0.5, 0.5, 0.5, 0.5 };
|
||||
const float simd4fOne[4] CPU_ALIGN16 = { 1.0, 1.0, 1.0, 1.0 };
|
||||
const float simd4fTwo[4] CPU_ALIGN16 = { 2.0, 2.0, 2.0, 2.0 };
|
||||
const float simd4fThree[4] CPU_ALIGN16 = { 3.0, 3.0, 3.0, 3.0 };
|
||||
const uint32_t simd4uOne[4] CPU_ALIGN16 = { 1, 1, 1, 1 };
|
||||
const uint32_t simd4uOneInv[4] CPU_ALIGN16 = { ~1, ~1, ~1, ~1 };
|
||||
const uint32_t simd4uTwo[4] CPU_ALIGN16 = { 2, 2, 2, 2 };
|
||||
const uint32_t simd4uFour[4] CPU_ALIGN16 = { 4, 4, 4, 4 };
|
||||
const float simd4fQuarter[4] CPU_ALIGN16 = { 0.25, 0.25, 0.25, 0.25 };
|
||||
const float simd4fPi[4] CPU_ALIGN16 = { M_PI, M_PI, M_PI, M_PI };
|
||||
const float simd4fZeroOneTwoThree[4] CPU_ALIGN16 = { 0.0, 1.0, 2.0, 3.0 };
|
||||
const uint32_t simd4fAlphaMask[4] CPU_ALIGN16 = { 0x00000000, 0x00000000, 0x00000000, 0xffffffff };
|
||||
const float simd4f255[4] CPU_ALIGN16 = { 255.0f, 255.0f, 255.0f, 255.0f };
|
||||
const float simd4f255Inv[4] CPU_ALIGN16 = { 1.0f/255.0f, 1.0f/255.0f, 1.0f/255.0f, 1.0f/255.0f };
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
////
|
||||
|
||||
|
||||
#if CPU_SSE2_SUPPORT
|
||||
|
||||
|
||||
/* Copyright (C) 2007 Julien Pommier
|
||||
|
||||
This software is provided 'as-is', without any express or implied
|
||||
warranty. In no event will the authors be held liable for any damages
|
||||
arising from the use of this software.
|
||||
|
||||
Permission is granted to anyone to use this software for any purpose,
|
||||
including commercial applications, and to alter it and redistribute it
|
||||
freely, subject to the following restrictions:
|
||||
|
||||
1. The origin of this software must not be misrepresented; you must not
|
||||
claim that you wrote the original software. If you use this software
|
||||
in a product, an acknowledgment in the product documentation would be
|
||||
appreciated but is not required.
|
||||
2. Altered source versions must be plainly marked as such, and must not be
|
||||
misrepresented as being the original software.
|
||||
3. This notice may not be removed or altered from any source distribution.
|
||||
|
||||
(this is the zlib license)
|
||||
*/
|
||||
|
||||
static const float simd4f_cephes_FOPI[4] CPU_ALIGN16 = { 1.27323954473516, 1.27323954473516, 1.27323954473516, 1.27323954473516 };
|
||||
static const float simd4f_minus_cephes_DP1[4] CPU_ALIGN16 = { -0.78515625, -0.78515625, -0.78515625, -0.78515625 };
|
||||
static const float simd4f_minus_cephes_DP2[4] CPU_ALIGN16 = { -2.4187564849853515625e-4, -2.4187564849853515625e-4, -2.4187564849853515625e-4, -2.4187564849853515625e-4 };
|
||||
static const float simd4f_minus_cephes_DP3[4] CPU_ALIGN16 = { -3.77489497744594108e-8, -3.77489497744594108e-8, -3.77489497744594108e-8, -3.77489497744594108e-8 };
|
||||
static const float simd4f_sincof_p0[4] CPU_ALIGN16 = { -1.9515295891E-4, -1.9515295891E-4, -1.9515295891E-4, -1.9515295891E-4 };
|
||||
static const float simd4f_sincof_p1[4] CPU_ALIGN16 = { 8.3321608736E-3, 8.3321608736E-3, 8.3321608736E-3, 8.3321608736E-3 };
|
||||
static const float simd4f_sincof_p2[4] CPU_ALIGN16 = { -1.6666654611E-1, -1.6666654611E-1, -1.6666654611E-1, -1.6666654611E-1 };
|
||||
static const float simd4f_coscof_p0[4] CPU_ALIGN16 = { 2.443315711809948E-005, 2.443315711809948E-005, 2.443315711809948E-005, 2.443315711809948E-005 };
|
||||
static const float simd4f_coscof_p1[4] CPU_ALIGN16 = { -1.388731625493765E-003, -1.388731625493765E-003, -1.388731625493765E-003, -1.388731625493765E-003 };
|
||||
static const float simd4f_coscof_p2[4] CPU_ALIGN16 = { 4.166664568298827E-002, 4.166664568298827E-002, 4.166664568298827E-002, 4.166664568298827E-002 };
|
||||
|
||||
__m128 simd4f_sin_ps( __m128 x )
|
||||
{
|
||||
__m128 xmm1, xmm2, xmm3, sign_bit, y;
|
||||
__m128i emm0, emm2;
|
||||
|
||||
xmm2 = _mm_setzero_ps();
|
||||
|
||||
sign_bit = x;
|
||||
/* take the absolute value */
|
||||
x = _mm_and_ps( x, *(__m128 *)simd4fSignMaskInv );
|
||||
/* extract the sign bit (upper one) */
|
||||
sign_bit = _mm_and_ps(sign_bit, *(__m128 *)simd4fSignMask);
|
||||
|
||||
/* scale by 4/Pi */
|
||||
y = _mm_mul_ps(x, *(__m128 *)simd4f_cephes_FOPI);
|
||||
|
||||
/* store the integer part of y in mm0 */
|
||||
emm2 = _mm_cvttps_epi32(y);
|
||||
/* j=(j+1) & (~1) (see the cephes sources) */
|
||||
emm2 = _mm_add_epi32(emm2, *(__m128i*)simd4uOne);
|
||||
emm2 = _mm_and_si128(emm2, *(__m128i*)simd4uOneInv);
|
||||
y = _mm_cvtepi32_ps(emm2);
|
||||
|
||||
/* get the swap sign flag */
|
||||
emm0 = _mm_and_si128(emm2, *(__m128i*)simd4uFour);
|
||||
emm0 = _mm_slli_epi32(emm0, 29);
|
||||
/* get the polynom selection mask
|
||||
there is one polynom for 0 <= x <= Pi/4
|
||||
and another one for Pi/4<x<=Pi/2
|
||||
Both branches will be computed.
|
||||
*/
|
||||
emm2 = _mm_and_si128(emm2, *(__m128i*)simd4uTwo);
|
||||
emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
|
||||
|
||||
__m128 swap_sign_bit = _mm_castsi128_ps(emm0);
|
||||
__m128 poly_mask = _mm_castsi128_ps(emm2);
|
||||
sign_bit = _mm_xor_ps(sign_bit, swap_sign_bit);
|
||||
|
||||
/* The magic pass: "Extended precision modular arithmetic"
|
||||
x = ((x - y * DP1) - y * DP2) - y * DP3; */
|
||||
xmm1 = *(__m128 *)simd4f_minus_cephes_DP1;
|
||||
xmm2 = *(__m128 *)simd4f_minus_cephes_DP2;
|
||||
xmm3 = *(__m128 *)simd4f_minus_cephes_DP3;
|
||||
xmm1 = _mm_mul_ps(y, xmm1);
|
||||
xmm2 = _mm_mul_ps(y, xmm2);
|
||||
xmm3 = _mm_mul_ps(y, xmm3);
|
||||
x = _mm_add_ps(x, xmm1);
|
||||
x = _mm_add_ps(x, xmm2);
|
||||
x = _mm_add_ps(x, xmm3);
|
||||
|
||||
/* Evaluate the first polynom (0 <= x <= Pi/4) */
|
||||
y = *(__m128 *)simd4f_coscof_p0;
|
||||
__m128 z = _mm_mul_ps(x,x);
|
||||
|
||||
y = _mm_mul_ps(y, z);
|
||||
y = _mm_add_ps(y, *(__m128 *)simd4f_coscof_p1);
|
||||
y = _mm_mul_ps(y, z);
|
||||
y = _mm_add_ps(y, *(__m128 *)simd4f_coscof_p2);
|
||||
y = _mm_mul_ps(y, z);
|
||||
y = _mm_mul_ps(y, z);
|
||||
__m128 tmp = _mm_mul_ps(z, *(__m128 *)simd4fHalf);
|
||||
y = _mm_sub_ps(y, tmp);
|
||||
y = _mm_add_ps(y, *(__m128 *)simd4fOne);
|
||||
|
||||
/* Evaluate the second polynom (Pi/4 <= x <= 0) */
|
||||
|
||||
__m128 y2 = *(__m128 *)simd4f_sincof_p0;
|
||||
y2 = _mm_mul_ps(y2, z);
|
||||
y2 = _mm_add_ps(y2, *(__m128 *)simd4f_sincof_p1);
|
||||
y2 = _mm_mul_ps(y2, z);
|
||||
y2 = _mm_add_ps(y2, *(__m128 *)simd4f_sincof_p2);
|
||||
y2 = _mm_mul_ps(y2, z);
|
||||
y2 = _mm_mul_ps(y2, x);
|
||||
y2 = _mm_add_ps(y2, x);
|
||||
|
||||
/* select the correct result from the two polynoms */
|
||||
xmm3 = poly_mask;
|
||||
y2 = _mm_and_ps(xmm3, y2);
|
||||
y = _mm_andnot_ps(xmm3, y);
|
||||
y = _mm_add_ps(y,y2);
|
||||
/* update the sign */
|
||||
y = _mm_xor_ps(y, sign_bit);
|
||||
|
||||
return y;
|
||||
}
|
||||
|
||||
|
||||
|
||||
/* almost the same as sin_ps */
|
||||
__m128 simd4f_cos_ps( __m128 x )
|
||||
{
|
||||
__m128 xmm1, xmm2, xmm3, y;
|
||||
__m128i emm0, emm2;
|
||||
|
||||
xmm2 = _mm_setzero_ps();
|
||||
|
||||
/* take the absolute value */
|
||||
x = _mm_and_ps(x, *(__m128*)simd4fSignMaskInv);
|
||||
|
||||
/* scale by 4/Pi */
|
||||
y = _mm_mul_ps(x, *(__m128*)simd4f_cephes_FOPI);
|
||||
|
||||
/* store the integer part of y in mm0 */
|
||||
emm2 = _mm_cvttps_epi32(y);
|
||||
/* j=(j+1) & (~1) (see the cephes sources) */
|
||||
emm2 = _mm_add_epi32(emm2, *(__m128i*)simd4uOne);
|
||||
emm2 = _mm_and_si128(emm2, *(__m128i*)simd4uOneInv);
|
||||
y = _mm_cvtepi32_ps(emm2);
|
||||
|
||||
emm2 = _mm_sub_epi32(emm2, *(__m128i*)simd4uTwo);
|
||||
|
||||
/* get the swap sign flag */
|
||||
emm0 = _mm_andnot_si128(emm2, *(__m128i*)simd4uFour);
|
||||
emm0 = _mm_slli_epi32(emm0, 29);
|
||||
/* get the polynom selection mask */
|
||||
emm2 = _mm_and_si128(emm2, *(__m128i*)simd4uTwo);
|
||||
emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
|
||||
|
||||
__m128 sign_bit = _mm_castsi128_ps(emm0);
|
||||
__m128 poly_mask = _mm_castsi128_ps(emm2);
|
||||
/* The magic pass: "Extended precision modular arithmetic"
|
||||
x = ((x - y * DP1) - y * DP2) - y * DP3; */
|
||||
xmm1 = *(__m128*)simd4f_minus_cephes_DP1;
|
||||
xmm2 = *(__m128*)simd4f_minus_cephes_DP2;
|
||||
xmm3 = *(__m128*)simd4f_minus_cephes_DP3;
|
||||
xmm1 = _mm_mul_ps(y, xmm1);
|
||||
xmm2 = _mm_mul_ps(y, xmm2);
|
||||
xmm3 = _mm_mul_ps(y, xmm3);
|
||||
x = _mm_add_ps(x, xmm1);
|
||||
x = _mm_add_ps(x, xmm2);
|
||||
x = _mm_add_ps(x, xmm3);
|
||||
|
||||
/* Evaluate the first polynom (0 <= x <= Pi/4) */
|
||||
y = *(__m128*)simd4f_coscof_p0;
|
||||
__m128 z = _mm_mul_ps(x,x);
|
||||
|
||||
y = _mm_mul_ps(y, z);
|
||||
y = _mm_add_ps(y, *(__m128*)simd4f_coscof_p1);
|
||||
y = _mm_mul_ps(y, z);
|
||||
y = _mm_add_ps(y, *(__m128*)simd4f_coscof_p2);
|
||||
y = _mm_mul_ps(y, z);
|
||||
y = _mm_mul_ps(y, z);
|
||||
__m128 tmp = _mm_mul_ps(z, *(__m128*)simd4fHalf);
|
||||
y = _mm_sub_ps(y, tmp);
|
||||
y = _mm_add_ps(y, *(__m128*)simd4fOne);
|
||||
|
||||
/* Evaluate the second polynom (Pi/4 <= x <= 0) */
|
||||
|
||||
__m128 y2 = *(__m128*)simd4f_sincof_p0;
|
||||
y2 = _mm_mul_ps(y2, z);
|
||||
y2 = _mm_add_ps(y2, *(__m128*)simd4f_sincof_p1);
|
||||
y2 = _mm_mul_ps(y2, z);
|
||||
y2 = _mm_add_ps(y2, *(__m128*)simd4f_sincof_p2);
|
||||
y2 = _mm_mul_ps(y2, z);
|
||||
y2 = _mm_mul_ps(y2, x);
|
||||
y2 = _mm_add_ps(y2, x);
|
||||
|
||||
/* select the correct result from the two polynoms */
|
||||
xmm3 = poly_mask;
|
||||
y2 = _mm_and_ps(xmm3, y2); //, xmm3);
|
||||
y = _mm_andnot_ps(xmm3, y);
|
||||
y = _mm_add_ps(y,y2);
|
||||
/* update the sign */
|
||||
y = _mm_xor_ps(y, sign_bit);
|
||||
|
||||
return y;
|
||||
}
|
||||
|
||||
/* since sin_ps and cos_ps are almost identical, sincos_ps could replace both of them..
|
||||
it is almost as fast, and gives you a free cosine with your sine */
|
||||
void simd4f_sincos_ps( __m128 x, __m128 *s, __m128 *c )
|
||||
{
|
||||
__m128 xmm1, xmm2, xmm3, sign_bit_sin, y;
|
||||
__m128i emm0, emm2, emm4;
|
||||
|
||||
xmm3 = _mm_setzero_ps();
|
||||
|
||||
sign_bit_sin = x;
|
||||
/* take the absolute value */
|
||||
x = _mm_and_ps(x, *(__m128*)simd4fSignMaskInv);
|
||||
/* extract the sign bit (upper one) */
|
||||
sign_bit_sin = _mm_and_ps(sign_bit_sin, *(__m128*)simd4fSignMask);
|
||||
|
||||
/* scale by 4/Pi */
|
||||
y = _mm_mul_ps(x, *(__m128*)simd4f_cephes_FOPI);
|
||||
|
||||
/* store the integer part of y in emm2 */
|
||||
emm2 = _mm_cvttps_epi32(y);
|
||||
|
||||
/* j=(j+1) & (~1) (see the cephes sources) */
|
||||
emm2 = _mm_add_epi32(emm2, *(__m128i*)simd4uOne);
|
||||
emm2 = _mm_and_si128(emm2, *(__m128i*)simd4uOneInv);
|
||||
y = _mm_cvtepi32_ps(emm2);
|
||||
|
||||
emm4 = emm2;
|
||||
|
||||
/* get the swap sign flag for the sine */
|
||||
emm0 = _mm_and_si128(emm2, *(__m128i*)simd4uFour);
|
||||
emm0 = _mm_slli_epi32(emm0, 29);
|
||||
__m128 swap_sign_bit_sin = _mm_castsi128_ps(emm0);
|
||||
|
||||
/* get the polynom selection mask for the sine*/
|
||||
emm2 = _mm_and_si128(emm2, *(__m128i*)simd4uTwo);
|
||||
emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
|
||||
__m128 poly_mask = _mm_castsi128_ps(emm2);
|
||||
|
||||
/* The magic pass: "Extended precision modular arithmetic"
|
||||
x = ((x - y * DP1) - y * DP2) - y * DP3; */
|
||||
xmm1 = *(__m128*)simd4f_minus_cephes_DP1;
|
||||
xmm2 = *(__m128*)simd4f_minus_cephes_DP2;
|
||||
xmm3 = *(__m128*)simd4f_minus_cephes_DP3;
|
||||
xmm1 = _mm_mul_ps(y, xmm1);
|
||||
xmm2 = _mm_mul_ps(y, xmm2);
|
||||
xmm3 = _mm_mul_ps(y, xmm3);
|
||||
x = _mm_add_ps(x, xmm1);
|
||||
x = _mm_add_ps(x, xmm2);
|
||||
x = _mm_add_ps(x, xmm3);
|
||||
|
||||
emm4 = _mm_sub_epi32(emm4, *(__m128i*)simd4uTwo);
|
||||
emm4 = _mm_andnot_si128(emm4, *(__m128i*)simd4uFour);
|
||||
emm4 = _mm_slli_epi32(emm4, 29);
|
||||
__m128 sign_bit_cos = _mm_castsi128_ps(emm4);
|
||||
|
||||
sign_bit_sin = _mm_xor_ps(sign_bit_sin, swap_sign_bit_sin);
|
||||
|
||||
/* Evaluate the first polynom (0 <= x <= Pi/4) */
|
||||
__m128 z = _mm_mul_ps(x,x);
|
||||
y = *(__m128*)simd4f_coscof_p0;
|
||||
|
||||
y = _mm_mul_ps(y, z);
|
||||
y = _mm_add_ps(y, *(__m128*)simd4f_coscof_p1);
|
||||
y = _mm_mul_ps(y, z);
|
||||
y = _mm_add_ps(y, *(__m128*)simd4f_coscof_p2);
|
||||
y = _mm_mul_ps(y, z);
|
||||
y = _mm_mul_ps(y, z);
|
||||
__m128 tmp = _mm_mul_ps(z, *(__m128*)simd4fHalf);
|
||||
y = _mm_sub_ps(y, tmp);
|
||||
y = _mm_add_ps(y, *(__m128*)simd4fOne);
|
||||
|
||||
/* Evaluate the second polynom (Pi/4 <= x <= 0) */
|
||||
|
||||
__m128 y2 = *(__m128*)simd4f_sincof_p0;
|
||||
y2 = _mm_mul_ps(y2, z);
|
||||
y2 = _mm_add_ps(y2, *(__m128*)simd4f_sincof_p1);
|
||||
y2 = _mm_mul_ps(y2, z);
|
||||
y2 = _mm_add_ps(y2, *(__m128*)simd4f_sincof_p2);
|
||||
y2 = _mm_mul_ps(y2, z);
|
||||
y2 = _mm_mul_ps(y2, x);
|
||||
y2 = _mm_add_ps(y2, x);
|
||||
|
||||
/* select the correct result from the two polynoms */
|
||||
xmm3 = poly_mask;
|
||||
__m128 ysin2 = _mm_and_ps(xmm3, y2);
|
||||
__m128 ysin1 = _mm_andnot_ps(xmm3, y);
|
||||
y2 = _mm_sub_ps(y2,ysin2);
|
||||
y = _mm_sub_ps(y, ysin1);
|
||||
|
||||
xmm1 = _mm_add_ps(ysin1,ysin2);
|
||||
xmm2 = _mm_add_ps(y,y2);
|
||||
|
||||
/* update the sign */
|
||||
*s = _mm_xor_ps(xmm1, sign_bit_sin);
|
||||
*c = _mm_xor_ps(xmm2, sign_bit_cos);
|
||||
}
|
||||
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
////
|
||||
|
||||
|
||||
#if CPU_SSE2_SUPPORT
|
||||
|
||||
|
||||
/* Copyright (C) 2008 José Fonseca
|
||||
http://jrfonseca.blogspot.ca/2008/09/fast-sse2-pow-tables-or-polynomials.html
|
||||
MIT license
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in all
|
||||
copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
SOFTWARE.
|
||||
*/
|
||||
|
||||
#define POLY0(x,c0) _mm_set1_ps(c0)
|
||||
#define POLY1(x,c0,c1) _mm_add_ps(_mm_mul_ps(POLY0(x, c1), x), _mm_set1_ps(c0))
|
||||
#define POLY2(x,c0,c1,c2) _mm_add_ps(_mm_mul_ps(POLY1(x, c1, c2), x), _mm_set1_ps(c0))
|
||||
#define POLY3(x,c0,c1,c2,c3) _mm_add_ps(_mm_mul_ps(POLY2(x, c1, c2, c3), x), _mm_set1_ps(c0))
|
||||
#define POLY4(x,c0,c1,c2,c3,c4) _mm_add_ps(_mm_mul_ps(POLY3(x, c1, c2, c3, c4), x), _mm_set1_ps(c0))
|
||||
#define POLY5(x,c0,c1,c2,c3,c4,c5) _mm_add_ps(_mm_mul_ps(POLY4(x, c1, c2, c3, c4, c5), x), _mm_set1_ps(c0))
|
||||
|
||||
#define EXP_POLY_DEGREE 3
|
||||
#define LOG_POLY_DEGREE 5
|
||||
|
||||
__m128 simd4f_exp2_ps( __m128 x )
|
||||
{
|
||||
__m128i ipart;
|
||||
__m128 fpart, expipart, expfpart;
|
||||
|
||||
x = _mm_min_ps( x, _mm_set1_ps( 129.00000f ) );
|
||||
x = _mm_max_ps( x, _mm_set1_ps( -126.99999f ) );
|
||||
/* ipart = int(x - 0.5) */
|
||||
ipart = _mm_cvtps_epi32( _mm_sub_ps( x, _mm_set1_ps( 0.5f ) ) );
|
||||
/* fpart = x - ipart */
|
||||
fpart = _mm_sub_ps( x, _mm_cvtepi32_ps( ipart ) );
|
||||
/* expipart = (float) (1 << ipart) */
|
||||
expipart = _mm_castsi128_ps( _mm_slli_epi32( _mm_add_epi32( ipart, _mm_set1_epi32( 127 ) ), 23 ) );
|
||||
/* minimax polynomial fit of 2**x, in range [-0.5, 0.5[ */
|
||||
#if EXP_POLY_DEGREE == 5
|
||||
expfpart = POLY5( fpart, 9.9999994e-1f, 6.9315308e-1f, 2.4015361e-1f, 5.5826318e-2f, 8.9893397e-3f, 1.8775767e-3f );
|
||||
#elif EXP_POLY_DEGREE == 4
|
||||
expfpart = POLY4( fpart, 1.0000026f, 6.9300383e-1f, 2.4144275e-1f, 5.2011464e-2f, 1.3534167e-2f );
|
||||
#elif EXP_POLY_DEGREE == 3
|
||||
expfpart = POLY3( fpart, 9.9992520e-1f, 6.9583356e-1f, 2.2606716e-1f, 7.8024521e-2f );
|
||||
#elif EXP_POLY_DEGREE == 2
|
||||
expfpart = POLY2( fpart, 1.0017247f, 6.5763628e-1f, 3.3718944e-1f );
|
||||
#else
|
||||
#error
|
||||
#endif
|
||||
return _mm_mul_ps(expipart, expfpart);
|
||||
}
|
||||
|
||||
__m128 simd4f_log2_ps( __m128 x )
|
||||
{
|
||||
__m128i expmask, mantmask, i;
|
||||
__m128 one, vexp, mant, logmant;
|
||||
|
||||
expmask = _mm_set1_epi32( 0x7f800000 );
|
||||
mantmask = _mm_set1_epi32( 0x007fffff );
|
||||
one = _mm_set1_ps( 1.0f );
|
||||
i = _mm_castps_si128( x );
|
||||
/* exp = (float) exponent(x) */
|
||||
vexp = _mm_cvtepi32_ps( _mm_sub_epi32( _mm_srli_epi32( _mm_and_si128( i, expmask ), 23 ), _mm_set1_epi32( 127 ) ) );
|
||||
/* mant = (float) mantissa(x) */
|
||||
mant = _mm_or_ps( _mm_castsi128_ps( _mm_and_si128( i, mantmask ) ), one );
|
||||
/* Minimax polynomial fit of log2(x)/(x - 1), for x in range [1, 2[
|
||||
* These coefficients can be generate with
|
||||
* http://www.boost.org/doc/libs/1_36_0/libs/math/doc/sf_and_dist/html/math_toolkit/toolkit/internals2/minimax.html
|
||||
*/
|
||||
#if LOG_POLY_DEGREE == 6
|
||||
logmant = POLY5( mant, 3.11578814719469302614f, -3.32419399085241980044f, 2.59883907202499966007f, -1.23152682416275988241f, 0.318212422185251071475f, -0.0344359067839062357313f );
|
||||
#elif LOG_POLY_DEGREE == 5
|
||||
logmant = POLY4( mant, 2.8882704548164776201f, -2.52074962577807006663f, 1.48116647521213171641f, -0.465725644288844778798f, 0.0596515482674574969533f );
|
||||
#elif LOG_POLY_DEGREE == 4
|
||||
logmant = POLY3( mant, 2.61761038894603480148f, -1.75647175389045657003f, 0.688243882994381274313f, -0.107254423828329604454f );
|
||||
#elif LOG_POLY_DEGREE == 3
|
||||
logmant = POLY2( mant, 2.28330284476918490682f, -1.04913055217340124191f, 0.204446009836232697516f );
|
||||
#else
|
||||
#error
|
||||
#endif
|
||||
/* This effectively increases the polynomial degree by one, but ensures that log2(1) == 0*/
|
||||
logmant = _mm_mul_ps( logmant, _mm_sub_ps(mant, one ) );
|
||||
return _mm_add_ps( logmant, vexp );
|
||||
}
|
||||
|
||||
|
||||
__m128 simd4f_pow_ps( __m128 x, __m128 y )
|
||||
{
|
||||
return simd4f_exp2_ps( _mm_mul_ps( simd4f_log2_ps( x ), y ) );
|
||||
}
|
||||
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
////
|
||||
|
||||
|
||||
#if CPU_SSE2_SUPPORT
|
||||
|
||||
|
||||
/*
|
||||
By Potatoswatter
|
||||
http://stackoverflow.com/questions/6475373/optimizations-for-pow-with-const-non-integer-exponent
|
||||
*/
|
||||
|
||||
#ifndef CC_ALWAYSINLINE
|
||||
#if defined(__GNUC__) || defined(__INTEL_COMPILER)
|
||||
#define CC_ALWAYSINLINE __attribute__((always_inline))
|
||||
#else
|
||||
#define CC_ALWAYSINLINE
|
||||
#endif
|
||||
#endif
|
||||
|
||||
static inline CC_ALWAYSINLINE __m128 simd4f_fastpow_ps( __m128 arg, uint32_t expnum, uint32_t expden, uint32_t coeffnum, uint32_t coeffden )
|
||||
{
|
||||
__m128 ret = arg;
|
||||
float corrfactor, powfactor;
|
||||
/* Apply a constant pre-correction factor. */
|
||||
corrfactor = exp2( 127.0 * expden / expnum - 127.0 ) * pow( 1.0 * coeffnum / coeffden, 1.0 * expden / expnum );
|
||||
powfactor = 1.0 * expnum / expden;
|
||||
ret = _mm_mul_ps( ret, _mm_set1_ps( corrfactor ) );
|
||||
/* Reinterpret arg as integer to obtain logarithm. */
|
||||
ret = _mm_cvtepi32_ps( _mm_castps_si128( ret ) );
|
||||
/* Multiply logarithm by power. */
|
||||
ret = _mm_mul_ps( ret, _mm_set1_ps( powfactor ) );
|
||||
/* Convert back to "integer" to exponentiate. */
|
||||
ret = _mm_castsi128_ps( _mm_cvtps_epi32( ret ) );
|
||||
return ret;
|
||||
}
|
||||
|
||||
__m128 simd4f_pow12d5_ps( __m128 arg )
|
||||
{
|
||||
/* Lower exponents provide lower initial error, but too low causes overflow. */
|
||||
__m128 xf = simd4f_fastpow_ps( arg, 4, 5, (int)( 1.38316186f * 1e9 ), (int)1e9 );
|
||||
/* Imprecise 4-cycle sqrt is still far better than fastpow, good enough. */
|
||||
__m128 xfm4 = _mm_rsqrt_ps( xf );
|
||||
__m128 xf4 = _mm_mul_ps( xf, xfm4 );
|
||||
/* Precisely calculate x^2 and x^3 */
|
||||
__m128 x2 = _mm_mul_ps( arg, arg );
|
||||
__m128 x3 = _mm_mul_ps( x2, arg );
|
||||
/* Overestimate of x^2 * x^0.4 */
|
||||
x2 = _mm_mul_ps( x2, xf4 );
|
||||
/* Get x^-0.2 from x^0.4, and square it for x^-0.4. Combine into x^-0.6. */
|
||||
__m128 xfm2 = _mm_rsqrt_ps( xf4 );
|
||||
x3 = _mm_mul_ps( x3, xfm4 );
|
||||
x3 = _mm_mul_ps( x3, xfm2 );
|
||||
return _mm_mul_ps( _mm_add_ps( x2, x3 ), _mm_set1_ps( 1.0f/1.960131704207789f * 0.9999f ) );
|
||||
}
|
||||
|
||||
__m128 simd4f_pow5d12_ps( __m128 arg )
|
||||
{
|
||||
/* 5/12 is too small, so compute the 4th root of 20/12 instead. */
|
||||
/* 20/12 = 5/3 = 1 + 2/3 = 2 - 1/3. 2/3 is a suitable argument for fastpow. */
|
||||
/* weighting coefficient: a^-1/2 = 2 a; a = 2^-2/3 */
|
||||
__m128 xf = simd4f_fastpow_ps( arg, 2, 3, (int)( 0.629960524947437f * 1e9 ), (int)1e9 );
|
||||
__m128 xover = _mm_mul_ps( arg, xf );
|
||||
__m128 xfm1 = _mm_rsqrt_ps( xf );
|
||||
__m128 x2 = _mm_mul_ps( arg, arg );
|
||||
__m128 xunder = _mm_mul_ps( x2, xfm1 );
|
||||
/* sqrt2 * over + 2 * sqrt2 * under */
|
||||
__m128 xavg = _mm_mul_ps( _mm_set1_ps( 1.0f/( 3.0f * 0.629960524947437f ) * 0.999852f ), _mm_add_ps( xover, xunder ) );
|
||||
xavg = _mm_mul_ps( xavg, _mm_rsqrt_ps( xavg ) );
|
||||
xavg = _mm_mul_ps( xavg, _mm_rsqrt_ps( xavg ) );
|
||||
return xavg;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
////
|
||||
|
410
lib/graphics_utils/mipmap/cpusimd.h
Normal file
410
lib/graphics_utils/mipmap/cpusimd.h
Normal file
@ -0,0 +1,410 @@
|
||||
/* -----------------------------------------------------------------------------
|
||||
*
|
||||
* Copyright (c) 2008-2016 Alexis Naveros.
|
||||
*
|
||||
* The SIMD trigonometry functions are Copyright (C) 2007 Julien Pommier
|
||||
* See copyright notice for simd4f_sin_ps(), simd4f_cos_ps(), simd4f_sincos_ps()
|
||||
*
|
||||
* Portions developed under contract to the SURVICE Engineering Company.
|
||||
*
|
||||
* This software is provided 'as-is', without any express or implied
|
||||
* warranty. In no event will the authors be held liable for any damages
|
||||
* arising from the use of this software.
|
||||
*
|
||||
* Permission is granted to anyone to use this software for any purpose,
|
||||
* including commercial applications, and to alter it and redistribute it
|
||||
* freely, subject to the following restrictions:
|
||||
*
|
||||
* 1. The origin of this software must not be misrepresented; you must not
|
||||
* claim that you wrote the original software. If you use this software
|
||||
* in a product, an acknowledgment in the product documentation would be
|
||||
* appreciated but is not required.
|
||||
* 2. Altered source versions must be plainly marked as such, and must not be
|
||||
* misrepresented as being the original software.
|
||||
* 3. This notice may not be removed or altered from any source distribution.
|
||||
*
|
||||
* -----------------------------------------------------------------------------
|
||||
*/
|
||||
|
||||
|
||||
#ifndef CPUSIMD_H
|
||||
#define CPUSIMD_H
|
||||
|
||||
|
||||
////
|
||||
|
||||
|
||||
#if __MMX__ || CPU_ENABLE_MMX
|
||||
#include <mmintrin.h>
|
||||
#define CPU_MMX_SUPPORT (1)
|
||||
#endif
|
||||
#if __SSE__ || _M_X64 || _M_IX86_FP >= 1 || CPU_ENABLE_SSE
|
||||
#include <xmmintrin.h>
|
||||
#define CPU_SSE_SUPPORT (1)
|
||||
#endif
|
||||
#if __SSE2__ || _M_X64 || _M_IX86_FP >= 2 || CPU_ENABLE_SSE2
|
||||
#include <emmintrin.h>
|
||||
#define CPU_SSE2_SUPPORT (1)
|
||||
#endif
|
||||
#if __SSE3__ || __AVX__ || CPU_ENABLE_SSE3
|
||||
#include <pmmintrin.h>
|
||||
#define CPU_SSE3_SUPPORT (1)
|
||||
#endif
|
||||
#if __SSSE3__ || __AVX__ || CPU_ENABLE_SSSE3
|
||||
#include <tmmintrin.h>
|
||||
#define CPU_SSSE3_SUPPORT (1)
|
||||
#endif
|
||||
#if __SSE4_1__ || __AVX__ || CPU_ENABLE_SSE4_1
|
||||
#include <smmintrin.h>
|
||||
#define CPU_SSE4_1_SUPPORT (1)
|
||||
#endif
|
||||
#if __SSE4_2__ || CPU_ENABLE_SSE4_2
|
||||
#include <nmmintrin.h>
|
||||
#define CPU_SSE4_2_SUPPORT (1)
|
||||
#endif
|
||||
#if __SSE4A__ || CPU_ENABLE_SSE4A
|
||||
#include <ammintrin.h>
|
||||
#define CPU_SSE4A_SUPPORT (1)
|
||||
#endif
|
||||
#if __AVX__ || CPU_ENABLE_AVX
|
||||
#include <immintrin.h>
|
||||
#define CPU_AVX_SUPPORT (1)
|
||||
#endif
|
||||
#if __AVX2__ || CPU_ENABLE_AVX2
|
||||
#include <immintrin.h>
|
||||
#define CPU_AVX2_SUPPORT (1)
|
||||
#endif
|
||||
#if __XOP__ || CPU_ENABLE_XOP
|
||||
#include <immintrin.h>
|
||||
#define CPU_XOP_SUPPORT (1)
|
||||
#endif
|
||||
#if __FMA3__ || CPU_ENABLE_FMA3
|
||||
#include <immintrin.h>
|
||||
#define CPU_FMA3_SUPPORT (1)
|
||||
#endif
|
||||
#if __FMA4__ || CPU_ENABLE_FMA4
|
||||
#include <immintrin.h>
|
||||
#define CPU_FMA4_SUPPORT (1)
|
||||
#endif
|
||||
#if __RDRND__ || CPU_ENABLE_RDRND
|
||||
#include <immintrin.h>
|
||||
#define CPU_RDRND_SUPPORT (1)
|
||||
#endif
|
||||
#if __POPCNT__ || CPU_ENABLE_POPCNT
|
||||
#include <popcntintrin.h>
|
||||
#define CPU_POPCNT_SUPPORT (1)
|
||||
#endif
|
||||
#if __LZCNT__ || CPU_ENABLE_LZCNT
|
||||
#include <lzcntintrin.h>
|
||||
#define CPU_LZCNT_SUPPORT (1)
|
||||
#endif
|
||||
#if __F16C__ || CPU_ENABLE_F16C
|
||||
#include <f16cintrin.h>
|
||||
#define CPU_F16C_SUPPORT (1)
|
||||
#endif
|
||||
#if __BMI__ || CPU_ENABLE_BMI
|
||||
#include <bmiintrin.h>
|
||||
#define CPU_BMI_SUPPORT (1)
|
||||
#endif
|
||||
#if __BMI2__ || CPU_ENABLE_BMI2
|
||||
#include <bmi2intrin.h>
|
||||
#define CPU_BMI2_SUPPORT (1)
|
||||
#endif
|
||||
#if __TBM__ || CPU_ENABLE_TBM
|
||||
#include <tbmintrin.h>
|
||||
#define CPU_TBM_SUPPORT (1)
|
||||
#endif
|
||||
|
||||
|
||||
|
||||
#if defined(__GNUC__) || defined(__INTEL_COMPILER)
|
||||
#define CPU_ALIGN16 __attribute__((aligned(16)))
|
||||
#define CPU_ALIGN32 __attribute__((aligned(32)))
|
||||
#define CPU_ALIGN64 __attribute__((aligned(64)))
|
||||
#elif defined(_MSC_VER)
|
||||
#define CPU_ALIGN16 __declspec(align(16))
|
||||
#define CPU_ALIGN64 __declspec(align(64))
|
||||
#else
|
||||
#define CPU_ALIGN16
|
||||
#define CPU_ALIGN32
|
||||
#define CPU_ALIGN64
|
||||
#warning "SSE/AVX Disabled: Unsupported Compiler."
|
||||
#undef CPU_SSE_SUPPORT
|
||||
#undef CPU_SSE2_SUPPORT
|
||||
#undef CPU_SSE3_SUPPORT
|
||||
#undef CPU_SSSE3_SUPPORT
|
||||
#undef CPU_SSE4_1_SUPPORT
|
||||
#undef CPU_SSE4_2_SUPPORT
|
||||
#undef CPU_AVX_SUPPORT
|
||||
#undef CPU_AVX2_SUPPORT
|
||||
#undef CPU_XOP_SUPPORT
|
||||
#undef CPU_FMA3_SUPPORT
|
||||
#undef CPU_FMA4_SUPPORT
|
||||
#endif
|
||||
|
||||
|
||||
////
|
||||
|
||||
|
||||
#if CPU_SSE_SUPPORT
|
||||
#define CPU_APPROX_DIV_FLOAT(z,w) _mm_cvtss_f32(_mm_mul_ss(_mm_set_ss(z),_mm_rcp_ss(_mm_set_ss(w))))
|
||||
#define CPU_APPROX_SQRT_FLOAT(z) _mm_cvtss_f32(_mm_mul_ss(_mm_set_ss(z),_mm_rsqrt_ss(_mm_set_ss(z))))
|
||||
#define CPU_APPROX_RSQRT_FLOAT(z) _mm_cvtss_f32(_mm_rsqrt_ss(_mm_set_ss(z)))
|
||||
#define CPU_APPROX_DIVSQRT_FLOAT(z,w) _mm_cvtss_f32(_mm_mul_ss(_mm_set_ss(z),_mm_rsqrt_ss(_mm_set_ss(w))))
|
||||
#else
|
||||
#define CPU_APPROX_DIV_FLOAT(z,w) ((z)/(w))
|
||||
#define CPU_APPROX_SQRT_FLOAT(z) (sqrtf(z))
|
||||
#define CPU_APPROX_RSQRT_FLOAT(z) (1.0/sqrtf(z))
|
||||
#define CPU_APPROX_DIVSQRT_FLOAT(z,w) ((z)/sqrtf(w))
|
||||
#endif
|
||||
|
||||
|
||||
#if CPU_SSE3_SUPPORT
|
||||
#define CPU_HADD_PS(vx,vy) _mm_hadd_ps(vx,vy)
|
||||
#define CPU_HADD_PD(vx,vy) _mm_hadd_pd(vx,vy)
|
||||
#elif CPU_SSE_SUPPORT
|
||||
static inline __m128 CPU_HADD_PS( __m128 vx, __m128 vy )
|
||||
{
|
||||
__m128 vh, vl;
|
||||
vh = _mm_shuffle_ps( vx, vy, _MM_SHUFFLE(3,1,3,1) );
|
||||
vl = _mm_shuffle_ps( vx, vy, _MM_SHUFFLE(2,0,2,0) );
|
||||
return _mm_add_ps( vh, vl );
|
||||
}
|
||||
#define CPU_HADD_PD(vx,vy) _mm_add_sd(vx,_mm_unpackhi_pd(vy,vy))
|
||||
#endif
|
||||
|
||||
|
||||
#if CPU_SSE4_1_SUPPORT
|
||||
#define CPU_CVT_U8_TO_I32(x,vzero) _mm_cvtepu8_epi32(x)
|
||||
#define CPU_CVT_S8_TO_I32(x,vzero) _mm_cvtepi8_epi32(x)
|
||||
#elif CPU_SSE2_SUPPORT
|
||||
#define CPU_CVT_U8_TO_I32(x,vzero) _mm_unpacklo_epi16(_mm_unpacklo_epi8((x),(vzero)),(vzero))
|
||||
static inline __m128i CPU_CVT_S8_TO_I32( __m128i vx, __m128i vzero )
|
||||
{
|
||||
__m128i vsign;
|
||||
vsign = _mm_cmpgt_epi8( vzero, vx );
|
||||
return _mm_unpacklo_epi16( _mm_unpacklo_epi8( vx, vsign ), _mm_unpacklo_epi8( vsign, vsign ) );
|
||||
}
|
||||
#endif
|
||||
|
||||
|
||||
#if CPU_SSE4_1_SUPPORT
|
||||
#define CPU_BLENDV_PS(x,y,mask) _mm_blendv_ps(x,y,mask)
|
||||
#define CPU_BLENDV_PD(x,y,mask) _mm_blendv_pd(x,y,mask)
|
||||
#elif CPU_SSE2_SUPPORT
|
||||
#define CPU_BLENDV_PS(x,y,mask) _mm_or_ps(_mm_andnot_ps(mask,x),_mm_and_ps(y,mask))
|
||||
#define CPU_BLENDV_PD(x,y,mask) _mm_or_pd(_mm_andnot_pd(mask,x),_mm_and_pd(y,mask))
|
||||
#endif
|
||||
|
||||
|
||||
|
||||
/*
|
||||
CPU_FMADD = ((f0*f1)+t0)
|
||||
CPU_FMSUB = ((f0*f1)-t0)
|
||||
*/
|
||||
#if CPU_FMA3_SUPPORT
|
||||
#define CPU_FMADD_SS(f0,f1,t0) _mm_fmadd_ss(f0,f1,t0)
|
||||
#define CPU_FMADD_PS(f0,f1,t0) _mm_fmadd_ps(f0,f1,t0)
|
||||
#define CPU_FMADD_SD(f0,f1,t0) _mm_fmadd_sd(f0,f1,t0)
|
||||
#define CPU_FMADD_PD(f0,f1,t0) _mm_fmadd_pd(f0,f1,t0)
|
||||
#define CPU_FMSUB_SS(f0,f1,t0) _mm_fmsub_ss(f0,f1,t0)
|
||||
#define CPU_FMSUB_PS(f0,f1,t0) _mm_fmsub_ps(f0,f1,t0)
|
||||
#define CPU_FMSUB_SD(f0,f1,t0) _mm_fmsub_sd(f0,f1,t0)
|
||||
#define CPU_FMSUB_PD(f0,f1,t0) _mm_fmsub_pd(f0,f1,t0)
|
||||
#define CPU_FMADD256_SS(f0,f1,t0) _mm256_fmadd_ss(f0,f1,t0)
|
||||
#define CPU_FMADD256_PS(f0,f1,t0) _mm256_fmadd_ps(f0,f1,t0)
|
||||
#define CPU_FMADD256_SD(f0,f1,t0) _mm256_fmadd_sd(f0,f1,t0)
|
||||
#define CPU_FMADD256_PD(f0,f1,t0) _mm256_fmadd_pd(f0,f1,t0)
|
||||
#define CPU_FMSUB256_SS(f0,f1,t0) _mm256_fmsub_ss(f0,f1,t0)
|
||||
#define CPU_FMSUB256_PS(f0,f1,t0) _mm256_fmsub_ps(f0,f1,t0)
|
||||
#define CPU_FMSUB256_SD(f0,f1,t0) _mm256_fmsub_sd(f0,f1,t0)
|
||||
#define CPU_FMSUB256_PD(f0,f1,t0) _mm256_fmsub_pd(f0,f1,t0)
|
||||
#elif CPU_FMA4_SUPPORT
|
||||
#define CPU_FMADD_SS(f0,f1,t0) _mm_macc_ss(f0,f1,t0)
|
||||
#define CPU_FMADD_PS(f0,f1,t0) _mm_macc_ps(f0,f1,t0)
|
||||
#define CPU_FMADD_SD(f0,f1,t0) _mm_macc_sd(f0,f1,t0)
|
||||
#define CPU_FMADD_PD(f0,f1,t0) _mm_macc_pd(f0,f1,t0)
|
||||
#define CPU_FMSUB_SS(f0,f1,t0) _mm_msub_ss(f0,f1,t0)
|
||||
#define CPU_FMSUB_PS(f0,f1,t0) _mm_msub_ps(f0,f1,t0)
|
||||
#define CPU_FMSUB_SD(f0,f1,t0) _mm_msub_sd(f0,f1,t0)
|
||||
#define CPU_FMSUB_PD(f0,f1,t0) _mm_msub_pd(f0,f1,t0)
|
||||
#define CPU_FMADD256_SS(f0,f1,t0) _mm256_macc_ss(f0,f1,t0)
|
||||
#define CPU_FMADD256_PS(f0,f1,t0) _mm256_macc_ps(f0,f1,t0)
|
||||
#define CPU_FMADD256_SD(f0,f1,t0) _mm256_macc_sd(f0,f1,t0)
|
||||
#define CPU_FMADD256_PD(f0,f1,t0) _mm256_macc_pd(f0,f1,t0)
|
||||
#define CPU_FMSUB256_SS(f0,f1,t0) _mm256_msub_ss(f0,f1,t0)
|
||||
#define CPU_FMSUB256_PS(f0,f1,t0) _mm256_msub_ps(f0,f1,t0)
|
||||
#define CPU_FMSUB256_SD(f0,f1,t0) _mm256_msub_sd(f0,f1,t0)
|
||||
#define CPU_FMSUB256_PD(f0,f1,t0) _mm256_msub_pd(f0,f1,t0)
|
||||
#else
|
||||
#define CPU_FMADD_SS(f0,f1,t0) _mm_add_ss(_mm_mul_ss(f0,f1),t0)
|
||||
#define CPU_FMADD_PS(f0,f1,t0) _mm_add_ps(_mm_mul_ps(f0,f1),t0)
|
||||
#define CPU_FMADD_SD(f0,f1,t0) _mm_add_sd(_mm_mul_sd(f0,f1),t0)
|
||||
#define CPU_FMADD_PD(f0,f1,t0) _mm_add_pd(_mm_mul_pd(f0,f1),t0)
|
||||
#define CPU_FMSUB_SS(f0,f1,t0) _mm_sub_ss(_mm_mul_ss(f0,f1),t0)
|
||||
#define CPU_FMSUB_PS(f0,f1,t0) _mm_sub_ps(_mm_mul_ps(f0,f1),t0)
|
||||
#define CPU_FMSUB_SD(f0,f1,t0) _mm_sub_sd(_mm_mul_sd(f0,f1),t0)
|
||||
#define CPU_FMSUB_PD(f0,f1,t0) _mm_sub_pd(_mm_mul_pd(f0,f1),t0)
|
||||
#define CPU_FMADD256_SS(f0,f1,t0) _mm256_add_ss(_mm256_mul_ss(f0,f1),t0)
|
||||
#define CPU_FMADD256_PS(f0,f1,t0) _mm256_add_ps(_mm256_mul_ps(f0,f1),t0)
|
||||
#define CPU_FMADD256_SD(f0,f1,t0) _mm256_add_sd(_mm256_mul_sd(f0,f1),t0)
|
||||
#define CPU_FMADD256_PD(f0,f1,t0) _mm256_add_pd(_mm256_mul_pd(f0,f1),t0)
|
||||
#define CPU_FMSUB256_SS(f0,f1,t0) _mm256_sub_ss(_mm256_mul_ss(f0,f1),t0)
|
||||
#define CPU_FMSUB256_PS(f0,f1,t0) _mm256_sub_ps(_mm256_mul_ps(f0,f1),t0)
|
||||
#define CPU_FMSUB256_SD(f0,f1,t0) _mm256_sub_sd(_mm256_mul_sd(f0,f1),t0)
|
||||
#define CPU_FMSUB256_PD(f0,f1,t0) _mm256_sub_pd(_mm256_mul_pd(f0,f1),t0)
|
||||
#endif
|
||||
|
||||
|
||||
////
|
||||
|
||||
|
||||
#if CPU_SSE_SUPPORT
|
||||
|
||||
extern const uint32_t simd4fSignMask[4];
|
||||
extern const uint32_t simd4fSignMaskInv[4];
|
||||
extern const float simd4fHalf[4];
|
||||
extern const float simd4fOne[4];
|
||||
extern const float simd4fTwo[4];
|
||||
extern const float simd4fThree[4];
|
||||
extern const uint32_t simd4uOne[4];
|
||||
extern const uint32_t simd4uOneInv[4];
|
||||
extern const uint32_t simd4uTwo[4];
|
||||
extern const uint32_t simd4uFour[4];
|
||||
extern const float simd4fQuarter[4];
|
||||
extern const float simd4fPi[4];
|
||||
extern const float simd4fZeroOneTwoThree[4];
|
||||
extern const uint32_t simd4fAlphaMask[4];
|
||||
extern const float simd4f255[4];
|
||||
extern const float simd4f255Inv[4];
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
#if CPU_SSE2_SUPPORT
|
||||
|
||||
/* Input range between -8192 and 8192 */
|
||||
__m128 simd4f_sin_ps( __m128 x );
|
||||
__m128 simd4f_cos_ps( __m128 x );
|
||||
void simd4f_sincos_ps( __m128 x, __m128 *s, __m128 *c );
|
||||
|
||||
#endif
|
||||
|
||||
#if CPU_SSE2_SUPPORT
|
||||
|
||||
__m128 simd4f_exp2_ps( __m128 x );
|
||||
__m128 simd4f_log2_ps( __m128 x );
|
||||
__m128 simd4f_pow_ps( __m128 x, __m128 y );
|
||||
|
||||
#endif
|
||||
|
||||
#if CPU_SSE2_SUPPORT
|
||||
|
||||
__m128 simd4f_pow12d5_ps( __m128 arg );
|
||||
__m128 simd4f_pow5d12_ps( __m128 arg );
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
////
|
||||
|
||||
|
||||
#if CPU_SSE2_SUPPORT
|
||||
|
||||
#ifndef CC_ALWAYSINLINE
|
||||
#if defined(__GNUC__) || defined(__INTEL_COMPILER)
|
||||
#define CC_ALWAYSINLINE __attribute__((always_inline))
|
||||
#else
|
||||
#define CC_ALWAYSINLINE
|
||||
#endif
|
||||
#endif
|
||||
|
||||
static inline CC_ALWAYSINLINE __m128 simd4f_pow12d5_inline_ps( __m128 vx )
|
||||
{
|
||||
__m128 vpow, vpwsqrtinv, vpwsqrt, vx2;
|
||||
vx2 = _mm_mul_ps( vx, vx );
|
||||
vpow = _mm_castsi128_ps( _mm_cvtps_epi32( _mm_mul_ps( _mm_cvtepi32_ps( _mm_castps_si128( _mm_mul_ps( vx, _mm_set1_ps( 5417434112.0f ) ) ) ), _mm_set1_ps( 0.8f ) ) ) );
|
||||
vpwsqrtinv = _mm_rsqrt_ps( vpow );
|
||||
vpwsqrt = _mm_mul_ps( vpow, vpwsqrtinv );
|
||||
return _mm_mul_ps( _mm_add_ps( _mm_mul_ps( vx2, vpwsqrt ), _mm_mul_ps( _mm_mul_ps( _mm_mul_ps( vx2, vx ), vpwsqrtinv ), _mm_rsqrt_ps( vpwsqrt ) ) ), _mm_set1_ps( 0.51011878327f ) );
|
||||
}
|
||||
|
||||
static inline CC_ALWAYSINLINE __m128 simd4f_pow5d12_inline_ps( __m128 vx )
|
||||
{
|
||||
__m128 vpow;
|
||||
vpow = _mm_castsi128_ps( _mm_cvtps_epi32( _mm_mul_ps( _mm_cvtepi32_ps( _mm_castps_si128( _mm_mul_ps( vx, _mm_set1_ps( 6521909350804488192.0f ) ) ) ), _mm_set1_ps( 0.666666666666f ) ) ) );
|
||||
vx = _mm_mul_ps( _mm_add_ps( _mm_mul_ps( vx, vpow ), _mm_mul_ps( _mm_mul_ps( vx, vx ), _mm_rsqrt_ps( vpow ) ) ), _mm_set1_ps( 0.5290553722f ) );
|
||||
#if 0
|
||||
vx = _mm_mul_ps( vx, _mm_rsqrt_ps( vx ) );
|
||||
vx = _mm_mul_ps( vx, _mm_rsqrt_ps( vx ) );
|
||||
#else
|
||||
vx = _mm_sqrt_ps( vx );
|
||||
vx = _mm_sqrt_ps( vx );
|
||||
#endif
|
||||
return vx;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
////
|
||||
|
||||
|
||||
#if CPU_SSE_SUPPORT
|
||||
|
||||
static inline void simdPrintDebugSSE4f( char *str, __m128 v )
|
||||
{
|
||||
float CPU_ALIGN16 store[4];
|
||||
_mm_store_ps( (void *)store, v );
|
||||
printf( "%s %f %f %f %f\n", str, store[0], store[1], store[2], store[3] );
|
||||
return;
|
||||
}
|
||||
|
||||
static inline void simdPrintDebugSSE2d( char *str, __m128d v )
|
||||
{
|
||||
double CPU_ALIGN16 store[2];
|
||||
_mm_store_pd( (void *)store, v );
|
||||
printf( "%s %f %f\n", str, store[0], store[1] );
|
||||
return;
|
||||
}
|
||||
|
||||
static inline void simdPrintDebugSSE16u8( char *str, __m128i v )
|
||||
{
|
||||
uint8_t CPU_ALIGN16 store[16];
|
||||
_mm_store_si128( (void *)store, v );
|
||||
printf( "%s %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d\n", str, store[0], store[1], store[2], store[3], store[4], store[5], store[6], store[7], store[8], store[9], store[10], store[11], store[12], store[13], store[14], store[15] );
|
||||
return;
|
||||
}
|
||||
|
||||
static inline void simdPrintDebugSSE8u16( char *str, __m128i v )
|
||||
{
|
||||
uint16_t CPU_ALIGN16 store[8];
|
||||
_mm_store_si128( (void *)store, v );
|
||||
printf( "%s %d %d %d %d %d %d %d %d\n", str, store[0], store[1], store[2], store[3], store[4], store[5], store[6], store[7] );
|
||||
return;
|
||||
}
|
||||
|
||||
static inline void simdPrintDebugSSE4u32( char *str, __m128i v )
|
||||
{
|
||||
uint32_t CPU_ALIGN16 store[4];
|
||||
_mm_store_si128( (void *)store, v );
|
||||
printf( "%s %d %d %d %d\n", str, store[0], store[1], store[2], store[3] );
|
||||
return;
|
||||
}
|
||||
|
||||
static inline void simdPrintDebugSSE2u64( char *str, __m128i v )
|
||||
{
|
||||
uint64_t CPU_ALIGN16 store[2];
|
||||
_mm_store_si128( (void *)store, v );
|
||||
printf( "%s %lld %lld\n", str, (long long)store[0], (long long)store[1] );
|
||||
return;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
////
|
||||
|
||||
|
||||
#endif
|
||||
|
628
lib/graphics_utils/mipmap/img.c
Normal file
628
lib/graphics_utils/mipmap/img.c
Normal file
@ -0,0 +1,628 @@
|
||||
/* *****************************************************************************
|
||||
*
|
||||
* Copyright (c) 2007-2016 Alexis Naveros.
|
||||
* Portions developed under contract to the SURVICE Engineering Company.
|
||||
*
|
||||
* This library is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public License
|
||||
* version 2.1 as published by the Free Software Foundation.
|
||||
*
|
||||
* This library is distributed in the hope that it will be useful, but
|
||||
* WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with this file; see the file named COPYING for more
|
||||
* information.
|
||||
*
|
||||
* *****************************************************************************
|
||||
*/
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <stddef.h>
|
||||
#include <stdint.h>
|
||||
#include <string.h>
|
||||
#include <math.h>
|
||||
|
||||
|
||||
#include "cpusimd.h"
|
||||
|
||||
#include "img.h"
|
||||
|
||||
|
||||
#ifndef ADDRESS
|
||||
#define ADDRESS(p,o) ((void *)(((char *)p)+(o)))
|
||||
#endif
|
||||
|
||||
|
||||
////
|
||||
|
||||
|
||||
void imgCopyRect( imgImage *image, int dstx, int dsty, int srcx, int srcy, int sizex, int sizey )
|
||||
{
|
||||
int y;
|
||||
void *dst, *src;
|
||||
src = ADDRESS( image->data, ( srcx * image->format.bytesperpixel ) + ( srcy * image->format.bytesperline ) );
|
||||
dst = ADDRESS( image->data, ( dstx * image->format.bytesperpixel ) + ( dsty * image->format.bytesperline ) );
|
||||
for( y = 0 ; y < sizey ; y++ )
|
||||
{
|
||||
memcpy( dst, src, sizex * image->format.bytesperpixel );
|
||||
src = ADDRESS( src, image->format.bytesperline );
|
||||
dst = ADDRESS( dst, image->format.bytesperline );
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
|
||||
#if CPU_SSE2_SUPPORT
|
||||
static const uint16_t CPU_ALIGN16 imgBlendRgbMask[8] = { 0xffff, 0xffff, 0xffff, 0x0000, 0xffff, 0xffff, 0xffff, 0x0000 };
|
||||
static const uint8_t CPU_ALIGN16 imgBlendAlphaTestMask[16] = { 0x00, 0x00, 0x00, 0xff, 0x00, 0x00, 0x00, 0xff, 0x00, 0x00, 0x00, 0xff, 0x00, 0x00, 0x00, 0xff };
|
||||
static const uint16_t CPU_ALIGN16 imgBlendRoundBias[8] = { 128, 128, 128, 128, 128, 128, 128, 128 };
|
||||
#if CPU_SSSE3_SUPPORT
|
||||
static const uint8_t CPU_ALIGN16 imgBlendShufMask[16] = { 6,7,6,7,6,7,6,7, 14,15,14,15,14,15,14,15 };
|
||||
#endif
|
||||
#endif
|
||||
|
||||
static void imgBlendImageRgba2Rgba( imgImage *dstimage, int dstx, int dsty, imgImage *srcimage )
|
||||
{
|
||||
int x, y;
|
||||
#if CPU_SSE2_SUPPORT
|
||||
int row4size;
|
||||
__m128i vsrc01, vsrc23, vdst01, vdst23, vblend01, vblend23;
|
||||
__m128i vzero, v255, vrgbmask, valphatest, vroundbias;
|
||||
#if CPU_SSSE3_SUPPORT
|
||||
__m128i vshufmask;
|
||||
#endif
|
||||
#else
|
||||
int32_t dstr, dstg, dstb, dsta;
|
||||
int32_t srcr, srcg, srcb, srca;
|
||||
#endif
|
||||
unsigned char *src, *srcrow, *dstrow;
|
||||
uint32_t *dst;
|
||||
|
||||
/* TODO: Other function to clamp copy area? */
|
||||
|
||||
#if CPU_SSE2_SUPPORT
|
||||
row4size = srcimage->format.width & ~3;
|
||||
vzero = _mm_setzero_si128();
|
||||
v255 = _mm_set1_epi16( 255 );
|
||||
vrgbmask = _mm_load_si128( (void *)imgBlendRgbMask );
|
||||
valphatest = _mm_load_si128( (void *)imgBlendAlphaTestMask );
|
||||
vroundbias = _mm_load_si128( (void *)imgBlendRoundBias );
|
||||
#if CPU_SSSE3_SUPPORT
|
||||
vshufmask = _mm_load_si128( (void *)imgBlendShufMask );
|
||||
#endif
|
||||
#endif
|
||||
|
||||
src = srcimage->data;
|
||||
dst = ADDRESS( dstimage->data, ( dstx * 4 ) + ( dsty * dstimage->format.bytesperline ) );
|
||||
for( y = 0 ; y < srcimage->format.height ; y++ )
|
||||
{
|
||||
srcrow = src;
|
||||
dstrow = (unsigned char *)dst;
|
||||
|
||||
#if CPU_SSE2_SUPPORT
|
||||
for( x = 0 ; x < row4size ; x += 4, srcrow += 16, dstrow += 16 )
|
||||
{
|
||||
/* r0g0b0a0,r1g1b1a1,r2g2b2a2,r3g3b3a3 */
|
||||
vsrc23 = _mm_loadu_si128( (void *)srcrow );
|
||||
if( _mm_movemask_ps( _mm_castsi128_ps( _mm_cmpeq_epi32( _mm_and_si128( valphatest, vsrc23 ), vzero ) ) ) == 0xf )
|
||||
continue;
|
||||
vdst23 = _mm_loadu_si128( (void *)dstrow );
|
||||
/* r0__g0__b0__a0__, r1__g1__b1__a1__ */
|
||||
vsrc01 = _mm_unpacklo_epi8( vsrc23, vzero );
|
||||
vdst01 = _mm_unpacklo_epi8( vdst23, vzero );
|
||||
/* r2__g2__b2__a2__, r3__g3__b3__a3__ */
|
||||
vsrc23 = _mm_unpackhi_epi8( vsrc23, vzero );
|
||||
vdst23 = _mm_unpackhi_epi8( vdst23, vzero );
|
||||
#if CPU_SSSE3_SUPPORT
|
||||
/* __a0__a0__a0__a0, __a1__a1__a1__a1 */
|
||||
vblend01 = _mm_shuffle_epi8( vsrc01, vshufmask );
|
||||
/* __a2__a2__a2__a2, __a3__a3__a3__a3 */
|
||||
vblend23 = _mm_shuffle_epi8( vsrc23, vshufmask );
|
||||
#else
|
||||
vblend01 = _mm_shufflelo_epi16( vsrc01, 0xff );
|
||||
vblend01 = _mm_shufflehi_epi16( vblend01, 0xff );
|
||||
vblend23 = _mm_shufflelo_epi16( vsrc23, 0xff );
|
||||
vblend23 = _mm_shufflehi_epi16( vblend23, 0xff );
|
||||
#endif
|
||||
vdst01 = _mm_adds_epu16( _mm_adds_epu16( _mm_mullo_epi16( vdst01, _mm_sub_epi16( v255, _mm_and_si128( vblend01, vrgbmask ) ) ), _mm_mullo_epi16( vsrc01, vblend01 ) ), vroundbias );
|
||||
vdst23 = _mm_adds_epu16( _mm_adds_epu16( _mm_mullo_epi16( vdst23, _mm_sub_epi16( v255, _mm_and_si128( vblend23, vrgbmask ) ) ), _mm_mullo_epi16( vsrc23, vblend23 ) ), vroundbias );
|
||||
/* Correction to divide by 255 instead of 256 */
|
||||
vdst01 = _mm_srli_epi16( _mm_adds_epu16( vdst01, _mm_srli_epi16( vdst01, 8 ) ), 8 );
|
||||
vdst23 = _mm_srli_epi16( _mm_adds_epu16( vdst23, _mm_srli_epi16( vdst23, 8 ) ), 8 );
|
||||
/* Combine interleaved and store */
|
||||
_mm_storeu_si128( (void *)dstrow, _mm_packus_epi16( vdst01, vdst23 ) );
|
||||
}
|
||||
for( ; x < srcimage->format.width ; x++, srcrow += 4, dstrow += 4 )
|
||||
{
|
||||
if( !( srcrow[3] ) )
|
||||
continue;
|
||||
vsrc01 = _mm_castps_si128( _mm_load_ss( (void *)srcrow ) );
|
||||
vdst01 = _mm_castps_si128( _mm_load_ss( (void *)dstrow ) );
|
||||
vsrc01 = _mm_unpacklo_epi8( vsrc01, vzero );
|
||||
vdst01 = _mm_unpacklo_epi8( vdst01, vzero );
|
||||
#if CPU_SSSE3_SUPPORT
|
||||
vblend01 = _mm_shuffle_epi8( vsrc01, vshufmask );
|
||||
#else
|
||||
vblend01 = _mm_shufflelo_epi16( vsrc01, 0xff );
|
||||
vblend01 = _mm_shufflehi_epi16( vblend01, 0xff );
|
||||
#endif
|
||||
vdst01 = _mm_adds_epu16( _mm_adds_epu16( _mm_mullo_epi16( vdst01, _mm_sub_epi16( v255, _mm_and_si128( vblend01, vrgbmask ) ) ), _mm_mullo_epi16( vsrc01, vblend01 ) ), vroundbias );
|
||||
/* Correction to divide by 255 instead of 256 */
|
||||
vdst01 = _mm_srli_epi16( _mm_adds_epu16( vdst01, _mm_srli_epi16( vdst01, 8 ) ), 8 );
|
||||
_mm_store_ss( (void *)dstrow, _mm_castsi128_ps( _mm_packus_epi16( vdst01, vdst01 ) ) );
|
||||
}
|
||||
#else
|
||||
for( x = 0 ; x < srcimage->format.width ; x++, srcrow += 4, dstrow += 4 )
|
||||
{
|
||||
if( !( srcrow[3] ) )
|
||||
continue;
|
||||
srcr = (int32_t)srcrow[0];
|
||||
srcg = (int32_t)srcrow[1];
|
||||
srcb = (int32_t)srcrow[2];
|
||||
srca = (int32_t)srcrow[3];
|
||||
dstr = (int32_t)dstrow[0];
|
||||
dstg = (int32_t)dstrow[1];
|
||||
dstb = (int32_t)dstrow[2];
|
||||
dsta = (int32_t)dstrow[3];
|
||||
dstr = ( ( dstr << 8 ) - dstr + ( srca * ( srcr - dstr ) ) + 128 );
|
||||
dstg = ( ( dstg << 8 ) - dstg + ( srca * ( srcg - dstg ) ) + 128 );
|
||||
dstb = ( ( dstb << 8 ) - dstb + ( srca * ( srcb - dstb ) ) + 128 );
|
||||
dsta = ( ( dsta << 8 ) - dsta + ( srca * srca ) + 128 );
|
||||
dstr = ( dstr + ( dstr >> 8 ) ) >> 8;
|
||||
dstg = ( dstg + ( dstg >> 8 ) ) >> 8;
|
||||
dstb = ( dstb + ( dstb >> 8 ) ) >> 8;
|
||||
dsta = ( dsta + ( dsta >> 8 ) ) >> 8;
|
||||
if( dsta > 255 )
|
||||
dsta = 255;
|
||||
dstrow[0] = (unsigned char)dstr;
|
||||
dstrow[1] = (unsigned char)dstg;
|
||||
dstrow[2] = (unsigned char)dstb;
|
||||
dstrow[3] = (unsigned char)dsta;
|
||||
}
|
||||
#endif
|
||||
src = ADDRESS( src, srcimage->format.bytesperline );
|
||||
dst = ADDRESS( dst, dstimage->format.bytesperline );
|
||||
}
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
static void imgBlendImageRgba2Rgbx( imgImage *dstimage, int dstx, int dsty, imgImage *srcimage )
|
||||
{
|
||||
int x, y;
|
||||
#if CPU_SSE2_SUPPORT
|
||||
int row4size;
|
||||
__m128i vsrc01, vsrc23, vdst01, vdst23, vblend01, vblend23;
|
||||
__m128i vzero, v255, valphatest, vroundbias;
|
||||
#if CPU_SSSE3_SUPPORT
|
||||
__m128i vshufmask;
|
||||
#endif
|
||||
#else
|
||||
int32_t dstr, dstg, dstb;
|
||||
int32_t srcr, srcg, srcb, srca;
|
||||
#endif
|
||||
unsigned char *src, *srcrow, *dstrow;
|
||||
uint32_t *dst;
|
||||
|
||||
/* TODO: Other function to clamp copy area? */
|
||||
|
||||
#if CPU_SSE2_SUPPORT
|
||||
row4size = srcimage->format.width & ~3;
|
||||
vzero = _mm_setzero_si128();
|
||||
v255 = _mm_set1_epi16( 255 );
|
||||
valphatest = _mm_load_si128( (void *)imgBlendAlphaTestMask );
|
||||
vroundbias = _mm_load_si128( (void *)imgBlendRoundBias );
|
||||
#if CPU_SSSE3_SUPPORT
|
||||
vshufmask = _mm_load_si128( (void *)imgBlendShufMask );
|
||||
#endif
|
||||
#endif
|
||||
|
||||
src = srcimage->data;
|
||||
dst = ADDRESS( dstimage->data, ( dstx * 4 ) + ( dsty * dstimage->format.bytesperline ) );
|
||||
for( y = 0 ; y < srcimage->format.height ; y++ )
|
||||
{
|
||||
srcrow = src;
|
||||
dstrow = (unsigned char *)dst;
|
||||
|
||||
#if CPU_SSE2_SUPPORT
|
||||
for( x = 0 ; x < row4size ; x += 4, srcrow += 16, dstrow += 16 )
|
||||
{
|
||||
/* r0g0b0a0,r1g1b1a1,r2g2b2a2,r3g3b3a3 */
|
||||
vsrc23 = _mm_loadu_si128( (void *)srcrow );
|
||||
if( _mm_movemask_ps( _mm_castsi128_ps( _mm_cmpeq_epi32( _mm_and_si128( valphatest, vsrc23 ), vzero ) ) ) == 0xf )
|
||||
continue;
|
||||
vdst23 = _mm_loadu_si128( (void *)dstrow );
|
||||
/* r0__g0__b0__a0__, r1__g1__b1__a1__ */
|
||||
vsrc01 = _mm_unpacklo_epi8( vsrc23, vzero );
|
||||
vdst01 = _mm_unpacklo_epi8( vdst23, vzero );
|
||||
/* r2__g2__b2__a2__, r3__g3__b3__a3__ */
|
||||
vsrc23 = _mm_unpackhi_epi8( vsrc23, vzero );
|
||||
vdst23 = _mm_unpackhi_epi8( vdst23, vzero );
|
||||
#if CPU_SSSE3_SUPPORT
|
||||
/* __a0__a0__a0__a0, __a1__a1__a1__a1 */
|
||||
vblend01 = _mm_shuffle_epi8( vsrc01, vshufmask );
|
||||
/* __a2__a2__a2__a2, __a3__a3__a3__a3 */
|
||||
vblend23 = _mm_shuffle_epi8( vsrc23, vshufmask );
|
||||
#else
|
||||
vblend01 = _mm_shufflelo_epi16( vsrc01, 0xff );
|
||||
vblend01 = _mm_shufflehi_epi16( vblend01, 0xff );
|
||||
vblend23 = _mm_shufflelo_epi16( vsrc23, 0xff );
|
||||
vblend23 = _mm_shufflehi_epi16( vblend23, 0xff );
|
||||
#endif
|
||||
vdst01 = _mm_adds_epu16( _mm_adds_epu16( _mm_mullo_epi16( vdst01, _mm_sub_epi16( v255, vblend01 ) ), _mm_mullo_epi16( vsrc01, vblend01 ) ), vroundbias );
|
||||
vdst23 = _mm_adds_epu16( _mm_adds_epu16( _mm_mullo_epi16( vdst23, _mm_sub_epi16( v255, vblend23 ) ), _mm_mullo_epi16( vsrc23, vblend23 ) ), vroundbias );
|
||||
/* Correction to divide by 255 instead of 256 */
|
||||
vdst01 = _mm_srli_epi16( _mm_adds_epu16( vdst01, _mm_srli_epi16( vdst01, 8 ) ), 8 );
|
||||
vdst23 = _mm_srli_epi16( _mm_adds_epu16( vdst23, _mm_srli_epi16( vdst23, 8 ) ), 8 );
|
||||
/* Combine interleaved and store */
|
||||
_mm_storeu_si128( (void *)dstrow, _mm_or_si128( _mm_packus_epi16( vdst01, vdst23 ), valphatest ) );
|
||||
}
|
||||
for( ; x < srcimage->format.width ; x++, srcrow += 4, dstrow += 4 )
|
||||
{
|
||||
if( !( srcrow[3] ) )
|
||||
continue;
|
||||
vsrc01 = _mm_castps_si128( _mm_load_ss( (void *)srcrow ) );
|
||||
vdst01 = _mm_castps_si128( _mm_load_ss( (void *)dstrow ) );
|
||||
vsrc01 = _mm_unpacklo_epi8( vsrc01, vzero );
|
||||
vdst01 = _mm_unpacklo_epi8( vdst01, vzero );
|
||||
#if CPU_SSSE3_SUPPORT
|
||||
vblend01 = _mm_shuffle_epi8( vsrc01, vshufmask );
|
||||
#else
|
||||
vblend01 = _mm_shufflelo_epi16( vsrc01, 0xff );
|
||||
vblend01 = _mm_shufflehi_epi16( vblend01, 0xff );
|
||||
#endif
|
||||
vdst01 = _mm_adds_epu16( _mm_adds_epu16( _mm_mullo_epi16( vdst01, _mm_sub_epi16( v255, vblend01 ) ), _mm_mullo_epi16( vsrc01, vblend01 ) ), vroundbias );
|
||||
/* Correction to divide by 255 instead of 256 */
|
||||
vdst01 = _mm_srli_epi16( _mm_adds_epu16( vdst01, _mm_srli_epi16( vdst01, 8 ) ), 8 );
|
||||
_mm_store_ss( (void *)dstrow, _mm_castsi128_ps( _mm_or_si128( _mm_packus_epi16( vdst01, vdst01 ), valphatest ) ) );
|
||||
}
|
||||
#else
|
||||
for( x = 0 ; x < srcimage->format.width ; x++, srcrow += 4, dstrow += 4 )
|
||||
{
|
||||
if( !( srcrow[3] ) )
|
||||
continue;
|
||||
srcr = (int32_t)srcrow[0];
|
||||
srcg = (int32_t)srcrow[1];
|
||||
srcb = (int32_t)srcrow[2];
|
||||
srca = (int32_t)srcrow[3];
|
||||
dstr = (int32_t)dstrow[0];
|
||||
dstg = (int32_t)dstrow[1];
|
||||
dstb = (int32_t)dstrow[2];
|
||||
dstr = ( ( dstr << 8 ) - dstr + ( srca * ( srcr - dstr ) ) + 128 );
|
||||
dstg = ( ( dstg << 8 ) - dstg + ( srca * ( srcg - dstg ) ) + 128 );
|
||||
dstb = ( ( dstb << 8 ) - dstb + ( srca * ( srcb - dstb ) ) + 128 );
|
||||
dstr = ( dstr + ( dstr >> 8 ) ) >> 8;
|
||||
dstg = ( dstg + ( dstg >> 8 ) ) >> 8;
|
||||
dstb = ( dstb + ( dstb >> 8 ) ) >> 8;
|
||||
dstrow[0] = (unsigned char)dstr;
|
||||
dstrow[1] = (unsigned char)dstg;
|
||||
dstrow[2] = (unsigned char)dstb;
|
||||
dstrow[3] = (unsigned char)255;
|
||||
}
|
||||
#endif
|
||||
src = ADDRESS( src, srcimage->format.bytesperline );
|
||||
dst = ADDRESS( dst, dstimage->format.bytesperline );
|
||||
}
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
static void imgBlendImageRgba2Rgb( imgImage *dstimage, int dstx, int dsty, imgImage *srcimage )
|
||||
{
|
||||
int x, y;
|
||||
int32_t dstr, dstg, dstb;
|
||||
int32_t srcr, srcg, srcb, srca;
|
||||
unsigned char *src, *srcrow, *dstrow;
|
||||
uint32_t *dst;
|
||||
|
||||
/* TODO: Other function to clamp copy area? */
|
||||
|
||||
src = srcimage->data;
|
||||
dst = ADDRESS( dstimage->data, ( dstx * 3 ) + ( dsty * dstimage->format.bytesperline ) );
|
||||
for( y = 0 ; y < srcimage->format.height ; y++ )
|
||||
{
|
||||
srcrow = src;
|
||||
dstrow = (unsigned char *)dst;
|
||||
for( x = 0 ; x < srcimage->format.width ; x++, srcrow += 4, dstrow += 3 )
|
||||
{
|
||||
if( !( srcrow[3] ) )
|
||||
continue;
|
||||
srcr = (int32_t)srcrow[0];
|
||||
srcg = (int32_t)srcrow[1];
|
||||
srcb = (int32_t)srcrow[2];
|
||||
srca = (int32_t)srcrow[3];
|
||||
dstr = (int32_t)dstrow[0];
|
||||
dstg = (int32_t)dstrow[1];
|
||||
dstb = (int32_t)dstrow[2];
|
||||
dstr = ( ( dstr << 8 ) - dstr + ( srca * ( srcr - dstr ) ) + 128 );
|
||||
dstg = ( ( dstg << 8 ) - dstg + ( srca * ( srcg - dstg ) ) + 128 );
|
||||
dstb = ( ( dstb << 8 ) - dstb + ( srca * ( srcb - dstb ) ) + 128 );
|
||||
dstr = ( dstr + ( dstr >> 8 ) ) >> 8;
|
||||
dstg = ( dstg + ( dstg >> 8 ) ) >> 8;
|
||||
dstb = ( dstb + ( dstb >> 8 ) ) >> 8;
|
||||
dstrow[0] = (unsigned char)dstr;
|
||||
dstrow[1] = (unsigned char)dstg;
|
||||
dstrow[2] = (unsigned char)dstb;
|
||||
}
|
||||
src = ADDRESS( src, srcimage->format.bytesperline );
|
||||
dst = ADDRESS( dst, dstimage->format.bytesperline );
|
||||
}
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
|
||||
void (*imgBlendGetFunction( imgImage *dstimage, imgImage *srcimage ))( imgImage *dstimage, int dstx, int dsty, imgImage *srcimage )
|
||||
{
|
||||
void (*blendfunc)( imgImage *dstimage, int dstx, int dsty, imgImage *srcimage );
|
||||
blendfunc = 0;
|
||||
if( srcimage->format.bytesperpixel == 4 )
|
||||
{
|
||||
if( dstimage->format.bytesperpixel == 4 )
|
||||
{
|
||||
if( ( dstimage->format.type == IMG_FORMAT_TYPE_RGBA32 ) || ( dstimage->format.type == IMG_FORMAT_TYPE_BGRA32 ) )
|
||||
blendfunc = imgBlendImageRgba2Rgba;
|
||||
else
|
||||
blendfunc = imgBlendImageRgba2Rgbx;
|
||||
}
|
||||
else if( dstimage->format.bytesperpixel == 3 )
|
||||
blendfunc = imgBlendImageRgba2Rgb;
|
||||
}
|
||||
return blendfunc;
|
||||
}
|
||||
|
||||
|
||||
int imgBlendImage( imgImage *dstimage, int dstx, int dsty, imgImage *srcimage )
|
||||
{
|
||||
void (*blendfunc)( imgImage *dstimage, int dstx, int dsty, imgImage *srcimage );
|
||||
blendfunc = imgBlendGetFunction( dstimage, srcimage );
|
||||
if( blendfunc )
|
||||
{
|
||||
blendfunc( dstimage, dstx, dsty, srcimage );
|
||||
return 1;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
////
|
||||
|
||||
|
||||
void imgAllocCopy( imgImage *dstimage, imgImage *srcimage )
|
||||
{
|
||||
dstimage->format = srcimage->format;
|
||||
dstimage->data = malloc( srcimage->format.height * srcimage->format.bytesperline );
|
||||
memcpy( dstimage->data, srcimage->data, srcimage->format.height * srcimage->format.bytesperline );
|
||||
return;
|
||||
}
|
||||
|
||||
|
||||
void imgAllocCopyExtendBorder( imgImage *dstimage, imgImage *srcimage, int extendsize )
|
||||
{
|
||||
int y;
|
||||
void *dst, *src, *dstrow;
|
||||
|
||||
dstimage->format.width = srcimage->format.width + ( extendsize << 1 );
|
||||
dstimage->format.height = srcimage->format.height + ( extendsize << 1 );
|
||||
dstimage->format.type = srcimage->format.type;
|
||||
dstimage->format.bytesperpixel = srcimage->format.bytesperpixel;
|
||||
dstimage->format.bytesperline = dstimage->format.width * dstimage->format.bytesperpixel;
|
||||
dstimage->data = malloc( dstimage->format.height * dstimage->format.bytesperline );
|
||||
|
||||
src = srcimage->data;
|
||||
dst = dstimage->data;
|
||||
for( y = 0 ; y < extendsize ; y++ )
|
||||
{
|
||||
memset( dst, 0, dstimage->format.bytesperline );
|
||||
dst = ADDRESS( dst, dstimage->format.bytesperline );
|
||||
}
|
||||
for( y = 0 ; y < srcimage->format.height ; y++ )
|
||||
{
|
||||
dstrow = dst;
|
||||
memset( dstrow, 0, extendsize * dstimage->format.bytesperpixel );
|
||||
dstrow = ADDRESS( dstrow, extendsize * dstimage->format.bytesperpixel );
|
||||
memcpy( dstrow, src, srcimage->format.width * dstimage->format.bytesperpixel );
|
||||
dstrow = ADDRESS( dstrow, srcimage->format.width * dstimage->format.bytesperpixel );
|
||||
memset( dstrow, 0, extendsize * dstimage->format.bytesperpixel );
|
||||
src = ADDRESS( src, srcimage->format.bytesperline );
|
||||
dst = ADDRESS( dst, dstimage->format.bytesperline );
|
||||
}
|
||||
for( y = 0 ; y < extendsize ; y++ )
|
||||
{
|
||||
memset( dst, 0, dstimage->format.bytesperline );
|
||||
dst = ADDRESS( dst, dstimage->format.bytesperline );
|
||||
}
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
|
||||
void imgAllocExtractChannel( imgImage *dstimage, imgImage *srcimage, int channelindex )
|
||||
{
|
||||
int x, y;
|
||||
unsigned char *dst, *src, *srcrow;
|
||||
|
||||
dstimage->format.width = srcimage->format.width;
|
||||
dstimage->format.height = srcimage->format.height;
|
||||
dstimage->format.type = IMG_FORMAT_TYPE_GRAYSCALE;
|
||||
dstimage->format.bytesperpixel = 1;
|
||||
dstimage->format.bytesperline = dstimage->format.width * dstimage->format.bytesperpixel;
|
||||
dstimage->data = malloc( dstimage->format.height * dstimage->format.bytesperline );
|
||||
|
||||
src = ADDRESS( srcimage->data, channelindex );
|
||||
dst = dstimage->data;
|
||||
for( y = 0 ; y < dstimage->format.height ; y++ )
|
||||
{
|
||||
srcrow = src;
|
||||
for( x = 0 ; x < dstimage->format.width ; x++ )
|
||||
{
|
||||
dst[x] = *srcrow;
|
||||
srcrow = ADDRESS( srcrow, srcimage->format.bytesperpixel );
|
||||
}
|
||||
src = ADDRESS( src, srcimage->format.bytesperline );
|
||||
dst = ADDRESS( dst, dstimage->format.bytesperline );
|
||||
}
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
|
||||
void imgAllocExtractChannelExtendBorder( imgImage *dstimage, imgImage *srcimage, int channelindex, int extendsize )
|
||||
{
|
||||
int x, y;
|
||||
unsigned char *src, *dst, *srcrow, *dstrow;
|
||||
|
||||
dstimage->format.width = srcimage->format.width + ( extendsize << 1 );
|
||||
dstimage->format.height = srcimage->format.height + ( extendsize << 1 );
|
||||
dstimage->format.type = IMG_FORMAT_TYPE_GRAYSCALE;
|
||||
dstimage->format.bytesperpixel = 1;
|
||||
dstimage->format.bytesperline = dstimage->format.width * dstimage->format.bytesperpixel;
|
||||
dstimage->data = malloc( dstimage->format.height * dstimage->format.bytesperline );
|
||||
|
||||
src = ADDRESS( srcimage->data, channelindex );
|
||||
dst = dstimage->data;
|
||||
for( y = 0 ; y < extendsize ; y++ )
|
||||
{
|
||||
memset( dst, 0, dstimage->format.bytesperline );
|
||||
dst = ADDRESS( dst, dstimage->format.bytesperline );
|
||||
}
|
||||
for( y = 0 ; y < srcimage->format.height ; y++ )
|
||||
{
|
||||
srcrow = src;
|
||||
dstrow = dst;
|
||||
memset( dstrow, 0, extendsize * dstimage->format.bytesperpixel );
|
||||
dstrow = ADDRESS( dstrow, extendsize * dstimage->format.bytesperpixel );
|
||||
for( x = 0 ; x < srcimage->format.width ; x++ )
|
||||
{
|
||||
dstrow[x] = *srcrow;
|
||||
srcrow = ADDRESS( srcrow, srcimage->format.bytesperpixel );
|
||||
}
|
||||
dstrow = ADDRESS( dstrow, srcimage->format.width * dstimage->format.bytesperpixel );
|
||||
memset( dstrow, 0, extendsize * dstimage->format.bytesperpixel );
|
||||
src = ADDRESS( src, srcimage->format.bytesperline );
|
||||
dst = ADDRESS( dst, dstimage->format.bytesperline );
|
||||
}
|
||||
for( y = 0 ; y < extendsize ; y++ )
|
||||
{
|
||||
memset( dst, 0, dstimage->format.bytesperline );
|
||||
dst = ADDRESS( dst, dstimage->format.bytesperline );
|
||||
}
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
|
||||
void imgAllocCopyChannelToAlpha( imgImage *dstimage, imgImage *srcimage, int channelindex, unsigned char r, unsigned char g, unsigned char b )
|
||||
{
|
||||
int x, y;
|
||||
unsigned char *dst, *src, *dstrow, *srcrow;
|
||||
|
||||
dstimage->format.width = srcimage->format.width;
|
||||
dstimage->format.height = srcimage->format.height;
|
||||
dstimage->format.type = IMG_FORMAT_TYPE_RGBA32;
|
||||
dstimage->format.bytesperpixel = 4;
|
||||
dstimage->format.bytesperline = dstimage->format.width * dstimage->format.bytesperpixel;
|
||||
dstimage->data = malloc( dstimage->format.height * dstimage->format.bytesperline );
|
||||
|
||||
src = ADDRESS( srcimage->data, channelindex );
|
||||
dst = dstimage->data;
|
||||
for( y = 0 ; y < dstimage->format.height ; y++ )
|
||||
{
|
||||
srcrow = src;
|
||||
dstrow = dst;
|
||||
for( x = 0 ; x < dstimage->format.width ; x++ )
|
||||
{
|
||||
dstrow[0] = r;
|
||||
dstrow[1] = g;
|
||||
dstrow[2] = b;
|
||||
dstrow[3] = *srcrow;
|
||||
srcrow = ADDRESS( srcrow, srcimage->format.bytesperpixel );
|
||||
dstrow = ADDRESS( dstrow, dstimage->format.bytesperpixel );
|
||||
}
|
||||
src = ADDRESS( src, srcimage->format.bytesperline );
|
||||
dst = ADDRESS( dst, dstimage->format.bytesperline );
|
||||
}
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
|
||||
void imgAllocAdjustBrightnessContrast( imgImage *dstimage, imgImage *srcimage, float brightness, float contrast )
|
||||
{
|
||||
int x, y;
|
||||
float r, g, b;
|
||||
unsigned char *dst, *src, *dstrow, *srcrow;
|
||||
|
||||
dstimage->format = srcimage->format;
|
||||
dstimage->data = malloc( srcimage->format.height * srcimage->format.bytesperline );
|
||||
|
||||
brightness += 0.5f;
|
||||
|
||||
if( dstimage->format.bytesperpixel >= 3 )
|
||||
{
|
||||
src = srcimage->data;
|
||||
dst = dstimage->data;
|
||||
for( y = 0 ; y < dstimage->format.height ; y++ )
|
||||
{
|
||||
srcrow = src;
|
||||
dstrow = dst;
|
||||
for( x = 0 ; x < dstimage->format.width ; x++ )
|
||||
{
|
||||
r = (1.0f/255.0f) * (float)srcrow[0];
|
||||
g = (1.0f/255.0f) * (float)srcrow[1];
|
||||
b = (1.0f/255.0f) * (float)srcrow[2];
|
||||
r = ( ( r - 0.5f ) * contrast ) + brightness;
|
||||
g = ( ( g - 0.5f ) * contrast ) + brightness;
|
||||
b = ( ( b - 0.5f ) * contrast ) + brightness;
|
||||
dstrow[0] = (unsigned char)fmaxf( 0.0f, fminf( 255.0f, roundf( r * 255.0f ) ) );
|
||||
dstrow[1] = (unsigned char)fmaxf( 0.0f, fminf( 255.0f, roundf( g * 255.0f ) ) );
|
||||
dstrow[2] = (unsigned char)fmaxf( 0.0f, fminf( 255.0f, roundf( b * 255.0f ) ) );
|
||||
if( dstimage->format.bytesperpixel >= 4 )
|
||||
dstrow[3] = srcrow[3];
|
||||
srcrow = ADDRESS( srcrow, srcimage->format.bytesperpixel );
|
||||
dstrow = ADDRESS( dstrow, dstimage->format.bytesperpixel );
|
||||
}
|
||||
src = ADDRESS( src, srcimage->format.bytesperline );
|
||||
dst = ADDRESS( dst, dstimage->format.bytesperline );
|
||||
}
|
||||
}
|
||||
else if( dstimage->format.bytesperpixel == 1 )
|
||||
{
|
||||
src = srcimage->data;
|
||||
dst = dstimage->data;
|
||||
for( y = 0 ; y < dstimage->format.height ; y++ )
|
||||
{
|
||||
srcrow = src;
|
||||
dstrow = dst;
|
||||
for( x = 0 ; x < dstimage->format.width ; x++ )
|
||||
{
|
||||
r = (1.0f/255.0f) * (float)srcrow[0];
|
||||
r = ( ( r - 0.5f ) * contrast ) + brightness;
|
||||
dstrow[0] = (unsigned char)fmaxf( 0.0f, fminf( 255.0f, roundf( r * 255.0f ) ) );
|
||||
srcrow = ADDRESS( srcrow, srcimage->format.bytesperpixel );
|
||||
dstrow = ADDRESS( dstrow, dstimage->format.bytesperpixel );
|
||||
}
|
||||
src = ADDRESS( src, srcimage->format.bytesperline );
|
||||
dst = ADDRESS( dst, dstimage->format.bytesperline );
|
||||
}
|
||||
}
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
|
||||
void imgFree( imgImage *image )
|
||||
{
|
||||
free( image->data );
|
||||
image->data = 0;
|
||||
return;
|
||||
}
|
||||
|
||||
|
||||
////
|
||||
|
||||
|
74
lib/graphics_utils/mipmap/img.h
Normal file
74
lib/graphics_utils/mipmap/img.h
Normal file
@ -0,0 +1,74 @@
|
||||
/* *****************************************************************************
|
||||
*
|
||||
* Copyright (c) 2007-2016 Alexis Naveros.
|
||||
* Portions developed under contract to the SURVICE Engineering Company.
|
||||
*
|
||||
* This library is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public License
|
||||
* version 2.1 as published by the Free Software Foundation.
|
||||
*
|
||||
* This library is distributed in the hope that it will be useful, but
|
||||
* WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with this file; see the file named COPYING for more
|
||||
* information.
|
||||
*
|
||||
* *****************************************************************************
|
||||
*/
|
||||
|
||||
#ifndef IMG_H
|
||||
#define IMG_H
|
||||
|
||||
|
||||
typedef struct
|
||||
{
|
||||
int width;
|
||||
int height;
|
||||
int type;
|
||||
int bytesperpixel;
|
||||
int bytesperline;
|
||||
} imgFormat;
|
||||
|
||||
enum
|
||||
{
|
||||
IMG_FORMAT_TYPE_ANY,
|
||||
IMG_FORMAT_TYPE_RGB24,
|
||||
IMG_FORMAT_TYPE_BGR24,
|
||||
IMG_FORMAT_TYPE_RGBX32,
|
||||
IMG_FORMAT_TYPE_BGRX32,
|
||||
IMG_FORMAT_TYPE_RGBA32,
|
||||
IMG_FORMAT_TYPE_BGRA32,
|
||||
IMG_FORMAT_TYPE_GRAYSCALE,
|
||||
IMG_FORMAT_TYPE_GRAYALPHA
|
||||
};
|
||||
|
||||
typedef struct
|
||||
{
|
||||
imgFormat format;
|
||||
void *data;
|
||||
} imgImage;
|
||||
|
||||
|
||||
////
|
||||
|
||||
|
||||
void imgCopyRect( imgImage *image, int dstx, int dsty, int srcx, int srcy, int sizex, int sizey );
|
||||
|
||||
void (*imgBlendGetFunction( imgImage *dstimage, imgImage *srcimage ))( imgImage *dstimage, int dstx, int dsty, imgImage *srcimage );
|
||||
int imgBlendImage( imgImage *dstimage, int dstx, int dsty, imgImage *srcimage );
|
||||
|
||||
void imgAllocCopy( imgImage *dst, imgImage *src );
|
||||
void imgAllocCopyExtendBorder( imgImage *dstimage, imgImage *srcimage, int extendsize );
|
||||
void imgAllocExtractChannel( imgImage *dst, imgImage *src, int channelindex );
|
||||
void imgAllocExtractChannelExtendBorder( imgImage *dstimage, imgImage *srcimage, int channelindex, int extendsize );
|
||||
void imgAllocCopyChannelToAlpha( imgImage *dstimage, imgImage *srcimage, int channelindex, unsigned char r, unsigned char g, unsigned char b );
|
||||
void imgAllocAdjustBrightnessContrast( imgImage *dstimage, imgImage *srcimage, float brightness, float contrast );
|
||||
|
||||
void imgFree( imgImage *image );
|
||||
|
||||
|
||||
#endif
|
||||
|
4098
lib/graphics_utils/mipmap/imgresize.c
Normal file
4098
lib/graphics_utils/mipmap/imgresize.c
Normal file
File diff suppressed because it is too large
Load Diff
150
lib/graphics_utils/mipmap/imgresize.h
Normal file
150
lib/graphics_utils/mipmap/imgresize.h
Normal file
@ -0,0 +1,150 @@
|
||||
/* -----------------------------------------------------------------------------
|
||||
*
|
||||
* Copyright (c) 2014-2017 Alexis Naveros.
|
||||
* Portions developed under contract to the SURVICE Engineering Company.
|
||||
*
|
||||
* This software is provided 'as-is', without any express or implied
|
||||
* warranty. In no event will the authors be held liable for any damages
|
||||
* arising from the use of this software.
|
||||
*
|
||||
* Permission is granted to anyone to use this software for any purpose,
|
||||
* including commercial applications, and to alter it and redistribute it
|
||||
* freely, subject to the following restrictions:
|
||||
*
|
||||
* 1. The origin of this software must not be misrepresented; you must not
|
||||
* claim that you wrote the original software. If you use this software
|
||||
* in a product, an acknowledgment in the product documentation would be
|
||||
* appreciated but is not required.
|
||||
* 2. Altered source versions must be plainly marked as such, and must not be
|
||||
* misrepresented as being the original software.
|
||||
* 3. This notice may not be removed or altered from any source distribution.
|
||||
*
|
||||
* -----------------------------------------------------------------------------
|
||||
*/
|
||||
|
||||
|
||||
#ifndef IMGRESIZE_H
|
||||
#define IMGRESIZE_H
|
||||
|
||||
|
||||
typedef struct
|
||||
{
|
||||
/* Specify filter type, from the IM_REDUCE_FILTER_* list */
|
||||
int filter;
|
||||
/* High quality, a little slow: hopcount=3; */
|
||||
/* Good quality, much faster: hopcount=2; */
|
||||
int hopcount;
|
||||
/* Strong preservation/amplification of details: alpha=2.0f; */
|
||||
/* Mild preservation/amplification of details: alpha=6.0f; */
|
||||
float alpha;
|
||||
/* NORMALMAP filters: factor to amyplify normals on X and Y before normalization */
|
||||
float amplifynormal;
|
||||
/* NORMALMAP_SUSTAIN filters: Preserve a factor of deviation "energy" as calculated by sqrtf(x*x+y*y) */
|
||||
float normalsustainfactor;
|
||||
} imReduceOptions;
|
||||
|
||||
static inline void imReduceSetOptions( imReduceOptions *options, int filter, int hopcount, float alpha, float amplifynormal, float normalsustainfactor )
|
||||
{
|
||||
options->filter = filter;
|
||||
options->hopcount = hopcount;
|
||||
options->alpha = alpha;
|
||||
options->amplifynormal = amplifynormal;
|
||||
options->normalsustainfactor = normalsustainfactor;
|
||||
return;
|
||||
}
|
||||
|
||||
|
||||
/* Reduce the image's dimensions by an integer divisor ~ this is fairly fast */
|
||||
int imReduceImageKaiserDataDivisor( unsigned char *dstdata, unsigned char *srcdata, int width, int height, int bytesperpixel, int bytesperline, int sizedivisor, imReduceOptions *options );
|
||||
/* Same as imReduceImageKaiserDataDivisor(), but imgdst is allocated */
|
||||
int imReduceImageKaiserDivisor( imgImage *imgdst, imgImage *imgsrc, int sizedivisor, imReduceOptions *options );
|
||||
|
||||
|
||||
/* Reduce the image's dimensions to match the newwidth and newheight ~ this is a little slower */
|
||||
int imReduceImageKaiserData( unsigned char *dstdata, unsigned char *srcdata, int width, int height, int bytesperpixel, int bytesperline, int newwidth, int newheight, imReduceOptions *options );
|
||||
/* Same as imReduceImageKaiserData(), but imgdst is allocated */
|
||||
int imReduceImageKaiser( imgImage *imgdst, imgImage *imgsrc, int newwidth, int newheight, imReduceOptions *options );
|
||||
|
||||
|
||||
/* Resize by half with a dumb box filter ~ don't use that except for the smallest mipmaps */
|
||||
/* Filters with ALPHANORM and/or SUSTAIN keywords are processed as the regular base filter only */
|
||||
int imReduceImageHalfBoxData( unsigned char *dstdata, unsigned char *srcdata, int width, int height, int bytesperpixel, int bytesperline, imReduceOptions *options );
|
||||
int imReduceImageHalfBox( imgImage *imgdst, imgImage *imgsrc, imReduceOptions *options );
|
||||
|
||||
|
||||
/*
|
||||
Keywords for image reduction filters
|
||||
|
||||
LINEAR: Data is linear, note that this is *not* the format of typical diffuse textures
|
||||
SRGB: Color is in sRGB space, any alpha is presumed linear
|
||||
NORMALMAP: RGB represents a XYZ vector as (2.0*RGB)-1.0f, any alpha is presumed linear
|
||||
|
||||
ALPHANORM: Alpha normalization, the weight of pixels is proportional to their alpha values
|
||||
(do you have "black" fully transparent pixels? please use an ALPHANORM filter)
|
||||
SUSTAIN: The "energy" of the normal map is sustained, amplified to preserve the level of details
|
||||
Note that this filter is rather slow (set options->normalsustainfactor to 0.75 or so)
|
||||
*/
|
||||
|
||||
enum
|
||||
{
|
||||
/* Linear space */
|
||||
IM_REDUCE_FILTER_LINEAR,
|
||||
IM_REDUCE_FILTER_LINEAR_ALPHANORM,
|
||||
|
||||
/* sRGB space (probably what you want for diffuse textures) */
|
||||
IM_REDUCE_FILTER_SRGB,
|
||||
IM_REDUCE_FILTER_SRGB_ALPHANORM,
|
||||
|
||||
/* RGB represents a XYZ vector as (2.0*RGB)-1.0f, any alpha is presumed linear */
|
||||
IM_REDUCE_FILTER_NORMALMAP,
|
||||
IM_REDUCE_FILTER_NORMALMAP_ALPHANORM,
|
||||
IM_REDUCE_FILTER_NORMALMAP_SUSTAIN,
|
||||
IM_REDUCE_FILTER_NORMALMAP_SUSTAIN_ALPHANORM,
|
||||
|
||||
/* Custom specialized filters */
|
||||
IM_REDUCE_FILTER_WATERMAP,
|
||||
IM_REDUCE_FILTER_PLANTMAP,
|
||||
IM_REDUCE_FILTER_FOLLIAGE,
|
||||
IM_REDUCE_FILTER_SKY,
|
||||
IM_REDUCE_FILTER_FOG
|
||||
};
|
||||
|
||||
|
||||
////
|
||||
|
||||
|
||||
#define IM_MIPMAP_CASCADE_MAX (16)
|
||||
|
||||
typedef struct
|
||||
{
|
||||
int width;
|
||||
int height;
|
||||
int layercount;
|
||||
int bytesperpixel;
|
||||
int bytesperline;
|
||||
imReduceOptions *options;
|
||||
void *mipmap[IM_MIPMAP_CASCADE_MAX];
|
||||
} imMipmapCascade;
|
||||
|
||||
|
||||
int imBuildMipmapCascade( imMipmapCascade *cascade, void *imagedata, int width, int height, int layercount, int bytesperpixel, int bytesperline, imReduceOptions *options, int cascadeflags );
|
||||
|
||||
void imFreeMipmapCascade( imMipmapCascade *cascade );
|
||||
|
||||
/* For base texture, propagate RGB channels to neighbors if they are fully transparent (ignored if bytesperpixel != 4 ) */
|
||||
#define IM_CASCADE_FLAGS_COLOR_BORDER_BASE (0x1)
|
||||
/* For generated mipmaps, propagate RGB channels to neighbors if they are fully transparent (ignored if bytesperpixel != 4 ) */
|
||||
#define IM_CASCADE_FLAGS_COLOR_BORDER_MIPMAPS (0x2)
|
||||
|
||||
|
||||
////
|
||||
|
||||
|
||||
void imPropagateAlphaBorder( unsigned char *imagedata, int width, int height, int bytesperpixel, int bytesperline );
|
||||
|
||||
|
||||
////
|
||||
|
||||
|
||||
#endif
|
||||
|
@ -934,6 +934,11 @@ namespace UserConfigParams
|
||||
PARAM_DEFAULT( BoolUserConfigParam(false, "everything_unlocked",
|
||||
"Enable all karts and tracks") );
|
||||
|
||||
PARAM_PREFIX BoolUserConfigParam m_hq_mipmap
|
||||
PARAM_DEFAULT( BoolUserConfigParam(false, "hq_mipmap",
|
||||
"Generate mipmap for textures using "
|
||||
"high quality method with SSE") );
|
||||
|
||||
// TODO? implement blacklist for new irrlicht device and GUI
|
||||
PARAM_PREFIX std::vector<std::string> m_blacklist_res;
|
||||
|
||||
|
@ -490,7 +490,7 @@ bool CentralVideoSettings::isARBPixelBufferObjectUsable() const
|
||||
|
||||
bool CentralVideoSettings::supportsThreadedTextureLoading() const
|
||||
{
|
||||
return isARBPixelBufferObjectUsable() && isARBBufferStorageUsable();
|
||||
return isARBPixelBufferObjectUsable() && isARBBufferStorageUsable() && isARBTextureStorageUsable();
|
||||
}
|
||||
|
||||
#endif // !SERVER_ONLY
|
||||
|
119
src/graphics/hq_mipmap_generator.cpp
Normal file
119
src/graphics/hq_mipmap_generator.cpp
Normal file
@ -0,0 +1,119 @@
|
||||
// SuperTuxKart - a fun racing game with go-kart
|
||||
// Copyright (C) 2017 SuperTuxKart-Team
|
||||
//
|
||||
// This program is free software; you can redistribute it and/or
|
||||
// modify it under the terms of the GNU General Public License
|
||||
// as published by the Free Software Foundation; either version 3
|
||||
// of the License, or (at your option) any later version.
|
||||
//
|
||||
// This program is distributed in the hope that it will be useful,
|
||||
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
// GNU General Public License for more details.
|
||||
//
|
||||
// You should have received a copy of the GNU General Public License
|
||||
// along with this program; if not, write to the Free Software
|
||||
// Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
|
||||
|
||||
|
||||
#include "graphics/hq_mipmap_generator.hpp"
|
||||
#define DUMP_MIPMAP
|
||||
#ifdef DUMP_MIPMAP
|
||||
#include "graphics/irr_driver.hpp"
|
||||
#include "utils/string_utils.hpp"
|
||||
#endif
|
||||
#include <cassert>
|
||||
|
||||
extern "C"
|
||||
{
|
||||
#include <mipmap/img.h>
|
||||
#include <mipmap/imgresize.h>
|
||||
}
|
||||
|
||||
// ----------------------------------------------------------------------------
|
||||
HQMipmapGenerator::HQMipmapGenerator(const io::path& name, uint8_t* data,
|
||||
const core::dimension2d<u32>& size,
|
||||
GLuint texture_name, bool single_channel)
|
||||
: video::ITexture(name), m_orig_data(data), m_size(size),
|
||||
m_texture_name(texture_name), m_texture_size(0),
|
||||
m_single_channel(single_channel), m_mipmap_data(NULL)
|
||||
{
|
||||
unsigned width = m_size.Width;
|
||||
unsigned height = m_size.Height;
|
||||
while (true)
|
||||
{
|
||||
width = width < 2 ? 1 : width >> 1;
|
||||
height = height < 2 ? 1 : height >> 1;
|
||||
m_mipmap_sizes.emplace_back(core::dimension2du(width, height),
|
||||
m_texture_size);
|
||||
m_texture_size += width * height * (m_single_channel ? 1 : 4);
|
||||
if (width == 1 && height == 1)
|
||||
break;
|
||||
}
|
||||
m_texture_size = unsigned(m_mipmap_sizes.back().second) +
|
||||
(m_single_channel ? 1 : 4);
|
||||
m_mipmap_data = malloc(sizeof(imMipmapCascade));
|
||||
} // HQMipmapGenerator
|
||||
|
||||
// ----------------------------------------------------------------------------
|
||||
HQMipmapGenerator::~HQMipmapGenerator()
|
||||
{
|
||||
imFreeMipmapCascade((imMipmapCascade*)m_mipmap_data);
|
||||
free(m_mipmap_data);
|
||||
} // ~HQMipmapGenerator
|
||||
|
||||
// ----------------------------------------------------------------------------
|
||||
void HQMipmapGenerator::threadedReload(void* ptr, void* param) const
|
||||
{
|
||||
imReduceOptions options;
|
||||
imReduceSetOptions(&options, IM_REDUCE_FILTER_SRGB, 3, 2.0f, 0.0f, 0.0f);
|
||||
imMipmapCascade* mm_cascade = (imMipmapCascade*)m_mipmap_data;
|
||||
#ifdef DEBUG
|
||||
int ret = imBuildMipmapCascade(mm_cascade, m_orig_data, m_size.Width,
|
||||
m_size.Height, 1/*layercount*/, m_single_channel ? 1 : 4,
|
||||
m_single_channel ? m_size.Width : m_size.Width * 4, &options, 0);
|
||||
assert(ret == 1);
|
||||
#else
|
||||
imBuildMipmapCascade(mm_cascade, m_orig_data, m_size.Width,
|
||||
m_size.Height, 1/*layercount*/, m_single_channel ? 1 : 4,
|
||||
m_single_channel ? m_size.Width : m_size.Width * 4, &options, 0);
|
||||
#endif
|
||||
for (unsigned int i = 0; i < m_mipmap_sizes.size(); i++)
|
||||
{
|
||||
memcpy((uint8_t*)ptr + m_mipmap_sizes[i].second,
|
||||
mm_cascade->mipmap[i + 1],
|
||||
m_mipmap_sizes[i].first.getArea() * (m_single_channel ? 1 : 4));
|
||||
#ifdef DUMP_MIPMAP
|
||||
if (m_single_channel) continue;
|
||||
video::IImage* image = irr_driver->getVideoDriver()
|
||||
->createImageFromData(video::ECF_A8R8G8B8, m_mipmap_sizes[i].first,
|
||||
mm_cascade->mipmap[i + 1], false/*ownForeignMemory*/);
|
||||
irr_driver->getVideoDriver()->writeImageToFile(image, std::string
|
||||
(StringUtils::toString(i) + "_" +
|
||||
StringUtils::getBasename(NamedPath.getPtr()) + ".png").c_str());
|
||||
image->drop();
|
||||
#endif
|
||||
}
|
||||
} // threadedReload
|
||||
|
||||
// ----------------------------------------------------------------------------
|
||||
void HQMipmapGenerator::threadedSubImage(void* ptr) const
|
||||
{
|
||||
#if !(defined(SERVER_ONLY) || defined(USE_GLES2))
|
||||
glBindTexture(GL_TEXTURE_2D, m_texture_name);
|
||||
for (unsigned int i = 0; i < m_mipmap_sizes.size(); i++)
|
||||
{
|
||||
glTexSubImage2D(GL_TEXTURE_2D, i + 1, 0, 0,
|
||||
m_mipmap_sizes[i].first.Width, m_mipmap_sizes[i].first.Height,
|
||||
m_single_channel ? GL_RED : GL_BGRA, GL_UNSIGNED_BYTE,
|
||||
(uint8_t*)ptr + m_mipmap_sizes[i].second);
|
||||
}
|
||||
delete this;
|
||||
#endif
|
||||
} // threadedSubImage
|
||||
|
||||
// ----------------------------------------------------------------------------
|
||||
void HQMipmapGenerator::cleanThreadedLoader()
|
||||
{
|
||||
delete[] m_orig_data;
|
||||
} // cleanThreadedLoader
|
98
src/graphics/hq_mipmap_generator.hpp
Normal file
98
src/graphics/hq_mipmap_generator.hpp
Normal file
@ -0,0 +1,98 @@
|
||||
// SuperTuxKart - a fun racing game with go-kart
|
||||
// Copyright (C) 2017 SuperTuxKart-Team
|
||||
//
|
||||
// This program is free software; you can redistribute it and/or
|
||||
// modify it under the terms of the GNU General Public License
|
||||
// as published by the Free Software Foundation; either version 3
|
||||
// of the License, or (at your option) any later version.
|
||||
//
|
||||
// This program is distributed in the hope that it will be useful,
|
||||
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
// GNU General Public License for more details.
|
||||
//
|
||||
// You should have received a copy of the GNU General Public License
|
||||
// along with this program; if not, write to the Free Software
|
||||
// Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
|
||||
|
||||
#ifndef HEADER_HQ_MIPMAP_GENERATOR_HPP
|
||||
#define HEADER_HQ_MIPMAP_GENERATOR_HPP
|
||||
|
||||
#include "graphics/gl_headers.hpp"
|
||||
#include "utils/no_copy.hpp"
|
||||
#include "utils/types.hpp"
|
||||
|
||||
#include <vector>
|
||||
#include <ITexture.h>
|
||||
|
||||
using namespace irr;
|
||||
|
||||
class HQMipmapGenerator : public video::ITexture, NoCopy
|
||||
{
|
||||
private:
|
||||
uint8_t* m_orig_data;
|
||||
|
||||
core::dimension2d<u32> m_size;
|
||||
|
||||
GLuint m_texture_name;
|
||||
|
||||
unsigned int m_texture_size;
|
||||
|
||||
bool m_single_channel;
|
||||
|
||||
void* m_mipmap_data;
|
||||
|
||||
std::vector<std::pair<core::dimension2d<u32>, size_t> > m_mipmap_sizes;
|
||||
|
||||
public:
|
||||
// ------------------------------------------------------------------------
|
||||
HQMipmapGenerator(const io::path& name, uint8_t* data,
|
||||
const core::dimension2d<u32>& size, GLuint texture_name,
|
||||
bool single_channel);
|
||||
// ------------------------------------------------------------------------
|
||||
virtual ~HQMipmapGenerator();
|
||||
// ------------------------------------------------------------------------
|
||||
virtual void* lock(video::E_TEXTURE_LOCK_MODE mode =
|
||||
video::ETLM_READ_WRITE, u32 mipmap_level = 0)
|
||||
{ return NULL; }
|
||||
// ------------------------------------------------------------------------
|
||||
virtual void unlock() {}
|
||||
// ------------------------------------------------------------------------
|
||||
virtual const core::dimension2d<u32>& getOriginalSize() const
|
||||
{ return m_size; }
|
||||
// ------------------------------------------------------------------------
|
||||
virtual const core::dimension2d<u32>& getSize() const { return m_size; }
|
||||
// ------------------------------------------------------------------------
|
||||
virtual video::E_DRIVER_TYPE getDriverType() const
|
||||
{
|
||||
#if defined(USE_GLES2)
|
||||
return video::EDT_OGLES2;
|
||||
#else
|
||||
return video::EDT_OPENGL;
|
||||
#endif
|
||||
}
|
||||
// ------------------------------------------------------------------------
|
||||
virtual video::ECOLOR_FORMAT getColorFormat() const
|
||||
{ return video::ECF_A8R8G8B8; }
|
||||
// ------------------------------------------------------------------------
|
||||
virtual u32 getPitch() const { return 0; }
|
||||
// ------------------------------------------------------------------------
|
||||
virtual bool hasMipMaps() const { return false; }
|
||||
// ------------------------------------------------------------------------
|
||||
virtual void regenerateMipMapLevels(void* mipmap_data = NULL) {}
|
||||
// ------------------------------------------------------------------------
|
||||
virtual u32 getOpenGLTextureName() const { return m_texture_name; }
|
||||
// ------------------------------------------------------------------------
|
||||
virtual u64 getHandle() { return 0; }
|
||||
// ------------------------------------------------------------------------
|
||||
virtual unsigned int getTextureSize() const { return m_texture_size; }
|
||||
// ------------------------------------------------------------------------
|
||||
virtual void threadedReload(void* ptr, void* param) const;
|
||||
// ------------------------------------------------------------------------
|
||||
virtual void threadedSubImage(void* ptr) const;
|
||||
// ------------------------------------------------------------------------
|
||||
virtual void cleanThreadedLoader();
|
||||
|
||||
}; // HQMipmapGenerator
|
||||
|
||||
#endif
|
@ -17,6 +17,7 @@
|
||||
|
||||
#include "graphics/stk_tex_manager.hpp"
|
||||
#include "config/hardware_stats.hpp"
|
||||
#include "config/user_config.hpp"
|
||||
#include "graphics/central_settings.hpp"
|
||||
#include "graphics/materials.hpp"
|
||||
#include "graphics/threaded_tex_loader.hpp"
|
||||
@ -33,11 +34,12 @@ STKTexManager::STKTexManager() : m_pbo(0), m_thread_size(0)
|
||||
#if !(defined(SERVER_ONLY) || defined(USE_GLES2))
|
||||
if (CVS->supportsThreadedTextureLoading())
|
||||
{
|
||||
UserConfigParams::m_hq_mipmap = true;
|
||||
pthread_mutex_init(&m_threaded_load_textures_mutex, NULL);
|
||||
pthread_cond_init(&m_cond_request, NULL);
|
||||
m_thread_size = HardwareStats::getNumProcessors();
|
||||
m_thread_size = core::clamp(m_thread_size, 1, 3);
|
||||
static const unsigned max_pbo_size = 48 * 1024 * 1024;
|
||||
m_thread_size = core::clamp(m_thread_size, 1, 8);
|
||||
static const unsigned max_pbo_size = 128 * 1024 * 1024;
|
||||
const unsigned each_capacity = max_pbo_size / m_thread_size;
|
||||
Log::info("STKTexManager", "%d thread(s) for texture loading,"
|
||||
" each capacity %d MB", m_thread_size,
|
||||
|
@ -18,14 +18,17 @@
|
||||
#include "graphics/stk_texture.hpp"
|
||||
#include "config/user_config.hpp"
|
||||
#include "graphics/central_settings.hpp"
|
||||
#include "graphics/hq_mipmap_generator.hpp"
|
||||
#include "graphics/irr_driver.hpp"
|
||||
#include "graphics/material.hpp"
|
||||
#include "graphics/material_manager.hpp"
|
||||
#include "graphics/materials.hpp"
|
||||
#include "graphics/stk_tex_manager.hpp"
|
||||
#include "modes/profile_world.hpp"
|
||||
#include "utils/log.hpp"
|
||||
#include "utils/string_utils.hpp"
|
||||
|
||||
#include <algorithm>
|
||||
#include <fstream>
|
||||
#include <functional>
|
||||
|
||||
@ -215,7 +218,7 @@ void STKTexture::reload(bool no_upload, uint8_t* preload_data,
|
||||
const unsigned int w = m_size.Width;
|
||||
const unsigned int h = m_size.Height;
|
||||
unsigned int format = m_single_channel ? GL_RED : GL_BGRA;
|
||||
unsigned int internal_format = m_single_channel ? GL_R8 : GL_RGBA;
|
||||
unsigned int internal_format = m_single_channel ? GL_R8 : GL_RGBA8;
|
||||
|
||||
#if !defined(USE_GLES2)
|
||||
if (m_mesh_texture && CVS->isTextureCompressionEnabled())
|
||||
@ -227,13 +230,41 @@ void STKTexture::reload(bool no_upload, uint8_t* preload_data,
|
||||
else
|
||||
{
|
||||
internal_format =
|
||||
m_single_channel ? GL_R8 : m_srgb ? GL_SRGB_ALPHA : GL_RGBA;
|
||||
m_single_channel ? GL_R8 : m_srgb ? GL_SRGB8_ALPHA8 : GL_RGBA8;
|
||||
}
|
||||
#endif
|
||||
if (!useThreadedLoading())
|
||||
formatConversion(data, &format, w, h);
|
||||
|
||||
if (!no_upload)
|
||||
if (useThreadedLoading())
|
||||
{
|
||||
if (m_texture_name == 0)
|
||||
{
|
||||
glGenTextures(1, &m_texture_name);
|
||||
glBindTexture(GL_TEXTURE_2D, m_texture_name);
|
||||
if (m_single_channel)
|
||||
{
|
||||
glPixelStorei(GL_UNPACK_ALIGNMENT, 1);
|
||||
glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_SWIZZLE_R, GL_ONE);
|
||||
glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_SWIZZLE_G, GL_ONE);
|
||||
glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_SWIZZLE_B, GL_ONE);
|
||||
glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_SWIZZLE_A, GL_RED);
|
||||
}
|
||||
int levels = 1;
|
||||
int width = w;
|
||||
int height = h;
|
||||
while (true)
|
||||
{
|
||||
width = width < 2 ? 1 : width >> 1;
|
||||
height = height < 2 ? 1 : height >> 1;
|
||||
levels++;
|
||||
if (width == 1 && height == 1)
|
||||
break;
|
||||
}
|
||||
glTexStorage2D(GL_TEXTURE_2D, levels, internal_format, w, h);
|
||||
}
|
||||
}
|
||||
else if (!no_upload)
|
||||
{
|
||||
const bool reload = m_texture_name != 0;
|
||||
if (!reload)
|
||||
@ -253,14 +284,14 @@ void STKTexture::reload(bool no_upload, uint8_t* preload_data,
|
||||
glTexImage2D(GL_TEXTURE_2D, 0, internal_format, w, h, 0, format,
|
||||
GL_UNSIGNED_BYTE, data);
|
||||
}
|
||||
else if (!useThreadedLoading())
|
||||
else
|
||||
{
|
||||
glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, w, h, format,
|
||||
GL_UNSIGNED_BYTE, data);
|
||||
}
|
||||
if (orig_img)
|
||||
orig_img->unlock();
|
||||
if (hasMipMaps() && !useThreadedLoading())
|
||||
if (hasMipMaps())
|
||||
glGenerateMipmap(GL_TEXTURE_2D);
|
||||
}
|
||||
|
||||
@ -594,8 +625,15 @@ void STKTexture::threadedReload(void* ptr, void* param) const
|
||||
if (orig_img)
|
||||
{
|
||||
orig_img->unlock();
|
||||
orig_img->setDeleteMemory(false);
|
||||
orig_img->drop();
|
||||
}
|
||||
if (useHQMipmap())
|
||||
{
|
||||
HQMipmapGenerator* hqmg = new HQMipmapGenerator(NamedPath, data,
|
||||
m_size, m_texture_name, m_single_channel);
|
||||
((STKTexManager*)(param))->addThreadedLoadTexture(hqmg);
|
||||
}
|
||||
else
|
||||
delete[] data;
|
||||
} // threadedReload
|
||||
@ -607,8 +645,11 @@ void STKTexture::threadedSubImage(void* ptr) const
|
||||
glBindTexture(GL_TEXTURE_2D, m_texture_name);
|
||||
glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, m_size.Width, m_size.Height,
|
||||
m_single_channel ? GL_RED : GL_BGRA, GL_UNSIGNED_BYTE, ptr);
|
||||
if (useHQMipmap())
|
||||
return;
|
||||
if (hasMipMaps())
|
||||
glGenerateMipmap(GL_TEXTURE_2D);
|
||||
|
||||
#endif
|
||||
} // threadedSubImage
|
||||
|
||||
@ -620,3 +661,10 @@ void STKTexture::cleanThreadedLoader()
|
||||
m_file = NULL;
|
||||
m_img_loader = NULL;
|
||||
} // cleanThreadedLoader
|
||||
|
||||
//-----------------------------------------------------------------------------
|
||||
bool STKTexture::useHQMipmap() const
|
||||
{
|
||||
return UserConfigParams::m_hq_mipmap && m_size.Width > 1 &&
|
||||
m_size.Height > 1;
|
||||
} // useHQMipmap
|
||||
|
@ -76,6 +76,8 @@ private:
|
||||
sc[i] = data[4 * i + 3];
|
||||
return sc;
|
||||
}
|
||||
// ------------------------------------------------------------------------
|
||||
bool useHQMipmap() const;
|
||||
|
||||
public:
|
||||
// ------------------------------------------------------------------------
|
||||
|
@ -84,8 +84,9 @@ void ThreadedTexLoader::handleCompletedTextures()
|
||||
size_t offset = m_pbo_offset;
|
||||
for (irr::video::ITexture* tex : m_completed_textures)
|
||||
{
|
||||
size_t cur_offset = tex->getTextureSize();
|
||||
tex->threadedSubImage((void*)offset);
|
||||
offset += tex->getTextureSize();
|
||||
offset += cur_offset;
|
||||
}
|
||||
m_completed_textures.clear();
|
||||
#endif
|
||||
|
Loading…
Reference in New Issue
Block a user