Try HQMipmapGenerator

This commit is contained in:
Benau 2017-03-13 10:28:43 +08:00
parent 498ce3ebc9
commit 76aa38e5b4
16 changed files with 6226 additions and 9 deletions

View File

@ -116,6 +116,10 @@ if((WIN32 AND NOT MINGW) OR APPLE)
set(JPEG_INCLUDE_DIR "${PROJECT_SOURCE_DIR}/lib/jpeglib/") set(JPEG_INCLUDE_DIR "${PROJECT_SOURCE_DIR}/lib/jpeglib/")
set(JPEG_LIBRARY jpeglib) set(JPEG_LIBRARY jpeglib)
endif() endif()
add_subdirectory("${PROJECT_SOURCE_DIR}/lib/graphics_utils")
include_directories("${PROJECT_SOURCE_DIR}/lib/graphics_utils")
# Build the irrlicht library # Build the irrlicht library
add_subdirectory("${PROJECT_SOURCE_DIR}/lib/irrlicht") add_subdirectory("${PROJECT_SOURCE_DIR}/lib/irrlicht")
include_directories("${PROJECT_SOURCE_DIR}/lib/irrlicht/include") include_directories("${PROJECT_SOURCE_DIR}/lib/irrlicht/include")
@ -370,6 +374,7 @@ target_link_libraries(supertuxkart
bulletmath bulletmath
enet enet
stkirrlicht stkirrlicht
graphics_utils
${Angelscript_LIBRARIES} ${Angelscript_LIBRARIES}
${CURL_LIBRARIES} ${CURL_LIBRARIES}
${OGGVORBIS_LIBRARIES} ${OGGVORBIS_LIBRARIES}

View File

@ -0,0 +1,9 @@
cmake_minimum_required(VERSION 2.6)
if (UNIX OR MINGW)
add_definitions(-O3 -msse -msse2 -msse3 -mssse3 -msse4.1 -mavx -ffast-math)
endif()
add_library(graphics_utils STATIC
mipmap/cpusimd.c
mipmap/img.c
mipmap/imgresize.c
)

View File

@ -0,0 +1,568 @@
/* -----------------------------------------------------------------------------
*
* Copyright (c) 2008-2016 Alexis Naveros.
*
*
* The SIMD trigonometry functions are Copyright (C) 2007 Julien Pommier
* See copyright notice for simd4f_sin_ps(), simd4f_cos_ps(), simd4f_sincos_ps()
*
*
* Some functions are Copyright (C) 2008 José Fonseca
* See copyright notice for simd4f_exp2_ps(), simd4f_log2_ps(), simd4f_pow_ps()
*
*
* Portions developed under contract to the SURVICE Engineering Company.
*
* This software is provided 'as-is', without any express or implied
* warranty. In no event will the authors be held liable for any damages
* arising from the use of this software.
*
* Permission is granted to anyone to use this software for any purpose,
* including commercial applications, and to alter it and redistribute it
* freely, subject to the following restrictions:
*
* 1. The origin of this software must not be misrepresented; you must not
* claim that you wrote the original software. If you use this software
* in a product, an acknowledgment in the product documentation would be
* appreciated but is not required.
* 2. Altered source versions must be plainly marked as such, and must not be
* misrepresented as being the original software.
* 3. This notice may not be removed or altered from any source distribution.
*
* -----------------------------------------------------------------------------
*/
#include <stdio.h>
#include <stdlib.h>
#include <stddef.h>
#include <stdint.h>
#include <string.h>
#include <math.h>
#include <float.h>
#include <limits.h>
#include <sys/time.h>
#include "cpusimd.h"
////
#if CPU_SSE_SUPPORT
const uint32_t simd4fSignMask[4] CPU_ALIGN16 = { 0x80000000, 0x80000000, 0x80000000, 0x80000000 };
const uint32_t simd4fSignMaskInv[4] CPU_ALIGN16 = { 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff };
const float simd4fHalf[4] CPU_ALIGN16 = { 0.5, 0.5, 0.5, 0.5 };
const float simd4fOne[4] CPU_ALIGN16 = { 1.0, 1.0, 1.0, 1.0 };
const float simd4fTwo[4] CPU_ALIGN16 = { 2.0, 2.0, 2.0, 2.0 };
const float simd4fThree[4] CPU_ALIGN16 = { 3.0, 3.0, 3.0, 3.0 };
const uint32_t simd4uOne[4] CPU_ALIGN16 = { 1, 1, 1, 1 };
const uint32_t simd4uOneInv[4] CPU_ALIGN16 = { ~1, ~1, ~1, ~1 };
const uint32_t simd4uTwo[4] CPU_ALIGN16 = { 2, 2, 2, 2 };
const uint32_t simd4uFour[4] CPU_ALIGN16 = { 4, 4, 4, 4 };
const float simd4fQuarter[4] CPU_ALIGN16 = { 0.25, 0.25, 0.25, 0.25 };
const float simd4fPi[4] CPU_ALIGN16 = { M_PI, M_PI, M_PI, M_PI };
const float simd4fZeroOneTwoThree[4] CPU_ALIGN16 = { 0.0, 1.0, 2.0, 3.0 };
const uint32_t simd4fAlphaMask[4] CPU_ALIGN16 = { 0x00000000, 0x00000000, 0x00000000, 0xffffffff };
const float simd4f255[4] CPU_ALIGN16 = { 255.0f, 255.0f, 255.0f, 255.0f };
const float simd4f255Inv[4] CPU_ALIGN16 = { 1.0f/255.0f, 1.0f/255.0f, 1.0f/255.0f, 1.0f/255.0f };
#endif
////
#if CPU_SSE2_SUPPORT
/* Copyright (C) 2007 Julien Pommier
This software is provided 'as-is', without any express or implied
warranty. In no event will the authors be held liable for any damages
arising from the use of this software.
Permission is granted to anyone to use this software for any purpose,
including commercial applications, and to alter it and redistribute it
freely, subject to the following restrictions:
1. The origin of this software must not be misrepresented; you must not
claim that you wrote the original software. If you use this software
in a product, an acknowledgment in the product documentation would be
appreciated but is not required.
2. Altered source versions must be plainly marked as such, and must not be
misrepresented as being the original software.
3. This notice may not be removed or altered from any source distribution.
(this is the zlib license)
*/
static const float simd4f_cephes_FOPI[4] CPU_ALIGN16 = { 1.27323954473516, 1.27323954473516, 1.27323954473516, 1.27323954473516 };
static const float simd4f_minus_cephes_DP1[4] CPU_ALIGN16 = { -0.78515625, -0.78515625, -0.78515625, -0.78515625 };
static const float simd4f_minus_cephes_DP2[4] CPU_ALIGN16 = { -2.4187564849853515625e-4, -2.4187564849853515625e-4, -2.4187564849853515625e-4, -2.4187564849853515625e-4 };
static const float simd4f_minus_cephes_DP3[4] CPU_ALIGN16 = { -3.77489497744594108e-8, -3.77489497744594108e-8, -3.77489497744594108e-8, -3.77489497744594108e-8 };
static const float simd4f_sincof_p0[4] CPU_ALIGN16 = { -1.9515295891E-4, -1.9515295891E-4, -1.9515295891E-4, -1.9515295891E-4 };
static const float simd4f_sincof_p1[4] CPU_ALIGN16 = { 8.3321608736E-3, 8.3321608736E-3, 8.3321608736E-3, 8.3321608736E-3 };
static const float simd4f_sincof_p2[4] CPU_ALIGN16 = { -1.6666654611E-1, -1.6666654611E-1, -1.6666654611E-1, -1.6666654611E-1 };
static const float simd4f_coscof_p0[4] CPU_ALIGN16 = { 2.443315711809948E-005, 2.443315711809948E-005, 2.443315711809948E-005, 2.443315711809948E-005 };
static const float simd4f_coscof_p1[4] CPU_ALIGN16 = { -1.388731625493765E-003, -1.388731625493765E-003, -1.388731625493765E-003, -1.388731625493765E-003 };
static const float simd4f_coscof_p2[4] CPU_ALIGN16 = { 4.166664568298827E-002, 4.166664568298827E-002, 4.166664568298827E-002, 4.166664568298827E-002 };
__m128 simd4f_sin_ps( __m128 x )
{
__m128 xmm1, xmm2, xmm3, sign_bit, y;
__m128i emm0, emm2;
xmm2 = _mm_setzero_ps();
sign_bit = x;
/* take the absolute value */
x = _mm_and_ps( x, *(__m128 *)simd4fSignMaskInv );
/* extract the sign bit (upper one) */
sign_bit = _mm_and_ps(sign_bit, *(__m128 *)simd4fSignMask);
/* scale by 4/Pi */
y = _mm_mul_ps(x, *(__m128 *)simd4f_cephes_FOPI);
/* store the integer part of y in mm0 */
emm2 = _mm_cvttps_epi32(y);
/* j=(j+1) & (~1) (see the cephes sources) */
emm2 = _mm_add_epi32(emm2, *(__m128i*)simd4uOne);
emm2 = _mm_and_si128(emm2, *(__m128i*)simd4uOneInv);
y = _mm_cvtepi32_ps(emm2);
/* get the swap sign flag */
emm0 = _mm_and_si128(emm2, *(__m128i*)simd4uFour);
emm0 = _mm_slli_epi32(emm0, 29);
/* get the polynom selection mask
there is one polynom for 0 <= x <= Pi/4
and another one for Pi/4<x<=Pi/2
Both branches will be computed.
*/
emm2 = _mm_and_si128(emm2, *(__m128i*)simd4uTwo);
emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
__m128 swap_sign_bit = _mm_castsi128_ps(emm0);
__m128 poly_mask = _mm_castsi128_ps(emm2);
sign_bit = _mm_xor_ps(sign_bit, swap_sign_bit);
/* The magic pass: "Extended precision modular arithmetic"
x = ((x - y * DP1) - y * DP2) - y * DP3; */
xmm1 = *(__m128 *)simd4f_minus_cephes_DP1;
xmm2 = *(__m128 *)simd4f_minus_cephes_DP2;
xmm3 = *(__m128 *)simd4f_minus_cephes_DP3;
xmm1 = _mm_mul_ps(y, xmm1);
xmm2 = _mm_mul_ps(y, xmm2);
xmm3 = _mm_mul_ps(y, xmm3);
x = _mm_add_ps(x, xmm1);
x = _mm_add_ps(x, xmm2);
x = _mm_add_ps(x, xmm3);
/* Evaluate the first polynom (0 <= x <= Pi/4) */
y = *(__m128 *)simd4f_coscof_p0;
__m128 z = _mm_mul_ps(x,x);
y = _mm_mul_ps(y, z);
y = _mm_add_ps(y, *(__m128 *)simd4f_coscof_p1);
y = _mm_mul_ps(y, z);
y = _mm_add_ps(y, *(__m128 *)simd4f_coscof_p2);
y = _mm_mul_ps(y, z);
y = _mm_mul_ps(y, z);
__m128 tmp = _mm_mul_ps(z, *(__m128 *)simd4fHalf);
y = _mm_sub_ps(y, tmp);
y = _mm_add_ps(y, *(__m128 *)simd4fOne);
/* Evaluate the second polynom (Pi/4 <= x <= 0) */
__m128 y2 = *(__m128 *)simd4f_sincof_p0;
y2 = _mm_mul_ps(y2, z);
y2 = _mm_add_ps(y2, *(__m128 *)simd4f_sincof_p1);
y2 = _mm_mul_ps(y2, z);
y2 = _mm_add_ps(y2, *(__m128 *)simd4f_sincof_p2);
y2 = _mm_mul_ps(y2, z);
y2 = _mm_mul_ps(y2, x);
y2 = _mm_add_ps(y2, x);
/* select the correct result from the two polynoms */
xmm3 = poly_mask;
y2 = _mm_and_ps(xmm3, y2);
y = _mm_andnot_ps(xmm3, y);
y = _mm_add_ps(y,y2);
/* update the sign */
y = _mm_xor_ps(y, sign_bit);
return y;
}
/* almost the same as sin_ps */
__m128 simd4f_cos_ps( __m128 x )
{
__m128 xmm1, xmm2, xmm3, y;
__m128i emm0, emm2;
xmm2 = _mm_setzero_ps();
/* take the absolute value */
x = _mm_and_ps(x, *(__m128*)simd4fSignMaskInv);
/* scale by 4/Pi */
y = _mm_mul_ps(x, *(__m128*)simd4f_cephes_FOPI);
/* store the integer part of y in mm0 */
emm2 = _mm_cvttps_epi32(y);
/* j=(j+1) & (~1) (see the cephes sources) */
emm2 = _mm_add_epi32(emm2, *(__m128i*)simd4uOne);
emm2 = _mm_and_si128(emm2, *(__m128i*)simd4uOneInv);
y = _mm_cvtepi32_ps(emm2);
emm2 = _mm_sub_epi32(emm2, *(__m128i*)simd4uTwo);
/* get the swap sign flag */
emm0 = _mm_andnot_si128(emm2, *(__m128i*)simd4uFour);
emm0 = _mm_slli_epi32(emm0, 29);
/* get the polynom selection mask */
emm2 = _mm_and_si128(emm2, *(__m128i*)simd4uTwo);
emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
__m128 sign_bit = _mm_castsi128_ps(emm0);
__m128 poly_mask = _mm_castsi128_ps(emm2);
/* The magic pass: "Extended precision modular arithmetic"
x = ((x - y * DP1) - y * DP2) - y * DP3; */
xmm1 = *(__m128*)simd4f_minus_cephes_DP1;
xmm2 = *(__m128*)simd4f_minus_cephes_DP2;
xmm3 = *(__m128*)simd4f_minus_cephes_DP3;
xmm1 = _mm_mul_ps(y, xmm1);
xmm2 = _mm_mul_ps(y, xmm2);
xmm3 = _mm_mul_ps(y, xmm3);
x = _mm_add_ps(x, xmm1);
x = _mm_add_ps(x, xmm2);
x = _mm_add_ps(x, xmm3);
/* Evaluate the first polynom (0 <= x <= Pi/4) */
y = *(__m128*)simd4f_coscof_p0;
__m128 z = _mm_mul_ps(x,x);
y = _mm_mul_ps(y, z);
y = _mm_add_ps(y, *(__m128*)simd4f_coscof_p1);
y = _mm_mul_ps(y, z);
y = _mm_add_ps(y, *(__m128*)simd4f_coscof_p2);
y = _mm_mul_ps(y, z);
y = _mm_mul_ps(y, z);
__m128 tmp = _mm_mul_ps(z, *(__m128*)simd4fHalf);
y = _mm_sub_ps(y, tmp);
y = _mm_add_ps(y, *(__m128*)simd4fOne);
/* Evaluate the second polynom (Pi/4 <= x <= 0) */
__m128 y2 = *(__m128*)simd4f_sincof_p0;
y2 = _mm_mul_ps(y2, z);
y2 = _mm_add_ps(y2, *(__m128*)simd4f_sincof_p1);
y2 = _mm_mul_ps(y2, z);
y2 = _mm_add_ps(y2, *(__m128*)simd4f_sincof_p2);
y2 = _mm_mul_ps(y2, z);
y2 = _mm_mul_ps(y2, x);
y2 = _mm_add_ps(y2, x);
/* select the correct result from the two polynoms */
xmm3 = poly_mask;
y2 = _mm_and_ps(xmm3, y2); //, xmm3);
y = _mm_andnot_ps(xmm3, y);
y = _mm_add_ps(y,y2);
/* update the sign */
y = _mm_xor_ps(y, sign_bit);
return y;
}
/* since sin_ps and cos_ps are almost identical, sincos_ps could replace both of them..
it is almost as fast, and gives you a free cosine with your sine */
void simd4f_sincos_ps( __m128 x, __m128 *s, __m128 *c )
{
__m128 xmm1, xmm2, xmm3, sign_bit_sin, y;
__m128i emm0, emm2, emm4;
xmm3 = _mm_setzero_ps();
sign_bit_sin = x;
/* take the absolute value */
x = _mm_and_ps(x, *(__m128*)simd4fSignMaskInv);
/* extract the sign bit (upper one) */
sign_bit_sin = _mm_and_ps(sign_bit_sin, *(__m128*)simd4fSignMask);
/* scale by 4/Pi */
y = _mm_mul_ps(x, *(__m128*)simd4f_cephes_FOPI);
/* store the integer part of y in emm2 */
emm2 = _mm_cvttps_epi32(y);
/* j=(j+1) & (~1) (see the cephes sources) */
emm2 = _mm_add_epi32(emm2, *(__m128i*)simd4uOne);
emm2 = _mm_and_si128(emm2, *(__m128i*)simd4uOneInv);
y = _mm_cvtepi32_ps(emm2);
emm4 = emm2;
/* get the swap sign flag for the sine */
emm0 = _mm_and_si128(emm2, *(__m128i*)simd4uFour);
emm0 = _mm_slli_epi32(emm0, 29);
__m128 swap_sign_bit_sin = _mm_castsi128_ps(emm0);
/* get the polynom selection mask for the sine*/
emm2 = _mm_and_si128(emm2, *(__m128i*)simd4uTwo);
emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
__m128 poly_mask = _mm_castsi128_ps(emm2);
/* The magic pass: "Extended precision modular arithmetic"
x = ((x - y * DP1) - y * DP2) - y * DP3; */
xmm1 = *(__m128*)simd4f_minus_cephes_DP1;
xmm2 = *(__m128*)simd4f_minus_cephes_DP2;
xmm3 = *(__m128*)simd4f_minus_cephes_DP3;
xmm1 = _mm_mul_ps(y, xmm1);
xmm2 = _mm_mul_ps(y, xmm2);
xmm3 = _mm_mul_ps(y, xmm3);
x = _mm_add_ps(x, xmm1);
x = _mm_add_ps(x, xmm2);
x = _mm_add_ps(x, xmm3);
emm4 = _mm_sub_epi32(emm4, *(__m128i*)simd4uTwo);
emm4 = _mm_andnot_si128(emm4, *(__m128i*)simd4uFour);
emm4 = _mm_slli_epi32(emm4, 29);
__m128 sign_bit_cos = _mm_castsi128_ps(emm4);
sign_bit_sin = _mm_xor_ps(sign_bit_sin, swap_sign_bit_sin);
/* Evaluate the first polynom (0 <= x <= Pi/4) */
__m128 z = _mm_mul_ps(x,x);
y = *(__m128*)simd4f_coscof_p0;
y = _mm_mul_ps(y, z);
y = _mm_add_ps(y, *(__m128*)simd4f_coscof_p1);
y = _mm_mul_ps(y, z);
y = _mm_add_ps(y, *(__m128*)simd4f_coscof_p2);
y = _mm_mul_ps(y, z);
y = _mm_mul_ps(y, z);
__m128 tmp = _mm_mul_ps(z, *(__m128*)simd4fHalf);
y = _mm_sub_ps(y, tmp);
y = _mm_add_ps(y, *(__m128*)simd4fOne);
/* Evaluate the second polynom (Pi/4 <= x <= 0) */
__m128 y2 = *(__m128*)simd4f_sincof_p0;
y2 = _mm_mul_ps(y2, z);
y2 = _mm_add_ps(y2, *(__m128*)simd4f_sincof_p1);
y2 = _mm_mul_ps(y2, z);
y2 = _mm_add_ps(y2, *(__m128*)simd4f_sincof_p2);
y2 = _mm_mul_ps(y2, z);
y2 = _mm_mul_ps(y2, x);
y2 = _mm_add_ps(y2, x);
/* select the correct result from the two polynoms */
xmm3 = poly_mask;
__m128 ysin2 = _mm_and_ps(xmm3, y2);
__m128 ysin1 = _mm_andnot_ps(xmm3, y);
y2 = _mm_sub_ps(y2,ysin2);
y = _mm_sub_ps(y, ysin1);
xmm1 = _mm_add_ps(ysin1,ysin2);
xmm2 = _mm_add_ps(y,y2);
/* update the sign */
*s = _mm_xor_ps(xmm1, sign_bit_sin);
*c = _mm_xor_ps(xmm2, sign_bit_cos);
}
#endif
////
#if CPU_SSE2_SUPPORT
/* Copyright (C) 2008 José Fonseca
http://jrfonseca.blogspot.ca/2008/09/fast-sse2-pow-tables-or-polynomials.html
MIT license
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
*/
#define POLY0(x,c0) _mm_set1_ps(c0)
#define POLY1(x,c0,c1) _mm_add_ps(_mm_mul_ps(POLY0(x, c1), x), _mm_set1_ps(c0))
#define POLY2(x,c0,c1,c2) _mm_add_ps(_mm_mul_ps(POLY1(x, c1, c2), x), _mm_set1_ps(c0))
#define POLY3(x,c0,c1,c2,c3) _mm_add_ps(_mm_mul_ps(POLY2(x, c1, c2, c3), x), _mm_set1_ps(c0))
#define POLY4(x,c0,c1,c2,c3,c4) _mm_add_ps(_mm_mul_ps(POLY3(x, c1, c2, c3, c4), x), _mm_set1_ps(c0))
#define POLY5(x,c0,c1,c2,c3,c4,c5) _mm_add_ps(_mm_mul_ps(POLY4(x, c1, c2, c3, c4, c5), x), _mm_set1_ps(c0))
#define EXP_POLY_DEGREE 3
#define LOG_POLY_DEGREE 5
__m128 simd4f_exp2_ps( __m128 x )
{
__m128i ipart;
__m128 fpart, expipart, expfpart;
x = _mm_min_ps( x, _mm_set1_ps( 129.00000f ) );
x = _mm_max_ps( x, _mm_set1_ps( -126.99999f ) );
/* ipart = int(x - 0.5) */
ipart = _mm_cvtps_epi32( _mm_sub_ps( x, _mm_set1_ps( 0.5f ) ) );
/* fpart = x - ipart */
fpart = _mm_sub_ps( x, _mm_cvtepi32_ps( ipart ) );
/* expipart = (float) (1 << ipart) */
expipart = _mm_castsi128_ps( _mm_slli_epi32( _mm_add_epi32( ipart, _mm_set1_epi32( 127 ) ), 23 ) );
/* minimax polynomial fit of 2**x, in range [-0.5, 0.5[ */
#if EXP_POLY_DEGREE == 5
expfpart = POLY5( fpart, 9.9999994e-1f, 6.9315308e-1f, 2.4015361e-1f, 5.5826318e-2f, 8.9893397e-3f, 1.8775767e-3f );
#elif EXP_POLY_DEGREE == 4
expfpart = POLY4( fpart, 1.0000026f, 6.9300383e-1f, 2.4144275e-1f, 5.2011464e-2f, 1.3534167e-2f );
#elif EXP_POLY_DEGREE == 3
expfpart = POLY3( fpart, 9.9992520e-1f, 6.9583356e-1f, 2.2606716e-1f, 7.8024521e-2f );
#elif EXP_POLY_DEGREE == 2
expfpart = POLY2( fpart, 1.0017247f, 6.5763628e-1f, 3.3718944e-1f );
#else
#error
#endif
return _mm_mul_ps(expipart, expfpart);
}
__m128 simd4f_log2_ps( __m128 x )
{
__m128i expmask, mantmask, i;
__m128 one, vexp, mant, logmant;
expmask = _mm_set1_epi32( 0x7f800000 );
mantmask = _mm_set1_epi32( 0x007fffff );
one = _mm_set1_ps( 1.0f );
i = _mm_castps_si128( x );
/* exp = (float) exponent(x) */
vexp = _mm_cvtepi32_ps( _mm_sub_epi32( _mm_srli_epi32( _mm_and_si128( i, expmask ), 23 ), _mm_set1_epi32( 127 ) ) );
/* mant = (float) mantissa(x) */
mant = _mm_or_ps( _mm_castsi128_ps( _mm_and_si128( i, mantmask ) ), one );
/* Minimax polynomial fit of log2(x)/(x - 1), for x in range [1, 2[
* These coefficients can be generate with
* http://www.boost.org/doc/libs/1_36_0/libs/math/doc/sf_and_dist/html/math_toolkit/toolkit/internals2/minimax.html
*/
#if LOG_POLY_DEGREE == 6
logmant = POLY5( mant, 3.11578814719469302614f, -3.32419399085241980044f, 2.59883907202499966007f, -1.23152682416275988241f, 0.318212422185251071475f, -0.0344359067839062357313f );
#elif LOG_POLY_DEGREE == 5
logmant = POLY4( mant, 2.8882704548164776201f, -2.52074962577807006663f, 1.48116647521213171641f, -0.465725644288844778798f, 0.0596515482674574969533f );
#elif LOG_POLY_DEGREE == 4
logmant = POLY3( mant, 2.61761038894603480148f, -1.75647175389045657003f, 0.688243882994381274313f, -0.107254423828329604454f );
#elif LOG_POLY_DEGREE == 3
logmant = POLY2( mant, 2.28330284476918490682f, -1.04913055217340124191f, 0.204446009836232697516f );
#else
#error
#endif
/* This effectively increases the polynomial degree by one, but ensures that log2(1) == 0*/
logmant = _mm_mul_ps( logmant, _mm_sub_ps(mant, one ) );
return _mm_add_ps( logmant, vexp );
}
__m128 simd4f_pow_ps( __m128 x, __m128 y )
{
return simd4f_exp2_ps( _mm_mul_ps( simd4f_log2_ps( x ), y ) );
}
#endif
////
#if CPU_SSE2_SUPPORT
/*
By Potatoswatter
http://stackoverflow.com/questions/6475373/optimizations-for-pow-with-const-non-integer-exponent
*/
#ifndef CC_ALWAYSINLINE
#if defined(__GNUC__) || defined(__INTEL_COMPILER)
#define CC_ALWAYSINLINE __attribute__((always_inline))
#else
#define CC_ALWAYSINLINE
#endif
#endif
static inline CC_ALWAYSINLINE __m128 simd4f_fastpow_ps( __m128 arg, uint32_t expnum, uint32_t expden, uint32_t coeffnum, uint32_t coeffden )
{
__m128 ret = arg;
float corrfactor, powfactor;
/* Apply a constant pre-correction factor. */
corrfactor = exp2( 127.0 * expden / expnum - 127.0 ) * pow( 1.0 * coeffnum / coeffden, 1.0 * expden / expnum );
powfactor = 1.0 * expnum / expden;
ret = _mm_mul_ps( ret, _mm_set1_ps( corrfactor ) );
/* Reinterpret arg as integer to obtain logarithm. */
ret = _mm_cvtepi32_ps( _mm_castps_si128( ret ) );
/* Multiply logarithm by power. */
ret = _mm_mul_ps( ret, _mm_set1_ps( powfactor ) );
/* Convert back to "integer" to exponentiate. */
ret = _mm_castsi128_ps( _mm_cvtps_epi32( ret ) );
return ret;
}
__m128 simd4f_pow12d5_ps( __m128 arg )
{
/* Lower exponents provide lower initial error, but too low causes overflow. */
__m128 xf = simd4f_fastpow_ps( arg, 4, 5, (int)( 1.38316186f * 1e9 ), (int)1e9 );
/* Imprecise 4-cycle sqrt is still far better than fastpow, good enough. */
__m128 xfm4 = _mm_rsqrt_ps( xf );
__m128 xf4 = _mm_mul_ps( xf, xfm4 );
/* Precisely calculate x^2 and x^3 */
__m128 x2 = _mm_mul_ps( arg, arg );
__m128 x3 = _mm_mul_ps( x2, arg );
/* Overestimate of x^2 * x^0.4 */
x2 = _mm_mul_ps( x2, xf4 );
/* Get x^-0.2 from x^0.4, and square it for x^-0.4. Combine into x^-0.6. */
__m128 xfm2 = _mm_rsqrt_ps( xf4 );
x3 = _mm_mul_ps( x3, xfm4 );
x3 = _mm_mul_ps( x3, xfm2 );
return _mm_mul_ps( _mm_add_ps( x2, x3 ), _mm_set1_ps( 1.0f/1.960131704207789f * 0.9999f ) );
}
__m128 simd4f_pow5d12_ps( __m128 arg )
{
/* 5/12 is too small, so compute the 4th root of 20/12 instead. */
/* 20/12 = 5/3 = 1 + 2/3 = 2 - 1/3. 2/3 is a suitable argument for fastpow. */
/* weighting coefficient: a^-1/2 = 2 a; a = 2^-2/3 */
__m128 xf = simd4f_fastpow_ps( arg, 2, 3, (int)( 0.629960524947437f * 1e9 ), (int)1e9 );
__m128 xover = _mm_mul_ps( arg, xf );
__m128 xfm1 = _mm_rsqrt_ps( xf );
__m128 x2 = _mm_mul_ps( arg, arg );
__m128 xunder = _mm_mul_ps( x2, xfm1 );
/* sqrt2 * over + 2 * sqrt2 * under */
__m128 xavg = _mm_mul_ps( _mm_set1_ps( 1.0f/( 3.0f * 0.629960524947437f ) * 0.999852f ), _mm_add_ps( xover, xunder ) );
xavg = _mm_mul_ps( xavg, _mm_rsqrt_ps( xavg ) );
xavg = _mm_mul_ps( xavg, _mm_rsqrt_ps( xavg ) );
return xavg;
}
#endif
////

View File

@ -0,0 +1,410 @@
/* -----------------------------------------------------------------------------
*
* Copyright (c) 2008-2016 Alexis Naveros.
*
* The SIMD trigonometry functions are Copyright (C) 2007 Julien Pommier
* See copyright notice for simd4f_sin_ps(), simd4f_cos_ps(), simd4f_sincos_ps()
*
* Portions developed under contract to the SURVICE Engineering Company.
*
* This software is provided 'as-is', without any express or implied
* warranty. In no event will the authors be held liable for any damages
* arising from the use of this software.
*
* Permission is granted to anyone to use this software for any purpose,
* including commercial applications, and to alter it and redistribute it
* freely, subject to the following restrictions:
*
* 1. The origin of this software must not be misrepresented; you must not
* claim that you wrote the original software. If you use this software
* in a product, an acknowledgment in the product documentation would be
* appreciated but is not required.
* 2. Altered source versions must be plainly marked as such, and must not be
* misrepresented as being the original software.
* 3. This notice may not be removed or altered from any source distribution.
*
* -----------------------------------------------------------------------------
*/
#ifndef CPUSIMD_H
#define CPUSIMD_H
////
#if __MMX__ || CPU_ENABLE_MMX
#include <mmintrin.h>
#define CPU_MMX_SUPPORT (1)
#endif
#if __SSE__ || _M_X64 || _M_IX86_FP >= 1 || CPU_ENABLE_SSE
#include <xmmintrin.h>
#define CPU_SSE_SUPPORT (1)
#endif
#if __SSE2__ || _M_X64 || _M_IX86_FP >= 2 || CPU_ENABLE_SSE2
#include <emmintrin.h>
#define CPU_SSE2_SUPPORT (1)
#endif
#if __SSE3__ || __AVX__ || CPU_ENABLE_SSE3
#include <pmmintrin.h>
#define CPU_SSE3_SUPPORT (1)
#endif
#if __SSSE3__ || __AVX__ || CPU_ENABLE_SSSE3
#include <tmmintrin.h>
#define CPU_SSSE3_SUPPORT (1)
#endif
#if __SSE4_1__ || __AVX__ || CPU_ENABLE_SSE4_1
#include <smmintrin.h>
#define CPU_SSE4_1_SUPPORT (1)
#endif
#if __SSE4_2__ || CPU_ENABLE_SSE4_2
#include <nmmintrin.h>
#define CPU_SSE4_2_SUPPORT (1)
#endif
#if __SSE4A__ || CPU_ENABLE_SSE4A
#include <ammintrin.h>
#define CPU_SSE4A_SUPPORT (1)
#endif
#if __AVX__ || CPU_ENABLE_AVX
#include <immintrin.h>
#define CPU_AVX_SUPPORT (1)
#endif
#if __AVX2__ || CPU_ENABLE_AVX2
#include <immintrin.h>
#define CPU_AVX2_SUPPORT (1)
#endif
#if __XOP__ || CPU_ENABLE_XOP
#include <immintrin.h>
#define CPU_XOP_SUPPORT (1)
#endif
#if __FMA3__ || CPU_ENABLE_FMA3
#include <immintrin.h>
#define CPU_FMA3_SUPPORT (1)
#endif
#if __FMA4__ || CPU_ENABLE_FMA4
#include <immintrin.h>
#define CPU_FMA4_SUPPORT (1)
#endif
#if __RDRND__ || CPU_ENABLE_RDRND
#include <immintrin.h>
#define CPU_RDRND_SUPPORT (1)
#endif
#if __POPCNT__ || CPU_ENABLE_POPCNT
#include <popcntintrin.h>
#define CPU_POPCNT_SUPPORT (1)
#endif
#if __LZCNT__ || CPU_ENABLE_LZCNT
#include <lzcntintrin.h>
#define CPU_LZCNT_SUPPORT (1)
#endif
#if __F16C__ || CPU_ENABLE_F16C
#include <f16cintrin.h>
#define CPU_F16C_SUPPORT (1)
#endif
#if __BMI__ || CPU_ENABLE_BMI
#include <bmiintrin.h>
#define CPU_BMI_SUPPORT (1)
#endif
#if __BMI2__ || CPU_ENABLE_BMI2
#include <bmi2intrin.h>
#define CPU_BMI2_SUPPORT (1)
#endif
#if __TBM__ || CPU_ENABLE_TBM
#include <tbmintrin.h>
#define CPU_TBM_SUPPORT (1)
#endif
#if defined(__GNUC__) || defined(__INTEL_COMPILER)
#define CPU_ALIGN16 __attribute__((aligned(16)))
#define CPU_ALIGN32 __attribute__((aligned(32)))
#define CPU_ALIGN64 __attribute__((aligned(64)))
#elif defined(_MSC_VER)
#define CPU_ALIGN16 __declspec(align(16))
#define CPU_ALIGN64 __declspec(align(64))
#else
#define CPU_ALIGN16
#define CPU_ALIGN32
#define CPU_ALIGN64
#warning "SSE/AVX Disabled: Unsupported Compiler."
#undef CPU_SSE_SUPPORT
#undef CPU_SSE2_SUPPORT
#undef CPU_SSE3_SUPPORT
#undef CPU_SSSE3_SUPPORT
#undef CPU_SSE4_1_SUPPORT
#undef CPU_SSE4_2_SUPPORT
#undef CPU_AVX_SUPPORT
#undef CPU_AVX2_SUPPORT
#undef CPU_XOP_SUPPORT
#undef CPU_FMA3_SUPPORT
#undef CPU_FMA4_SUPPORT
#endif
////
#if CPU_SSE_SUPPORT
#define CPU_APPROX_DIV_FLOAT(z,w) _mm_cvtss_f32(_mm_mul_ss(_mm_set_ss(z),_mm_rcp_ss(_mm_set_ss(w))))
#define CPU_APPROX_SQRT_FLOAT(z) _mm_cvtss_f32(_mm_mul_ss(_mm_set_ss(z),_mm_rsqrt_ss(_mm_set_ss(z))))
#define CPU_APPROX_RSQRT_FLOAT(z) _mm_cvtss_f32(_mm_rsqrt_ss(_mm_set_ss(z)))
#define CPU_APPROX_DIVSQRT_FLOAT(z,w) _mm_cvtss_f32(_mm_mul_ss(_mm_set_ss(z),_mm_rsqrt_ss(_mm_set_ss(w))))
#else
#define CPU_APPROX_DIV_FLOAT(z,w) ((z)/(w))
#define CPU_APPROX_SQRT_FLOAT(z) (sqrtf(z))
#define CPU_APPROX_RSQRT_FLOAT(z) (1.0/sqrtf(z))
#define CPU_APPROX_DIVSQRT_FLOAT(z,w) ((z)/sqrtf(w))
#endif
#if CPU_SSE3_SUPPORT
#define CPU_HADD_PS(vx,vy) _mm_hadd_ps(vx,vy)
#define CPU_HADD_PD(vx,vy) _mm_hadd_pd(vx,vy)
#elif CPU_SSE_SUPPORT
static inline __m128 CPU_HADD_PS( __m128 vx, __m128 vy )
{
__m128 vh, vl;
vh = _mm_shuffle_ps( vx, vy, _MM_SHUFFLE(3,1,3,1) );
vl = _mm_shuffle_ps( vx, vy, _MM_SHUFFLE(2,0,2,0) );
return _mm_add_ps( vh, vl );
}
#define CPU_HADD_PD(vx,vy) _mm_add_sd(vx,_mm_unpackhi_pd(vy,vy))
#endif
#if CPU_SSE4_1_SUPPORT
#define CPU_CVT_U8_TO_I32(x,vzero) _mm_cvtepu8_epi32(x)
#define CPU_CVT_S8_TO_I32(x,vzero) _mm_cvtepi8_epi32(x)
#elif CPU_SSE2_SUPPORT
#define CPU_CVT_U8_TO_I32(x,vzero) _mm_unpacklo_epi16(_mm_unpacklo_epi8((x),(vzero)),(vzero))
static inline __m128i CPU_CVT_S8_TO_I32( __m128i vx, __m128i vzero )
{
__m128i vsign;
vsign = _mm_cmpgt_epi8( vzero, vx );
return _mm_unpacklo_epi16( _mm_unpacklo_epi8( vx, vsign ), _mm_unpacklo_epi8( vsign, vsign ) );
}
#endif
#if CPU_SSE4_1_SUPPORT
#define CPU_BLENDV_PS(x,y,mask) _mm_blendv_ps(x,y,mask)
#define CPU_BLENDV_PD(x,y,mask) _mm_blendv_pd(x,y,mask)
#elif CPU_SSE2_SUPPORT
#define CPU_BLENDV_PS(x,y,mask) _mm_or_ps(_mm_andnot_ps(mask,x),_mm_and_ps(y,mask))
#define CPU_BLENDV_PD(x,y,mask) _mm_or_pd(_mm_andnot_pd(mask,x),_mm_and_pd(y,mask))
#endif
/*
CPU_FMADD = ((f0*f1)+t0)
CPU_FMSUB = ((f0*f1)-t0)
*/
#if CPU_FMA3_SUPPORT
#define CPU_FMADD_SS(f0,f1,t0) _mm_fmadd_ss(f0,f1,t0)
#define CPU_FMADD_PS(f0,f1,t0) _mm_fmadd_ps(f0,f1,t0)
#define CPU_FMADD_SD(f0,f1,t0) _mm_fmadd_sd(f0,f1,t0)
#define CPU_FMADD_PD(f0,f1,t0) _mm_fmadd_pd(f0,f1,t0)
#define CPU_FMSUB_SS(f0,f1,t0) _mm_fmsub_ss(f0,f1,t0)
#define CPU_FMSUB_PS(f0,f1,t0) _mm_fmsub_ps(f0,f1,t0)
#define CPU_FMSUB_SD(f0,f1,t0) _mm_fmsub_sd(f0,f1,t0)
#define CPU_FMSUB_PD(f0,f1,t0) _mm_fmsub_pd(f0,f1,t0)
#define CPU_FMADD256_SS(f0,f1,t0) _mm256_fmadd_ss(f0,f1,t0)
#define CPU_FMADD256_PS(f0,f1,t0) _mm256_fmadd_ps(f0,f1,t0)
#define CPU_FMADD256_SD(f0,f1,t0) _mm256_fmadd_sd(f0,f1,t0)
#define CPU_FMADD256_PD(f0,f1,t0) _mm256_fmadd_pd(f0,f1,t0)
#define CPU_FMSUB256_SS(f0,f1,t0) _mm256_fmsub_ss(f0,f1,t0)
#define CPU_FMSUB256_PS(f0,f1,t0) _mm256_fmsub_ps(f0,f1,t0)
#define CPU_FMSUB256_SD(f0,f1,t0) _mm256_fmsub_sd(f0,f1,t0)
#define CPU_FMSUB256_PD(f0,f1,t0) _mm256_fmsub_pd(f0,f1,t0)
#elif CPU_FMA4_SUPPORT
#define CPU_FMADD_SS(f0,f1,t0) _mm_macc_ss(f0,f1,t0)
#define CPU_FMADD_PS(f0,f1,t0) _mm_macc_ps(f0,f1,t0)
#define CPU_FMADD_SD(f0,f1,t0) _mm_macc_sd(f0,f1,t0)
#define CPU_FMADD_PD(f0,f1,t0) _mm_macc_pd(f0,f1,t0)
#define CPU_FMSUB_SS(f0,f1,t0) _mm_msub_ss(f0,f1,t0)
#define CPU_FMSUB_PS(f0,f1,t0) _mm_msub_ps(f0,f1,t0)
#define CPU_FMSUB_SD(f0,f1,t0) _mm_msub_sd(f0,f1,t0)
#define CPU_FMSUB_PD(f0,f1,t0) _mm_msub_pd(f0,f1,t0)
#define CPU_FMADD256_SS(f0,f1,t0) _mm256_macc_ss(f0,f1,t0)
#define CPU_FMADD256_PS(f0,f1,t0) _mm256_macc_ps(f0,f1,t0)
#define CPU_FMADD256_SD(f0,f1,t0) _mm256_macc_sd(f0,f1,t0)
#define CPU_FMADD256_PD(f0,f1,t0) _mm256_macc_pd(f0,f1,t0)
#define CPU_FMSUB256_SS(f0,f1,t0) _mm256_msub_ss(f0,f1,t0)
#define CPU_FMSUB256_PS(f0,f1,t0) _mm256_msub_ps(f0,f1,t0)
#define CPU_FMSUB256_SD(f0,f1,t0) _mm256_msub_sd(f0,f1,t0)
#define CPU_FMSUB256_PD(f0,f1,t0) _mm256_msub_pd(f0,f1,t0)
#else
#define CPU_FMADD_SS(f0,f1,t0) _mm_add_ss(_mm_mul_ss(f0,f1),t0)
#define CPU_FMADD_PS(f0,f1,t0) _mm_add_ps(_mm_mul_ps(f0,f1),t0)
#define CPU_FMADD_SD(f0,f1,t0) _mm_add_sd(_mm_mul_sd(f0,f1),t0)
#define CPU_FMADD_PD(f0,f1,t0) _mm_add_pd(_mm_mul_pd(f0,f1),t0)
#define CPU_FMSUB_SS(f0,f1,t0) _mm_sub_ss(_mm_mul_ss(f0,f1),t0)
#define CPU_FMSUB_PS(f0,f1,t0) _mm_sub_ps(_mm_mul_ps(f0,f1),t0)
#define CPU_FMSUB_SD(f0,f1,t0) _mm_sub_sd(_mm_mul_sd(f0,f1),t0)
#define CPU_FMSUB_PD(f0,f1,t0) _mm_sub_pd(_mm_mul_pd(f0,f1),t0)
#define CPU_FMADD256_SS(f0,f1,t0) _mm256_add_ss(_mm256_mul_ss(f0,f1),t0)
#define CPU_FMADD256_PS(f0,f1,t0) _mm256_add_ps(_mm256_mul_ps(f0,f1),t0)
#define CPU_FMADD256_SD(f0,f1,t0) _mm256_add_sd(_mm256_mul_sd(f0,f1),t0)
#define CPU_FMADD256_PD(f0,f1,t0) _mm256_add_pd(_mm256_mul_pd(f0,f1),t0)
#define CPU_FMSUB256_SS(f0,f1,t0) _mm256_sub_ss(_mm256_mul_ss(f0,f1),t0)
#define CPU_FMSUB256_PS(f0,f1,t0) _mm256_sub_ps(_mm256_mul_ps(f0,f1),t0)
#define CPU_FMSUB256_SD(f0,f1,t0) _mm256_sub_sd(_mm256_mul_sd(f0,f1),t0)
#define CPU_FMSUB256_PD(f0,f1,t0) _mm256_sub_pd(_mm256_mul_pd(f0,f1),t0)
#endif
////
#if CPU_SSE_SUPPORT
extern const uint32_t simd4fSignMask[4];
extern const uint32_t simd4fSignMaskInv[4];
extern const float simd4fHalf[4];
extern const float simd4fOne[4];
extern const float simd4fTwo[4];
extern const float simd4fThree[4];
extern const uint32_t simd4uOne[4];
extern const uint32_t simd4uOneInv[4];
extern const uint32_t simd4uTwo[4];
extern const uint32_t simd4uFour[4];
extern const float simd4fQuarter[4];
extern const float simd4fPi[4];
extern const float simd4fZeroOneTwoThree[4];
extern const uint32_t simd4fAlphaMask[4];
extern const float simd4f255[4];
extern const float simd4f255Inv[4];
#endif
#if CPU_SSE2_SUPPORT
/* Input range between -8192 and 8192 */
__m128 simd4f_sin_ps( __m128 x );
__m128 simd4f_cos_ps( __m128 x );
void simd4f_sincos_ps( __m128 x, __m128 *s, __m128 *c );
#endif
#if CPU_SSE2_SUPPORT
__m128 simd4f_exp2_ps( __m128 x );
__m128 simd4f_log2_ps( __m128 x );
__m128 simd4f_pow_ps( __m128 x, __m128 y );
#endif
#if CPU_SSE2_SUPPORT
__m128 simd4f_pow12d5_ps( __m128 arg );
__m128 simd4f_pow5d12_ps( __m128 arg );
#endif
////
#if CPU_SSE2_SUPPORT
#ifndef CC_ALWAYSINLINE
#if defined(__GNUC__) || defined(__INTEL_COMPILER)
#define CC_ALWAYSINLINE __attribute__((always_inline))
#else
#define CC_ALWAYSINLINE
#endif
#endif
static inline CC_ALWAYSINLINE __m128 simd4f_pow12d5_inline_ps( __m128 vx )
{
__m128 vpow, vpwsqrtinv, vpwsqrt, vx2;
vx2 = _mm_mul_ps( vx, vx );
vpow = _mm_castsi128_ps( _mm_cvtps_epi32( _mm_mul_ps( _mm_cvtepi32_ps( _mm_castps_si128( _mm_mul_ps( vx, _mm_set1_ps( 5417434112.0f ) ) ) ), _mm_set1_ps( 0.8f ) ) ) );
vpwsqrtinv = _mm_rsqrt_ps( vpow );
vpwsqrt = _mm_mul_ps( vpow, vpwsqrtinv );
return _mm_mul_ps( _mm_add_ps( _mm_mul_ps( vx2, vpwsqrt ), _mm_mul_ps( _mm_mul_ps( _mm_mul_ps( vx2, vx ), vpwsqrtinv ), _mm_rsqrt_ps( vpwsqrt ) ) ), _mm_set1_ps( 0.51011878327f ) );
}
static inline CC_ALWAYSINLINE __m128 simd4f_pow5d12_inline_ps( __m128 vx )
{
__m128 vpow;
vpow = _mm_castsi128_ps( _mm_cvtps_epi32( _mm_mul_ps( _mm_cvtepi32_ps( _mm_castps_si128( _mm_mul_ps( vx, _mm_set1_ps( 6521909350804488192.0f ) ) ) ), _mm_set1_ps( 0.666666666666f ) ) ) );
vx = _mm_mul_ps( _mm_add_ps( _mm_mul_ps( vx, vpow ), _mm_mul_ps( _mm_mul_ps( vx, vx ), _mm_rsqrt_ps( vpow ) ) ), _mm_set1_ps( 0.5290553722f ) );
#if 0
vx = _mm_mul_ps( vx, _mm_rsqrt_ps( vx ) );
vx = _mm_mul_ps( vx, _mm_rsqrt_ps( vx ) );
#else
vx = _mm_sqrt_ps( vx );
vx = _mm_sqrt_ps( vx );
#endif
return vx;
}
#endif
////
#if CPU_SSE_SUPPORT
static inline void simdPrintDebugSSE4f( char *str, __m128 v )
{
float CPU_ALIGN16 store[4];
_mm_store_ps( (void *)store, v );
printf( "%s %f %f %f %f\n", str, store[0], store[1], store[2], store[3] );
return;
}
static inline void simdPrintDebugSSE2d( char *str, __m128d v )
{
double CPU_ALIGN16 store[2];
_mm_store_pd( (void *)store, v );
printf( "%s %f %f\n", str, store[0], store[1] );
return;
}
static inline void simdPrintDebugSSE16u8( char *str, __m128i v )
{
uint8_t CPU_ALIGN16 store[16];
_mm_store_si128( (void *)store, v );
printf( "%s %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d\n", str, store[0], store[1], store[2], store[3], store[4], store[5], store[6], store[7], store[8], store[9], store[10], store[11], store[12], store[13], store[14], store[15] );
return;
}
static inline void simdPrintDebugSSE8u16( char *str, __m128i v )
{
uint16_t CPU_ALIGN16 store[8];
_mm_store_si128( (void *)store, v );
printf( "%s %d %d %d %d %d %d %d %d\n", str, store[0], store[1], store[2], store[3], store[4], store[5], store[6], store[7] );
return;
}
static inline void simdPrintDebugSSE4u32( char *str, __m128i v )
{
uint32_t CPU_ALIGN16 store[4];
_mm_store_si128( (void *)store, v );
printf( "%s %d %d %d %d\n", str, store[0], store[1], store[2], store[3] );
return;
}
static inline void simdPrintDebugSSE2u64( char *str, __m128i v )
{
uint64_t CPU_ALIGN16 store[2];
_mm_store_si128( (void *)store, v );
printf( "%s %lld %lld\n", str, (long long)store[0], (long long)store[1] );
return;
}
#endif
////
#endif

View File

@ -0,0 +1,628 @@
/* *****************************************************************************
*
* Copyright (c) 2007-2016 Alexis Naveros.
* Portions developed under contract to the SURVICE Engineering Company.
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public License
* version 2.1 as published by the Free Software Foundation.
*
* This library is distributed in the hope that it will be useful, but
* WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this file; see the file named COPYING for more
* information.
*
* *****************************************************************************
*/
#include <stdio.h>
#include <stdlib.h>
#include <stddef.h>
#include <stdint.h>
#include <string.h>
#include <math.h>
#include "cpusimd.h"
#include "img.h"
#ifndef ADDRESS
#define ADDRESS(p,o) ((void *)(((char *)p)+(o)))
#endif
////
void imgCopyRect( imgImage *image, int dstx, int dsty, int srcx, int srcy, int sizex, int sizey )
{
int y;
void *dst, *src;
src = ADDRESS( image->data, ( srcx * image->format.bytesperpixel ) + ( srcy * image->format.bytesperline ) );
dst = ADDRESS( image->data, ( dstx * image->format.bytesperpixel ) + ( dsty * image->format.bytesperline ) );
for( y = 0 ; y < sizey ; y++ )
{
memcpy( dst, src, sizex * image->format.bytesperpixel );
src = ADDRESS( src, image->format.bytesperline );
dst = ADDRESS( dst, image->format.bytesperline );
}
return;
}
#if CPU_SSE2_SUPPORT
static const uint16_t CPU_ALIGN16 imgBlendRgbMask[8] = { 0xffff, 0xffff, 0xffff, 0x0000, 0xffff, 0xffff, 0xffff, 0x0000 };
static const uint8_t CPU_ALIGN16 imgBlendAlphaTestMask[16] = { 0x00, 0x00, 0x00, 0xff, 0x00, 0x00, 0x00, 0xff, 0x00, 0x00, 0x00, 0xff, 0x00, 0x00, 0x00, 0xff };
static const uint16_t CPU_ALIGN16 imgBlendRoundBias[8] = { 128, 128, 128, 128, 128, 128, 128, 128 };
#if CPU_SSSE3_SUPPORT
static const uint8_t CPU_ALIGN16 imgBlendShufMask[16] = { 6,7,6,7,6,7,6,7, 14,15,14,15,14,15,14,15 };
#endif
#endif
static void imgBlendImageRgba2Rgba( imgImage *dstimage, int dstx, int dsty, imgImage *srcimage )
{
int x, y;
#if CPU_SSE2_SUPPORT
int row4size;
__m128i vsrc01, vsrc23, vdst01, vdst23, vblend01, vblend23;
__m128i vzero, v255, vrgbmask, valphatest, vroundbias;
#if CPU_SSSE3_SUPPORT
__m128i vshufmask;
#endif
#else
int32_t dstr, dstg, dstb, dsta;
int32_t srcr, srcg, srcb, srca;
#endif
unsigned char *src, *srcrow, *dstrow;
uint32_t *dst;
/* TODO: Other function to clamp copy area? */
#if CPU_SSE2_SUPPORT
row4size = srcimage->format.width & ~3;
vzero = _mm_setzero_si128();
v255 = _mm_set1_epi16( 255 );
vrgbmask = _mm_load_si128( (void *)imgBlendRgbMask );
valphatest = _mm_load_si128( (void *)imgBlendAlphaTestMask );
vroundbias = _mm_load_si128( (void *)imgBlendRoundBias );
#if CPU_SSSE3_SUPPORT
vshufmask = _mm_load_si128( (void *)imgBlendShufMask );
#endif
#endif
src = srcimage->data;
dst = ADDRESS( dstimage->data, ( dstx * 4 ) + ( dsty * dstimage->format.bytesperline ) );
for( y = 0 ; y < srcimage->format.height ; y++ )
{
srcrow = src;
dstrow = (unsigned char *)dst;
#if CPU_SSE2_SUPPORT
for( x = 0 ; x < row4size ; x += 4, srcrow += 16, dstrow += 16 )
{
/* r0g0b0a0,r1g1b1a1,r2g2b2a2,r3g3b3a3 */
vsrc23 = _mm_loadu_si128( (void *)srcrow );
if( _mm_movemask_ps( _mm_castsi128_ps( _mm_cmpeq_epi32( _mm_and_si128( valphatest, vsrc23 ), vzero ) ) ) == 0xf )
continue;
vdst23 = _mm_loadu_si128( (void *)dstrow );
/* r0__g0__b0__a0__, r1__g1__b1__a1__ */
vsrc01 = _mm_unpacklo_epi8( vsrc23, vzero );
vdst01 = _mm_unpacklo_epi8( vdst23, vzero );
/* r2__g2__b2__a2__, r3__g3__b3__a3__ */
vsrc23 = _mm_unpackhi_epi8( vsrc23, vzero );
vdst23 = _mm_unpackhi_epi8( vdst23, vzero );
#if CPU_SSSE3_SUPPORT
/* __a0__a0__a0__a0, __a1__a1__a1__a1 */
vblend01 = _mm_shuffle_epi8( vsrc01, vshufmask );
/* __a2__a2__a2__a2, __a3__a3__a3__a3 */
vblend23 = _mm_shuffle_epi8( vsrc23, vshufmask );
#else
vblend01 = _mm_shufflelo_epi16( vsrc01, 0xff );
vblend01 = _mm_shufflehi_epi16( vblend01, 0xff );
vblend23 = _mm_shufflelo_epi16( vsrc23, 0xff );
vblend23 = _mm_shufflehi_epi16( vblend23, 0xff );
#endif
vdst01 = _mm_adds_epu16( _mm_adds_epu16( _mm_mullo_epi16( vdst01, _mm_sub_epi16( v255, _mm_and_si128( vblend01, vrgbmask ) ) ), _mm_mullo_epi16( vsrc01, vblend01 ) ), vroundbias );
vdst23 = _mm_adds_epu16( _mm_adds_epu16( _mm_mullo_epi16( vdst23, _mm_sub_epi16( v255, _mm_and_si128( vblend23, vrgbmask ) ) ), _mm_mullo_epi16( vsrc23, vblend23 ) ), vroundbias );
/* Correction to divide by 255 instead of 256 */
vdst01 = _mm_srli_epi16( _mm_adds_epu16( vdst01, _mm_srli_epi16( vdst01, 8 ) ), 8 );
vdst23 = _mm_srli_epi16( _mm_adds_epu16( vdst23, _mm_srli_epi16( vdst23, 8 ) ), 8 );
/* Combine interleaved and store */
_mm_storeu_si128( (void *)dstrow, _mm_packus_epi16( vdst01, vdst23 ) );
}
for( ; x < srcimage->format.width ; x++, srcrow += 4, dstrow += 4 )
{
if( !( srcrow[3] ) )
continue;
vsrc01 = _mm_castps_si128( _mm_load_ss( (void *)srcrow ) );
vdst01 = _mm_castps_si128( _mm_load_ss( (void *)dstrow ) );
vsrc01 = _mm_unpacklo_epi8( vsrc01, vzero );
vdst01 = _mm_unpacklo_epi8( vdst01, vzero );
#if CPU_SSSE3_SUPPORT
vblend01 = _mm_shuffle_epi8( vsrc01, vshufmask );
#else
vblend01 = _mm_shufflelo_epi16( vsrc01, 0xff );
vblend01 = _mm_shufflehi_epi16( vblend01, 0xff );
#endif
vdst01 = _mm_adds_epu16( _mm_adds_epu16( _mm_mullo_epi16( vdst01, _mm_sub_epi16( v255, _mm_and_si128( vblend01, vrgbmask ) ) ), _mm_mullo_epi16( vsrc01, vblend01 ) ), vroundbias );
/* Correction to divide by 255 instead of 256 */
vdst01 = _mm_srli_epi16( _mm_adds_epu16( vdst01, _mm_srli_epi16( vdst01, 8 ) ), 8 );
_mm_store_ss( (void *)dstrow, _mm_castsi128_ps( _mm_packus_epi16( vdst01, vdst01 ) ) );
}
#else
for( x = 0 ; x < srcimage->format.width ; x++, srcrow += 4, dstrow += 4 )
{
if( !( srcrow[3] ) )
continue;
srcr = (int32_t)srcrow[0];
srcg = (int32_t)srcrow[1];
srcb = (int32_t)srcrow[2];
srca = (int32_t)srcrow[3];
dstr = (int32_t)dstrow[0];
dstg = (int32_t)dstrow[1];
dstb = (int32_t)dstrow[2];
dsta = (int32_t)dstrow[3];
dstr = ( ( dstr << 8 ) - dstr + ( srca * ( srcr - dstr ) ) + 128 );
dstg = ( ( dstg << 8 ) - dstg + ( srca * ( srcg - dstg ) ) + 128 );
dstb = ( ( dstb << 8 ) - dstb + ( srca * ( srcb - dstb ) ) + 128 );
dsta = ( ( dsta << 8 ) - dsta + ( srca * srca ) + 128 );
dstr = ( dstr + ( dstr >> 8 ) ) >> 8;
dstg = ( dstg + ( dstg >> 8 ) ) >> 8;
dstb = ( dstb + ( dstb >> 8 ) ) >> 8;
dsta = ( dsta + ( dsta >> 8 ) ) >> 8;
if( dsta > 255 )
dsta = 255;
dstrow[0] = (unsigned char)dstr;
dstrow[1] = (unsigned char)dstg;
dstrow[2] = (unsigned char)dstb;
dstrow[3] = (unsigned char)dsta;
}
#endif
src = ADDRESS( src, srcimage->format.bytesperline );
dst = ADDRESS( dst, dstimage->format.bytesperline );
}
return;
}
static void imgBlendImageRgba2Rgbx( imgImage *dstimage, int dstx, int dsty, imgImage *srcimage )
{
int x, y;
#if CPU_SSE2_SUPPORT
int row4size;
__m128i vsrc01, vsrc23, vdst01, vdst23, vblend01, vblend23;
__m128i vzero, v255, valphatest, vroundbias;
#if CPU_SSSE3_SUPPORT
__m128i vshufmask;
#endif
#else
int32_t dstr, dstg, dstb;
int32_t srcr, srcg, srcb, srca;
#endif
unsigned char *src, *srcrow, *dstrow;
uint32_t *dst;
/* TODO: Other function to clamp copy area? */
#if CPU_SSE2_SUPPORT
row4size = srcimage->format.width & ~3;
vzero = _mm_setzero_si128();
v255 = _mm_set1_epi16( 255 );
valphatest = _mm_load_si128( (void *)imgBlendAlphaTestMask );
vroundbias = _mm_load_si128( (void *)imgBlendRoundBias );
#if CPU_SSSE3_SUPPORT
vshufmask = _mm_load_si128( (void *)imgBlendShufMask );
#endif
#endif
src = srcimage->data;
dst = ADDRESS( dstimage->data, ( dstx * 4 ) + ( dsty * dstimage->format.bytesperline ) );
for( y = 0 ; y < srcimage->format.height ; y++ )
{
srcrow = src;
dstrow = (unsigned char *)dst;
#if CPU_SSE2_SUPPORT
for( x = 0 ; x < row4size ; x += 4, srcrow += 16, dstrow += 16 )
{
/* r0g0b0a0,r1g1b1a1,r2g2b2a2,r3g3b3a3 */
vsrc23 = _mm_loadu_si128( (void *)srcrow );
if( _mm_movemask_ps( _mm_castsi128_ps( _mm_cmpeq_epi32( _mm_and_si128( valphatest, vsrc23 ), vzero ) ) ) == 0xf )
continue;
vdst23 = _mm_loadu_si128( (void *)dstrow );
/* r0__g0__b0__a0__, r1__g1__b1__a1__ */
vsrc01 = _mm_unpacklo_epi8( vsrc23, vzero );
vdst01 = _mm_unpacklo_epi8( vdst23, vzero );
/* r2__g2__b2__a2__, r3__g3__b3__a3__ */
vsrc23 = _mm_unpackhi_epi8( vsrc23, vzero );
vdst23 = _mm_unpackhi_epi8( vdst23, vzero );
#if CPU_SSSE3_SUPPORT
/* __a0__a0__a0__a0, __a1__a1__a1__a1 */
vblend01 = _mm_shuffle_epi8( vsrc01, vshufmask );
/* __a2__a2__a2__a2, __a3__a3__a3__a3 */
vblend23 = _mm_shuffle_epi8( vsrc23, vshufmask );
#else
vblend01 = _mm_shufflelo_epi16( vsrc01, 0xff );
vblend01 = _mm_shufflehi_epi16( vblend01, 0xff );
vblend23 = _mm_shufflelo_epi16( vsrc23, 0xff );
vblend23 = _mm_shufflehi_epi16( vblend23, 0xff );
#endif
vdst01 = _mm_adds_epu16( _mm_adds_epu16( _mm_mullo_epi16( vdst01, _mm_sub_epi16( v255, vblend01 ) ), _mm_mullo_epi16( vsrc01, vblend01 ) ), vroundbias );
vdst23 = _mm_adds_epu16( _mm_adds_epu16( _mm_mullo_epi16( vdst23, _mm_sub_epi16( v255, vblend23 ) ), _mm_mullo_epi16( vsrc23, vblend23 ) ), vroundbias );
/* Correction to divide by 255 instead of 256 */
vdst01 = _mm_srli_epi16( _mm_adds_epu16( vdst01, _mm_srli_epi16( vdst01, 8 ) ), 8 );
vdst23 = _mm_srli_epi16( _mm_adds_epu16( vdst23, _mm_srli_epi16( vdst23, 8 ) ), 8 );
/* Combine interleaved and store */
_mm_storeu_si128( (void *)dstrow, _mm_or_si128( _mm_packus_epi16( vdst01, vdst23 ), valphatest ) );
}
for( ; x < srcimage->format.width ; x++, srcrow += 4, dstrow += 4 )
{
if( !( srcrow[3] ) )
continue;
vsrc01 = _mm_castps_si128( _mm_load_ss( (void *)srcrow ) );
vdst01 = _mm_castps_si128( _mm_load_ss( (void *)dstrow ) );
vsrc01 = _mm_unpacklo_epi8( vsrc01, vzero );
vdst01 = _mm_unpacklo_epi8( vdst01, vzero );
#if CPU_SSSE3_SUPPORT
vblend01 = _mm_shuffle_epi8( vsrc01, vshufmask );
#else
vblend01 = _mm_shufflelo_epi16( vsrc01, 0xff );
vblend01 = _mm_shufflehi_epi16( vblend01, 0xff );
#endif
vdst01 = _mm_adds_epu16( _mm_adds_epu16( _mm_mullo_epi16( vdst01, _mm_sub_epi16( v255, vblend01 ) ), _mm_mullo_epi16( vsrc01, vblend01 ) ), vroundbias );
/* Correction to divide by 255 instead of 256 */
vdst01 = _mm_srli_epi16( _mm_adds_epu16( vdst01, _mm_srli_epi16( vdst01, 8 ) ), 8 );
_mm_store_ss( (void *)dstrow, _mm_castsi128_ps( _mm_or_si128( _mm_packus_epi16( vdst01, vdst01 ), valphatest ) ) );
}
#else
for( x = 0 ; x < srcimage->format.width ; x++, srcrow += 4, dstrow += 4 )
{
if( !( srcrow[3] ) )
continue;
srcr = (int32_t)srcrow[0];
srcg = (int32_t)srcrow[1];
srcb = (int32_t)srcrow[2];
srca = (int32_t)srcrow[3];
dstr = (int32_t)dstrow[0];
dstg = (int32_t)dstrow[1];
dstb = (int32_t)dstrow[2];
dstr = ( ( dstr << 8 ) - dstr + ( srca * ( srcr - dstr ) ) + 128 );
dstg = ( ( dstg << 8 ) - dstg + ( srca * ( srcg - dstg ) ) + 128 );
dstb = ( ( dstb << 8 ) - dstb + ( srca * ( srcb - dstb ) ) + 128 );
dstr = ( dstr + ( dstr >> 8 ) ) >> 8;
dstg = ( dstg + ( dstg >> 8 ) ) >> 8;
dstb = ( dstb + ( dstb >> 8 ) ) >> 8;
dstrow[0] = (unsigned char)dstr;
dstrow[1] = (unsigned char)dstg;
dstrow[2] = (unsigned char)dstb;
dstrow[3] = (unsigned char)255;
}
#endif
src = ADDRESS( src, srcimage->format.bytesperline );
dst = ADDRESS( dst, dstimage->format.bytesperline );
}
return;
}
static void imgBlendImageRgba2Rgb( imgImage *dstimage, int dstx, int dsty, imgImage *srcimage )
{
int x, y;
int32_t dstr, dstg, dstb;
int32_t srcr, srcg, srcb, srca;
unsigned char *src, *srcrow, *dstrow;
uint32_t *dst;
/* TODO: Other function to clamp copy area? */
src = srcimage->data;
dst = ADDRESS( dstimage->data, ( dstx * 3 ) + ( dsty * dstimage->format.bytesperline ) );
for( y = 0 ; y < srcimage->format.height ; y++ )
{
srcrow = src;
dstrow = (unsigned char *)dst;
for( x = 0 ; x < srcimage->format.width ; x++, srcrow += 4, dstrow += 3 )
{
if( !( srcrow[3] ) )
continue;
srcr = (int32_t)srcrow[0];
srcg = (int32_t)srcrow[1];
srcb = (int32_t)srcrow[2];
srca = (int32_t)srcrow[3];
dstr = (int32_t)dstrow[0];
dstg = (int32_t)dstrow[1];
dstb = (int32_t)dstrow[2];
dstr = ( ( dstr << 8 ) - dstr + ( srca * ( srcr - dstr ) ) + 128 );
dstg = ( ( dstg << 8 ) - dstg + ( srca * ( srcg - dstg ) ) + 128 );
dstb = ( ( dstb << 8 ) - dstb + ( srca * ( srcb - dstb ) ) + 128 );
dstr = ( dstr + ( dstr >> 8 ) ) >> 8;
dstg = ( dstg + ( dstg >> 8 ) ) >> 8;
dstb = ( dstb + ( dstb >> 8 ) ) >> 8;
dstrow[0] = (unsigned char)dstr;
dstrow[1] = (unsigned char)dstg;
dstrow[2] = (unsigned char)dstb;
}
src = ADDRESS( src, srcimage->format.bytesperline );
dst = ADDRESS( dst, dstimage->format.bytesperline );
}
return;
}
void (*imgBlendGetFunction( imgImage *dstimage, imgImage *srcimage ))( imgImage *dstimage, int dstx, int dsty, imgImage *srcimage )
{
void (*blendfunc)( imgImage *dstimage, int dstx, int dsty, imgImage *srcimage );
blendfunc = 0;
if( srcimage->format.bytesperpixel == 4 )
{
if( dstimage->format.bytesperpixel == 4 )
{
if( ( dstimage->format.type == IMG_FORMAT_TYPE_RGBA32 ) || ( dstimage->format.type == IMG_FORMAT_TYPE_BGRA32 ) )
blendfunc = imgBlendImageRgba2Rgba;
else
blendfunc = imgBlendImageRgba2Rgbx;
}
else if( dstimage->format.bytesperpixel == 3 )
blendfunc = imgBlendImageRgba2Rgb;
}
return blendfunc;
}
int imgBlendImage( imgImage *dstimage, int dstx, int dsty, imgImage *srcimage )
{
void (*blendfunc)( imgImage *dstimage, int dstx, int dsty, imgImage *srcimage );
blendfunc = imgBlendGetFunction( dstimage, srcimage );
if( blendfunc )
{
blendfunc( dstimage, dstx, dsty, srcimage );
return 1;
}
return 0;
}
////
void imgAllocCopy( imgImage *dstimage, imgImage *srcimage )
{
dstimage->format = srcimage->format;
dstimage->data = malloc( srcimage->format.height * srcimage->format.bytesperline );
memcpy( dstimage->data, srcimage->data, srcimage->format.height * srcimage->format.bytesperline );
return;
}
void imgAllocCopyExtendBorder( imgImage *dstimage, imgImage *srcimage, int extendsize )
{
int y;
void *dst, *src, *dstrow;
dstimage->format.width = srcimage->format.width + ( extendsize << 1 );
dstimage->format.height = srcimage->format.height + ( extendsize << 1 );
dstimage->format.type = srcimage->format.type;
dstimage->format.bytesperpixel = srcimage->format.bytesperpixel;
dstimage->format.bytesperline = dstimage->format.width * dstimage->format.bytesperpixel;
dstimage->data = malloc( dstimage->format.height * dstimage->format.bytesperline );
src = srcimage->data;
dst = dstimage->data;
for( y = 0 ; y < extendsize ; y++ )
{
memset( dst, 0, dstimage->format.bytesperline );
dst = ADDRESS( dst, dstimage->format.bytesperline );
}
for( y = 0 ; y < srcimage->format.height ; y++ )
{
dstrow = dst;
memset( dstrow, 0, extendsize * dstimage->format.bytesperpixel );
dstrow = ADDRESS( dstrow, extendsize * dstimage->format.bytesperpixel );
memcpy( dstrow, src, srcimage->format.width * dstimage->format.bytesperpixel );
dstrow = ADDRESS( dstrow, srcimage->format.width * dstimage->format.bytesperpixel );
memset( dstrow, 0, extendsize * dstimage->format.bytesperpixel );
src = ADDRESS( src, srcimage->format.bytesperline );
dst = ADDRESS( dst, dstimage->format.bytesperline );
}
for( y = 0 ; y < extendsize ; y++ )
{
memset( dst, 0, dstimage->format.bytesperline );
dst = ADDRESS( dst, dstimage->format.bytesperline );
}
return;
}
void imgAllocExtractChannel( imgImage *dstimage, imgImage *srcimage, int channelindex )
{
int x, y;
unsigned char *dst, *src, *srcrow;
dstimage->format.width = srcimage->format.width;
dstimage->format.height = srcimage->format.height;
dstimage->format.type = IMG_FORMAT_TYPE_GRAYSCALE;
dstimage->format.bytesperpixel = 1;
dstimage->format.bytesperline = dstimage->format.width * dstimage->format.bytesperpixel;
dstimage->data = malloc( dstimage->format.height * dstimage->format.bytesperline );
src = ADDRESS( srcimage->data, channelindex );
dst = dstimage->data;
for( y = 0 ; y < dstimage->format.height ; y++ )
{
srcrow = src;
for( x = 0 ; x < dstimage->format.width ; x++ )
{
dst[x] = *srcrow;
srcrow = ADDRESS( srcrow, srcimage->format.bytesperpixel );
}
src = ADDRESS( src, srcimage->format.bytesperline );
dst = ADDRESS( dst, dstimage->format.bytesperline );
}
return;
}
void imgAllocExtractChannelExtendBorder( imgImage *dstimage, imgImage *srcimage, int channelindex, int extendsize )
{
int x, y;
unsigned char *src, *dst, *srcrow, *dstrow;
dstimage->format.width = srcimage->format.width + ( extendsize << 1 );
dstimage->format.height = srcimage->format.height + ( extendsize << 1 );
dstimage->format.type = IMG_FORMAT_TYPE_GRAYSCALE;
dstimage->format.bytesperpixel = 1;
dstimage->format.bytesperline = dstimage->format.width * dstimage->format.bytesperpixel;
dstimage->data = malloc( dstimage->format.height * dstimage->format.bytesperline );
src = ADDRESS( srcimage->data, channelindex );
dst = dstimage->data;
for( y = 0 ; y < extendsize ; y++ )
{
memset( dst, 0, dstimage->format.bytesperline );
dst = ADDRESS( dst, dstimage->format.bytesperline );
}
for( y = 0 ; y < srcimage->format.height ; y++ )
{
srcrow = src;
dstrow = dst;
memset( dstrow, 0, extendsize * dstimage->format.bytesperpixel );
dstrow = ADDRESS( dstrow, extendsize * dstimage->format.bytesperpixel );
for( x = 0 ; x < srcimage->format.width ; x++ )
{
dstrow[x] = *srcrow;
srcrow = ADDRESS( srcrow, srcimage->format.bytesperpixel );
}
dstrow = ADDRESS( dstrow, srcimage->format.width * dstimage->format.bytesperpixel );
memset( dstrow, 0, extendsize * dstimage->format.bytesperpixel );
src = ADDRESS( src, srcimage->format.bytesperline );
dst = ADDRESS( dst, dstimage->format.bytesperline );
}
for( y = 0 ; y < extendsize ; y++ )
{
memset( dst, 0, dstimage->format.bytesperline );
dst = ADDRESS( dst, dstimage->format.bytesperline );
}
return;
}
void imgAllocCopyChannelToAlpha( imgImage *dstimage, imgImage *srcimage, int channelindex, unsigned char r, unsigned char g, unsigned char b )
{
int x, y;
unsigned char *dst, *src, *dstrow, *srcrow;
dstimage->format.width = srcimage->format.width;
dstimage->format.height = srcimage->format.height;
dstimage->format.type = IMG_FORMAT_TYPE_RGBA32;
dstimage->format.bytesperpixel = 4;
dstimage->format.bytesperline = dstimage->format.width * dstimage->format.bytesperpixel;
dstimage->data = malloc( dstimage->format.height * dstimage->format.bytesperline );
src = ADDRESS( srcimage->data, channelindex );
dst = dstimage->data;
for( y = 0 ; y < dstimage->format.height ; y++ )
{
srcrow = src;
dstrow = dst;
for( x = 0 ; x < dstimage->format.width ; x++ )
{
dstrow[0] = r;
dstrow[1] = g;
dstrow[2] = b;
dstrow[3] = *srcrow;
srcrow = ADDRESS( srcrow, srcimage->format.bytesperpixel );
dstrow = ADDRESS( dstrow, dstimage->format.bytesperpixel );
}
src = ADDRESS( src, srcimage->format.bytesperline );
dst = ADDRESS( dst, dstimage->format.bytesperline );
}
return;
}
void imgAllocAdjustBrightnessContrast( imgImage *dstimage, imgImage *srcimage, float brightness, float contrast )
{
int x, y;
float r, g, b;
unsigned char *dst, *src, *dstrow, *srcrow;
dstimage->format = srcimage->format;
dstimage->data = malloc( srcimage->format.height * srcimage->format.bytesperline );
brightness += 0.5f;
if( dstimage->format.bytesperpixel >= 3 )
{
src = srcimage->data;
dst = dstimage->data;
for( y = 0 ; y < dstimage->format.height ; y++ )
{
srcrow = src;
dstrow = dst;
for( x = 0 ; x < dstimage->format.width ; x++ )
{
r = (1.0f/255.0f) * (float)srcrow[0];
g = (1.0f/255.0f) * (float)srcrow[1];
b = (1.0f/255.0f) * (float)srcrow[2];
r = ( ( r - 0.5f ) * contrast ) + brightness;
g = ( ( g - 0.5f ) * contrast ) + brightness;
b = ( ( b - 0.5f ) * contrast ) + brightness;
dstrow[0] = (unsigned char)fmaxf( 0.0f, fminf( 255.0f, roundf( r * 255.0f ) ) );
dstrow[1] = (unsigned char)fmaxf( 0.0f, fminf( 255.0f, roundf( g * 255.0f ) ) );
dstrow[2] = (unsigned char)fmaxf( 0.0f, fminf( 255.0f, roundf( b * 255.0f ) ) );
if( dstimage->format.bytesperpixel >= 4 )
dstrow[3] = srcrow[3];
srcrow = ADDRESS( srcrow, srcimage->format.bytesperpixel );
dstrow = ADDRESS( dstrow, dstimage->format.bytesperpixel );
}
src = ADDRESS( src, srcimage->format.bytesperline );
dst = ADDRESS( dst, dstimage->format.bytesperline );
}
}
else if( dstimage->format.bytesperpixel == 1 )
{
src = srcimage->data;
dst = dstimage->data;
for( y = 0 ; y < dstimage->format.height ; y++ )
{
srcrow = src;
dstrow = dst;
for( x = 0 ; x < dstimage->format.width ; x++ )
{
r = (1.0f/255.0f) * (float)srcrow[0];
r = ( ( r - 0.5f ) * contrast ) + brightness;
dstrow[0] = (unsigned char)fmaxf( 0.0f, fminf( 255.0f, roundf( r * 255.0f ) ) );
srcrow = ADDRESS( srcrow, srcimage->format.bytesperpixel );
dstrow = ADDRESS( dstrow, dstimage->format.bytesperpixel );
}
src = ADDRESS( src, srcimage->format.bytesperline );
dst = ADDRESS( dst, dstimage->format.bytesperline );
}
}
return;
}
void imgFree( imgImage *image )
{
free( image->data );
image->data = 0;
return;
}
////

View File

@ -0,0 +1,74 @@
/* *****************************************************************************
*
* Copyright (c) 2007-2016 Alexis Naveros.
* Portions developed under contract to the SURVICE Engineering Company.
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public License
* version 2.1 as published by the Free Software Foundation.
*
* This library is distributed in the hope that it will be useful, but
* WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this file; see the file named COPYING for more
* information.
*
* *****************************************************************************
*/
#ifndef IMG_H
#define IMG_H
typedef struct
{
int width;
int height;
int type;
int bytesperpixel;
int bytesperline;
} imgFormat;
enum
{
IMG_FORMAT_TYPE_ANY,
IMG_FORMAT_TYPE_RGB24,
IMG_FORMAT_TYPE_BGR24,
IMG_FORMAT_TYPE_RGBX32,
IMG_FORMAT_TYPE_BGRX32,
IMG_FORMAT_TYPE_RGBA32,
IMG_FORMAT_TYPE_BGRA32,
IMG_FORMAT_TYPE_GRAYSCALE,
IMG_FORMAT_TYPE_GRAYALPHA
};
typedef struct
{
imgFormat format;
void *data;
} imgImage;
////
void imgCopyRect( imgImage *image, int dstx, int dsty, int srcx, int srcy, int sizex, int sizey );
void (*imgBlendGetFunction( imgImage *dstimage, imgImage *srcimage ))( imgImage *dstimage, int dstx, int dsty, imgImage *srcimage );
int imgBlendImage( imgImage *dstimage, int dstx, int dsty, imgImage *srcimage );
void imgAllocCopy( imgImage *dst, imgImage *src );
void imgAllocCopyExtendBorder( imgImage *dstimage, imgImage *srcimage, int extendsize );
void imgAllocExtractChannel( imgImage *dst, imgImage *src, int channelindex );
void imgAllocExtractChannelExtendBorder( imgImage *dstimage, imgImage *srcimage, int channelindex, int extendsize );
void imgAllocCopyChannelToAlpha( imgImage *dstimage, imgImage *srcimage, int channelindex, unsigned char r, unsigned char g, unsigned char b );
void imgAllocAdjustBrightnessContrast( imgImage *dstimage, imgImage *srcimage, float brightness, float contrast );
void imgFree( imgImage *image );
#endif

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,150 @@
/* -----------------------------------------------------------------------------
*
* Copyright (c) 2014-2017 Alexis Naveros.
* Portions developed under contract to the SURVICE Engineering Company.
*
* This software is provided 'as-is', without any express or implied
* warranty. In no event will the authors be held liable for any damages
* arising from the use of this software.
*
* Permission is granted to anyone to use this software for any purpose,
* including commercial applications, and to alter it and redistribute it
* freely, subject to the following restrictions:
*
* 1. The origin of this software must not be misrepresented; you must not
* claim that you wrote the original software. If you use this software
* in a product, an acknowledgment in the product documentation would be
* appreciated but is not required.
* 2. Altered source versions must be plainly marked as such, and must not be
* misrepresented as being the original software.
* 3. This notice may not be removed or altered from any source distribution.
*
* -----------------------------------------------------------------------------
*/
#ifndef IMGRESIZE_H
#define IMGRESIZE_H
typedef struct
{
/* Specify filter type, from the IM_REDUCE_FILTER_* list */
int filter;
/* High quality, a little slow: hopcount=3; */
/* Good quality, much faster: hopcount=2; */
int hopcount;
/* Strong preservation/amplification of details: alpha=2.0f; */
/* Mild preservation/amplification of details: alpha=6.0f; */
float alpha;
/* NORMALMAP filters: factor to amyplify normals on X and Y before normalization */
float amplifynormal;
/* NORMALMAP_SUSTAIN filters: Preserve a factor of deviation "energy" as calculated by sqrtf(x*x+y*y) */
float normalsustainfactor;
} imReduceOptions;
static inline void imReduceSetOptions( imReduceOptions *options, int filter, int hopcount, float alpha, float amplifynormal, float normalsustainfactor )
{
options->filter = filter;
options->hopcount = hopcount;
options->alpha = alpha;
options->amplifynormal = amplifynormal;
options->normalsustainfactor = normalsustainfactor;
return;
}
/* Reduce the image's dimensions by an integer divisor ~ this is fairly fast */
int imReduceImageKaiserDataDivisor( unsigned char *dstdata, unsigned char *srcdata, int width, int height, int bytesperpixel, int bytesperline, int sizedivisor, imReduceOptions *options );
/* Same as imReduceImageKaiserDataDivisor(), but imgdst is allocated */
int imReduceImageKaiserDivisor( imgImage *imgdst, imgImage *imgsrc, int sizedivisor, imReduceOptions *options );
/* Reduce the image's dimensions to match the newwidth and newheight ~ this is a little slower */
int imReduceImageKaiserData( unsigned char *dstdata, unsigned char *srcdata, int width, int height, int bytesperpixel, int bytesperline, int newwidth, int newheight, imReduceOptions *options );
/* Same as imReduceImageKaiserData(), but imgdst is allocated */
int imReduceImageKaiser( imgImage *imgdst, imgImage *imgsrc, int newwidth, int newheight, imReduceOptions *options );
/* Resize by half with a dumb box filter ~ don't use that except for the smallest mipmaps */
/* Filters with ALPHANORM and/or SUSTAIN keywords are processed as the regular base filter only */
int imReduceImageHalfBoxData( unsigned char *dstdata, unsigned char *srcdata, int width, int height, int bytesperpixel, int bytesperline, imReduceOptions *options );
int imReduceImageHalfBox( imgImage *imgdst, imgImage *imgsrc, imReduceOptions *options );
/*
Keywords for image reduction filters
LINEAR: Data is linear, note that this is *not* the format of typical diffuse textures
SRGB: Color is in sRGB space, any alpha is presumed linear
NORMALMAP: RGB represents a XYZ vector as (2.0*RGB)-1.0f, any alpha is presumed linear
ALPHANORM: Alpha normalization, the weight of pixels is proportional to their alpha values
(do you have "black" fully transparent pixels? please use an ALPHANORM filter)
SUSTAIN: The "energy" of the normal map is sustained, amplified to preserve the level of details
Note that this filter is rather slow (set options->normalsustainfactor to 0.75 or so)
*/
enum
{
/* Linear space */
IM_REDUCE_FILTER_LINEAR,
IM_REDUCE_FILTER_LINEAR_ALPHANORM,
/* sRGB space (probably what you want for diffuse textures) */
IM_REDUCE_FILTER_SRGB,
IM_REDUCE_FILTER_SRGB_ALPHANORM,
/* RGB represents a XYZ vector as (2.0*RGB)-1.0f, any alpha is presumed linear */
IM_REDUCE_FILTER_NORMALMAP,
IM_REDUCE_FILTER_NORMALMAP_ALPHANORM,
IM_REDUCE_FILTER_NORMALMAP_SUSTAIN,
IM_REDUCE_FILTER_NORMALMAP_SUSTAIN_ALPHANORM,
/* Custom specialized filters */
IM_REDUCE_FILTER_WATERMAP,
IM_REDUCE_FILTER_PLANTMAP,
IM_REDUCE_FILTER_FOLLIAGE,
IM_REDUCE_FILTER_SKY,
IM_REDUCE_FILTER_FOG
};
////
#define IM_MIPMAP_CASCADE_MAX (16)
typedef struct
{
int width;
int height;
int layercount;
int bytesperpixel;
int bytesperline;
imReduceOptions *options;
void *mipmap[IM_MIPMAP_CASCADE_MAX];
} imMipmapCascade;
int imBuildMipmapCascade( imMipmapCascade *cascade, void *imagedata, int width, int height, int layercount, int bytesperpixel, int bytesperline, imReduceOptions *options, int cascadeflags );
void imFreeMipmapCascade( imMipmapCascade *cascade );
/* For base texture, propagate RGB channels to neighbors if they are fully transparent (ignored if bytesperpixel != 4 ) */
#define IM_CASCADE_FLAGS_COLOR_BORDER_BASE (0x1)
/* For generated mipmaps, propagate RGB channels to neighbors if they are fully transparent (ignored if bytesperpixel != 4 ) */
#define IM_CASCADE_FLAGS_COLOR_BORDER_MIPMAPS (0x2)
////
void imPropagateAlphaBorder( unsigned char *imagedata, int width, int height, int bytesperpixel, int bytesperline );
////
#endif

View File

@ -934,6 +934,11 @@ namespace UserConfigParams
PARAM_DEFAULT( BoolUserConfigParam(false, "everything_unlocked", PARAM_DEFAULT( BoolUserConfigParam(false, "everything_unlocked",
"Enable all karts and tracks") ); "Enable all karts and tracks") );
PARAM_PREFIX BoolUserConfigParam m_hq_mipmap
PARAM_DEFAULT( BoolUserConfigParam(false, "hq_mipmap",
"Generate mipmap for textures using "
"high quality method with SSE") );
// TODO? implement blacklist for new irrlicht device and GUI // TODO? implement blacklist for new irrlicht device and GUI
PARAM_PREFIX std::vector<std::string> m_blacklist_res; PARAM_PREFIX std::vector<std::string> m_blacklist_res;

View File

@ -490,7 +490,7 @@ bool CentralVideoSettings::isARBPixelBufferObjectUsable() const
bool CentralVideoSettings::supportsThreadedTextureLoading() const bool CentralVideoSettings::supportsThreadedTextureLoading() const
{ {
return isARBPixelBufferObjectUsable() && isARBBufferStorageUsable(); return isARBPixelBufferObjectUsable() && isARBBufferStorageUsable() && isARBTextureStorageUsable();
} }
#endif // !SERVER_ONLY #endif // !SERVER_ONLY

View File

@ -0,0 +1,119 @@
// SuperTuxKart - a fun racing game with go-kart
// Copyright (C) 2017 SuperTuxKart-Team
//
// This program is free software; you can redistribute it and/or
// modify it under the terms of the GNU General Public License
// as published by the Free Software Foundation; either version 3
// of the License, or (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
#include "graphics/hq_mipmap_generator.hpp"
#define DUMP_MIPMAP
#ifdef DUMP_MIPMAP
#include "graphics/irr_driver.hpp"
#include "utils/string_utils.hpp"
#endif
#include <cassert>
extern "C"
{
#include <mipmap/img.h>
#include <mipmap/imgresize.h>
}
// ----------------------------------------------------------------------------
HQMipmapGenerator::HQMipmapGenerator(const io::path& name, uint8_t* data,
const core::dimension2d<u32>& size,
GLuint texture_name, bool single_channel)
: video::ITexture(name), m_orig_data(data), m_size(size),
m_texture_name(texture_name), m_texture_size(0),
m_single_channel(single_channel), m_mipmap_data(NULL)
{
unsigned width = m_size.Width;
unsigned height = m_size.Height;
while (true)
{
width = width < 2 ? 1 : width >> 1;
height = height < 2 ? 1 : height >> 1;
m_mipmap_sizes.emplace_back(core::dimension2du(width, height),
m_texture_size);
m_texture_size += width * height * (m_single_channel ? 1 : 4);
if (width == 1 && height == 1)
break;
}
m_texture_size = unsigned(m_mipmap_sizes.back().second) +
(m_single_channel ? 1 : 4);
m_mipmap_data = malloc(sizeof(imMipmapCascade));
} // HQMipmapGenerator
// ----------------------------------------------------------------------------
HQMipmapGenerator::~HQMipmapGenerator()
{
imFreeMipmapCascade((imMipmapCascade*)m_mipmap_data);
free(m_mipmap_data);
} // ~HQMipmapGenerator
// ----------------------------------------------------------------------------
void HQMipmapGenerator::threadedReload(void* ptr, void* param) const
{
imReduceOptions options;
imReduceSetOptions(&options, IM_REDUCE_FILTER_SRGB, 3, 2.0f, 0.0f, 0.0f);
imMipmapCascade* mm_cascade = (imMipmapCascade*)m_mipmap_data;
#ifdef DEBUG
int ret = imBuildMipmapCascade(mm_cascade, m_orig_data, m_size.Width,
m_size.Height, 1/*layercount*/, m_single_channel ? 1 : 4,
m_single_channel ? m_size.Width : m_size.Width * 4, &options, 0);
assert(ret == 1);
#else
imBuildMipmapCascade(mm_cascade, m_orig_data, m_size.Width,
m_size.Height, 1/*layercount*/, m_single_channel ? 1 : 4,
m_single_channel ? m_size.Width : m_size.Width * 4, &options, 0);
#endif
for (unsigned int i = 0; i < m_mipmap_sizes.size(); i++)
{
memcpy((uint8_t*)ptr + m_mipmap_sizes[i].second,
mm_cascade->mipmap[i + 1],
m_mipmap_sizes[i].first.getArea() * (m_single_channel ? 1 : 4));
#ifdef DUMP_MIPMAP
if (m_single_channel) continue;
video::IImage* image = irr_driver->getVideoDriver()
->createImageFromData(video::ECF_A8R8G8B8, m_mipmap_sizes[i].first,
mm_cascade->mipmap[i + 1], false/*ownForeignMemory*/);
irr_driver->getVideoDriver()->writeImageToFile(image, std::string
(StringUtils::toString(i) + "_" +
StringUtils::getBasename(NamedPath.getPtr()) + ".png").c_str());
image->drop();
#endif
}
} // threadedReload
// ----------------------------------------------------------------------------
void HQMipmapGenerator::threadedSubImage(void* ptr) const
{
#if !(defined(SERVER_ONLY) || defined(USE_GLES2))
glBindTexture(GL_TEXTURE_2D, m_texture_name);
for (unsigned int i = 0; i < m_mipmap_sizes.size(); i++)
{
glTexSubImage2D(GL_TEXTURE_2D, i + 1, 0, 0,
m_mipmap_sizes[i].first.Width, m_mipmap_sizes[i].first.Height,
m_single_channel ? GL_RED : GL_BGRA, GL_UNSIGNED_BYTE,
(uint8_t*)ptr + m_mipmap_sizes[i].second);
}
delete this;
#endif
} // threadedSubImage
// ----------------------------------------------------------------------------
void HQMipmapGenerator::cleanThreadedLoader()
{
delete[] m_orig_data;
} // cleanThreadedLoader

View File

@ -0,0 +1,98 @@
// SuperTuxKart - a fun racing game with go-kart
// Copyright (C) 2017 SuperTuxKart-Team
//
// This program is free software; you can redistribute it and/or
// modify it under the terms of the GNU General Public License
// as published by the Free Software Foundation; either version 3
// of the License, or (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
#ifndef HEADER_HQ_MIPMAP_GENERATOR_HPP
#define HEADER_HQ_MIPMAP_GENERATOR_HPP
#include "graphics/gl_headers.hpp"
#include "utils/no_copy.hpp"
#include "utils/types.hpp"
#include <vector>
#include <ITexture.h>
using namespace irr;
class HQMipmapGenerator : public video::ITexture, NoCopy
{
private:
uint8_t* m_orig_data;
core::dimension2d<u32> m_size;
GLuint m_texture_name;
unsigned int m_texture_size;
bool m_single_channel;
void* m_mipmap_data;
std::vector<std::pair<core::dimension2d<u32>, size_t> > m_mipmap_sizes;
public:
// ------------------------------------------------------------------------
HQMipmapGenerator(const io::path& name, uint8_t* data,
const core::dimension2d<u32>& size, GLuint texture_name,
bool single_channel);
// ------------------------------------------------------------------------
virtual ~HQMipmapGenerator();
// ------------------------------------------------------------------------
virtual void* lock(video::E_TEXTURE_LOCK_MODE mode =
video::ETLM_READ_WRITE, u32 mipmap_level = 0)
{ return NULL; }
// ------------------------------------------------------------------------
virtual void unlock() {}
// ------------------------------------------------------------------------
virtual const core::dimension2d<u32>& getOriginalSize() const
{ return m_size; }
// ------------------------------------------------------------------------
virtual const core::dimension2d<u32>& getSize() const { return m_size; }
// ------------------------------------------------------------------------
virtual video::E_DRIVER_TYPE getDriverType() const
{
#if defined(USE_GLES2)
return video::EDT_OGLES2;
#else
return video::EDT_OPENGL;
#endif
}
// ------------------------------------------------------------------------
virtual video::ECOLOR_FORMAT getColorFormat() const
{ return video::ECF_A8R8G8B8; }
// ------------------------------------------------------------------------
virtual u32 getPitch() const { return 0; }
// ------------------------------------------------------------------------
virtual bool hasMipMaps() const { return false; }
// ------------------------------------------------------------------------
virtual void regenerateMipMapLevels(void* mipmap_data = NULL) {}
// ------------------------------------------------------------------------
virtual u32 getOpenGLTextureName() const { return m_texture_name; }
// ------------------------------------------------------------------------
virtual u64 getHandle() { return 0; }
// ------------------------------------------------------------------------
virtual unsigned int getTextureSize() const { return m_texture_size; }
// ------------------------------------------------------------------------
virtual void threadedReload(void* ptr, void* param) const;
// ------------------------------------------------------------------------
virtual void threadedSubImage(void* ptr) const;
// ------------------------------------------------------------------------
virtual void cleanThreadedLoader();
}; // HQMipmapGenerator
#endif

View File

@ -17,6 +17,7 @@
#include "graphics/stk_tex_manager.hpp" #include "graphics/stk_tex_manager.hpp"
#include "config/hardware_stats.hpp" #include "config/hardware_stats.hpp"
#include "config/user_config.hpp"
#include "graphics/central_settings.hpp" #include "graphics/central_settings.hpp"
#include "graphics/materials.hpp" #include "graphics/materials.hpp"
#include "graphics/threaded_tex_loader.hpp" #include "graphics/threaded_tex_loader.hpp"
@ -33,11 +34,12 @@ STKTexManager::STKTexManager() : m_pbo(0), m_thread_size(0)
#if !(defined(SERVER_ONLY) || defined(USE_GLES2)) #if !(defined(SERVER_ONLY) || defined(USE_GLES2))
if (CVS->supportsThreadedTextureLoading()) if (CVS->supportsThreadedTextureLoading())
{ {
UserConfigParams::m_hq_mipmap = true;
pthread_mutex_init(&m_threaded_load_textures_mutex, NULL); pthread_mutex_init(&m_threaded_load_textures_mutex, NULL);
pthread_cond_init(&m_cond_request, NULL); pthread_cond_init(&m_cond_request, NULL);
m_thread_size = HardwareStats::getNumProcessors(); m_thread_size = HardwareStats::getNumProcessors();
m_thread_size = core::clamp(m_thread_size, 1, 3); m_thread_size = core::clamp(m_thread_size, 1, 8);
static const unsigned max_pbo_size = 48 * 1024 * 1024; static const unsigned max_pbo_size = 128 * 1024 * 1024;
const unsigned each_capacity = max_pbo_size / m_thread_size; const unsigned each_capacity = max_pbo_size / m_thread_size;
Log::info("STKTexManager", "%d thread(s) for texture loading," Log::info("STKTexManager", "%d thread(s) for texture loading,"
" each capacity %d MB", m_thread_size, " each capacity %d MB", m_thread_size,

View File

@ -18,14 +18,17 @@
#include "graphics/stk_texture.hpp" #include "graphics/stk_texture.hpp"
#include "config/user_config.hpp" #include "config/user_config.hpp"
#include "graphics/central_settings.hpp" #include "graphics/central_settings.hpp"
#include "graphics/hq_mipmap_generator.hpp"
#include "graphics/irr_driver.hpp" #include "graphics/irr_driver.hpp"
#include "graphics/material.hpp" #include "graphics/material.hpp"
#include "graphics/material_manager.hpp" #include "graphics/material_manager.hpp"
#include "graphics/materials.hpp" #include "graphics/materials.hpp"
#include "graphics/stk_tex_manager.hpp"
#include "modes/profile_world.hpp" #include "modes/profile_world.hpp"
#include "utils/log.hpp" #include "utils/log.hpp"
#include "utils/string_utils.hpp" #include "utils/string_utils.hpp"
#include <algorithm>
#include <fstream> #include <fstream>
#include <functional> #include <functional>
@ -215,7 +218,7 @@ void STKTexture::reload(bool no_upload, uint8_t* preload_data,
const unsigned int w = m_size.Width; const unsigned int w = m_size.Width;
const unsigned int h = m_size.Height; const unsigned int h = m_size.Height;
unsigned int format = m_single_channel ? GL_RED : GL_BGRA; unsigned int format = m_single_channel ? GL_RED : GL_BGRA;
unsigned int internal_format = m_single_channel ? GL_R8 : GL_RGBA; unsigned int internal_format = m_single_channel ? GL_R8 : GL_RGBA8;
#if !defined(USE_GLES2) #if !defined(USE_GLES2)
if (m_mesh_texture && CVS->isTextureCompressionEnabled()) if (m_mesh_texture && CVS->isTextureCompressionEnabled())
@ -227,13 +230,41 @@ void STKTexture::reload(bool no_upload, uint8_t* preload_data,
else else
{ {
internal_format = internal_format =
m_single_channel ? GL_R8 : m_srgb ? GL_SRGB_ALPHA : GL_RGBA; m_single_channel ? GL_R8 : m_srgb ? GL_SRGB8_ALPHA8 : GL_RGBA8;
} }
#endif #endif
if (!useThreadedLoading()) if (!useThreadedLoading())
formatConversion(data, &format, w, h); formatConversion(data, &format, w, h);
if (!no_upload) if (useThreadedLoading())
{
if (m_texture_name == 0)
{
glGenTextures(1, &m_texture_name);
glBindTexture(GL_TEXTURE_2D, m_texture_name);
if (m_single_channel)
{
glPixelStorei(GL_UNPACK_ALIGNMENT, 1);
glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_SWIZZLE_R, GL_ONE);
glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_SWIZZLE_G, GL_ONE);
glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_SWIZZLE_B, GL_ONE);
glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_SWIZZLE_A, GL_RED);
}
int levels = 1;
int width = w;
int height = h;
while (true)
{
width = width < 2 ? 1 : width >> 1;
height = height < 2 ? 1 : height >> 1;
levels++;
if (width == 1 && height == 1)
break;
}
glTexStorage2D(GL_TEXTURE_2D, levels, internal_format, w, h);
}
}
else if (!no_upload)
{ {
const bool reload = m_texture_name != 0; const bool reload = m_texture_name != 0;
if (!reload) if (!reload)
@ -253,14 +284,14 @@ void STKTexture::reload(bool no_upload, uint8_t* preload_data,
glTexImage2D(GL_TEXTURE_2D, 0, internal_format, w, h, 0, format, glTexImage2D(GL_TEXTURE_2D, 0, internal_format, w, h, 0, format,
GL_UNSIGNED_BYTE, data); GL_UNSIGNED_BYTE, data);
} }
else if (!useThreadedLoading()) else
{ {
glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, w, h, format, glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, w, h, format,
GL_UNSIGNED_BYTE, data); GL_UNSIGNED_BYTE, data);
} }
if (orig_img) if (orig_img)
orig_img->unlock(); orig_img->unlock();
if (hasMipMaps() && !useThreadedLoading()) if (hasMipMaps())
glGenerateMipmap(GL_TEXTURE_2D); glGenerateMipmap(GL_TEXTURE_2D);
} }
@ -594,8 +625,15 @@ void STKTexture::threadedReload(void* ptr, void* param) const
if (orig_img) if (orig_img)
{ {
orig_img->unlock(); orig_img->unlock();
orig_img->setDeleteMemory(false);
orig_img->drop(); orig_img->drop();
} }
if (useHQMipmap())
{
HQMipmapGenerator* hqmg = new HQMipmapGenerator(NamedPath, data,
m_size, m_texture_name, m_single_channel);
((STKTexManager*)(param))->addThreadedLoadTexture(hqmg);
}
else else
delete[] data; delete[] data;
} // threadedReload } // threadedReload
@ -607,8 +645,11 @@ void STKTexture::threadedSubImage(void* ptr) const
glBindTexture(GL_TEXTURE_2D, m_texture_name); glBindTexture(GL_TEXTURE_2D, m_texture_name);
glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, m_size.Width, m_size.Height, glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, m_size.Width, m_size.Height,
m_single_channel ? GL_RED : GL_BGRA, GL_UNSIGNED_BYTE, ptr); m_single_channel ? GL_RED : GL_BGRA, GL_UNSIGNED_BYTE, ptr);
if (useHQMipmap())
return;
if (hasMipMaps()) if (hasMipMaps())
glGenerateMipmap(GL_TEXTURE_2D); glGenerateMipmap(GL_TEXTURE_2D);
#endif #endif
} // threadedSubImage } // threadedSubImage
@ -620,3 +661,10 @@ void STKTexture::cleanThreadedLoader()
m_file = NULL; m_file = NULL;
m_img_loader = NULL; m_img_loader = NULL;
} // cleanThreadedLoader } // cleanThreadedLoader
//-----------------------------------------------------------------------------
bool STKTexture::useHQMipmap() const
{
return UserConfigParams::m_hq_mipmap && m_size.Width > 1 &&
m_size.Height > 1;
} // useHQMipmap

View File

@ -76,6 +76,8 @@ private:
sc[i] = data[4 * i + 3]; sc[i] = data[4 * i + 3];
return sc; return sc;
} }
// ------------------------------------------------------------------------
bool useHQMipmap() const;
public: public:
// ------------------------------------------------------------------------ // ------------------------------------------------------------------------

View File

@ -84,8 +84,9 @@ void ThreadedTexLoader::handleCompletedTextures()
size_t offset = m_pbo_offset; size_t offset = m_pbo_offset;
for (irr::video::ITexture* tex : m_completed_textures) for (irr::video::ITexture* tex : m_completed_textures)
{ {
size_t cur_offset = tex->getTextureSize();
tex->threadedSubImage((void*)offset); tex->threadedSubImage((void*)offset);
offset += tex->getTextureSize(); offset += cur_offset;
} }
m_completed_textures.clear(); m_completed_textures.clear();
#endif #endif