Applied Stragus' SSE optimisations for spherical harmonics.
This commit is contained in:
parent
12e6bab90a
commit
a500f3becc
@ -29,29 +29,107 @@
|
|||||||
#include <cassert>
|
#include <cassert>
|
||||||
#include <irrlicht.h>
|
#include <irrlicht.h>
|
||||||
|
|
||||||
|
#if __SSE2__ || _M_X64 || _M_IX86_FP >= 2
|
||||||
|
#include <emmintrin.h>
|
||||||
|
#define SIMD_SSE2_SUPPORT (1)
|
||||||
|
#endif
|
||||||
|
#if __SSE4_1__ || __AVX__
|
||||||
|
#include <smmintrin.h>
|
||||||
|
#define SIMD_SSE4_1_SUPPORT (1)
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#if SIMD_SSE4_1_SUPPORT
|
||||||
|
#include <smmintrin.h>
|
||||||
|
#define SIMD_BLENDV_PS(x,y,mask) _mm_blendv_ps(x,y,mask)
|
||||||
|
#elif SIMD_SSE2_SUPPORT
|
||||||
|
#define SIMD_BLENDV_PS(x,y,mask) _mm_or_ps(_mm_andnot_ps(mask,x),_mm_and_ps(y,mask))
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#if defined(__GNUC__) || defined(__INTEL_COMPILER) || defined(__clang__)
|
||||||
|
#define ALWAYSINLINE __attribute__((always_inline))
|
||||||
|
#define SIMD_ALIGN16 __attribute__((aligned(16)))
|
||||||
|
#elif defined(_MSC_VER)
|
||||||
|
#define SIMD_ALIGN16 __declspec(align(16))
|
||||||
|
#define ALWAYSINLINE __forceinline
|
||||||
|
#else
|
||||||
|
#define ALWAYSINLINE
|
||||||
|
#endif
|
||||||
|
|
||||||
|
|
||||||
using namespace irr;
|
using namespace irr;
|
||||||
|
|
||||||
namespace
|
namespace
|
||||||
{
|
{
|
||||||
/** Convert an unsigned char cubemap texture to a float texture
|
|
||||||
* \param sh_rgba The 6 faces of the cubemap texture
|
#if defined(__x86_64__) || defined(__x86_64) || defined(__amd64__) || defined(__amd64) || defined(__i386__) || defined(__i386) || defined(i386)
|
||||||
* \param sh_w Texture width
|
|
||||||
* \param sh_h Texture height
|
/* Input is 0.0,255.0, output is 0.0,1.0 */
|
||||||
* \param[out] float_tex_cube The converted float cubemap texture
|
static inline ALWAYSINLINE float srgb2linear( float v )
|
||||||
*/
|
|
||||||
void convertToFloatTexture(unsigned char *sh_rgba[6], unsigned sh_w, unsigned sh_h, Color *float_tex_cube[6])
|
|
||||||
{
|
{
|
||||||
for (unsigned i = 0; i < 6; i++)
|
float v2, vpow, vpwsqrt;
|
||||||
|
union
|
||||||
{
|
{
|
||||||
float_tex_cube[i] = new Color[sh_w * sh_h];
|
int32_t i;
|
||||||
for (unsigned j = 0; j < sh_w * sh_h; j++)
|
float f;
|
||||||
{
|
} u;
|
||||||
float_tex_cube[i][j].Blue = powf(float(0xFF & sh_rgba[i][4 * j]) / 255.f, 2.2f);
|
if( v <= (0.04045f*255.0f) )
|
||||||
float_tex_cube[i][j].Green = powf(float(0xFF & sh_rgba[i][4 * j + 1]) / 255.f, 2.2f);
|
v = v * ( (1.0f/12.92f)*(1.0f/255.0f) );
|
||||||
float_tex_cube[i][j].Red = powf(float(0xFF & sh_rgba[i][4 * j + 2]) / 255.f, 2.2f);
|
else
|
||||||
}
|
{
|
||||||
}
|
v = ( v + (0.055f*255.0f) ) * ( (1.0f/1.055f)*(1.0f/255.0f) );
|
||||||
} //convertToFloatTexture
|
v2 = v * v;
|
||||||
|
u.f = v * 5417434112.0f;
|
||||||
|
u.i = (int32_t)( ( (float)u.i * 0.8f ) + 0.5f );
|
||||||
|
vpow = u.f;
|
||||||
|
vpwsqrt = sqrtf( vpow );
|
||||||
|
v = ( ( v2 * vpwsqrt ) + ( ( ( v2 * v ) / vpwsqrt ) / sqrtf( vpwsqrt ) ) ) * 0.51011878327f;
|
||||||
|
}
|
||||||
|
return v;
|
||||||
|
}
|
||||||
|
|
||||||
|
#else
|
||||||
|
|
||||||
|
/* Input is 0.0,255.0, output is 0.0,1.0 */
|
||||||
|
/* Only for reference, this is waayyy too slow and should never be used */
|
||||||
|
static inline ALWAYSINLINE float srgb2linear( float v )
|
||||||
|
{
|
||||||
|
v *= (1.0f/255.0f);
|
||||||
|
if( v <= 0.04045f )
|
||||||
|
v = v * (1.0f/12.92);
|
||||||
|
else
|
||||||
|
v = pow( ( v + 0.055f ) * (1.0f/1.055f), 2.4f );
|
||||||
|
return v;
|
||||||
|
}
|
||||||
|
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#if SIMD_SSE2_SUPPORT
|
||||||
|
|
||||||
|
static const float SIMD_ALIGN16 srgbLinearConst00[4] = { 0.04045f*255.0f, 0.04045f*255.0f, 0.04045f*255.0f, 0.04045f*255.0f };
|
||||||
|
static const float SIMD_ALIGN16 srgbLinearConst01[4] = { (1.0f/12.92f)*(1.0f/255.0f), (1.0f/12.92f)*(1.0f/255.0f), (1.0f/12.92f)*(1.0f/255.0f), (1.0f/12.92f)*(1.0f/255.0f) };
|
||||||
|
static const float SIMD_ALIGN16 srgbLinearConst02[4] = { 0.055f*255.0f, 0.055f*255.0f, 0.055f*255.0f, 0.055f*255.0f };
|
||||||
|
static const float SIMD_ALIGN16 srgbLinearConst03[4] = { (1.0f/1.055f)*(1.0f/255.0f), (1.0f/1.055f)*(1.0f/255.0f), (1.0f/1.055f)*(1.0f/255.0f), (1.0f/1.055f)*(1.0f/255.0f) };
|
||||||
|
static const float SIMD_ALIGN16 srgbLinearConst04[4] = { 5417434112.0f, 5417434112.0f, 5417434112.0f, 5417434112.0f };
|
||||||
|
static const float SIMD_ALIGN16 srgbLinearConst05[4] = { 0.8f, 0.8f, 0.8f, 0.8f };
|
||||||
|
static const float SIMD_ALIGN16 srgbLinearConst06[4] = { 0.51011878327f, 0.51011878327f, 0.51011878327f, 0.51011878327f };
|
||||||
|
|
||||||
|
/* Input is 0.0,255.0 ~ output is 0.0,1.0 */
|
||||||
|
static inline ALWAYSINLINE __m128 srgb2linear4( __m128 vx )
|
||||||
|
{
|
||||||
|
__m128 vmask, vbase;
|
||||||
|
__m128 vpow, vpwsqrtinv, vpwsqrt, vx2;
|
||||||
|
vmask = _mm_cmple_ps( vx, *(__m128*)srgbLinearConst00 );
|
||||||
|
vbase = _mm_mul_ps( vx, *(__m128*)srgbLinearConst01 );
|
||||||
|
vx = _mm_mul_ps( _mm_add_ps( vx, *(__m128*)srgbLinearConst02 ), *(__m128*)srgbLinearConst03 );
|
||||||
|
vx2 = _mm_mul_ps( vx, vx );
|
||||||
|
vpow = _mm_castsi128_ps( _mm_cvtps_epi32( _mm_mul_ps( _mm_cvtepi32_ps( _mm_castps_si128( _mm_mul_ps( vx, *(__m128*)srgbLinearConst04 ) ) ), *(__m128*)srgbLinearConst05 ) ) );
|
||||||
|
vpwsqrtinv = _mm_rsqrt_ps( vpow );
|
||||||
|
vpwsqrt = _mm_mul_ps( vpow, vpwsqrtinv );
|
||||||
|
vx = _mm_mul_ps( _mm_add_ps( _mm_mul_ps( vx2, vpwsqrt ), _mm_mul_ps( _mm_mul_ps( _mm_mul_ps( vx2, vx ), vpwsqrtinv ), _mm_rsqrt_ps( vpwsqrt ) ) ), *(__m128*)srgbLinearConst06 );
|
||||||
|
return SIMD_BLENDV_PS( vx, vbase, vmask );
|
||||||
|
}
|
||||||
|
|
||||||
|
#endif
|
||||||
|
|
||||||
// ------------------------------------------------------------------------
|
// ------------------------------------------------------------------------
|
||||||
/** Print the nine first spherical harmonics coefficients
|
/** Print the nine first spherical harmonics coefficients
|
||||||
@ -106,174 +184,292 @@ namespace
|
|||||||
*/
|
*/
|
||||||
void getXYZ(GLenum face, float i, float j, float &x, float &y, float &z)
|
void getXYZ(GLenum face, float i, float j, float &x, float &y, float &z)
|
||||||
{
|
{
|
||||||
|
float norminv;
|
||||||
switch (face)
|
switch (face)
|
||||||
{
|
{
|
||||||
case GL_TEXTURE_CUBE_MAP_POSITIVE_X:
|
case 0: // PosX
|
||||||
x = 1.;
|
x = 1.0f;
|
||||||
y = -i;
|
y = -i;
|
||||||
z = -j;
|
z = -j;
|
||||||
break;
|
break;
|
||||||
case GL_TEXTURE_CUBE_MAP_NEGATIVE_X:
|
case 1: // NegX
|
||||||
x = -1.;
|
x = -1.0f;
|
||||||
y = -i;
|
y = -i;
|
||||||
z = j;
|
z = j;
|
||||||
break;
|
break;
|
||||||
case GL_TEXTURE_CUBE_MAP_POSITIVE_Y:
|
case 2: // PosY
|
||||||
x = j;
|
x = j;
|
||||||
y = 1.;
|
y = 1.0f;
|
||||||
z = i;
|
z = i;
|
||||||
break;
|
break;
|
||||||
case GL_TEXTURE_CUBE_MAP_NEGATIVE_Y:
|
case 3: // NegY
|
||||||
x = j;
|
x = j;
|
||||||
y = -1;
|
y = -1.0f;
|
||||||
z = -i;
|
z = -i;
|
||||||
break;
|
break;
|
||||||
case GL_TEXTURE_CUBE_MAP_POSITIVE_Z:
|
case 4: // PosZ
|
||||||
x = j;
|
x = j;
|
||||||
y = -i;
|
y = -i;
|
||||||
z = 1;
|
z = 1.0f;
|
||||||
break;
|
break;
|
||||||
case GL_TEXTURE_CUBE_MAP_NEGATIVE_Z:
|
case 5: // NegZ
|
||||||
x = -j;
|
x = -j;
|
||||||
y = -i;
|
y = -i;
|
||||||
z = -1;
|
z = -1.0f;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
float norm = sqrt(x * x + y * y + z * z);
|
norminv = 1.0f / sqrtf(x * x + y * y + z * z);
|
||||||
x /= norm, y /= norm, z /= norm;
|
x *= norminv, y *= norminv, z *= norminv;
|
||||||
return;
|
return;
|
||||||
} // getXYZ
|
} // getXYZ
|
||||||
|
|
||||||
// ------------------------------------------------------------------------
|
|
||||||
/** Compute the value of the spherical harmonics basis functions (Yml)
|
|
||||||
* on each texel of a cubemap face
|
|
||||||
* \param face Face of the cubemap
|
|
||||||
* \param edge_size Size of the cubemap face
|
|
||||||
* \param[out] Yml The sphericals harmonics functions values on each texel of the cubemap
|
|
||||||
*/
|
|
||||||
void getYml(GLenum face, size_t edge_size,
|
|
||||||
float *Y00,
|
|
||||||
float *Y1minus1, float *Y10, float *Y11,
|
|
||||||
float *Y2minus2, float *Y2minus1, float *Y20, float *Y21, float *Y22)
|
|
||||||
{
|
|
||||||
#pragma omp parallel for
|
|
||||||
for (int i = 0; i < int(edge_size); i++)
|
|
||||||
{
|
|
||||||
for (unsigned j = 0; j < edge_size; j++)
|
|
||||||
{
|
|
||||||
float x, y, z;
|
|
||||||
float fi = float(i), fj = float(j);
|
|
||||||
fi /= edge_size, fj /= edge_size;
|
|
||||||
fi = 2 * fi - 1, fj = 2 * fj - 1;
|
|
||||||
getXYZ(face, fi, fj, x, y, z);
|
|
||||||
|
|
||||||
// constant part of Ylm
|
|
||||||
float c00 = 0.282095f;
|
|
||||||
float c1minus1 = 0.488603f;
|
|
||||||
float c10 = 0.488603f;
|
|
||||||
float c11 = 0.488603f;
|
|
||||||
float c2minus2 = 1.092548f;
|
|
||||||
float c2minus1 = 1.092548f;
|
|
||||||
float c21 = 1.092548f;
|
|
||||||
float c20 = 0.315392f;
|
|
||||||
float c22 = 0.546274f;
|
|
||||||
|
|
||||||
size_t idx = i * edge_size + j;
|
|
||||||
|
|
||||||
Y00[idx] = c00;
|
|
||||||
Y1minus1[idx] = c1minus1 * y;
|
|
||||||
Y10[idx] = c10 * z;
|
|
||||||
Y11[idx] = c11 * x;
|
|
||||||
Y2minus2[idx] = c2minus2 * x * y;
|
|
||||||
Y2minus1[idx] = c2minus1 * y * z;
|
|
||||||
Y21[idx] = c21 * x * z;
|
|
||||||
Y20[idx] = c20 * (3 * z * z - 1);
|
|
||||||
Y22[idx] = c22 * (x * x - y * y);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
} //namespace
|
} //namespace
|
||||||
|
|
||||||
// ----------------------------------------------------------------------------
|
// ----------------------------------------------------------------------------
|
||||||
/** Compute m_SH_coeff->red_SH_coeff, m_SH_coeff->green_SH_coeff
|
/** Compute m_SH_coeff->red_SH_coeff, m_SH_coeff->green_SH_coeff
|
||||||
* and m_SH_coeff->blue_SH_coeff from Yml values
|
* and m_SH_coeff->blue_SH_coeff from Yml values
|
||||||
* \param cubemap_face The 6 cubemap faces (float textures)
|
* \param sh_rgba The 6 cubemap faces (sRGB byte textures)
|
||||||
* \param edge_size Size of the cubemap face
|
* \param edge_size Size of the cubemap face
|
||||||
* \param Yml The sphericals harmonics functions values on each texel of the cubemap
|
|
||||||
*/
|
*/
|
||||||
void SphericalHarmonics::projectSH(Color *cubemap_face[6], size_t edge_size,
|
void SphericalHarmonics::generateSphericalHarmonics(unsigned char *sh_rgba[6], size_t edge_size)
|
||||||
float *Y00[],
|
|
||||||
float *Y1minus1[], float *Y10[], float *Y11[],
|
|
||||||
float *Y2minus2[], float *Y2minus1[], float * Y20[],
|
|
||||||
float *Y21[], float *Y22[])
|
|
||||||
{
|
{
|
||||||
for (unsigned i = 0; i < 9; i++)
|
|
||||||
|
#if SIMD_SSE2_SUPPORT
|
||||||
|
|
||||||
|
float wh = float(edge_size * edge_size);
|
||||||
|
float edge_size_inv;
|
||||||
|
float fi, fj, fi2p1;
|
||||||
|
unsigned char *shface;
|
||||||
|
__m128 sh0, sh1, sh2, sh3, sh4, sh5, sh6, sh7, sh8;
|
||||||
|
|
||||||
|
// constant part of Ylm
|
||||||
|
const float c00 = 0.282095f;
|
||||||
|
const float c1minus1 = 0.488603f;
|
||||||
|
const float c10 = 0.488603f;
|
||||||
|
const float c11 = 0.488603f;
|
||||||
|
const float c2minus2 = 1.092548f;
|
||||||
|
const float c2minus1 = 1.092548f;
|
||||||
|
const float c21 = 1.092548f;
|
||||||
|
const float c20 = 0.315392f;
|
||||||
|
const float c22 = 0.546274f;
|
||||||
|
|
||||||
|
sh0 = _mm_setzero_ps();
|
||||||
|
sh1 = _mm_setzero_ps();
|
||||||
|
sh2 = _mm_setzero_ps();
|
||||||
|
sh3 = _mm_setzero_ps();
|
||||||
|
sh4 = _mm_setzero_ps();
|
||||||
|
sh5 = _mm_setzero_ps();
|
||||||
|
sh6 = _mm_setzero_ps();
|
||||||
|
sh7 = _mm_setzero_ps();
|
||||||
|
sh8 = _mm_setzero_ps();
|
||||||
|
|
||||||
|
edge_size_inv = 2.0f / edge_size;
|
||||||
|
for (unsigned face = 0; face < 6; face++)
|
||||||
{
|
{
|
||||||
m_SH_coeff->blue_SH_coeff[i] = 0;
|
shface = sh_rgba[face];
|
||||||
m_SH_coeff->green_SH_coeff[i] = 0;
|
for (int i = 0; i < int(edge_size); i++)
|
||||||
m_SH_coeff->red_SH_coeff[i] = 0;
|
{
|
||||||
|
int shidx = ( i * edge_size ) * 4;
|
||||||
|
fi = (float(i) * edge_size_inv) - 1.0f;
|
||||||
|
fi2p1 = (fi * fi) + 1.0f;
|
||||||
|
for (unsigned j = 0; j < edge_size; j++, shidx += 4)
|
||||||
|
{
|
||||||
|
float d, solidangle, dinv;
|
||||||
|
float shx, shy, shz;
|
||||||
|
__m128 vrgb;
|
||||||
|
|
||||||
|
fj = (float(j) * edge_size_inv) - 1.0f;
|
||||||
|
d = sqrtf(fi2p1 + (fj * fj));
|
||||||
|
dinv = 1.0f / d;
|
||||||
|
// Constant obtained by projecting unprojected ref values
|
||||||
|
solidangle = 2.75f / (wh * sqrtf(d*d*d));
|
||||||
|
#if SIMD_SSE4_1_SUPPORT
|
||||||
|
vrgb = _mm_cvtepi32_ps( _mm_cvtepu8_epi32( _mm_castps_si128( _mm_load_ss( (float *)&shface[shidx+0] ) ) ) );
|
||||||
|
#else
|
||||||
|
vrgb = _mm_cvtepi32_ps( _mm_unpacklo_epi16( _mm_unpacklo_epi8( _mm_castps_si128( _mm_load_ss( (float *)&shface[shidx+0] ) ), _mm_setzero_si128() ), _mm_setzero_si128() ) );
|
||||||
|
#endif
|
||||||
|
vrgb = _mm_mul_ps( srgb2linear4( vrgb ), _mm_set1_ps( solidangle ) );
|
||||||
|
|
||||||
|
// TODO: This is very suboptimal, and a pity to break such a streamlined loop
|
||||||
|
// I think people will disapprove if I unroll the 'face' loop 6 times...
|
||||||
|
// What about some branchless vblendps masking?
|
||||||
|
switch (face)
|
||||||
|
{
|
||||||
|
case 0: // PosX
|
||||||
|
shx = 1.0f;
|
||||||
|
shy = -fi;
|
||||||
|
shz = -fj;
|
||||||
|
break;
|
||||||
|
case 1: // NegX
|
||||||
|
shx = -1.0f;
|
||||||
|
shy = -fi;
|
||||||
|
shz = fj;
|
||||||
|
break;
|
||||||
|
case 2: // PosY
|
||||||
|
shx = fj;
|
||||||
|
shy = 1.0f;
|
||||||
|
shz = fi;
|
||||||
|
break;
|
||||||
|
case 3: // NegY
|
||||||
|
shx = fj;
|
||||||
|
shy = -1.0f;
|
||||||
|
shz = -fi;
|
||||||
|
break;
|
||||||
|
case 4: // PosZ
|
||||||
|
shx = fj;
|
||||||
|
shy = -fi;
|
||||||
|
shz = 1.0f;
|
||||||
|
break;
|
||||||
|
case 5: // NegZ
|
||||||
|
shx = -fj;
|
||||||
|
shy = -fi;
|
||||||
|
shz = -1.0f;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
shx *= dinv;
|
||||||
|
shy *= dinv;
|
||||||
|
shz *= dinv;
|
||||||
|
|
||||||
|
sh0 = _mm_add_ps( sh0, vrgb );
|
||||||
|
sh1 = _mm_add_ps( sh1, _mm_mul_ps( vrgb, _mm_set1_ps( shy ) ) );
|
||||||
|
sh2 = _mm_add_ps( sh2, _mm_mul_ps( vrgb, _mm_set1_ps( shz ) ) );
|
||||||
|
sh3 = _mm_add_ps( sh3, _mm_mul_ps( vrgb, _mm_set1_ps( shx ) ) );
|
||||||
|
sh4 = _mm_add_ps( sh4, _mm_mul_ps( vrgb, _mm_set1_ps( shx * shy ) ) );
|
||||||
|
sh5 = _mm_add_ps( sh5, _mm_mul_ps( vrgb, _mm_set1_ps( shy * shz ) ) );
|
||||||
|
sh6 = _mm_add_ps( sh6, _mm_mul_ps( vrgb, _mm_set1_ps( shx * shz ) ) );
|
||||||
|
sh7 = _mm_add_ps( sh7, _mm_mul_ps( vrgb, _mm_set1_ps( ((3.0f * shz * shz) - 1.0f) ) ) );
|
||||||
|
sh8 = _mm_add_ps( sh8, _mm_mul_ps( vrgb, _mm_set1_ps( ((shx * shx) - (shy * shy)) ) ) );
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
sh0 = _mm_mul_ps( sh0, _mm_set1_ps( c00 ) );
|
||||||
|
sh1 = _mm_mul_ps( sh1, _mm_set1_ps( c1minus1 ) );
|
||||||
|
sh2 = _mm_mul_ps( sh2, _mm_set1_ps( c10 ) );
|
||||||
|
sh3 = _mm_mul_ps( sh3, _mm_set1_ps( c11 ) );
|
||||||
|
sh4 = _mm_mul_ps( sh4, _mm_set1_ps( c2minus2 ) );
|
||||||
|
sh5 = _mm_mul_ps( sh5, _mm_set1_ps( c2minus1 ) );
|
||||||
|
sh6 = _mm_mul_ps( sh6, _mm_set1_ps( c21 ) );
|
||||||
|
sh7 = _mm_mul_ps( sh7, _mm_set1_ps( c20 ) );
|
||||||
|
sh8 = _mm_mul_ps( sh8, _mm_set1_ps( c22 ) );
|
||||||
|
|
||||||
|
m_SH_coeff->blue_SH_coeff[0] = _mm_cvtss_f32( sh0 );
|
||||||
|
m_SH_coeff->blue_SH_coeff[1] = _mm_cvtss_f32( sh1 );
|
||||||
|
m_SH_coeff->blue_SH_coeff[2] = _mm_cvtss_f32( sh2 );
|
||||||
|
m_SH_coeff->blue_SH_coeff[3] = _mm_cvtss_f32( sh3 );
|
||||||
|
m_SH_coeff->blue_SH_coeff[4] = _mm_cvtss_f32( sh4 );
|
||||||
|
m_SH_coeff->blue_SH_coeff[5] = _mm_cvtss_f32( sh5 );
|
||||||
|
m_SH_coeff->blue_SH_coeff[6] = _mm_cvtss_f32( sh6 );
|
||||||
|
m_SH_coeff->blue_SH_coeff[7] = _mm_cvtss_f32( sh7 );
|
||||||
|
m_SH_coeff->blue_SH_coeff[8] = _mm_cvtss_f32( sh8 );
|
||||||
|
|
||||||
|
m_SH_coeff->green_SH_coeff[0] = _mm_cvtss_f32( _mm_shuffle_ps( sh0, sh0, 0x55 ) );
|
||||||
|
m_SH_coeff->green_SH_coeff[1] = _mm_cvtss_f32( _mm_shuffle_ps( sh1, sh1, 0x55 ) );
|
||||||
|
m_SH_coeff->green_SH_coeff[2] = _mm_cvtss_f32( _mm_shuffle_ps( sh2, sh2, 0x55 ) );
|
||||||
|
m_SH_coeff->green_SH_coeff[3] = _mm_cvtss_f32( _mm_shuffle_ps( sh3, sh3, 0x55 ) );
|
||||||
|
m_SH_coeff->green_SH_coeff[4] = _mm_cvtss_f32( _mm_shuffle_ps( sh4, sh4, 0x55 ) );
|
||||||
|
m_SH_coeff->green_SH_coeff[5] = _mm_cvtss_f32( _mm_shuffle_ps( sh5, sh5, 0x55 ) );
|
||||||
|
m_SH_coeff->green_SH_coeff[6] = _mm_cvtss_f32( _mm_shuffle_ps( sh6, sh6, 0x55 ) );
|
||||||
|
m_SH_coeff->green_SH_coeff[7] = _mm_cvtss_f32( _mm_shuffle_ps( sh7, sh7, 0x55 ) );
|
||||||
|
m_SH_coeff->green_SH_coeff[8] = _mm_cvtss_f32( _mm_shuffle_ps( sh8, sh8, 0x55 ) );
|
||||||
|
|
||||||
|
m_SH_coeff->red_SH_coeff[0] = _mm_cvtss_f32( _mm_movehl_ps( sh0, sh0 ) );
|
||||||
|
m_SH_coeff->red_SH_coeff[1] = _mm_cvtss_f32( _mm_movehl_ps( sh1, sh1 ) );
|
||||||
|
m_SH_coeff->red_SH_coeff[2] = _mm_cvtss_f32( _mm_movehl_ps( sh2, sh2 ) );
|
||||||
|
m_SH_coeff->red_SH_coeff[3] = _mm_cvtss_f32( _mm_movehl_ps( sh3, sh3 ) );
|
||||||
|
m_SH_coeff->red_SH_coeff[4] = _mm_cvtss_f32( _mm_movehl_ps( sh4, sh4 ) );
|
||||||
|
m_SH_coeff->red_SH_coeff[5] = _mm_cvtss_f32( _mm_movehl_ps( sh5, sh5 ) );
|
||||||
|
m_SH_coeff->red_SH_coeff[6] = _mm_cvtss_f32( _mm_movehl_ps( sh6, sh6 ) );
|
||||||
|
m_SH_coeff->red_SH_coeff[7] = _mm_cvtss_f32( _mm_movehl_ps( sh7, sh7 ) );
|
||||||
|
m_SH_coeff->red_SH_coeff[8] = _mm_cvtss_f32( _mm_movehl_ps( sh8, sh8 ) );
|
||||||
|
|
||||||
|
#else
|
||||||
|
|
||||||
float wh = float(edge_size * edge_size);
|
float wh = float(edge_size * edge_size);
|
||||||
float b0 = 0., b1 = 0., b2 = 0., b3 = 0., b4 = 0., b5 = 0., b6 = 0., b7 = 0., b8 = 0.;
|
float b0 = 0., b1 = 0., b2 = 0., b3 = 0., b4 = 0., b5 = 0., b6 = 0., b7 = 0., b8 = 0.;
|
||||||
float r0 = 0., r1 = 0., r2 = 0., r3 = 0., r4 = 0., r5 = 0., r6 = 0., r7 = 0., r8 = 0.;
|
|
||||||
float g0 = 0., g1 = 0., g2 = 0., g3 = 0., g4 = 0., g5 = 0., g6 = 0., g7 = 0., g8 = 0.;
|
float g0 = 0., g1 = 0., g2 = 0., g3 = 0., g4 = 0., g5 = 0., g6 = 0., g7 = 0., g8 = 0.;
|
||||||
|
float r0 = 0., r1 = 0., r2 = 0., r3 = 0., r4 = 0., r5 = 0., r6 = 0., r7 = 0., r8 = 0.;
|
||||||
|
float edge_size_inv;
|
||||||
|
float fi, fj, fi2p1;
|
||||||
|
unsigned char *shface;
|
||||||
|
|
||||||
|
// constant part of Ylm
|
||||||
|
const float c00 = 0.282095f;
|
||||||
|
const float c1minus1 = 0.488603f;
|
||||||
|
const float c10 = 0.488603f;
|
||||||
|
const float c11 = 0.488603f;
|
||||||
|
const float c2minus2 = 1.092548f;
|
||||||
|
const float c2minus1 = 1.092548f;
|
||||||
|
const float c21 = 1.092548f;
|
||||||
|
const float c20 = 0.315392f;
|
||||||
|
const float c22 = 0.546274f;
|
||||||
|
|
||||||
|
edge_size_inv = 2.0f / edge_size;
|
||||||
for (unsigned face = 0; face < 6; face++)
|
for (unsigned face = 0; face < 6; face++)
|
||||||
{
|
{
|
||||||
#pragma omp parallel for reduction(+ : b0, b1, b2, b3, b4, b5, b6, b7, b8, \
|
shface = sh_rgba[face];
|
||||||
r0, r1, r2, r3, r4, r5, r6, r7, r8, \
|
|
||||||
g0, g1, g2, g3, g4, g5, g6, g7, g8)
|
|
||||||
for (int i = 0; i < int(edge_size); i++)
|
for (int i = 0; i < int(edge_size); i++)
|
||||||
{
|
{
|
||||||
for (unsigned j = 0; j < edge_size; j++)
|
int shidx = ( i * edge_size ) * 4;
|
||||||
|
fi = (float(i) * edge_size_inv) - 1.0f;
|
||||||
|
fi2p1 = (fi * fi) + 1.0f;
|
||||||
|
for (unsigned j = 0; j < edge_size; j++, shidx += 4)
|
||||||
{
|
{
|
||||||
int idx = i * edge_size + j;
|
fj = (float(j) * edge_size_inv) - 1.0f;
|
||||||
float fi = float(i), fj = float(j);
|
|
||||||
fi /= edge_size, fj /= edge_size;
|
|
||||||
fi = 2 * fi - 1, fj = 2 * fj - 1;
|
|
||||||
|
|
||||||
|
|
||||||
float d = sqrt(fi * fi + fj * fj + 1);
|
|
||||||
|
|
||||||
|
float d = sqrtf(fi2p1 + (fj * fj));
|
||||||
// Constant obtained by projecting unprojected ref values
|
// Constant obtained by projecting unprojected ref values
|
||||||
float solidangle = 2.75f / (wh * pow(d, 1.5f));
|
float solidangle = 2.75f / (wh * sqrtf(d*d*d));
|
||||||
// pow(., 2.2) to convert from srgb
|
float r, g, b;
|
||||||
float b = cubemap_face[face][edge_size * i + j].Blue;
|
float y00, y1m1, y10, y11, y2m2, y2m1, y21, y20, y22;
|
||||||
float g = cubemap_face[face][edge_size * i + j].Green;
|
float shx, shy, shz;
|
||||||
float r = cubemap_face[face][edge_size * i + j].Red;
|
|
||||||
|
|
||||||
b0 += b * Y00[face][idx] * solidangle;
|
b = srgb2linear( float(shface[shidx+0]) ) * solidangle;
|
||||||
b1 += b * Y1minus1[face][idx] * solidangle;
|
g = srgb2linear( float(shface[shidx+1]) ) * solidangle;
|
||||||
b2 += b * Y10[face][idx] * solidangle;
|
r = srgb2linear( float(shface[shidx+2]) ) * solidangle;
|
||||||
b3 += b * Y11[face][idx] * solidangle;
|
|
||||||
b4 += b * Y2minus2[face][idx] * solidangle;
|
|
||||||
b5 += b * Y2minus1[face][idx] * solidangle;
|
|
||||||
b6 += b * Y20[face][idx] * solidangle;
|
|
||||||
b7 += b * Y21[face][idx] * solidangle;
|
|
||||||
b8 += b * Y22[face][idx] * solidangle;
|
|
||||||
|
|
||||||
g0 += g * Y00[face][idx] * solidangle;
|
getXYZ(face, fi, fj, shx, shy, shz);
|
||||||
g1 += g * Y1minus1[face][idx] * solidangle;
|
y00 = c00;
|
||||||
g2 += g * Y10[face][idx] * solidangle;
|
y1m1 = c1minus1 * shy;
|
||||||
g3 += g * Y11[face][idx] * solidangle;
|
y10 = c10 * shz;
|
||||||
g4 += g * Y2minus2[face][idx] * solidangle;
|
y11 = c11 * shx;
|
||||||
g5 += g * Y2minus1[face][idx] * solidangle;
|
y2m2 = c2minus2 * shx * shy;
|
||||||
g6 += g * Y20[face][idx] * solidangle;
|
y2m1 = c2minus1 * shy * shz;
|
||||||
g7 += g * Y21[face][idx] * solidangle;
|
y21 = c21 * shx * shz;
|
||||||
g8 += g * Y22[face][idx] * solidangle;
|
y20 = c20 * ((3.0f * shz * shz) - 1.0f);
|
||||||
|
y22 = c22 * ((shx * shx) - (shy * shy));
|
||||||
|
|
||||||
|
b0 += b * y00;
|
||||||
|
b1 += b * y1m1;
|
||||||
|
b2 += b * y10;
|
||||||
|
b3 += b * y11;
|
||||||
|
b4 += b * y2m2;
|
||||||
|
b5 += b * y2m1;
|
||||||
|
b6 += b * y20;
|
||||||
|
b7 += b * y21;
|
||||||
|
b8 += b * y22;
|
||||||
|
|
||||||
r0 += r * Y00[face][idx] * solidangle;
|
g0 += g * y00;
|
||||||
r1 += r * Y1minus1[face][idx] * solidangle;
|
g1 += g * y1m1;
|
||||||
r2 += r * Y10[face][idx] * solidangle;
|
g2 += g * y10;
|
||||||
r3 += r * Y11[face][idx] * solidangle;
|
g3 += g * y11;
|
||||||
r4 += r * Y2minus2[face][idx] * solidangle;
|
g4 += g * y2m2;
|
||||||
r5 += r * Y2minus1[face][idx] * solidangle;
|
g5 += g * y2m1;
|
||||||
r6 += r * Y20[face][idx] * solidangle;
|
g6 += g * y20;
|
||||||
r7 += r * Y21[face][idx] * solidangle;
|
g7 += g * y21;
|
||||||
r8 += r * Y22[face][idx] * solidangle;
|
g8 += g * y22;
|
||||||
|
|
||||||
|
r0 += r * y00;
|
||||||
|
r1 += r * y1m1;
|
||||||
|
r2 += r * y10;
|
||||||
|
r3 += r * y11;
|
||||||
|
r4 += r * y2m2;
|
||||||
|
r5 += r * y2m1;
|
||||||
|
r6 += r * y20;
|
||||||
|
r7 += r * y21;
|
||||||
|
r8 += r * y22;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -307,60 +503,15 @@ void SphericalHarmonics::projectSH(Color *cubemap_face[6], size_t edge_size,
|
|||||||
m_SH_coeff->green_SH_coeff[6] = g6;
|
m_SH_coeff->green_SH_coeff[6] = g6;
|
||||||
m_SH_coeff->green_SH_coeff[7] = g7;
|
m_SH_coeff->green_SH_coeff[7] = g7;
|
||||||
m_SH_coeff->green_SH_coeff[8] = g8;
|
m_SH_coeff->green_SH_coeff[8] = g8;
|
||||||
|
|
||||||
|
#endif
|
||||||
|
/*
|
||||||
|
printf( "#### SH ; Coeffs R ; %f %f %f %f %f %f %f %f\n", m_SH_coeff->red_SH_coeff[0], m_SH_coeff->red_SH_coeff[1], m_SH_coeff->red_SH_coeff[2], m_SH_coeff->red_SH_coeff[3], m_SH_coeff->red_SH_coeff[4], m_SH_coeff->red_SH_coeff[5], m_SH_coeff->red_SH_coeff[6], m_SH_coeff->red_SH_coeff[7] );
|
||||||
|
printf( "#### SH ; Coeffs G ; %f %f %f %f %f %f %f %f\n", m_SH_coeff->green_SH_coeff[0], m_SH_coeff->green_SH_coeff[1], m_SH_coeff->green_SH_coeff[2], m_SH_coeff->green_SH_coeff[3], m_SH_coeff->green_SH_coeff[4], m_SH_coeff->green_SH_coeff[5], m_SH_coeff->green_SH_coeff[6], m_SH_coeff->green_SH_coeff[7] );
|
||||||
|
printf( "#### SH ; Coeffs B ; %f %f %f %f %f %f %f %f\n", m_SH_coeff->blue_SH_coeff[0], m_SH_coeff->blue_SH_coeff[1], m_SH_coeff->blue_SH_coeff[2], m_SH_coeff->blue_SH_coeff[3], m_SH_coeff->blue_SH_coeff[4], m_SH_coeff->blue_SH_coeff[5], m_SH_coeff->blue_SH_coeff[6], m_SH_coeff->blue_SH_coeff[7] );
|
||||||
|
*/
|
||||||
} // projectSH
|
} // projectSH
|
||||||
|
|
||||||
// ----------------------------------------------------------------------------
|
|
||||||
/** Generate the 9 first SH coefficients for each color channel
|
|
||||||
* using the cubemap provided by CubemapFace.
|
|
||||||
* \param cubemap_face The 6 cubemap faces (float textures)
|
|
||||||
* \param edge_size Size of the cubemap face
|
|
||||||
*/
|
|
||||||
void SphericalHarmonics::generateSphericalHarmonics(Color *cubemap_face[6], size_t edge_size)
|
|
||||||
{
|
|
||||||
float *Y00[6];
|
|
||||||
float *Y1minus1[6];
|
|
||||||
float *Y10[6];
|
|
||||||
float *Y11[6];
|
|
||||||
float *Y2minus2[6];
|
|
||||||
float *Y2minus1[6];
|
|
||||||
float *Y20[6];
|
|
||||||
float *Y21[6];
|
|
||||||
float *Y22[6];
|
|
||||||
|
|
||||||
for (unsigned face = 0; face < 6; face++)
|
|
||||||
{
|
|
||||||
Y00[face] = new float[edge_size * edge_size];
|
|
||||||
Y1minus1[face] = new float[edge_size * edge_size];
|
|
||||||
Y10[face] = new float[edge_size * edge_size];
|
|
||||||
Y11[face] = new float[edge_size * edge_size];
|
|
||||||
Y2minus2[face] = new float[edge_size * edge_size];
|
|
||||||
Y2minus1[face] = new float[edge_size * edge_size];
|
|
||||||
Y20[face] = new float[edge_size * edge_size];
|
|
||||||
Y21[face] = new float[edge_size * edge_size];
|
|
||||||
Y22[face] = new float[edge_size * edge_size];
|
|
||||||
|
|
||||||
getYml(GL_TEXTURE_CUBE_MAP_POSITIVE_X + face, edge_size, Y00[face],
|
|
||||||
Y1minus1[face], Y10[face], Y11[face], Y2minus2[face],
|
|
||||||
Y2minus1[face], Y20[face], Y21[face], Y22[face]);
|
|
||||||
}
|
|
||||||
|
|
||||||
projectSH(cubemap_face, edge_size, Y00, Y1minus1, Y10, Y11, Y2minus2,
|
|
||||||
Y2minus1, Y20, Y21, Y22);
|
|
||||||
|
|
||||||
for (unsigned face = 0; face < 6; face++)
|
|
||||||
{
|
|
||||||
delete[] Y00[face];
|
|
||||||
delete[] Y1minus1[face];
|
|
||||||
delete[] Y10[face];
|
|
||||||
delete[] Y11[face];
|
|
||||||
delete[] Y2minus2[face];
|
|
||||||
delete[] Y2minus1[face];
|
|
||||||
delete[] Y20[face];
|
|
||||||
delete[] Y21[face];
|
|
||||||
delete[] Y22[face];
|
|
||||||
}
|
|
||||||
} // generateSphericalHarmonics
|
|
||||||
|
|
||||||
// ----------------------------------------------------------------------------
|
// ----------------------------------------------------------------------------
|
||||||
SphericalHarmonics::SphericalHarmonics(const std::vector<video::ITexture *> &spherical_harmonics_textures)
|
SphericalHarmonics::SphericalHarmonics(const std::vector<video::ITexture *> &spherical_harmonics_textures)
|
||||||
{
|
{
|
||||||
@ -427,15 +578,10 @@ void SphericalHarmonics::setTextures(const std::vector<video::ITexture *> &spher
|
|||||||
#endif
|
#endif
|
||||||
} //for (unsigned i = 0; i < 6; i++)
|
} //for (unsigned i = 0; i < 6; i++)
|
||||||
|
|
||||||
Color *float_tex_cube[6];
|
generateSphericalHarmonics(sh_rgba, sh_w);
|
||||||
convertToFloatTexture(sh_rgba, sh_w, sh_h, float_tex_cube);
|
|
||||||
generateSphericalHarmonics(float_tex_cube, sh_w);
|
|
||||||
|
|
||||||
for (unsigned i = 0; i < 6; i++)
|
for (unsigned i = 0; i < 6; i++)
|
||||||
{
|
|
||||||
delete[] sh_rgba[i];
|
delete[] sh_rgba[i];
|
||||||
delete[] float_tex_cube[i];
|
|
||||||
}
|
|
||||||
} //setSphericalHarmonicsTextures
|
} //setSphericalHarmonicsTextures
|
||||||
|
|
||||||
/** Compute spherical harmonics coefficients from ambient light */
|
/** Compute spherical harmonics coefficients from ambient light */
|
||||||
@ -451,29 +597,28 @@ void SphericalHarmonics::setAmbientLight(const video::SColor &ambient)
|
|||||||
unsigned char *sh_rgba[6];
|
unsigned char *sh_rgba[6];
|
||||||
unsigned sh_w = 16;
|
unsigned sh_w = 16;
|
||||||
unsigned sh_h = 16;
|
unsigned sh_h = 16;
|
||||||
|
unsigned ambr, ambg, ambb;
|
||||||
|
|
||||||
|
ambr = ambient.getBlue();
|
||||||
|
ambg = ambient.getGreen();
|
||||||
|
ambb = ambient.getBlue();
|
||||||
|
|
||||||
for (unsigned i = 0; i < 6; i++)
|
for (unsigned i = 0; i < 6; i++)
|
||||||
{
|
{
|
||||||
sh_rgba[i] = new unsigned char[sh_w * sh_h * 4];
|
sh_rgba[i] = new unsigned char[sh_w * sh_h * 4];
|
||||||
|
|
||||||
for (unsigned j = 0; j < sh_w * sh_h * 4; j += 4)
|
for (unsigned j = 0; j < sh_w * sh_h * 4; j += 4)
|
||||||
{
|
{
|
||||||
sh_rgba[i][j] = ambient.getBlue();
|
sh_rgba[i][j] = ambb;
|
||||||
sh_rgba[i][j + 1] = ambient.getGreen();
|
sh_rgba[i][j + 1] = ambg;
|
||||||
sh_rgba[i][j + 2] = ambient.getRed();
|
sh_rgba[i][j + 2] = ambr;
|
||||||
sh_rgba[i][j + 3] = 255;
|
sh_rgba[i][j + 3] = 255;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
Color *float_tex_cube[6];
|
generateSphericalHarmonics(sh_rgba, sh_w);
|
||||||
convertToFloatTexture(sh_rgba, sh_w, sh_h, float_tex_cube);
|
|
||||||
generateSphericalHarmonics(float_tex_cube, sh_w);
|
|
||||||
|
|
||||||
for (unsigned i = 0; i < 6; i++)
|
for (unsigned i = 0; i < 6; i++)
|
||||||
{
|
|
||||||
delete[] sh_rgba[i];
|
delete[] sh_rgba[i];
|
||||||
delete[] float_tex_cube[i];
|
|
||||||
}
|
|
||||||
|
|
||||||
// Diffuse env map is x 0.25, compensate
|
// Diffuse env map is x 0.25, compensate
|
||||||
for (unsigned i = 0; i < 9; i++)
|
for (unsigned i = 0; i < 9; i++)
|
||||||
|
@ -49,12 +49,7 @@ private:
|
|||||||
/** The spherical harmonics coefficients */
|
/** The spherical harmonics coefficients */
|
||||||
SHCoefficients *m_SH_coeff;
|
SHCoefficients *m_SH_coeff;
|
||||||
|
|
||||||
void projectSH(Color *cubemap_face[6], size_t edge_size, float *Y00[],
|
void generateSphericalHarmonics(unsigned char *sh_rgba[6], size_t edge_size);
|
||||||
float *Y1minus1[], float *Y10[], float *Y11[],
|
|
||||||
float *Y2minus2[], float *Y2minus1[], float * Y20[],
|
|
||||||
float *Y21[], float *Y22[]);
|
|
||||||
|
|
||||||
void generateSphericalHarmonics(Color *cubemap_face[6], size_t edge_size);
|
|
||||||
|
|
||||||
public:
|
public:
|
||||||
SphericalHarmonics(const std::vector<irr::video::ITexture *> &spherical_harmonics_textures);
|
SphericalHarmonics(const std::vector<irr::video::ITexture *> &spherical_harmonics_textures);
|
||||||
|
Loading…
Reference in New Issue
Block a user