diff --git a/lib/irrlicht/include/matrix4.h b/lib/irrlicht/include/matrix4.h index 45fe98374..1d812b308 100644 --- a/lib/irrlicht/include/matrix4.h +++ b/lib/irrlicht/include/matrix4.h @@ -12,6 +12,9 @@ #include "aabbox3d.h" #include "rect.h" #include "irrString.h" +#if defined(__x86_64__) || defined(_M_X64) || defined(__i386) || defined(_M_IX86) +#include +#endif // enable this to keep track of changes to the matrix // and make simpler identity check for seldomly changing matrices @@ -1307,20 +1310,110 @@ namespace core template - inline bool CMatrix4::getInverse(CMatrix4& out) const - { - /// Calculates the inverse of this Matrix - /// The inverse is calculated using Cramers rule. - /// If no inverse exists then 'false' is returned. + inline bool CMatrix4::getInverse(CMatrix4& out) const + { + /// Calculates the inverse of this Matrix + /// The inverse is calculated using Cramers rule. + /// If no inverse exists then 'false' is returned. #if defined ( USE_MATRIX_TEST ) - if ( this->isIdentity() ) - { - out=*this; - return true; - } + if ( this->isIdentity() ) + { + out=*this; + return true; + } #endif - const CMatrix4 &m = *this; + const CMatrix4 &m = *this; +#if defined(__x86_64__) || defined(_M_X64) || defined(__i386) || defined(_M_IX86) + float *src = (float*)m.pointer(); + float *dst = (float*)out.pointer(); + // from http://www.intel.com/design/pentiumiii/sml/245043.htm + { + __m128 minor0, minor1, minor2, minor3; + __m128 row0, row1, row2, row3; + __m128 det, tmp1; + tmp1 = _mm_loadh_pi(_mm_loadl_pi(tmp1, (__m64*)(src)), (__m64*)(src + 4)); + row1 = _mm_loadh_pi(_mm_loadl_pi(row1, (__m64*)(src + 8)), (__m64*)(src + 12)); + row0 = _mm_shuffle_ps(tmp1, row1, 0x88); + row1 = _mm_shuffle_ps(row1, tmp1, 0xDD); + tmp1 = _mm_loadh_pi(_mm_loadl_pi(tmp1, (__m64*)(src + 2)), (__m64*)(src + 6)); + row3 = _mm_loadh_pi(_mm_loadl_pi(row3, (__m64*)(src + 10)), (__m64*)(src + 14)); + row2 = _mm_shuffle_ps(tmp1, row3, 0x88); + row3 = _mm_shuffle_ps(row3, tmp1, 0xDD); + // ----------------------------------------------- + tmp1 = _mm_mul_ps(row2, row3); + tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0xB1); + minor0 = _mm_mul_ps(row1, tmp1); + minor1 = _mm_mul_ps(row0, tmp1); + tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0x4E); + minor0 = _mm_sub_ps(_mm_mul_ps(row1, tmp1), minor0); + minor1 = _mm_sub_ps(_mm_mul_ps(row0, tmp1), minor1); + minor1 = _mm_shuffle_ps(minor1, minor1, 0x4E); + // ----------------------------------------------- + tmp1 = _mm_mul_ps(row1, row2); + tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0xB1); + minor0 = _mm_add_ps(_mm_mul_ps(row3, tmp1), minor0); + minor3 = _mm_mul_ps(row0, tmp1); + tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0x4E); + minor0 = _mm_sub_ps(minor0, _mm_mul_ps(row3, tmp1)); + minor3 = _mm_sub_ps(_mm_mul_ps(row0, tmp1), minor3); + minor3 = _mm_shuffle_ps(minor3, minor3, 0x4E); + // ----------------------------------------------- + tmp1 = _mm_mul_ps(_mm_shuffle_ps(row1, row1, 0x4E), row3); + tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0xB1); + row2 = _mm_shuffle_ps(row2, row2, 0x4E); + minor0 = _mm_add_ps(_mm_mul_ps(row2, tmp1), minor0); + minor2 = _mm_mul_ps(row0, tmp1); + tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0x4E); + minor0 = _mm_sub_ps(minor0, _mm_mul_ps(row2, tmp1)); + minor2 = _mm_sub_ps(_mm_mul_ps(row0, tmp1), minor2); + minor2 = _mm_shuffle_ps(minor2, minor2, 0x4E); + // ----------------------------------------------- + tmp1 = _mm_mul_ps(row0, row1); + tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0xB1); + minor2 = _mm_add_ps(_mm_mul_ps(row3, tmp1), minor2); + minor3 = _mm_sub_ps(_mm_mul_ps(row2, tmp1), minor3); + tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0x4E); + minor2 = _mm_sub_ps(_mm_mul_ps(row3, tmp1), minor2); + minor3 = _mm_sub_ps(minor3, _mm_mul_ps(row2, tmp1)); + // ----------------------------------------------- + tmp1 = _mm_mul_ps(row0, row3); + tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0xB1); + minor1 = _mm_sub_ps(minor1, _mm_mul_ps(row2, tmp1)); + minor2 = _mm_add_ps(_mm_mul_ps(row1, tmp1), minor2); + tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0x4E); + minor1 = _mm_add_ps(_mm_mul_ps(row2, tmp1), minor1); + minor2 = _mm_sub_ps(minor2, _mm_mul_ps(row1, tmp1)); + // ----------------------------------------------- + tmp1 = _mm_mul_ps(row0, row2); + tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0xB1); + minor1 = _mm_add_ps(_mm_mul_ps(row3, tmp1), minor1); + minor3 = _mm_sub_ps(minor3, _mm_mul_ps(row1, tmp1)); + tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0x4E); + minor1 = _mm_sub_ps(minor1, _mm_mul_ps(row3, tmp1)); + minor3 = _mm_add_ps(_mm_mul_ps(row1, tmp1), minor3); + // ----------------------------------------------- + det = _mm_mul_ps(row0, minor0); + det = _mm_add_ps(_mm_shuffle_ps(det, det, 0x4E), det); + det = _mm_add_ss(_mm_shuffle_ps(det, det, 0xB1), det); + tmp1 = _mm_rcp_ss(det); + det = _mm_sub_ss(_mm_add_ss(tmp1, tmp1), _mm_mul_ss(det, _mm_mul_ss(tmp1, tmp1))); + det = _mm_shuffle_ps(det, det, 0x00); + minor0 = _mm_mul_ps(det, minor0); + _mm_storel_pi((__m64*)(dst), minor0); + _mm_storeh_pi((__m64*)(dst + 2), minor0); + minor1 = _mm_mul_ps(det, minor1); + _mm_storel_pi((__m64*)(dst + 4), minor1); + _mm_storeh_pi((__m64*)(dst + 6), minor1); + minor2 = _mm_mul_ps(det, minor2); + _mm_storel_pi((__m64*)(dst + 8), minor2); + _mm_storeh_pi((__m64*)(dst + 10), minor2); + minor3 = _mm_mul_ps(det, minor3); + _mm_storel_pi((__m64*)(dst + 12), minor3); + _mm_storeh_pi((__m64*)(dst + 14), minor3); + } + return true; +#else f32 d = (m(0, 0) * m(1, 1) - m(0, 1) * m(1, 0)) * (m(2, 2) * m(3, 3) - m(2, 3) * m(3, 2)) - (m(0, 0) * m(1, 2) - m(0, 2) * m(1, 0)) * (m(2, 1) * m(3, 3) - m(2, 3) * m(3, 1)) + @@ -1387,6 +1480,7 @@ namespace core out.definitelyIdentityMatrix = definitelyIdentityMatrix; #endif return true; +#endif }