/* ----------------------------------------------------------------------------- * * Copyright (c) 2008-2016 Alexis Naveros. * * * The SIMD trigonometry functions are Copyright (C) 2007 Julien Pommier * See copyright notice for simd4f_sin_ps(), simd4f_cos_ps(), simd4f_sincos_ps() * * * Some functions are Copyright (C) 2008 José Fonseca * See copyright notice for simd4f_exp2_ps(), simd4f_log2_ps(), simd4f_pow_ps() * * * Portions developed under contract to the SURVICE Engineering Company. * * This software is provided 'as-is', without any express or implied * warranty. In no event will the authors be held liable for any damages * arising from the use of this software. * * Permission is granted to anyone to use this software for any purpose, * including commercial applications, and to alter it and redistribute it * freely, subject to the following restrictions: * * 1. The origin of this software must not be misrepresented; you must not * claim that you wrote the original software. If you use this software * in a product, an acknowledgment in the product documentation would be * appreciated but is not required. * 2. Altered source versions must be plainly marked as such, and must not be * misrepresented as being the original software. * 3. This notice may not be removed or altered from any source distribution. * * ----------------------------------------------------------------------------- */ #include #include #include #include #include #include #include #include #include "cpusimd.h" //// #if CPU_SSE_SUPPORT const uint32_t CPU_ALIGN16 simd4fSignMask[4] = { 0x80000000, 0x80000000, 0x80000000, 0x80000000 }; const uint32_t CPU_ALIGN16 simd4fSignMaskInv[4] = { 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff }; const float CPU_ALIGN16 simd4fHalf[4] = { 0.5, 0.5, 0.5, 0.5 }; const float CPU_ALIGN16 simd4fOne[4] = { 1.0, 1.0, 1.0, 1.0 }; const float CPU_ALIGN16 simd4fTwo[4] = { 2.0, 2.0, 2.0, 2.0 }; const float CPU_ALIGN16 simd4fThree[4] = { 3.0, 3.0, 3.0, 3.0 }; const uint32_t CPU_ALIGN16 simd4uOne[4] = { 1, 1, 1, 1 }; const uint32_t CPU_ALIGN16 simd4uOneInv[4] = { ~1, ~1, ~1, ~1 }; const uint32_t CPU_ALIGN16 simd4uTwo[4] = { 2, 2, 2, 2 }; const uint32_t CPU_ALIGN16 simd4uFour[4] = { 4, 4, 4, 4 }; const float CPU_ALIGN16 simd4fQuarter[4] = { 0.25, 0.25, 0.25, 0.25 }; // Avoid double to float conversion warnings when using M_PI --> use our own PI #define PI 3.14159265358979323846f const float CPU_ALIGN16 simd4fPi[4] = { PI, PI, PI, PI }; const float CPU_ALIGN16 simd4fZeroOneTwoThree[4] = { 0.0, 1.0, 2.0, 3.0 }; const uint32_t CPU_ALIGN16 simd4fAlphaMask[4] = { 0x00000000, 0x00000000, 0x00000000, 0xffffffff }; const float CPU_ALIGN16 simd4f255[4] = { 255.0f, 255.0f, 255.0f, 255.0f }; const float CPU_ALIGN16 simd4f255Inv[4] = { 1.0f/255.0f, 1.0f/255.0f, 1.0f/255.0f, 1.0f/255.0f }; #endif //// #if CPU_SSE2_SUPPORT /* Copyright (C) 2007 Julien Pommier This software is provided 'as-is', without any express or implied warranty. In no event will the authors be held liable for any damages arising from the use of this software. Permission is granted to anyone to use this software for any purpose, including commercial applications, and to alter it and redistribute it freely, subject to the following restrictions: 1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required. 2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software. 3. This notice may not be removed or altered from any source distribution. (this is the zlib license) */ static const float CPU_ALIGN16 simd4f_cephes_FOPI[4] = { 1.27323954473516f, 1.27323954473516f, 1.27323954473516f, 1.27323954473516f }; static const float CPU_ALIGN16 simd4f_minus_cephes_DP1[4] = { -0.78515625, -0.78515625, -0.78515625, -0.78515625 }; static const float CPU_ALIGN16 simd4f_minus_cephes_DP2[4] = { -2.4187564849853515625e-4, -2.4187564849853515625e-4, -2.4187564849853515625e-4, -2.4187564849853515625e-4 }; static const float CPU_ALIGN16 simd4f_minus_cephes_DP3[4] = { -3.77489497744594108e-8f, -3.77489497744594108e-8f, -3.77489497744594108e-8f, -3.77489497744594108e-8f }; static const float CPU_ALIGN16 simd4f_sincof_p0[4] = { -1.9515295891E-4f, -1.9515295891E-4f, -1.9515295891E-4f, -1.9515295891E-4f }; static const float CPU_ALIGN16 simd4f_sincof_p1[4] = { 8.3321608736E-3f, 8.3321608736E-3f, 8.3321608736E-3f, 8.3321608736E-3f }; static const float CPU_ALIGN16 simd4f_sincof_p2[4] = { -1.6666654611E-1f, -1.6666654611E-1f, -1.6666654611E-1f, -1.6666654611E-1f }; static const float CPU_ALIGN16 simd4f_coscof_p0[4] = { 2.443315711809948E-005f, 2.443315711809948E-005f, 2.443315711809948E-005f, 2.443315711809948E-005f }; static const float CPU_ALIGN16 simd4f_coscof_p1[4] = { -1.388731625493765E-003f, -1.388731625493765E-003f, -1.388731625493765E-003f, -1.388731625493765E-003f }; static const float CPU_ALIGN16 simd4f_coscof_p2[4] = { 4.166664568298827E-002f, 4.166664568298827E-002f, 4.166664568298827E-002f, 4.166664568298827E-002f }; __m128 simd4f_sin_ps( __m128 x ) { __m128 xmm1, xmm2, xmm3, sign_bit, y; __m128i emm0, emm2; xmm2 = _mm_setzero_ps(); sign_bit = x; /* take the absolute value */ x = _mm_and_ps( x, *(__m128 *)simd4fSignMaskInv ); /* extract the sign bit (upper one) */ sign_bit = _mm_and_ps(sign_bit, *(__m128 *)simd4fSignMask); /* scale by 4/Pi */ y = _mm_mul_ps(x, *(__m128 *)simd4f_cephes_FOPI); /* store the integer part of y in mm0 */ emm2 = _mm_cvttps_epi32(y); /* j=(j+1) & (~1) (see the cephes sources) */ emm2 = _mm_add_epi32(emm2, *(__m128i*)simd4uOne); emm2 = _mm_and_si128(emm2, *(__m128i*)simd4uOneInv); y = _mm_cvtepi32_ps(emm2); /* get the swap sign flag */ emm0 = _mm_and_si128(emm2, *(__m128i*)simd4uFour); emm0 = _mm_slli_epi32(emm0, 29); /* get the polynom selection mask there is one polynom for 0 <= x <= Pi/4 and another one for Pi/4