/* ----------------------------------------------------------------------------- * * Copyright (c) 2008-2016 Alexis Naveros. * * * The SIMD trigonometry functions are Copyright (C) 2007 Julien Pommier * See copyright notice for simd4f_sin_ps(), simd4f_cos_ps(), simd4f_sincos_ps() * * * Some functions are Copyright (C) 2008 José Fonseca * See copyright notice for simd4f_exp2_ps(), simd4f_log2_ps(), simd4f_pow_ps() * * * Portions developed under contract to the SURVICE Engineering Company. * * This software is provided 'as-is', without any express or implied * warranty. In no event will the authors be held liable for any damages * arising from the use of this software. * * Permission is granted to anyone to use this software for any purpose, * including commercial applications, and to alter it and redistribute it * freely, subject to the following restrictions: * * 1. The origin of this software must not be misrepresented; you must not * claim that you wrote the original software. If you use this software * in a product, an acknowledgment in the product documentation would be * appreciated but is not required. * 2. Altered source versions must be plainly marked as such, and must not be * misrepresented as being the original software. * 3. This notice may not be removed or altered from any source distribution. * * ----------------------------------------------------------------------------- */ #include #include #include #include #include #include #include #include #include "cpusimd.h" #ifndef M_PI #define M_PI 3.14159265358979323846 #endif //// #if CPU_SSE_SUPPORT const uint32_t simd4fSignMask[4] CPU_ALIGN16 = { 0x80000000, 0x80000000, 0x80000000, 0x80000000 }; const uint32_t simd4fSignMaskInv[4] CPU_ALIGN16 = { 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff }; const float simd4fHalf[4] CPU_ALIGN16 = { 0.5, 0.5, 0.5, 0.5 }; const float simd4fOne[4] CPU_ALIGN16 = { 1.0, 1.0, 1.0, 1.0 }; const float simd4fTwo[4] CPU_ALIGN16 = { 2.0, 2.0, 2.0, 2.0 }; const float simd4fThree[4] CPU_ALIGN16 = { 3.0, 3.0, 3.0, 3.0 }; const uint32_t simd4uOne[4] CPU_ALIGN16 = { 1, 1, 1, 1 }; const uint32_t simd4uOneInv[4] CPU_ALIGN16 = { ~1, ~1, ~1, ~1 }; const uint32_t simd4uTwo[4] CPU_ALIGN16 = { 2, 2, 2, 2 }; const uint32_t simd4uFour[4] CPU_ALIGN16 = { 4, 4, 4, 4 }; const float simd4fQuarter[4] CPU_ALIGN16 = { 0.25, 0.25, 0.25, 0.25 }; const float simd4fPi[4] CPU_ALIGN16 = { M_PI, M_PI, M_PI, M_PI }; const float simd4fZeroOneTwoThree[4] CPU_ALIGN16 = { 0.0, 1.0, 2.0, 3.0 }; const uint32_t simd4fAlphaMask[4] CPU_ALIGN16 = { 0x00000000, 0x00000000, 0x00000000, 0xffffffff }; const float simd4f255[4] CPU_ALIGN16 = { 255.0f, 255.0f, 255.0f, 255.0f }; const float simd4f255Inv[4] CPU_ALIGN16 = { 1.0f/255.0f, 1.0f/255.0f, 1.0f/255.0f, 1.0f/255.0f }; #endif //// #if CPU_SSE2_SUPPORT /* Copyright (C) 2007 Julien Pommier This software is provided 'as-is', without any express or implied warranty. In no event will the authors be held liable for any damages arising from the use of this software. Permission is granted to anyone to use this software for any purpose, including commercial applications, and to alter it and redistribute it freely, subject to the following restrictions: 1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required. 2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software. 3. This notice may not be removed or altered from any source distribution. (this is the zlib license) */ static const float simd4f_cephes_FOPI[4] CPU_ALIGN16 = { 1.27323954473516, 1.27323954473516, 1.27323954473516, 1.27323954473516 }; static const float simd4f_minus_cephes_DP1[4] CPU_ALIGN16 = { -0.78515625, -0.78515625, -0.78515625, -0.78515625 }; static const float simd4f_minus_cephes_DP2[4] CPU_ALIGN16 = { -2.4187564849853515625e-4, -2.4187564849853515625e-4, -2.4187564849853515625e-4, -2.4187564849853515625e-4 }; static const float simd4f_minus_cephes_DP3[4] CPU_ALIGN16 = { -3.77489497744594108e-8, -3.77489497744594108e-8, -3.77489497744594108e-8, -3.77489497744594108e-8 }; static const float simd4f_sincof_p0[4] CPU_ALIGN16 = { -1.9515295891E-4, -1.9515295891E-4, -1.9515295891E-4, -1.9515295891E-4 }; static const float simd4f_sincof_p1[4] CPU_ALIGN16 = { 8.3321608736E-3, 8.3321608736E-3, 8.3321608736E-3, 8.3321608736E-3 }; static const float simd4f_sincof_p2[4] CPU_ALIGN16 = { -1.6666654611E-1, -1.6666654611E-1, -1.6666654611E-1, -1.6666654611E-1 }; static const float simd4f_coscof_p0[4] CPU_ALIGN16 = { 2.443315711809948E-005, 2.443315711809948E-005, 2.443315711809948E-005, 2.443315711809948E-005 }; static const float simd4f_coscof_p1[4] CPU_ALIGN16 = { -1.388731625493765E-003, -1.388731625493765E-003, -1.388731625493765E-003, -1.388731625493765E-003 }; static const float simd4f_coscof_p2[4] CPU_ALIGN16 = { 4.166664568298827E-002, 4.166664568298827E-002, 4.166664568298827E-002, 4.166664568298827E-002 }; __m128 simd4f_sin_ps( __m128 x ) { __m128 xmm1, xmm2, xmm3, sign_bit, y; __m128i emm0, emm2; xmm2 = _mm_setzero_ps(); sign_bit = x; /* take the absolute value */ x = _mm_and_ps( x, *(__m128 *)simd4fSignMaskInv ); /* extract the sign bit (upper one) */ sign_bit = _mm_and_ps(sign_bit, *(__m128 *)simd4fSignMask); /* scale by 4/Pi */ y = _mm_mul_ps(x, *(__m128 *)simd4f_cephes_FOPI); /* store the integer part of y in mm0 */ emm2 = _mm_cvttps_epi32(y); /* j=(j+1) & (~1) (see the cephes sources) */ emm2 = _mm_add_epi32(emm2, *(__m128i*)simd4uOne); emm2 = _mm_and_si128(emm2, *(__m128i*)simd4uOneInv); y = _mm_cvtepi32_ps(emm2); /* get the swap sign flag */ emm0 = _mm_and_si128(emm2, *(__m128i*)simd4uFour); emm0 = _mm_slli_epi32(emm0, 29); /* get the polynom selection mask there is one polynom for 0 <= x <= Pi/4 and another one for Pi/4