Try HQMipmapGenerator

2017-03-13 10:28:43 +08:00 · 2017-03-13 10:28:43 +08:00 · 76aa38e5b4
commit 76aa38e5b4
parent 498ce3ebc9
16 changed files with 6226 additions and 9 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -116,6 +116,10 @@ if((WIN32 AND NOT MINGW) OR APPLE)
    set(JPEG_INCLUDE_DIR "${PROJECT_SOURCE_DIR}/lib/jpeglib/")
    set(JPEG_LIBRARY jpeglib)
 endif()
+
+add_subdirectory("${PROJECT_SOURCE_DIR}/lib/graphics_utils")
+include_directories("${PROJECT_SOURCE_DIR}/lib/graphics_utils")
+
 # Build the irrlicht library
 add_subdirectory("${PROJECT_SOURCE_DIR}/lib/irrlicht")
 include_directories("${PROJECT_SOURCE_DIR}/lib/irrlicht/include")
@ -370,6 +374,7 @@ target_link_libraries(supertuxkart
    bulletmath
    enet
    stkirrlicht
+    graphics_utils
    ${Angelscript_LIBRARIES}
    ${CURL_LIBRARIES}
    ${OGGVORBIS_LIBRARIES}
--- a/lib/graphics_utils/CMakeLists.txt
+++ b/lib/graphics_utils/CMakeLists.txt
@ -0,0 +1,9 @@
+cmake_minimum_required(VERSION 2.6)
+if (UNIX OR MINGW)
+    add_definitions(-O3 -msse -msse2 -msse3 -mssse3 -msse4.1 -mavx -ffast-math)
+endif()
+add_library(graphics_utils STATIC
+    mipmap/cpusimd.c
+    mipmap/img.c
+    mipmap/imgresize.c
+)
--- a/lib/graphics_utils/mipmap/cpusimd.c
+++ b/lib/graphics_utils/mipmap/cpusimd.c
@ -0,0 +1,568 @@
+/* -----------------------------------------------------------------------------
+ *
+ * Copyright (c) 2008-2016 Alexis Naveros.
+ *
+ *
+ * The SIMD trigonometry functions are Copyright (C) 2007  Julien Pommier
+ * See copyright notice for simd4f_sin_ps(), simd4f_cos_ps(), simd4f_sincos_ps()
+ *
+ *
+ * Some functions are Copyright (C) 2008  José Fonseca
+ * See copyright notice for simd4f_exp2_ps(), simd4f_log2_ps(), simd4f_pow_ps()
+ *
+ *
+ * Portions developed under contract to the SURVICE Engineering Company.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty. In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ * claim that you wrote the original software. If you use this software
+ * in a product, an acknowledgment in the product documentation would be
+ * appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ * misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ *
+ * -----------------------------------------------------------------------------
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <string.h>
+#include <math.h>
+#include <float.h>
+#include <limits.h>
+
+#include <sys/time.h>
+
+
+#include "cpusimd.h"
+
+
+////
+
+
+#if CPU_SSE_SUPPORT
+
+const uint32_t simd4fSignMask[4] CPU_ALIGN16 = { 0x80000000, 0x80000000, 0x80000000, 0x80000000 };
+const uint32_t simd4fSignMaskInv[4] CPU_ALIGN16 = { 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff };
+const float simd4fHalf[4] CPU_ALIGN16 = { 0.5, 0.5, 0.5, 0.5 };
+const float simd4fOne[4] CPU_ALIGN16 = { 1.0, 1.0, 1.0, 1.0 };
+const float simd4fTwo[4] CPU_ALIGN16 = { 2.0, 2.0, 2.0, 2.0 };
+const float simd4fThree[4] CPU_ALIGN16 = { 3.0, 3.0, 3.0, 3.0 };
+const uint32_t simd4uOne[4] CPU_ALIGN16 = { 1, 1, 1, 1 };
+const uint32_t simd4uOneInv[4] CPU_ALIGN16 = { ~1, ~1, ~1, ~1 };
+const uint32_t simd4uTwo[4] CPU_ALIGN16 = { 2, 2, 2, 2 };
+const uint32_t simd4uFour[4] CPU_ALIGN16 = { 4, 4, 4, 4 };
+const float simd4fQuarter[4] CPU_ALIGN16 = { 0.25, 0.25, 0.25, 0.25 };
+const float simd4fPi[4] CPU_ALIGN16 = { M_PI, M_PI, M_PI, M_PI };
+const float simd4fZeroOneTwoThree[4] CPU_ALIGN16 = { 0.0, 1.0, 2.0, 3.0 };
+const uint32_t simd4fAlphaMask[4] CPU_ALIGN16 = { 0x00000000, 0x00000000, 0x00000000, 0xffffffff };
+const float simd4f255[4] CPU_ALIGN16 = { 255.0f, 255.0f, 255.0f, 255.0f };
+const float simd4f255Inv[4] CPU_ALIGN16 = { 1.0f/255.0f, 1.0f/255.0f, 1.0f/255.0f, 1.0f/255.0f };
+
+#endif
+
+
+////
+
+
+#if CPU_SSE2_SUPPORT
+
+
+/* Copyright (C) 2007  Julien Pommier
+
+  This software is provided 'as-is', without any express or implied
+  warranty.  In no event will the authors be held liable for any damages
+  arising from the use of this software.
+
+  Permission is granted to anyone to use this software for any purpose,
+  including commercial applications, and to alter it and redistribute it
+  freely, subject to the following restrictions:
+
+  1. The origin of this software must not be misrepresented; you must not
+     claim that you wrote the original software. If you use this software
+     in a product, an acknowledgment in the product documentation would be
+     appreciated but is not required.
+  2. Altered source versions must be plainly marked as such, and must not be
+     misrepresented as being the original software.
+  3. This notice may not be removed or altered from any source distribution.
+
+  (this is the zlib license)
+*/
+
+static const float simd4f_cephes_FOPI[4] CPU_ALIGN16 = { 1.27323954473516, 1.27323954473516, 1.27323954473516, 1.27323954473516 };
+static const float simd4f_minus_cephes_DP1[4] CPU_ALIGN16 = { -0.78515625, -0.78515625, -0.78515625, -0.78515625 };
+static const float simd4f_minus_cephes_DP2[4] CPU_ALIGN16 = { -2.4187564849853515625e-4, -2.4187564849853515625e-4, -2.4187564849853515625e-4, -2.4187564849853515625e-4 };
+static const float simd4f_minus_cephes_DP3[4] CPU_ALIGN16 = { -3.77489497744594108e-8, -3.77489497744594108e-8, -3.77489497744594108e-8, -3.77489497744594108e-8 };
+static const float simd4f_sincof_p0[4] CPU_ALIGN16 = { -1.9515295891E-4, -1.9515295891E-4, -1.9515295891E-4, -1.9515295891E-4 };
+static const float simd4f_sincof_p1[4] CPU_ALIGN16 = { 8.3321608736E-3, 8.3321608736E-3, 8.3321608736E-3, 8.3321608736E-3 };
+static const float simd4f_sincof_p2[4] CPU_ALIGN16 = { -1.6666654611E-1, -1.6666654611E-1, -1.6666654611E-1, -1.6666654611E-1 };
+static const float simd4f_coscof_p0[4] CPU_ALIGN16 = { 2.443315711809948E-005, 2.443315711809948E-005, 2.443315711809948E-005, 2.443315711809948E-005 };
+static const float simd4f_coscof_p1[4] CPU_ALIGN16 = { -1.388731625493765E-003, -1.388731625493765E-003, -1.388731625493765E-003, -1.388731625493765E-003 };
+static const float simd4f_coscof_p2[4] CPU_ALIGN16 = { 4.166664568298827E-002, 4.166664568298827E-002, 4.166664568298827E-002, 4.166664568298827E-002 };
+
+__m128 simd4f_sin_ps( __m128 x )
+{
+  __m128 xmm1, xmm2, xmm3, sign_bit, y;
+  __m128i emm0, emm2;
+
+  xmm2 = _mm_setzero_ps();
+
+  sign_bit = x;
+  /* take the absolute value */
+  x = _mm_and_ps( x, *(__m128 *)simd4fSignMaskInv );
+  /* extract the sign bit (upper one) */
+  sign_bit = _mm_and_ps(sign_bit, *(__m128 *)simd4fSignMask);
+
+  /* scale by 4/Pi */
+  y = _mm_mul_ps(x, *(__m128 *)simd4f_cephes_FOPI);
+
+  /* store the integer part of y in mm0 */
+  emm2 = _mm_cvttps_epi32(y);
+  /* j=(j+1) & (~1) (see the cephes sources) */
+  emm2 = _mm_add_epi32(emm2, *(__m128i*)simd4uOne);
+  emm2 = _mm_and_si128(emm2, *(__m128i*)simd4uOneInv);
+  y = _mm_cvtepi32_ps(emm2);
+
+  /* get the swap sign flag */
+  emm0 = _mm_and_si128(emm2, *(__m128i*)simd4uFour);
+  emm0 = _mm_slli_epi32(emm0, 29);
+  /* get the polynom selection mask
+     there is one polynom for 0 <= x <= Pi/4
+     and another one for Pi/4<x<=Pi/2
+     Both branches will be computed.
+  */
+  emm2 = _mm_and_si128(emm2, *(__m128i*)simd4uTwo);
+  emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
+
+  __m128 swap_sign_bit = _mm_castsi128_ps(emm0);
+  __m128 poly_mask = _mm_castsi128_ps(emm2);
+  sign_bit = _mm_xor_ps(sign_bit, swap_sign_bit);
+
+  /* The magic pass: "Extended precision modular arithmetic"
+     x = ((x - y * DP1) - y * DP2) - y * DP3; */
+  xmm1 = *(__m128 *)simd4f_minus_cephes_DP1;
+  xmm2 = *(__m128 *)simd4f_minus_cephes_DP2;
+  xmm3 = *(__m128 *)simd4f_minus_cephes_DP3;
+  xmm1 = _mm_mul_ps(y, xmm1);
+  xmm2 = _mm_mul_ps(y, xmm2);
+  xmm3 = _mm_mul_ps(y, xmm3);
+  x = _mm_add_ps(x, xmm1);
+  x = _mm_add_ps(x, xmm2);
+  x = _mm_add_ps(x, xmm3);
+
+  /* Evaluate the first polynom  (0 <= x <= Pi/4) */
+  y = *(__m128 *)simd4f_coscof_p0;
+  __m128 z = _mm_mul_ps(x,x);
+
+  y = _mm_mul_ps(y, z);
+  y = _mm_add_ps(y, *(__m128 *)simd4f_coscof_p1);
+  y = _mm_mul_ps(y, z);
+  y = _mm_add_ps(y, *(__m128 *)simd4f_coscof_p2);
+  y = _mm_mul_ps(y, z);
+  y = _mm_mul_ps(y, z);
+  __m128 tmp = _mm_mul_ps(z, *(__m128 *)simd4fHalf);
+  y = _mm_sub_ps(y, tmp);
+  y = _mm_add_ps(y, *(__m128 *)simd4fOne);
+
+  /* Evaluate the second polynom  (Pi/4 <= x <= 0) */
+
+  __m128 y2 = *(__m128 *)simd4f_sincof_p0;
+  y2 = _mm_mul_ps(y2, z);
+  y2 = _mm_add_ps(y2, *(__m128 *)simd4f_sincof_p1);
+  y2 = _mm_mul_ps(y2, z);
+  y2 = _mm_add_ps(y2, *(__m128 *)simd4f_sincof_p2);
+  y2 = _mm_mul_ps(y2, z);
+  y2 = _mm_mul_ps(y2, x);
+  y2 = _mm_add_ps(y2, x);
+
+  /* select the correct result from the two polynoms */
+  xmm3 = poly_mask;
+  y2 = _mm_and_ps(xmm3, y2);
+  y = _mm_andnot_ps(xmm3, y);
+  y = _mm_add_ps(y,y2);
+  /* update the sign */
+  y = _mm_xor_ps(y, sign_bit);
+
+  return y;
+}
+
+
+
+/* almost the same as sin_ps */
+__m128 simd4f_cos_ps( __m128 x )
+{
+  __m128 xmm1, xmm2, xmm3, y;
+  __m128i emm0, emm2;
+
+  xmm2 = _mm_setzero_ps();
+
+  /* take the absolute value */
+  x = _mm_and_ps(x, *(__m128*)simd4fSignMaskInv);
+  
+  /* scale by 4/Pi */
+  y = _mm_mul_ps(x, *(__m128*)simd4f_cephes_FOPI);
+  
+  /* store the integer part of y in mm0 */
+  emm2 = _mm_cvttps_epi32(y);
+  /* j=(j+1) & (~1) (see the cephes sources) */
+  emm2 = _mm_add_epi32(emm2, *(__m128i*)simd4uOne);
+  emm2 = _mm_and_si128(emm2, *(__m128i*)simd4uOneInv);
+  y = _mm_cvtepi32_ps(emm2);
+
+  emm2 = _mm_sub_epi32(emm2, *(__m128i*)simd4uTwo);
+  
+  /* get the swap sign flag */
+  emm0 = _mm_andnot_si128(emm2, *(__m128i*)simd4uFour);
+  emm0 = _mm_slli_epi32(emm0, 29);
+  /* get the polynom selection mask */
+  emm2 = _mm_and_si128(emm2, *(__m128i*)simd4uTwo);
+  emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
+  
+  __m128 sign_bit = _mm_castsi128_ps(emm0);
+  __m128 poly_mask = _mm_castsi128_ps(emm2);
+  /* The magic pass: "Extended precision modular arithmetic" 
+     x = ((x - y * DP1) - y * DP2) - y * DP3; */
+  xmm1 = *(__m128*)simd4f_minus_cephes_DP1;
+  xmm2 = *(__m128*)simd4f_minus_cephes_DP2;
+  xmm3 = *(__m128*)simd4f_minus_cephes_DP3;
+  xmm1 = _mm_mul_ps(y, xmm1);
+  xmm2 = _mm_mul_ps(y, xmm2);
+  xmm3 = _mm_mul_ps(y, xmm3);
+  x = _mm_add_ps(x, xmm1);
+  x = _mm_add_ps(x, xmm2);
+  x = _mm_add_ps(x, xmm3);
+  
+  /* Evaluate the first polynom  (0 <= x <= Pi/4) */
+  y = *(__m128*)simd4f_coscof_p0;
+  __m128 z = _mm_mul_ps(x,x);
+
+  y = _mm_mul_ps(y, z);
+  y = _mm_add_ps(y, *(__m128*)simd4f_coscof_p1);
+  y = _mm_mul_ps(y, z);
+  y = _mm_add_ps(y, *(__m128*)simd4f_coscof_p2);
+  y = _mm_mul_ps(y, z);
+  y = _mm_mul_ps(y, z);
+  __m128 tmp = _mm_mul_ps(z, *(__m128*)simd4fHalf);
+  y = _mm_sub_ps(y, tmp);
+  y = _mm_add_ps(y, *(__m128*)simd4fOne);
+  
+  /* Evaluate the second polynom  (Pi/4 <= x <= 0) */
+
+  __m128 y2 = *(__m128*)simd4f_sincof_p0;
+  y2 = _mm_mul_ps(y2, z);
+  y2 = _mm_add_ps(y2, *(__m128*)simd4f_sincof_p1);
+  y2 = _mm_mul_ps(y2, z);
+  y2 = _mm_add_ps(y2, *(__m128*)simd4f_sincof_p2);
+  y2 = _mm_mul_ps(y2, z);
+  y2 = _mm_mul_ps(y2, x);
+  y2 = _mm_add_ps(y2, x);
+
+  /* select the correct result from the two polynoms */  
+  xmm3 = poly_mask;
+  y2 = _mm_and_ps(xmm3, y2); //, xmm3);
+  y = _mm_andnot_ps(xmm3, y);
+  y = _mm_add_ps(y,y2);
+  /* update the sign */
+  y = _mm_xor_ps(y, sign_bit);
+
+  return y;
+}
+
+/* since sin_ps and cos_ps are almost identical, sincos_ps could replace both of them..
+   it is almost as fast, and gives you a free cosine with your sine */
+void simd4f_sincos_ps( __m128 x, __m128 *s, __m128 *c )
+{
+  __m128 xmm1, xmm2, xmm3, sign_bit_sin, y;
+  __m128i emm0, emm2, emm4;
+
+  xmm3 = _mm_setzero_ps();
+
+  sign_bit_sin = x;
+  /* take the absolute value */
+  x = _mm_and_ps(x, *(__m128*)simd4fSignMaskInv);
+  /* extract the sign bit (upper one) */
+  sign_bit_sin = _mm_and_ps(sign_bit_sin, *(__m128*)simd4fSignMask);
+  
+  /* scale by 4/Pi */
+  y = _mm_mul_ps(x, *(__m128*)simd4f_cephes_FOPI);
+    
+  /* store the integer part of y in emm2 */
+  emm2 = _mm_cvttps_epi32(y);
+
+  /* j=(j+1) & (~1) (see the cephes sources) */
+  emm2 = _mm_add_epi32(emm2, *(__m128i*)simd4uOne);
+  emm2 = _mm_and_si128(emm2, *(__m128i*)simd4uOneInv);
+  y = _mm_cvtepi32_ps(emm2);
+
+  emm4 = emm2;
+
+  /* get the swap sign flag for the sine */
+  emm0 = _mm_and_si128(emm2, *(__m128i*)simd4uFour);
+  emm0 = _mm_slli_epi32(emm0, 29);
+  __m128 swap_sign_bit_sin = _mm_castsi128_ps(emm0);
+
+  /* get the polynom selection mask for the sine*/
+  emm2 = _mm_and_si128(emm2, *(__m128i*)simd4uTwo);
+  emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
+  __m128 poly_mask = _mm_castsi128_ps(emm2);
+
+  /* The magic pass: "Extended precision modular arithmetic" 
+     x = ((x - y * DP1) - y * DP2) - y * DP3; */
+  xmm1 = *(__m128*)simd4f_minus_cephes_DP1;
+  xmm2 = *(__m128*)simd4f_minus_cephes_DP2;
+  xmm3 = *(__m128*)simd4f_minus_cephes_DP3;
+  xmm1 = _mm_mul_ps(y, xmm1);
+  xmm2 = _mm_mul_ps(y, xmm2);
+  xmm3 = _mm_mul_ps(y, xmm3);
+  x = _mm_add_ps(x, xmm1);
+  x = _mm_add_ps(x, xmm2);
+  x = _mm_add_ps(x, xmm3);
+
+  emm4 = _mm_sub_epi32(emm4, *(__m128i*)simd4uTwo);
+  emm4 = _mm_andnot_si128(emm4, *(__m128i*)simd4uFour);
+  emm4 = _mm_slli_epi32(emm4, 29);
+  __m128 sign_bit_cos = _mm_castsi128_ps(emm4);
+
+  sign_bit_sin = _mm_xor_ps(sign_bit_sin, swap_sign_bit_sin);
+  
+  /* Evaluate the first polynom  (0 <= x <= Pi/4) */
+  __m128 z = _mm_mul_ps(x,x);
+  y = *(__m128*)simd4f_coscof_p0;
+
+  y = _mm_mul_ps(y, z);
+  y = _mm_add_ps(y, *(__m128*)simd4f_coscof_p1);
+  y = _mm_mul_ps(y, z);
+  y = _mm_add_ps(y, *(__m128*)simd4f_coscof_p2);
+  y = _mm_mul_ps(y, z);
+  y = _mm_mul_ps(y, z);
+  __m128 tmp = _mm_mul_ps(z, *(__m128*)simd4fHalf);
+  y = _mm_sub_ps(y, tmp);
+  y = _mm_add_ps(y, *(__m128*)simd4fOne);
+  
+  /* Evaluate the second polynom  (Pi/4 <= x <= 0) */
+
+  __m128 y2 = *(__m128*)simd4f_sincof_p0;
+  y2 = _mm_mul_ps(y2, z);
+  y2 = _mm_add_ps(y2, *(__m128*)simd4f_sincof_p1);
+  y2 = _mm_mul_ps(y2, z);
+  y2 = _mm_add_ps(y2, *(__m128*)simd4f_sincof_p2);
+  y2 = _mm_mul_ps(y2, z);
+  y2 = _mm_mul_ps(y2, x);
+  y2 = _mm_add_ps(y2, x);
+
+  /* select the correct result from the two polynoms */  
+  xmm3 = poly_mask;
+  __m128 ysin2 = _mm_and_ps(xmm3, y2);
+  __m128 ysin1 = _mm_andnot_ps(xmm3, y);
+  y2 = _mm_sub_ps(y2,ysin2);
+  y = _mm_sub_ps(y, ysin1);
+
+  xmm1 = _mm_add_ps(ysin1,ysin2);
+  xmm2 = _mm_add_ps(y,y2);
+ 
+  /* update the sign */
+  *s = _mm_xor_ps(xmm1, sign_bit_sin);
+  *c = _mm_xor_ps(xmm2, sign_bit_cos);
+}
+
+
+#endif
+
+
+////
+
+
+#if CPU_SSE2_SUPPORT
+
+
+/* Copyright (C) 2008  José Fonseca
+   http://jrfonseca.blogspot.ca/2008/09/fast-sse2-pow-tables-or-polynomials.html
+   MIT license
+
+   Permission is hereby granted, free of charge, to any person obtaining a copy
+   of this software and associated documentation files (the "Software"), to deal
+   in the Software without restriction, including without limitation the rights
+   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+   copies of the Software, and to permit persons to whom the Software is
+   furnished to do so, subject to the following conditions:
+
+   The above copyright notice and this permission notice shall be included in all
+   copies or substantial portions of the Software.
+
+   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+   SOFTWARE.
+*/
+
+#define POLY0(x,c0) _mm_set1_ps(c0)
+#define POLY1(x,c0,c1) _mm_add_ps(_mm_mul_ps(POLY0(x, c1), x), _mm_set1_ps(c0))
+#define POLY2(x,c0,c1,c2) _mm_add_ps(_mm_mul_ps(POLY1(x, c1, c2), x), _mm_set1_ps(c0))
+#define POLY3(x,c0,c1,c2,c3) _mm_add_ps(_mm_mul_ps(POLY2(x, c1, c2, c3), x), _mm_set1_ps(c0))
+#define POLY4(x,c0,c1,c2,c3,c4) _mm_add_ps(_mm_mul_ps(POLY3(x, c1, c2, c3, c4), x), _mm_set1_ps(c0))
+#define POLY5(x,c0,c1,c2,c3,c4,c5) _mm_add_ps(_mm_mul_ps(POLY4(x, c1, c2, c3, c4, c5), x), _mm_set1_ps(c0))
+
+#define EXP_POLY_DEGREE 3
+#define LOG_POLY_DEGREE 5
+
+__m128 simd4f_exp2_ps( __m128 x )
+{
+  __m128i ipart;
+  __m128 fpart, expipart, expfpart;
+
+  x = _mm_min_ps( x, _mm_set1_ps(  129.00000f ) );
+  x = _mm_max_ps( x, _mm_set1_ps( -126.99999f ) );
+  /* ipart = int(x - 0.5) */
+  ipart = _mm_cvtps_epi32( _mm_sub_ps( x, _mm_set1_ps( 0.5f ) ) );
+  /* fpart = x - ipart */
+  fpart = _mm_sub_ps( x, _mm_cvtepi32_ps( ipart ) );
+  /* expipart = (float) (1 << ipart) */
+  expipart = _mm_castsi128_ps( _mm_slli_epi32( _mm_add_epi32( ipart, _mm_set1_epi32( 127 ) ), 23 ) );
+  /* minimax polynomial fit of 2**x, in range [-0.5, 0.5[ */
+#if EXP_POLY_DEGREE == 5
+  expfpart = POLY5( fpart, 9.9999994e-1f, 6.9315308e-1f, 2.4015361e-1f, 5.5826318e-2f, 8.9893397e-3f, 1.8775767e-3f );
+#elif EXP_POLY_DEGREE == 4
+  expfpart = POLY4( fpart, 1.0000026f, 6.9300383e-1f, 2.4144275e-1f, 5.2011464e-2f, 1.3534167e-2f );
+#elif EXP_POLY_DEGREE == 3
+  expfpart = POLY3( fpart, 9.9992520e-1f, 6.9583356e-1f, 2.2606716e-1f, 7.8024521e-2f );
+#elif EXP_POLY_DEGREE == 2
+  expfpart = POLY2( fpart, 1.0017247f, 6.5763628e-1f, 3.3718944e-1f );
+#else
+#error
+#endif
+  return _mm_mul_ps(expipart, expfpart);
+}
+
+__m128 simd4f_log2_ps( __m128 x )
+{
+  __m128i expmask, mantmask, i;
+  __m128 one, vexp, mant, logmant;
+
+  expmask = _mm_set1_epi32( 0x7f800000 );
+  mantmask = _mm_set1_epi32( 0x007fffff );
+  one = _mm_set1_ps( 1.0f );
+  i = _mm_castps_si128( x );
+  /* exp = (float) exponent(x) */
+  vexp = _mm_cvtepi32_ps( _mm_sub_epi32( _mm_srli_epi32( _mm_and_si128( i, expmask ), 23 ), _mm_set1_epi32( 127 ) ) );
+  /* mant = (float) mantissa(x) */
+  mant = _mm_or_ps( _mm_castsi128_ps( _mm_and_si128( i, mantmask ) ), one );
+  /* Minimax polynomial fit of log2(x)/(x - 1), for x in range [1, 2[ 
+   * These coefficients can be generate with 
+   * http://www.boost.org/doc/libs/1_36_0/libs/math/doc/sf_and_dist/html/math_toolkit/toolkit/internals2/minimax.html
+   */
+#if LOG_POLY_DEGREE == 6
+  logmant = POLY5( mant, 3.11578814719469302614f, -3.32419399085241980044f, 2.59883907202499966007f, -1.23152682416275988241f, 0.318212422185251071475f, -0.0344359067839062357313f );
+#elif LOG_POLY_DEGREE == 5
+  logmant = POLY4( mant, 2.8882704548164776201f, -2.52074962577807006663f, 1.48116647521213171641f, -0.465725644288844778798f, 0.0596515482674574969533f );
+#elif LOG_POLY_DEGREE == 4
+  logmant = POLY3( mant, 2.61761038894603480148f, -1.75647175389045657003f, 0.688243882994381274313f, -0.107254423828329604454f );
+#elif LOG_POLY_DEGREE == 3
+  logmant = POLY2( mant, 2.28330284476918490682f, -1.04913055217340124191f, 0.204446009836232697516f );
+#else
+#error
+#endif
+  /* This effectively increases the polynomial degree by one, but ensures that log2(1) == 0*/
+  logmant = _mm_mul_ps( logmant, _mm_sub_ps(mant, one ) );
+  return _mm_add_ps( logmant, vexp );
+}
+
+
+__m128 simd4f_pow_ps( __m128 x, __m128 y )
+{
+  return simd4f_exp2_ps( _mm_mul_ps( simd4f_log2_ps( x ), y ) );
+}
+
+
+#endif
+
+
+////
+
+
+#if CPU_SSE2_SUPPORT
+
+
+/*
+By Potatoswatter
+http://stackoverflow.com/questions/6475373/optimizations-for-pow-with-const-non-integer-exponent
+*/
+
+#ifndef CC_ALWAYSINLINE
+ #if defined(__GNUC__) || defined(__INTEL_COMPILER)
+  #define CC_ALWAYSINLINE __attribute__((always_inline))
+ #else
+  #define CC_ALWAYSINLINE
+ #endif
+#endif
+
+static inline CC_ALWAYSINLINE __m128 simd4f_fastpow_ps( __m128 arg, uint32_t expnum, uint32_t expden, uint32_t coeffnum, uint32_t coeffden )
+{
+  __m128 ret = arg;
+  float corrfactor, powfactor;
+  /* Apply a constant pre-correction factor. */
+  corrfactor = exp2( 127.0 * expden / expnum - 127.0 ) * pow( 1.0 * coeffnum / coeffden, 1.0 * expden / expnum );
+  powfactor = 1.0 * expnum / expden;
+  ret = _mm_mul_ps( ret, _mm_set1_ps( corrfactor ) );
+  /* Reinterpret arg as integer to obtain logarithm. */
+  ret = _mm_cvtepi32_ps( _mm_castps_si128( ret ) );
+  /* Multiply logarithm by power. */
+  ret = _mm_mul_ps( ret, _mm_set1_ps( powfactor ) );
+  /* Convert back to "integer" to exponentiate. */
+  ret = _mm_castsi128_ps( _mm_cvtps_epi32( ret ) );
+  return ret;
+}
+
+__m128 simd4f_pow12d5_ps( __m128 arg )
+{
+  /* Lower exponents provide lower initial error, but too low causes overflow. */
+  __m128 xf = simd4f_fastpow_ps( arg, 4, 5, (int)( 1.38316186f * 1e9 ), (int)1e9 );
+  /* Imprecise 4-cycle sqrt is still far better than fastpow, good enough. */
+  __m128 xfm4 = _mm_rsqrt_ps( xf );
+  __m128 xf4 = _mm_mul_ps( xf, xfm4 );
+  /* Precisely calculate x^2 and x^3 */
+  __m128 x2 = _mm_mul_ps( arg, arg );
+  __m128 x3 = _mm_mul_ps( x2, arg );
+  /* Overestimate of x^2 * x^0.4 */
+  x2 = _mm_mul_ps( x2, xf4 );
+  /* Get x^-0.2 from x^0.4, and square it for x^-0.4. Combine into x^-0.6. */
+  __m128 xfm2 = _mm_rsqrt_ps( xf4 );
+  x3 = _mm_mul_ps( x3, xfm4 );
+  x3 = _mm_mul_ps( x3, xfm2 );
+  return _mm_mul_ps( _mm_add_ps( x2, x3 ), _mm_set1_ps( 1.0f/1.960131704207789f * 0.9999f ) );
+}
+
+__m128 simd4f_pow5d12_ps( __m128 arg )
+{
+  /* 5/12 is too small, so compute the 4th root of 20/12 instead. */
+  /* 20/12 = 5/3 = 1 + 2/3 = 2 - 1/3. 2/3 is a suitable argument for fastpow. */
+  /* weighting coefficient: a^-1/2 = 2 a; a = 2^-2/3 */
+  __m128 xf = simd4f_fastpow_ps( arg, 2, 3, (int)( 0.629960524947437f * 1e9 ), (int)1e9 );
+  __m128 xover = _mm_mul_ps( arg, xf );
+  __m128 xfm1 = _mm_rsqrt_ps( xf );
+  __m128 x2 = _mm_mul_ps( arg, arg );
+  __m128 xunder = _mm_mul_ps( x2, xfm1 );
+  /* sqrt2 * over + 2 * sqrt2 * under */
+  __m128 xavg = _mm_mul_ps( _mm_set1_ps( 1.0f/( 3.0f * 0.629960524947437f ) * 0.999852f ), _mm_add_ps( xover, xunder ) );
+  xavg = _mm_mul_ps( xavg, _mm_rsqrt_ps( xavg ) );
+  xavg = _mm_mul_ps( xavg, _mm_rsqrt_ps( xavg ) );
+  return xavg;
+}
+
+#endif
+
+
+////
+
--- a/lib/graphics_utils/mipmap/cpusimd.h
+++ b/lib/graphics_utils/mipmap/cpusimd.h
@ -0,0 +1,410 @@
+/* -----------------------------------------------------------------------------
+ *
+ * Copyright (c) 2008-2016 Alexis Naveros.
+ *
+ * The SIMD trigonometry functions are Copyright (C) 2007  Julien Pommier
+ * See copyright notice for simd4f_sin_ps(), simd4f_cos_ps(), simd4f_sincos_ps()
+ *
+ * Portions developed under contract to the SURVICE Engineering Company.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty. In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ * claim that you wrote the original software. If you use this software
+ * in a product, an acknowledgment in the product documentation would be
+ * appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ * misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ *
+ * -----------------------------------------------------------------------------
+ */
+
+
+#ifndef CPUSIMD_H
+#define CPUSIMD_H
+
+
+////
+
+
+#if __MMX__ || CPU_ENABLE_MMX
+ #include <mmintrin.h>
+ #define CPU_MMX_SUPPORT (1)
+#endif
+#if __SSE__ || _M_X64 || _M_IX86_FP >= 1  || CPU_ENABLE_SSE
+ #include <xmmintrin.h>
+ #define CPU_SSE_SUPPORT (1)
+#endif
+#if __SSE2__ || _M_X64 || _M_IX86_FP >= 2  || CPU_ENABLE_SSE2
+ #include <emmintrin.h>
+ #define CPU_SSE2_SUPPORT (1)
+#endif
+#if __SSE3__ || __AVX__ || CPU_ENABLE_SSE3
+ #include <pmmintrin.h>
+ #define CPU_SSE3_SUPPORT (1)
+#endif
+#if __SSSE3__ || __AVX__  || CPU_ENABLE_SSSE3
+ #include <tmmintrin.h>
+ #define CPU_SSSE3_SUPPORT (1)
+#endif
+#if __SSE4_1__ || __AVX__  || CPU_ENABLE_SSE4_1
+ #include <smmintrin.h>
+ #define CPU_SSE4_1_SUPPORT (1)
+#endif
+#if __SSE4_2__ || CPU_ENABLE_SSE4_2
+ #include <nmmintrin.h>
+ #define CPU_SSE4_2_SUPPORT (1)
+#endif
+#if __SSE4A__ || CPU_ENABLE_SSE4A
+ #include <ammintrin.h>
+ #define CPU_SSE4A_SUPPORT (1)
+#endif
+#if __AVX__ || CPU_ENABLE_AVX
+ #include <immintrin.h>
+ #define CPU_AVX_SUPPORT (1)
+#endif
+#if __AVX2__ || CPU_ENABLE_AVX2
+ #include <immintrin.h>
+ #define CPU_AVX2_SUPPORT (1)
+#endif
+#if __XOP__ || CPU_ENABLE_XOP
+ #include <immintrin.h>
+ #define CPU_XOP_SUPPORT (1)
+#endif
+#if __FMA3__ || CPU_ENABLE_FMA3
+ #include <immintrin.h>
+ #define CPU_FMA3_SUPPORT (1)
+#endif
+#if __FMA4__ || CPU_ENABLE_FMA4
+ #include <immintrin.h>
+ #define CPU_FMA4_SUPPORT (1)
+#endif
+#if __RDRND__ || CPU_ENABLE_RDRND
+ #include <immintrin.h>
+ #define CPU_RDRND_SUPPORT (1)
+#endif
+#if __POPCNT__ || CPU_ENABLE_POPCNT
+ #include <popcntintrin.h>
+ #define CPU_POPCNT_SUPPORT (1)
+#endif
+#if __LZCNT__ || CPU_ENABLE_LZCNT
+ #include <lzcntintrin.h>
+ #define CPU_LZCNT_SUPPORT (1)
+#endif
+#if __F16C__ || CPU_ENABLE_F16C
+ #include <f16cintrin.h>
+ #define CPU_F16C_SUPPORT (1)
+#endif
+#if __BMI__ || CPU_ENABLE_BMI
+ #include <bmiintrin.h>
+ #define CPU_BMI_SUPPORT (1)
+#endif
+#if __BMI2__ || CPU_ENABLE_BMI2
+ #include <bmi2intrin.h>
+ #define CPU_BMI2_SUPPORT (1)
+#endif
+#if __TBM__ || CPU_ENABLE_TBM
+ #include <tbmintrin.h>
+ #define CPU_TBM_SUPPORT (1)
+#endif
+
+
+
+#if defined(__GNUC__) || defined(__INTEL_COMPILER)
+ #define CPU_ALIGN16 __attribute__((aligned(16)))
+ #define CPU_ALIGN32 __attribute__((aligned(32)))
+ #define CPU_ALIGN64 __attribute__((aligned(64)))
+#elif defined(_MSC_VER)
+ #define CPU_ALIGN16 __declspec(align(16))
+ #define CPU_ALIGN64 __declspec(align(64))
+#else
+ #define CPU_ALIGN16
+ #define CPU_ALIGN32
+ #define CPU_ALIGN64
+ #warning "SSE/AVX Disabled: Unsupported Compiler."
+ #undef CPU_SSE_SUPPORT
+ #undef CPU_SSE2_SUPPORT
+ #undef CPU_SSE3_SUPPORT
+ #undef CPU_SSSE3_SUPPORT
+ #undef CPU_SSE4_1_SUPPORT
+ #undef CPU_SSE4_2_SUPPORT
+ #undef CPU_AVX_SUPPORT
+ #undef CPU_AVX2_SUPPORT
+ #undef CPU_XOP_SUPPORT
+ #undef CPU_FMA3_SUPPORT
+ #undef CPU_FMA4_SUPPORT
+#endif
+
+
+////
+
+
+#if CPU_SSE_SUPPORT
+ #define CPU_APPROX_DIV_FLOAT(z,w) _mm_cvtss_f32(_mm_mul_ss(_mm_set_ss(z),_mm_rcp_ss(_mm_set_ss(w))))
+ #define CPU_APPROX_SQRT_FLOAT(z) _mm_cvtss_f32(_mm_mul_ss(_mm_set_ss(z),_mm_rsqrt_ss(_mm_set_ss(z))))
+ #define CPU_APPROX_RSQRT_FLOAT(z) _mm_cvtss_f32(_mm_rsqrt_ss(_mm_set_ss(z)))
+ #define CPU_APPROX_DIVSQRT_FLOAT(z,w) _mm_cvtss_f32(_mm_mul_ss(_mm_set_ss(z),_mm_rsqrt_ss(_mm_set_ss(w))))
+#else
+ #define CPU_APPROX_DIV_FLOAT(z,w) ((z)/(w))
+ #define CPU_APPROX_SQRT_FLOAT(z) (sqrtf(z))
+ #define CPU_APPROX_RSQRT_FLOAT(z) (1.0/sqrtf(z))
+ #define CPU_APPROX_DIVSQRT_FLOAT(z,w) ((z)/sqrtf(w))
+#endif
+
+
+#if CPU_SSE3_SUPPORT
+ #define CPU_HADD_PS(vx,vy) _mm_hadd_ps(vx,vy)
+ #define CPU_HADD_PD(vx,vy) _mm_hadd_pd(vx,vy)
+#elif CPU_SSE_SUPPORT
+ static inline __m128 CPU_HADD_PS( __m128 vx, __m128 vy )
+ {
+   __m128 vh, vl;
+   vh = _mm_shuffle_ps( vx, vy, _MM_SHUFFLE(3,1,3,1) );
+   vl = _mm_shuffle_ps( vx, vy, _MM_SHUFFLE(2,0,2,0) );
+   return _mm_add_ps( vh, vl );
+ }
+ #define CPU_HADD_PD(vx,vy) _mm_add_sd(vx,_mm_unpackhi_pd(vy,vy))
+#endif
+
+
+#if CPU_SSE4_1_SUPPORT
+ #define CPU_CVT_U8_TO_I32(x,vzero) _mm_cvtepu8_epi32(x)
+ #define CPU_CVT_S8_TO_I32(x,vzero) _mm_cvtepi8_epi32(x)
+#elif CPU_SSE2_SUPPORT
+ #define CPU_CVT_U8_TO_I32(x,vzero) _mm_unpacklo_epi16(_mm_unpacklo_epi8((x),(vzero)),(vzero))
+static inline __m128i CPU_CVT_S8_TO_I32( __m128i vx, __m128i vzero )
+{
+  __m128i vsign;
+  vsign = _mm_cmpgt_epi8( vzero, vx );
+  return _mm_unpacklo_epi16( _mm_unpacklo_epi8( vx, vsign ), _mm_unpacklo_epi8( vsign, vsign ) );
+}
+#endif
+
+
+#if CPU_SSE4_1_SUPPORT
+ #define CPU_BLENDV_PS(x,y,mask) _mm_blendv_ps(x,y,mask)
+ #define CPU_BLENDV_PD(x,y,mask) _mm_blendv_pd(x,y,mask)
+#elif CPU_SSE2_SUPPORT
+ #define CPU_BLENDV_PS(x,y,mask) _mm_or_ps(_mm_andnot_ps(mask,x),_mm_and_ps(y,mask))
+ #define CPU_BLENDV_PD(x,y,mask) _mm_or_pd(_mm_andnot_pd(mask,x),_mm_and_pd(y,mask))
+#endif
+
+
+
+/*
+CPU_FMADD = ((f0*f1)+t0)
+CPU_FMSUB = ((f0*f1)-t0)
+*/
+#if CPU_FMA3_SUPPORT
+ #define CPU_FMADD_SS(f0,f1,t0) _mm_fmadd_ss(f0,f1,t0)
+ #define CPU_FMADD_PS(f0,f1,t0) _mm_fmadd_ps(f0,f1,t0)
+ #define CPU_FMADD_SD(f0,f1,t0) _mm_fmadd_sd(f0,f1,t0)
+ #define CPU_FMADD_PD(f0,f1,t0) _mm_fmadd_pd(f0,f1,t0)
+ #define CPU_FMSUB_SS(f0,f1,t0) _mm_fmsub_ss(f0,f1,t0)
+ #define CPU_FMSUB_PS(f0,f1,t0) _mm_fmsub_ps(f0,f1,t0)
+ #define CPU_FMSUB_SD(f0,f1,t0) _mm_fmsub_sd(f0,f1,t0)
+ #define CPU_FMSUB_PD(f0,f1,t0) _mm_fmsub_pd(f0,f1,t0)
+ #define CPU_FMADD256_SS(f0,f1,t0) _mm256_fmadd_ss(f0,f1,t0)
+ #define CPU_FMADD256_PS(f0,f1,t0) _mm256_fmadd_ps(f0,f1,t0)
+ #define CPU_FMADD256_SD(f0,f1,t0) _mm256_fmadd_sd(f0,f1,t0)
+ #define CPU_FMADD256_PD(f0,f1,t0) _mm256_fmadd_pd(f0,f1,t0)
+ #define CPU_FMSUB256_SS(f0,f1,t0) _mm256_fmsub_ss(f0,f1,t0)
+ #define CPU_FMSUB256_PS(f0,f1,t0) _mm256_fmsub_ps(f0,f1,t0)
+ #define CPU_FMSUB256_SD(f0,f1,t0) _mm256_fmsub_sd(f0,f1,t0)
+ #define CPU_FMSUB256_PD(f0,f1,t0) _mm256_fmsub_pd(f0,f1,t0)
+#elif CPU_FMA4_SUPPORT
+ #define CPU_FMADD_SS(f0,f1,t0) _mm_macc_ss(f0,f1,t0)
+ #define CPU_FMADD_PS(f0,f1,t0) _mm_macc_ps(f0,f1,t0)
+ #define CPU_FMADD_SD(f0,f1,t0) _mm_macc_sd(f0,f1,t0)
+ #define CPU_FMADD_PD(f0,f1,t0) _mm_macc_pd(f0,f1,t0)
+ #define CPU_FMSUB_SS(f0,f1,t0) _mm_msub_ss(f0,f1,t0)
+ #define CPU_FMSUB_PS(f0,f1,t0) _mm_msub_ps(f0,f1,t0)
+ #define CPU_FMSUB_SD(f0,f1,t0) _mm_msub_sd(f0,f1,t0)
+ #define CPU_FMSUB_PD(f0,f1,t0) _mm_msub_pd(f0,f1,t0)
+ #define CPU_FMADD256_SS(f0,f1,t0) _mm256_macc_ss(f0,f1,t0)
+ #define CPU_FMADD256_PS(f0,f1,t0) _mm256_macc_ps(f0,f1,t0)
+ #define CPU_FMADD256_SD(f0,f1,t0) _mm256_macc_sd(f0,f1,t0)
+ #define CPU_FMADD256_PD(f0,f1,t0) _mm256_macc_pd(f0,f1,t0)
+ #define CPU_FMSUB256_SS(f0,f1,t0) _mm256_msub_ss(f0,f1,t0)
+ #define CPU_FMSUB256_PS(f0,f1,t0) _mm256_msub_ps(f0,f1,t0)
+ #define CPU_FMSUB256_SD(f0,f1,t0) _mm256_msub_sd(f0,f1,t0)
+ #define CPU_FMSUB256_PD(f0,f1,t0) _mm256_msub_pd(f0,f1,t0)
+#else
+ #define CPU_FMADD_SS(f0,f1,t0) _mm_add_ss(_mm_mul_ss(f0,f1),t0)
+ #define CPU_FMADD_PS(f0,f1,t0) _mm_add_ps(_mm_mul_ps(f0,f1),t0)
+ #define CPU_FMADD_SD(f0,f1,t0) _mm_add_sd(_mm_mul_sd(f0,f1),t0)
+ #define CPU_FMADD_PD(f0,f1,t0) _mm_add_pd(_mm_mul_pd(f0,f1),t0)
+ #define CPU_FMSUB_SS(f0,f1,t0) _mm_sub_ss(_mm_mul_ss(f0,f1),t0)
+ #define CPU_FMSUB_PS(f0,f1,t0) _mm_sub_ps(_mm_mul_ps(f0,f1),t0)
+ #define CPU_FMSUB_SD(f0,f1,t0) _mm_sub_sd(_mm_mul_sd(f0,f1),t0)
+ #define CPU_FMSUB_PD(f0,f1,t0) _mm_sub_pd(_mm_mul_pd(f0,f1),t0)
+ #define CPU_FMADD256_SS(f0,f1,t0) _mm256_add_ss(_mm256_mul_ss(f0,f1),t0)
+ #define CPU_FMADD256_PS(f0,f1,t0) _mm256_add_ps(_mm256_mul_ps(f0,f1),t0)
+ #define CPU_FMADD256_SD(f0,f1,t0) _mm256_add_sd(_mm256_mul_sd(f0,f1),t0)
+ #define CPU_FMADD256_PD(f0,f1,t0) _mm256_add_pd(_mm256_mul_pd(f0,f1),t0)
+ #define CPU_FMSUB256_SS(f0,f1,t0) _mm256_sub_ss(_mm256_mul_ss(f0,f1),t0)
+ #define CPU_FMSUB256_PS(f0,f1,t0) _mm256_sub_ps(_mm256_mul_ps(f0,f1),t0)
+ #define CPU_FMSUB256_SD(f0,f1,t0) _mm256_sub_sd(_mm256_mul_sd(f0,f1),t0)
+ #define CPU_FMSUB256_PD(f0,f1,t0) _mm256_sub_pd(_mm256_mul_pd(f0,f1),t0)
+#endif
+
+
+////
+
+
+#if CPU_SSE_SUPPORT
+
+extern const uint32_t simd4fSignMask[4];
+extern const uint32_t simd4fSignMaskInv[4];
+extern const float simd4fHalf[4];
+extern const float simd4fOne[4];
+extern const float simd4fTwo[4];
+extern const float simd4fThree[4];
+extern const uint32_t simd4uOne[4];
+extern const uint32_t simd4uOneInv[4];
+extern const uint32_t simd4uTwo[4];
+extern const uint32_t simd4uFour[4];
+extern const float simd4fQuarter[4];
+extern const float simd4fPi[4];
+extern const float simd4fZeroOneTwoThree[4];
+extern const uint32_t simd4fAlphaMask[4];
+extern const float simd4f255[4];
+extern const float simd4f255Inv[4];
+
+#endif
+
+
+#if CPU_SSE2_SUPPORT
+
+/* Input range between -8192 and 8192 */
+__m128 simd4f_sin_ps( __m128 x );
+__m128 simd4f_cos_ps( __m128 x );
+void simd4f_sincos_ps( __m128 x, __m128 *s, __m128 *c );
+
+#endif
+
+#if CPU_SSE2_SUPPORT
+
+__m128 simd4f_exp2_ps( __m128 x );
+__m128 simd4f_log2_ps( __m128 x );
+__m128 simd4f_pow_ps( __m128 x, __m128 y );
+
+#endif
+
+#if CPU_SSE2_SUPPORT
+
+__m128 simd4f_pow12d5_ps( __m128 arg );
+__m128 simd4f_pow5d12_ps( __m128 arg );
+
+#endif
+
+
+////
+
+
+#if CPU_SSE2_SUPPORT
+
+#ifndef CC_ALWAYSINLINE
+ #if defined(__GNUC__) || defined(__INTEL_COMPILER)
+  #define CC_ALWAYSINLINE __attribute__((always_inline))
+ #else
+  #define CC_ALWAYSINLINE
+ #endif
+#endif
+
+static inline CC_ALWAYSINLINE __m128 simd4f_pow12d5_inline_ps( __m128 vx )
+{
+  __m128 vpow, vpwsqrtinv, vpwsqrt, vx2;
+  vx2 = _mm_mul_ps( vx, vx );
+  vpow = _mm_castsi128_ps( _mm_cvtps_epi32( _mm_mul_ps( _mm_cvtepi32_ps( _mm_castps_si128( _mm_mul_ps( vx, _mm_set1_ps( 5417434112.0f ) ) ) ), _mm_set1_ps( 0.8f ) ) ) );
+  vpwsqrtinv = _mm_rsqrt_ps( vpow );
+  vpwsqrt = _mm_mul_ps( vpow, vpwsqrtinv );
+  return _mm_mul_ps( _mm_add_ps( _mm_mul_ps( vx2, vpwsqrt ), _mm_mul_ps( _mm_mul_ps( _mm_mul_ps( vx2, vx ), vpwsqrtinv ), _mm_rsqrt_ps( vpwsqrt ) ) ), _mm_set1_ps( 0.51011878327f ) );
+}
+
+static inline CC_ALWAYSINLINE __m128 simd4f_pow5d12_inline_ps( __m128 vx )
+{
+  __m128 vpow;
+  vpow = _mm_castsi128_ps( _mm_cvtps_epi32( _mm_mul_ps( _mm_cvtepi32_ps( _mm_castps_si128( _mm_mul_ps( vx, _mm_set1_ps( 6521909350804488192.0f ) ) ) ), _mm_set1_ps( 0.666666666666f ) ) ) );
+  vx = _mm_mul_ps( _mm_add_ps( _mm_mul_ps( vx, vpow ), _mm_mul_ps( _mm_mul_ps( vx, vx ), _mm_rsqrt_ps( vpow ) ) ), _mm_set1_ps( 0.5290553722f ) );
+#if 0
+  vx = _mm_mul_ps( vx, _mm_rsqrt_ps( vx ) );
+  vx = _mm_mul_ps( vx, _mm_rsqrt_ps( vx ) );
+#else
+  vx = _mm_sqrt_ps( vx );
+  vx = _mm_sqrt_ps( vx );
+#endif
+  return vx;
+}
+
+#endif
+
+
+////
+
+
+#if CPU_SSE_SUPPORT
+
+static inline void simdPrintDebugSSE4f( char *str, __m128 v )
+{
+  float CPU_ALIGN16 store[4];
+  _mm_store_ps( (void *)store, v );
+  printf( "%s %f %f %f %f\n", str, store[0], store[1], store[2], store[3] );
+  return;
+}
+
+static inline void simdPrintDebugSSE2d( char *str, __m128d v )
+{
+  double CPU_ALIGN16 store[2];
+  _mm_store_pd( (void *)store, v );
+  printf( "%s %f %f\n", str, store[0], store[1] );
+  return;
+}
+
+static inline void simdPrintDebugSSE16u8( char *str, __m128i v )
+{
+  uint8_t CPU_ALIGN16 store[16];
+  _mm_store_si128( (void *)store, v );
+  printf( "%s %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d\n", str, store[0], store[1], store[2], store[3], store[4], store[5], store[6], store[7], store[8], store[9], store[10], store[11], store[12], store[13], store[14], store[15] );
+  return;
+}
+
+static inline void simdPrintDebugSSE8u16( char *str, __m128i v )
+{
+  uint16_t CPU_ALIGN16 store[8];
+  _mm_store_si128( (void *)store, v );
+  printf( "%s %d %d %d %d %d %d %d %d\n", str, store[0], store[1], store[2], store[3], store[4], store[5], store[6], store[7] );
+  return;
+}
+
+static inline void simdPrintDebugSSE4u32( char *str, __m128i v )
+{
+  uint32_t CPU_ALIGN16 store[4];
+  _mm_store_si128( (void *)store, v );
+  printf( "%s %d %d %d %d\n", str, store[0], store[1], store[2], store[3] );
+  return;
+}
+
+static inline void simdPrintDebugSSE2u64( char *str, __m128i v )
+{
+  uint64_t CPU_ALIGN16 store[2];
+  _mm_store_si128( (void *)store, v );
+  printf( "%s %lld %lld\n", str, (long long)store[0], (long long)store[1] );
+  return;
+}
+
+#endif
+
+
+////
+
+
+#endif
+
--- a/lib/graphics_utils/mipmap/img.c
+++ b/lib/graphics_utils/mipmap/img.c
@ -0,0 +1,628 @@
+/* *****************************************************************************
+ *
+ * Copyright (c) 2007-2016 Alexis Naveros.
+ * Portions developed under contract to the SURVICE Engineering Company.
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * version 2.1 as published by the Free Software Foundation.
+ *
+ * This library is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this file; see the file named COPYING for more
+ * information.
+ *
+ * *****************************************************************************
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <string.h>
+#include <math.h>
+
+
+#include "cpusimd.h"
+
+#include "img.h"
+
+
+#ifndef ADDRESS
+ #define ADDRESS(p,o) ((void *)(((char *)p)+(o)))
+#endif
+
+
+////
+
+
+void imgCopyRect( imgImage *image, int dstx, int dsty, int srcx, int srcy, int sizex, int sizey )
+{
+  int y;
+  void *dst, *src;
+  src = ADDRESS( image->data, ( srcx * image->format.bytesperpixel ) + ( srcy * image->format.bytesperline ) );
+  dst = ADDRESS( image->data, ( dstx * image->format.bytesperpixel ) + ( dsty * image->format.bytesperline ) );
+  for( y = 0 ; y < sizey ; y++ )
+  {
+    memcpy( dst, src, sizex * image->format.bytesperpixel );
+    src = ADDRESS( src, image->format.bytesperline );
+    dst = ADDRESS( dst, image->format.bytesperline );
+  }
+  return;
+}
+
+
+#if CPU_SSE2_SUPPORT
+static const uint16_t CPU_ALIGN16 imgBlendRgbMask[8] = { 0xffff, 0xffff, 0xffff, 0x0000, 0xffff, 0xffff, 0xffff, 0x0000 };
+static const uint8_t CPU_ALIGN16 imgBlendAlphaTestMask[16] = { 0x00, 0x00, 0x00, 0xff, 0x00, 0x00, 0x00, 0xff, 0x00, 0x00, 0x00, 0xff, 0x00, 0x00, 0x00, 0xff };
+static const uint16_t CPU_ALIGN16 imgBlendRoundBias[8] = { 128, 128, 128, 128, 128, 128, 128, 128 };
+ #if CPU_SSSE3_SUPPORT
+static const uint8_t CPU_ALIGN16 imgBlendShufMask[16] = { 6,7,6,7,6,7,6,7, 14,15,14,15,14,15,14,15 };
+ #endif
+#endif
+
+static void imgBlendImageRgba2Rgba( imgImage *dstimage, int dstx, int dsty, imgImage *srcimage )
+{
+  int x, y;
+#if CPU_SSE2_SUPPORT
+  int row4size;
+  __m128i vsrc01, vsrc23, vdst01, vdst23, vblend01, vblend23;
+  __m128i vzero, v255, vrgbmask, valphatest, vroundbias;
+ #if CPU_SSSE3_SUPPORT
+  __m128i vshufmask;
+ #endif
+#else
+  int32_t dstr, dstg, dstb, dsta;
+  int32_t srcr, srcg, srcb, srca;
+#endif
+  unsigned char *src, *srcrow, *dstrow;
+  uint32_t *dst;
+
+  /* TODO: Other function to clamp copy area? */
+
+#if CPU_SSE2_SUPPORT
+  row4size = srcimage->format.width & ~3;
+  vzero = _mm_setzero_si128();
+  v255 = _mm_set1_epi16( 255 );
+  vrgbmask = _mm_load_si128( (void *)imgBlendRgbMask );
+  valphatest = _mm_load_si128( (void *)imgBlendAlphaTestMask );
+  vroundbias = _mm_load_si128( (void *)imgBlendRoundBias );
+ #if CPU_SSSE3_SUPPORT
+  vshufmask = _mm_load_si128( (void *)imgBlendShufMask );
+ #endif
+#endif
+
+  src = srcimage->data;
+  dst = ADDRESS( dstimage->data, ( dstx * 4 ) + ( dsty * dstimage->format.bytesperline ) );
+  for( y = 0 ; y < srcimage->format.height ; y++ )
+  {
+    srcrow = src;
+    dstrow = (unsigned char *)dst;
+
+#if CPU_SSE2_SUPPORT
+    for( x = 0 ; x < row4size ; x += 4, srcrow += 16, dstrow += 16 )
+    {
+      /* r0g0b0a0,r1g1b1a1,r2g2b2a2,r3g3b3a3 */
+      vsrc23 = _mm_loadu_si128( (void *)srcrow );
+      if( _mm_movemask_ps( _mm_castsi128_ps( _mm_cmpeq_epi32( _mm_and_si128( valphatest, vsrc23 ), vzero ) ) ) == 0xf )
+        continue;
+      vdst23 = _mm_loadu_si128( (void *)dstrow );
+      /* r0__g0__b0__a0__, r1__g1__b1__a1__ */
+      vsrc01 = _mm_unpacklo_epi8( vsrc23, vzero );
+      vdst01 = _mm_unpacklo_epi8( vdst23, vzero );
+      /* r2__g2__b2__a2__, r3__g3__b3__a3__ */
+      vsrc23 = _mm_unpackhi_epi8( vsrc23, vzero );
+      vdst23 = _mm_unpackhi_epi8( vdst23, vzero );
+ #if CPU_SSSE3_SUPPORT
+      /* __a0__a0__a0__a0, __a1__a1__a1__a1 */
+      vblend01 = _mm_shuffle_epi8( vsrc01, vshufmask );
+      /* __a2__a2__a2__a2, __a3__a3__a3__a3 */
+      vblend23 = _mm_shuffle_epi8( vsrc23, vshufmask );
+ #else
+      vblend01 = _mm_shufflelo_epi16( vsrc01, 0xff );
+      vblend01 = _mm_shufflehi_epi16( vblend01, 0xff );
+      vblend23 = _mm_shufflelo_epi16( vsrc23, 0xff );
+      vblend23 = _mm_shufflehi_epi16( vblend23, 0xff );
+ #endif
+      vdst01 = _mm_adds_epu16( _mm_adds_epu16( _mm_mullo_epi16( vdst01, _mm_sub_epi16( v255, _mm_and_si128( vblend01, vrgbmask ) ) ), _mm_mullo_epi16( vsrc01, vblend01 ) ), vroundbias );
+      vdst23 = _mm_adds_epu16( _mm_adds_epu16( _mm_mullo_epi16( vdst23, _mm_sub_epi16( v255, _mm_and_si128( vblend23, vrgbmask ) ) ), _mm_mullo_epi16( vsrc23, vblend23 ) ), vroundbias );
+      /* Correction to divide by 255 instead of 256 */
+      vdst01 = _mm_srli_epi16( _mm_adds_epu16( vdst01, _mm_srli_epi16( vdst01, 8 ) ), 8 );
+      vdst23 = _mm_srli_epi16( _mm_adds_epu16( vdst23, _mm_srli_epi16( vdst23, 8 ) ), 8 );
+      /* Combine interleaved and store */
+      _mm_storeu_si128( (void *)dstrow, _mm_packus_epi16( vdst01, vdst23 ) );
+    }
+    for( ; x < srcimage->format.width ; x++, srcrow += 4, dstrow += 4 )
+    {
+      if( !( srcrow[3] ) )
+        continue;
+      vsrc01 = _mm_castps_si128( _mm_load_ss( (void *)srcrow ) );
+      vdst01 = _mm_castps_si128( _mm_load_ss( (void *)dstrow ) );
+      vsrc01 = _mm_unpacklo_epi8( vsrc01, vzero );
+      vdst01 = _mm_unpacklo_epi8( vdst01, vzero );
+ #if CPU_SSSE3_SUPPORT
+      vblend01 = _mm_shuffle_epi8( vsrc01, vshufmask );
+ #else
+      vblend01 = _mm_shufflelo_epi16( vsrc01, 0xff );
+      vblend01 = _mm_shufflehi_epi16( vblend01, 0xff );
+ #endif
+      vdst01 = _mm_adds_epu16( _mm_adds_epu16( _mm_mullo_epi16( vdst01, _mm_sub_epi16( v255, _mm_and_si128( vblend01, vrgbmask ) ) ), _mm_mullo_epi16( vsrc01, vblend01 ) ), vroundbias );
+      /* Correction to divide by 255 instead of 256 */
+      vdst01 = _mm_srli_epi16( _mm_adds_epu16( vdst01, _mm_srli_epi16( vdst01, 8 ) ), 8 );
+      _mm_store_ss( (void *)dstrow, _mm_castsi128_ps( _mm_packus_epi16( vdst01, vdst01 ) ) );
+    }
+#else
+    for( x = 0 ; x < srcimage->format.width ; x++, srcrow += 4, dstrow += 4 )
+    {
+      if( !( srcrow[3] ) )
+        continue;
+      srcr = (int32_t)srcrow[0];
+      srcg = (int32_t)srcrow[1];
+      srcb = (int32_t)srcrow[2];
+      srca = (int32_t)srcrow[3];
+      dstr = (int32_t)dstrow[0];
+      dstg = (int32_t)dstrow[1];
+      dstb = (int32_t)dstrow[2];
+      dsta = (int32_t)dstrow[3];
+      dstr = ( ( dstr << 8 ) - dstr + ( srca * ( srcr - dstr ) ) + 128 );
+      dstg = ( ( dstg << 8 ) - dstg + ( srca * ( srcg - dstg ) ) + 128 );
+      dstb = ( ( dstb << 8 ) - dstb + ( srca * ( srcb - dstb ) ) + 128 );
+      dsta = ( ( dsta << 8 ) - dsta + ( srca * srca ) + 128 );
+      dstr = ( dstr + ( dstr >> 8 ) ) >> 8;
+      dstg = ( dstg + ( dstg >> 8 ) ) >> 8;
+      dstb = ( dstb + ( dstb >> 8 ) ) >> 8;
+      dsta = ( dsta + ( dsta >> 8 ) ) >> 8;
+      if( dsta > 255 )
+        dsta = 255;
+      dstrow[0] = (unsigned char)dstr;
+      dstrow[1] = (unsigned char)dstg;
+      dstrow[2] = (unsigned char)dstb;
+      dstrow[3] = (unsigned char)dsta;
+    }
+#endif
+    src = ADDRESS( src, srcimage->format.bytesperline );
+    dst = ADDRESS( dst, dstimage->format.bytesperline );
+  }
+
+  return;
+}
+
+static void imgBlendImageRgba2Rgbx( imgImage *dstimage, int dstx, int dsty, imgImage *srcimage )
+{
+  int x, y;
+#if CPU_SSE2_SUPPORT
+  int row4size;
+  __m128i vsrc01, vsrc23, vdst01, vdst23, vblend01, vblend23;
+  __m128i vzero, v255, valphatest, vroundbias;
+ #if CPU_SSSE3_SUPPORT
+  __m128i vshufmask;
+ #endif
+#else
+  int32_t dstr, dstg, dstb;
+  int32_t srcr, srcg, srcb, srca;
+#endif
+  unsigned char *src, *srcrow, *dstrow;
+  uint32_t *dst;
+
+  /* TODO: Other function to clamp copy area? */
+
+#if CPU_SSE2_SUPPORT
+  row4size = srcimage->format.width & ~3;
+  vzero = _mm_setzero_si128();
+  v255 = _mm_set1_epi16( 255 );
+  valphatest = _mm_load_si128( (void *)imgBlendAlphaTestMask );
+  vroundbias = _mm_load_si128( (void *)imgBlendRoundBias );
+ #if CPU_SSSE3_SUPPORT
+  vshufmask = _mm_load_si128( (void *)imgBlendShufMask );
+ #endif
+#endif
+
+  src = srcimage->data;
+  dst = ADDRESS( dstimage->data, ( dstx * 4 ) + ( dsty * dstimage->format.bytesperline ) );
+  for( y = 0 ; y < srcimage->format.height ; y++ )
+  {
+    srcrow = src;
+    dstrow = (unsigned char *)dst;
+
+#if CPU_SSE2_SUPPORT
+    for( x = 0 ; x < row4size ; x += 4, srcrow += 16, dstrow += 16 )
+    {
+      /* r0g0b0a0,r1g1b1a1,r2g2b2a2,r3g3b3a3 */
+      vsrc23 = _mm_loadu_si128( (void *)srcrow );
+      if( _mm_movemask_ps( _mm_castsi128_ps( _mm_cmpeq_epi32( _mm_and_si128( valphatest, vsrc23 ), vzero ) ) ) == 0xf )
+        continue;
+      vdst23 = _mm_loadu_si128( (void *)dstrow );
+      /* r0__g0__b0__a0__, r1__g1__b1__a1__ */
+      vsrc01 = _mm_unpacklo_epi8( vsrc23, vzero );
+      vdst01 = _mm_unpacklo_epi8( vdst23, vzero );
+      /* r2__g2__b2__a2__, r3__g3__b3__a3__ */
+      vsrc23 = _mm_unpackhi_epi8( vsrc23, vzero );
+      vdst23 = _mm_unpackhi_epi8( vdst23, vzero );
+ #if CPU_SSSE3_SUPPORT
+      /* __a0__a0__a0__a0, __a1__a1__a1__a1 */
+      vblend01 = _mm_shuffle_epi8( vsrc01, vshufmask );
+      /* __a2__a2__a2__a2, __a3__a3__a3__a3 */
+      vblend23 = _mm_shuffle_epi8( vsrc23, vshufmask );
+ #else
+      vblend01 = _mm_shufflelo_epi16( vsrc01, 0xff );
+      vblend01 = _mm_shufflehi_epi16( vblend01, 0xff );
+      vblend23 = _mm_shufflelo_epi16( vsrc23, 0xff );
+      vblend23 = _mm_shufflehi_epi16( vblend23, 0xff );
+ #endif
+      vdst01 = _mm_adds_epu16( _mm_adds_epu16( _mm_mullo_epi16( vdst01, _mm_sub_epi16( v255, vblend01 ) ), _mm_mullo_epi16( vsrc01, vblend01 ) ), vroundbias );
+      vdst23 = _mm_adds_epu16( _mm_adds_epu16( _mm_mullo_epi16( vdst23, _mm_sub_epi16( v255, vblend23 ) ), _mm_mullo_epi16( vsrc23, vblend23 ) ), vroundbias );
+      /* Correction to divide by 255 instead of 256 */
+      vdst01 = _mm_srli_epi16( _mm_adds_epu16( vdst01, _mm_srli_epi16( vdst01, 8 ) ), 8 );
+      vdst23 = _mm_srli_epi16( _mm_adds_epu16( vdst23, _mm_srli_epi16( vdst23, 8 ) ), 8 );
+      /* Combine interleaved and store */
+      _mm_storeu_si128( (void *)dstrow, _mm_or_si128( _mm_packus_epi16( vdst01, vdst23 ), valphatest ) );
+    }
+    for( ; x < srcimage->format.width ; x++, srcrow += 4, dstrow += 4 )
+    {
+      if( !( srcrow[3] ) )
+        continue;
+      vsrc01 = _mm_castps_si128( _mm_load_ss( (void *)srcrow ) );
+      vdst01 = _mm_castps_si128( _mm_load_ss( (void *)dstrow ) );
+      vsrc01 = _mm_unpacklo_epi8( vsrc01, vzero );
+      vdst01 = _mm_unpacklo_epi8( vdst01, vzero );
+ #if CPU_SSSE3_SUPPORT
+      vblend01 = _mm_shuffle_epi8( vsrc01, vshufmask );
+ #else
+      vblend01 = _mm_shufflelo_epi16( vsrc01, 0xff );
+      vblend01 = _mm_shufflehi_epi16( vblend01, 0xff );
+ #endif
+      vdst01 = _mm_adds_epu16( _mm_adds_epu16( _mm_mullo_epi16( vdst01, _mm_sub_epi16( v255, vblend01 ) ), _mm_mullo_epi16( vsrc01, vblend01 ) ), vroundbias );
+      /* Correction to divide by 255 instead of 256 */
+      vdst01 = _mm_srli_epi16( _mm_adds_epu16( vdst01, _mm_srli_epi16( vdst01, 8 ) ), 8 );
+      _mm_store_ss( (void *)dstrow, _mm_castsi128_ps( _mm_or_si128( _mm_packus_epi16( vdst01, vdst01 ), valphatest ) ) );
+    }
+#else
+    for( x = 0 ; x < srcimage->format.width ; x++, srcrow += 4, dstrow += 4 )
+    {
+      if( !( srcrow[3] ) )
+        continue;
+      srcr = (int32_t)srcrow[0];
+      srcg = (int32_t)srcrow[1];
+      srcb = (int32_t)srcrow[2];
+      srca = (int32_t)srcrow[3];
+      dstr = (int32_t)dstrow[0];
+      dstg = (int32_t)dstrow[1];
+      dstb = (int32_t)dstrow[2];
+      dstr = ( ( dstr << 8 ) - dstr + ( srca * ( srcr - dstr ) ) + 128 );
+      dstg = ( ( dstg << 8 ) - dstg + ( srca * ( srcg - dstg ) ) + 128 );
+      dstb = ( ( dstb << 8 ) - dstb + ( srca * ( srcb - dstb ) ) + 128 );
+      dstr = ( dstr + ( dstr >> 8 ) ) >> 8;
+      dstg = ( dstg + ( dstg >> 8 ) ) >> 8;
+      dstb = ( dstb + ( dstb >> 8 ) ) >> 8;
+      dstrow[0] = (unsigned char)dstr;
+      dstrow[1] = (unsigned char)dstg;
+      dstrow[2] = (unsigned char)dstb;
+      dstrow[3] = (unsigned char)255;
+    }
+#endif
+    src = ADDRESS( src, srcimage->format.bytesperline );
+    dst = ADDRESS( dst, dstimage->format.bytesperline );
+  }
+
+  return;
+}
+
+static void imgBlendImageRgba2Rgb( imgImage *dstimage, int dstx, int dsty, imgImage *srcimage )
+{
+  int x, y;
+  int32_t dstr, dstg, dstb;
+  int32_t srcr, srcg, srcb, srca;
+  unsigned char *src, *srcrow, *dstrow;
+  uint32_t *dst;
+
+  /* TODO: Other function to clamp copy area? */
+
+  src = srcimage->data;
+  dst = ADDRESS( dstimage->data, ( dstx * 3 ) + ( dsty * dstimage->format.bytesperline ) );
+  for( y = 0 ; y < srcimage->format.height ; y++ )
+  {
+    srcrow = src;
+    dstrow = (unsigned char *)dst;
+    for( x = 0 ; x < srcimage->format.width ; x++, srcrow += 4, dstrow += 3 )
+    {
+      if( !( srcrow[3] ) )
+        continue;
+      srcr = (int32_t)srcrow[0];
+      srcg = (int32_t)srcrow[1];
+      srcb = (int32_t)srcrow[2];
+      srca = (int32_t)srcrow[3];
+      dstr = (int32_t)dstrow[0];
+      dstg = (int32_t)dstrow[1];
+      dstb = (int32_t)dstrow[2];
+      dstr = ( ( dstr << 8 ) - dstr + ( srca * ( srcr - dstr ) ) + 128 );
+      dstg = ( ( dstg << 8 ) - dstg + ( srca * ( srcg - dstg ) ) + 128 );
+      dstb = ( ( dstb << 8 ) - dstb + ( srca * ( srcb - dstb ) ) + 128 );
+      dstr = ( dstr + ( dstr >> 8 ) ) >> 8;
+      dstg = ( dstg + ( dstg >> 8 ) ) >> 8;
+      dstb = ( dstb + ( dstb >> 8 ) ) >> 8;
+      dstrow[0] = (unsigned char)dstr;
+      dstrow[1] = (unsigned char)dstg;
+      dstrow[2] = (unsigned char)dstb;
+    }
+    src = ADDRESS( src, srcimage->format.bytesperline );
+    dst = ADDRESS( dst, dstimage->format.bytesperline );
+  }
+
+  return;
+}
+
+
+void (*imgBlendGetFunction( imgImage *dstimage, imgImage *srcimage ))( imgImage *dstimage, int dstx, int dsty, imgImage *srcimage )
+{
+  void (*blendfunc)( imgImage *dstimage, int dstx, int dsty, imgImage *srcimage );
+  blendfunc = 0;
+  if( srcimage->format.bytesperpixel == 4 )
+  {
+    if( dstimage->format.bytesperpixel == 4 )
+    {
+      if( ( dstimage->format.type == IMG_FORMAT_TYPE_RGBA32 ) || ( dstimage->format.type == IMG_FORMAT_TYPE_BGRA32 ) )
+        blendfunc = imgBlendImageRgba2Rgba;
+      else
+        blendfunc = imgBlendImageRgba2Rgbx;
+    }
+    else if( dstimage->format.bytesperpixel == 3 )
+      blendfunc = imgBlendImageRgba2Rgb;
+  }
+  return blendfunc;
+}
+
+
+int imgBlendImage( imgImage *dstimage, int dstx, int dsty, imgImage *srcimage )
+{
+  void (*blendfunc)( imgImage *dstimage, int dstx, int dsty, imgImage *srcimage );
+  blendfunc = imgBlendGetFunction( dstimage, srcimage );
+  if( blendfunc )
+  {
+    blendfunc( dstimage, dstx, dsty, srcimage );
+    return 1;
+  }
+  return 0;
+}
+
+
+////
+
+
+void imgAllocCopy( imgImage *dstimage, imgImage *srcimage )
+{
+  dstimage->format = srcimage->format;
+  dstimage->data = malloc( srcimage->format.height * srcimage->format.bytesperline );
+  memcpy( dstimage->data, srcimage->data, srcimage->format.height * srcimage->format.bytesperline );
+  return;
+}
+
+
+void imgAllocCopyExtendBorder( imgImage *dstimage, imgImage *srcimage, int extendsize )
+{
+  int y;
+  void *dst, *src, *dstrow;
+
+  dstimage->format.width = srcimage->format.width + ( extendsize << 1 );
+  dstimage->format.height = srcimage->format.height + ( extendsize << 1 );
+  dstimage->format.type = srcimage->format.type;
+  dstimage->format.bytesperpixel = srcimage->format.bytesperpixel;
+  dstimage->format.bytesperline = dstimage->format.width * dstimage->format.bytesperpixel;
+  dstimage->data = malloc( dstimage->format.height * dstimage->format.bytesperline );
+
+  src = srcimage->data;
+  dst = dstimage->data;
+  for( y = 0 ; y < extendsize ; y++ )
+  {
+    memset( dst, 0, dstimage->format.bytesperline );
+    dst = ADDRESS( dst, dstimage->format.bytesperline );
+  }
+  for( y = 0 ; y < srcimage->format.height ; y++ )
+  {
+    dstrow = dst;
+    memset( dstrow, 0, extendsize * dstimage->format.bytesperpixel );
+    dstrow = ADDRESS( dstrow, extendsize * dstimage->format.bytesperpixel );
+    memcpy( dstrow, src, srcimage->format.width * dstimage->format.bytesperpixel );
+    dstrow = ADDRESS( dstrow, srcimage->format.width * dstimage->format.bytesperpixel );
+    memset( dstrow, 0, extendsize * dstimage->format.bytesperpixel );
+    src = ADDRESS( src, srcimage->format.bytesperline );
+    dst = ADDRESS( dst, dstimage->format.bytesperline );
+  }
+  for( y = 0 ; y < extendsize ; y++ )
+  {
+    memset( dst, 0, dstimage->format.bytesperline );
+    dst = ADDRESS( dst, dstimage->format.bytesperline );
+  }
+
+  return;
+}
+
+
+void imgAllocExtractChannel( imgImage *dstimage, imgImage *srcimage, int channelindex )
+{
+  int x, y;
+  unsigned char *dst, *src, *srcrow;
+
+  dstimage->format.width = srcimage->format.width;
+  dstimage->format.height = srcimage->format.height;
+  dstimage->format.type = IMG_FORMAT_TYPE_GRAYSCALE;
+  dstimage->format.bytesperpixel = 1;
+  dstimage->format.bytesperline = dstimage->format.width * dstimage->format.bytesperpixel;
+  dstimage->data = malloc( dstimage->format.height * dstimage->format.bytesperline );
+
+  src = ADDRESS( srcimage->data, channelindex );
+  dst = dstimage->data;
+  for( y = 0 ; y < dstimage->format.height ; y++ )
+  {
+    srcrow = src;
+    for( x = 0 ; x < dstimage->format.width ; x++ )
+    {
+      dst[x] = *srcrow;
+      srcrow = ADDRESS( srcrow, srcimage->format.bytesperpixel );
+    }
+    src = ADDRESS( src, srcimage->format.bytesperline );
+    dst = ADDRESS( dst, dstimage->format.bytesperline );
+  }
+
+  return;
+}
+
+
+void imgAllocExtractChannelExtendBorder( imgImage *dstimage, imgImage *srcimage, int channelindex, int extendsize )
+{
+  int x, y;
+  unsigned char *src, *dst, *srcrow, *dstrow;
+
+  dstimage->format.width = srcimage->format.width + ( extendsize << 1 );
+  dstimage->format.height = srcimage->format.height + ( extendsize << 1 );
+  dstimage->format.type = IMG_FORMAT_TYPE_GRAYSCALE;
+  dstimage->format.bytesperpixel = 1;
+  dstimage->format.bytesperline = dstimage->format.width * dstimage->format.bytesperpixel;
+  dstimage->data = malloc( dstimage->format.height * dstimage->format.bytesperline );
+
+  src = ADDRESS( srcimage->data, channelindex );
+  dst = dstimage->data;
+  for( y = 0 ; y < extendsize ; y++ )
+  {
+    memset( dst, 0, dstimage->format.bytesperline );
+    dst = ADDRESS( dst, dstimage->format.bytesperline );
+  }
+  for( y = 0 ; y < srcimage->format.height ; y++ )
+  {
+    srcrow = src;
+    dstrow = dst;
+    memset( dstrow, 0, extendsize * dstimage->format.bytesperpixel );
+    dstrow = ADDRESS( dstrow, extendsize * dstimage->format.bytesperpixel );
+    for( x = 0 ; x < srcimage->format.width ; x++ )
+    {
+      dstrow[x] = *srcrow;
+      srcrow = ADDRESS( srcrow, srcimage->format.bytesperpixel );
+    }
+    dstrow = ADDRESS( dstrow, srcimage->format.width * dstimage->format.bytesperpixel );
+    memset( dstrow, 0, extendsize * dstimage->format.bytesperpixel );
+    src = ADDRESS( src, srcimage->format.bytesperline );
+    dst = ADDRESS( dst, dstimage->format.bytesperline );
+  }
+  for( y = 0 ; y < extendsize ; y++ )
+  {
+    memset( dst, 0, dstimage->format.bytesperline );
+    dst = ADDRESS( dst, dstimage->format.bytesperline );
+  }
+
+  return;
+}
+
+
+void imgAllocCopyChannelToAlpha( imgImage *dstimage, imgImage *srcimage, int channelindex, unsigned char r, unsigned char g, unsigned char b )
+{
+  int x, y;
+  unsigned char *dst, *src, *dstrow, *srcrow;
+
+  dstimage->format.width = srcimage->format.width;
+  dstimage->format.height = srcimage->format.height;
+  dstimage->format.type = IMG_FORMAT_TYPE_RGBA32;
+  dstimage->format.bytesperpixel = 4;
+  dstimage->format.bytesperline = dstimage->format.width * dstimage->format.bytesperpixel;
+  dstimage->data = malloc( dstimage->format.height * dstimage->format.bytesperline );
+
+  src = ADDRESS( srcimage->data, channelindex );
+  dst = dstimage->data;
+  for( y = 0 ; y < dstimage->format.height ; y++ )
+  {
+    srcrow = src;
+    dstrow = dst;
+    for( x = 0 ; x < dstimage->format.width ; x++ )
+    {
+      dstrow[0] = r;
+      dstrow[1] = g;
+      dstrow[2] = b;
+      dstrow[3] = *srcrow;
+      srcrow = ADDRESS( srcrow, srcimage->format.bytesperpixel );
+      dstrow = ADDRESS( dstrow, dstimage->format.bytesperpixel );
+    }
+    src = ADDRESS( src, srcimage->format.bytesperline );
+    dst = ADDRESS( dst, dstimage->format.bytesperline );
+  }
+
+  return;
+}
+
+
+void imgAllocAdjustBrightnessContrast( imgImage *dstimage, imgImage *srcimage, float brightness, float contrast )
+{
+  int x, y;
+  float r, g, b;
+  unsigned char *dst, *src, *dstrow, *srcrow;
+
+  dstimage->format = srcimage->format;
+  dstimage->data = malloc( srcimage->format.height * srcimage->format.bytesperline );
+
+  brightness += 0.5f;
+
+  if( dstimage->format.bytesperpixel >= 3 )
+  {
+    src = srcimage->data;
+    dst = dstimage->data;
+    for( y = 0 ; y < dstimage->format.height ; y++ )
+    {
+      srcrow = src;
+      dstrow = dst;
+      for( x = 0 ; x < dstimage->format.width ; x++ )
+      {
+        r = (1.0f/255.0f) * (float)srcrow[0];
+        g = (1.0f/255.0f) * (float)srcrow[1];
+        b = (1.0f/255.0f) * (float)srcrow[2];
+        r = ( ( r - 0.5f ) * contrast ) + brightness;
+        g = ( ( g - 0.5f ) * contrast ) + brightness;
+        b = ( ( b - 0.5f ) * contrast ) + brightness;
+        dstrow[0] = (unsigned char)fmaxf( 0.0f, fminf( 255.0f, roundf( r * 255.0f ) ) );
+        dstrow[1] = (unsigned char)fmaxf( 0.0f, fminf( 255.0f, roundf( g * 255.0f ) ) );
+        dstrow[2] = (unsigned char)fmaxf( 0.0f, fminf( 255.0f, roundf( b * 255.0f ) ) );
+        if( dstimage->format.bytesperpixel >= 4 )
+          dstrow[3] = srcrow[3];
+        srcrow = ADDRESS( srcrow, srcimage->format.bytesperpixel );
+        dstrow = ADDRESS( dstrow, dstimage->format.bytesperpixel );
+      }
+      src = ADDRESS( src, srcimage->format.bytesperline );
+      dst = ADDRESS( dst, dstimage->format.bytesperline );
+    }
+  }
+  else if( dstimage->format.bytesperpixel == 1 )
+  {
+    src = srcimage->data;
+    dst = dstimage->data;
+    for( y = 0 ; y < dstimage->format.height ; y++ )
+    {
+      srcrow = src;
+      dstrow = dst;
+      for( x = 0 ; x < dstimage->format.width ; x++ )
+      {
+        r = (1.0f/255.0f) * (float)srcrow[0];
+        r = ( ( r - 0.5f ) * contrast ) + brightness;
+        dstrow[0] = (unsigned char)fmaxf( 0.0f, fminf( 255.0f, roundf( r * 255.0f ) ) );
+        srcrow = ADDRESS( srcrow, srcimage->format.bytesperpixel );
+        dstrow = ADDRESS( dstrow, dstimage->format.bytesperpixel );
+      }
+      src = ADDRESS( src, srcimage->format.bytesperline );
+      dst = ADDRESS( dst, dstimage->format.bytesperline );
+    }
+  }
+
+  return;
+}
+
+
+void imgFree( imgImage *image )
+{
+  free( image->data );
+  image->data = 0;
+  return;
+}
+
+
+////
+
+
--- a/lib/graphics_utils/mipmap/img.h
+++ b/lib/graphics_utils/mipmap/img.h
@ -0,0 +1,74 @@
+/* *****************************************************************************
+ *
+ * Copyright (c) 2007-2016 Alexis Naveros.
+ * Portions developed under contract to the SURVICE Engineering Company.
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * version 2.1 as published by the Free Software Foundation.
+ *
+ * This library is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this file; see the file named COPYING for more
+ * information.
+ *
+ * *****************************************************************************
+ */
+
+#ifndef IMG_H
+#define IMG_H
+
+
+typedef struct
+{
+  int width;
+  int height;
+  int type;
+  int bytesperpixel;
+  int bytesperline;
+} imgFormat;
+
+enum
+{
+  IMG_FORMAT_TYPE_ANY,
+  IMG_FORMAT_TYPE_RGB24,
+  IMG_FORMAT_TYPE_BGR24,
+  IMG_FORMAT_TYPE_RGBX32,
+  IMG_FORMAT_TYPE_BGRX32,
+  IMG_FORMAT_TYPE_RGBA32,
+  IMG_FORMAT_TYPE_BGRA32,
+  IMG_FORMAT_TYPE_GRAYSCALE,
+  IMG_FORMAT_TYPE_GRAYALPHA
+};
+
+typedef struct
+{
+  imgFormat format;
+  void *data;
+} imgImage;
+
+
+////
+
+
+void imgCopyRect( imgImage *image, int dstx, int dsty, int srcx, int srcy, int sizex, int sizey );
+
+void (*imgBlendGetFunction( imgImage *dstimage, imgImage *srcimage ))( imgImage *dstimage, int dstx, int dsty, imgImage *srcimage );
+int imgBlendImage( imgImage *dstimage, int dstx, int dsty, imgImage *srcimage );
+
+void imgAllocCopy( imgImage *dst, imgImage *src );
+void imgAllocCopyExtendBorder( imgImage *dstimage, imgImage *srcimage, int extendsize );
+void imgAllocExtractChannel( imgImage *dst, imgImage *src, int channelindex );
+void imgAllocExtractChannelExtendBorder( imgImage *dstimage, imgImage *srcimage, int channelindex, int extendsize );
+void imgAllocCopyChannelToAlpha( imgImage *dstimage, imgImage *srcimage, int channelindex, unsigned char r, unsigned char g, unsigned char b );
+void imgAllocAdjustBrightnessContrast( imgImage *dstimage, imgImage *srcimage, float brightness, float contrast );
+
+void imgFree( imgImage *image );
+
+
+#endif
+
--- a/lib/graphics_utils/mipmap/imgresize.c
+++ b/lib/graphics_utils/mipmap/imgresize.c
--- a/lib/graphics_utils/mipmap/imgresize.h
+++ b/lib/graphics_utils/mipmap/imgresize.h
@ -0,0 +1,150 @@
+/* -----------------------------------------------------------------------------
+ *
+ * Copyright (c) 2014-2017 Alexis Naveros.
+ * Portions developed under contract to the SURVICE Engineering Company.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty. In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ * claim that you wrote the original software. If you use this software
+ * in a product, an acknowledgment in the product documentation would be
+ * appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ * misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ *
+ * -----------------------------------------------------------------------------
+ */
+
+
+#ifndef IMGRESIZE_H
+#define IMGRESIZE_H
+
+
+typedef struct
+{
+  /* Specify filter type, from the IM_REDUCE_FILTER_* list */
+  int filter;
+  /* High quality, a little slow: hopcount=3; */
+  /* Good quality, much faster: hopcount=2; */
+  int hopcount;
+  /* Strong preservation/amplification of details: alpha=2.0f; */
+  /* Mild preservation/amplification of details: alpha=6.0f; */
+  float alpha;
+  /* NORMALMAP filters: factor to amyplify normals on X and Y before normalization */
+  float amplifynormal;
+  /* NORMALMAP_SUSTAIN filters: Preserve a factor of deviation "energy" as calculated by sqrtf(x*x+y*y) */
+  float normalsustainfactor;
+} imReduceOptions;
+
+static inline void imReduceSetOptions( imReduceOptions *options, int filter, int hopcount, float alpha, float amplifynormal, float normalsustainfactor )
+{
+  options->filter = filter;
+  options->hopcount = hopcount;
+  options->alpha = alpha;
+  options->amplifynormal = amplifynormal;
+  options->normalsustainfactor = normalsustainfactor;
+  return;
+}
+
+
+/* Reduce the image's dimensions by an integer divisor ~ this is fairly fast */
+int imReduceImageKaiserDataDivisor( unsigned char *dstdata, unsigned char *srcdata, int width, int height, int bytesperpixel, int bytesperline, int sizedivisor, imReduceOptions *options );
+/* Same as imReduceImageKaiserDataDivisor(), but imgdst is allocated */
+int imReduceImageKaiserDivisor( imgImage *imgdst, imgImage *imgsrc, int sizedivisor, imReduceOptions *options );
+
+
+/* Reduce the image's dimensions to match the newwidth and newheight ~ this is a little slower */
+int imReduceImageKaiserData( unsigned char *dstdata, unsigned char *srcdata, int width, int height, int bytesperpixel, int bytesperline, int newwidth, int newheight, imReduceOptions *options );
+/* Same as imReduceImageKaiserData(), but imgdst is allocated */
+int imReduceImageKaiser( imgImage *imgdst, imgImage *imgsrc, int newwidth, int newheight, imReduceOptions *options );
+
+
+/* Resize by half with a dumb box filter ~ don't use that except for the smallest mipmaps */
+/* Filters with ALPHANORM and/or SUSTAIN keywords are processed as the regular base filter only */
+int imReduceImageHalfBoxData( unsigned char *dstdata, unsigned char *srcdata, int width, int height, int bytesperpixel, int bytesperline, imReduceOptions *options );
+int imReduceImageHalfBox( imgImage *imgdst, imgImage *imgsrc, imReduceOptions *options );
+
+
+/*
+Keywords for image reduction filters
+
+LINEAR: Data is linear, note that this is *not* the format of typical diffuse textures
+SRGB: Color is in sRGB space, any alpha is presumed linear
+NORMALMAP: RGB represents a XYZ vector as (2.0*RGB)-1.0f, any alpha is presumed linear
+
+ALPHANORM: Alpha normalization, the weight of pixels is proportional to their alpha values
+           (do you have "black" fully transparent pixels? please use an ALPHANORM filter)
+SUSTAIN: The "energy" of the normal map is sustained, amplified to preserve the level of details
+         Note that this filter is rather slow (set options->normalsustainfactor to 0.75 or so)
+*/
+
+enum
+{
+  /* Linear space */
+  IM_REDUCE_FILTER_LINEAR,
+  IM_REDUCE_FILTER_LINEAR_ALPHANORM,
+
+  /* sRGB space (probably what you want for diffuse textures) */
+  IM_REDUCE_FILTER_SRGB,
+  IM_REDUCE_FILTER_SRGB_ALPHANORM,
+
+  /* RGB represents a XYZ vector as (2.0*RGB)-1.0f, any alpha is presumed linear */
+  IM_REDUCE_FILTER_NORMALMAP,
+  IM_REDUCE_FILTER_NORMALMAP_ALPHANORM,
+  IM_REDUCE_FILTER_NORMALMAP_SUSTAIN,
+  IM_REDUCE_FILTER_NORMALMAP_SUSTAIN_ALPHANORM,
+
+  /* Custom specialized filters */
+  IM_REDUCE_FILTER_WATERMAP,
+  IM_REDUCE_FILTER_PLANTMAP,
+  IM_REDUCE_FILTER_FOLLIAGE,
+  IM_REDUCE_FILTER_SKY,
+  IM_REDUCE_FILTER_FOG
+};
+
+
+////
+
+
+#define IM_MIPMAP_CASCADE_MAX (16)
+
+typedef struct
+{
+  int width;
+  int height;
+  int layercount;
+  int bytesperpixel;
+  int bytesperline;
+  imReduceOptions *options;
+  void *mipmap[IM_MIPMAP_CASCADE_MAX];
+} imMipmapCascade;
+
+
+int imBuildMipmapCascade( imMipmapCascade *cascade, void *imagedata, int width, int height, int layercount, int bytesperpixel, int bytesperline, imReduceOptions *options, int cascadeflags );
+
+void imFreeMipmapCascade( imMipmapCascade *cascade );
+
+/* For base texture, propagate RGB channels to neighbors if they are fully transparent (ignored if bytesperpixel != 4 ) */
+#define IM_CASCADE_FLAGS_COLOR_BORDER_BASE (0x1)
+/* For generated mipmaps, propagate RGB channels to neighbors if they are fully transparent (ignored if bytesperpixel != 4 ) */
+#define IM_CASCADE_FLAGS_COLOR_BORDER_MIPMAPS (0x2)
+
+
+////
+
+
+void imPropagateAlphaBorder( unsigned char *imagedata, int width, int height, int bytesperpixel, int bytesperline );
+
+
+////
+
+
+#endif
+
--- a/src/config/user_config.hpp
+++ b/src/config/user_config.hpp
@ -934,6 +934,11 @@ namespace UserConfigParams
            PARAM_DEFAULT( BoolUserConfigParam(false, "everything_unlocked",
                               "Enable all karts and tracks") );

+    PARAM_PREFIX BoolUserConfigParam        m_hq_mipmap
+            PARAM_DEFAULT( BoolUserConfigParam(false, "hq_mipmap",
+                               "Generate mipmap for textures using "
+                               "high quality method with SSE") );
+
    // TODO? implement blacklist for new irrlicht device and GUI
    PARAM_PREFIX std::vector<std::string>   m_blacklist_res;

--- a/src/graphics/central_settings.cpp
+++ b/src/graphics/central_settings.cpp
@ -490,7 +490,7 @@ bool CentralVideoSettings::isARBPixelBufferObjectUsable() const

 bool CentralVideoSettings::supportsThreadedTextureLoading() const
 {
-    return isARBPixelBufferObjectUsable() && isARBBufferStorageUsable();
+    return isARBPixelBufferObjectUsable() && isARBBufferStorageUsable() && isARBTextureStorageUsable();
 }

 #endif   // !SERVER_ONLY
--- a/src/graphics/hq_mipmap_generator.cpp
+++ b/src/graphics/hq_mipmap_generator.cpp
@ -0,0 +1,119 @@
+//  SuperTuxKart - a fun racing game with go-kart
+//  Copyright (C) 2017 SuperTuxKart-Team
+//
+//  This program is free software; you can redistribute it and/or
+//  modify it under the terms of the GNU General Public License
+//  as published by the Free Software Foundation; either version 3
+//  of the License, or (at your option) any later version.
+//
+//  This program is distributed in the hope that it will be useful,
+//  but WITHOUT ANY WARRANTY; without even the implied warranty of
+//  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+//  GNU General Public License for more details.
+//
+//  You should have received a copy of the GNU General Public License
+//  along with this program; if not, write to the Free Software
+//  Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
+
+
+#include "graphics/hq_mipmap_generator.hpp"
+#define DUMP_MIPMAP
+#ifdef DUMP_MIPMAP
+#include "graphics/irr_driver.hpp"
+#include "utils/string_utils.hpp"
+#endif
+#include <cassert>
+
+extern "C"
+{
+    #include <mipmap/img.h>
+    #include <mipmap/imgresize.h>
+}
+
+// ----------------------------------------------------------------------------
+HQMipmapGenerator::HQMipmapGenerator(const io::path& name, uint8_t* data,
+                                     const core::dimension2d<u32>& size,
+                                     GLuint texture_name, bool single_channel)
+                 : video::ITexture(name), m_orig_data(data), m_size(size),
+                   m_texture_name(texture_name), m_texture_size(0),
+                   m_single_channel(single_channel), m_mipmap_data(NULL)
+{
+    unsigned width = m_size.Width;
+    unsigned height = m_size.Height;
+    while (true)
+    {
+        width = width < 2 ? 1 : width >> 1;
+        height = height < 2 ? 1 : height >> 1;
+        m_mipmap_sizes.emplace_back(core::dimension2du(width, height),
+            m_texture_size);
+        m_texture_size += width * height * (m_single_channel ? 1 : 4);
+        if (width == 1 && height == 1)
+            break;
+    }
+    m_texture_size = unsigned(m_mipmap_sizes.back().second) +
+        (m_single_channel ? 1 : 4);
+    m_mipmap_data = malloc(sizeof(imMipmapCascade));
+}   // HQMipmapGenerator
+
+// ----------------------------------------------------------------------------
+HQMipmapGenerator::~HQMipmapGenerator()
+{
+    imFreeMipmapCascade((imMipmapCascade*)m_mipmap_data);
+    free(m_mipmap_data);
+}   // ~HQMipmapGenerator
+
+// ----------------------------------------------------------------------------
+void HQMipmapGenerator::threadedReload(void* ptr, void* param) const
+{
+    imReduceOptions options;
+    imReduceSetOptions(&options, IM_REDUCE_FILTER_SRGB, 3, 2.0f, 0.0f, 0.0f);
+    imMipmapCascade* mm_cascade = (imMipmapCascade*)m_mipmap_data;
+#ifdef DEBUG
+    int ret = imBuildMipmapCascade(mm_cascade, m_orig_data, m_size.Width,
+        m_size.Height, 1/*layercount*/, m_single_channel ? 1 : 4,
+        m_single_channel ? m_size.Width : m_size.Width * 4, &options, 0);
+    assert(ret == 1);
+#else
+    imBuildMipmapCascade(mm_cascade, m_orig_data, m_size.Width,
+        m_size.Height, 1/*layercount*/, m_single_channel ? 1 : 4,
+        m_single_channel ? m_size.Width : m_size.Width * 4, &options, 0);
+#endif
+    for (unsigned int i = 0; i < m_mipmap_sizes.size(); i++)
+    {
+        memcpy((uint8_t*)ptr + m_mipmap_sizes[i].second,
+            mm_cascade->mipmap[i + 1],
+            m_mipmap_sizes[i].first.getArea() * (m_single_channel ? 1 : 4));
+#ifdef DUMP_MIPMAP
+        if (m_single_channel) continue;
+        video::IImage* image = irr_driver->getVideoDriver()
+            ->createImageFromData(video::ECF_A8R8G8B8, m_mipmap_sizes[i].first,
+            mm_cascade->mipmap[i + 1], false/*ownForeignMemory*/);
+        irr_driver->getVideoDriver()->writeImageToFile(image, std::string
+            (StringUtils::toString(i) + "_" +
+            StringUtils::getBasename(NamedPath.getPtr()) + ".png").c_str());
+        image->drop();
+#endif
+    }
+}   // threadedReload
+
+// ----------------------------------------------------------------------------
+void HQMipmapGenerator::threadedSubImage(void* ptr) const
+{
+#if !(defined(SERVER_ONLY) || defined(USE_GLES2))
+    glBindTexture(GL_TEXTURE_2D, m_texture_name);
+    for (unsigned int i = 0; i < m_mipmap_sizes.size(); i++)
+    {
+        glTexSubImage2D(GL_TEXTURE_2D, i + 1, 0, 0,
+            m_mipmap_sizes[i].first.Width, m_mipmap_sizes[i].first.Height,
+            m_single_channel ? GL_RED : GL_BGRA, GL_UNSIGNED_BYTE,
+            (uint8_t*)ptr + m_mipmap_sizes[i].second);
+    }
+    delete this;
+#endif
+}   // threadedSubImage
+
+// ----------------------------------------------------------------------------
+void HQMipmapGenerator::cleanThreadedLoader()
+{
+    delete[] m_orig_data;
+}   // cleanThreadedLoader
--- a/src/graphics/hq_mipmap_generator.hpp
+++ b/src/graphics/hq_mipmap_generator.hpp
@ -0,0 +1,98 @@
+//  SuperTuxKart - a fun racing game with go-kart
+//  Copyright (C) 2017 SuperTuxKart-Team
+//
+//  This program is free software; you can redistribute it and/or
+//  modify it under the terms of the GNU General Public License
+//  as published by the Free Software Foundation; either version 3
+//  of the License, or (at your option) any later version.
+//
+//  This program is distributed in the hope that it will be useful,
+//  but WITHOUT ANY WARRANTY; without even the implied warranty of
+//  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+//  GNU General Public License for more details.
+//
+//  You should have received a copy of the GNU General Public License
+//  along with this program; if not, write to the Free Software
+//  Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
+
+#ifndef HEADER_HQ_MIPMAP_GENERATOR_HPP
+#define HEADER_HQ_MIPMAP_GENERATOR_HPP
+
+#include "graphics/gl_headers.hpp"
+#include "utils/no_copy.hpp"
+#include "utils/types.hpp"
+
+#include <vector>
+#include <ITexture.h>
+
+using namespace irr;
+
+class HQMipmapGenerator : public video::ITexture, NoCopy
+{
+private:
+    uint8_t* m_orig_data;
+
+    core::dimension2d<u32> m_size;
+
+    GLuint m_texture_name;
+
+    unsigned int m_texture_size;
+
+    bool m_single_channel;
+
+    void* m_mipmap_data;
+
+    std::vector<std::pair<core::dimension2d<u32>, size_t> > m_mipmap_sizes;
+
+public:
+    // ------------------------------------------------------------------------
+    HQMipmapGenerator(const io::path& name, uint8_t* data,
+                      const core::dimension2d<u32>& size, GLuint texture_name,
+                      bool single_channel);
+    // ------------------------------------------------------------------------
+    virtual ~HQMipmapGenerator();
+    // ------------------------------------------------------------------------
+    virtual void* lock(video::E_TEXTURE_LOCK_MODE mode =
+                       video::ETLM_READ_WRITE, u32 mipmap_level = 0)
+                                                               { return NULL; }
+    // ------------------------------------------------------------------------
+    virtual void unlock()                                                    {}
+    // ------------------------------------------------------------------------
+    virtual const core::dimension2d<u32>& getOriginalSize() const
+                                                             { return m_size; }
+    // ------------------------------------------------------------------------
+    virtual const core::dimension2d<u32>& getSize() const    { return m_size; }
+    // ------------------------------------------------------------------------
+    virtual video::E_DRIVER_TYPE getDriverType() const
+    {
+#if defined(USE_GLES2)
+        return video::EDT_OGLES2;
+#else
+        return video::EDT_OPENGL;
+#endif
+    }
+    // ------------------------------------------------------------------------
+    virtual video::ECOLOR_FORMAT getColorFormat() const
+                                                { return video::ECF_A8R8G8B8; }
+    // ------------------------------------------------------------------------
+    virtual u32 getPitch() const                                  { return 0; }
+    // ------------------------------------------------------------------------
+    virtual bool hasMipMaps() const                           { return false; }
+    // ------------------------------------------------------------------------
+    virtual void regenerateMipMapLevels(void* mipmap_data = NULL)            {}
+    // ------------------------------------------------------------------------
+    virtual u32 getOpenGLTextureName() const         { return m_texture_name; }
+    // ------------------------------------------------------------------------
+    virtual u64 getHandle()                                       { return 0; }
+    // ------------------------------------------------------------------------
+    virtual unsigned int getTextureSize() const      { return m_texture_size; }
+    // ------------------------------------------------------------------------
+    virtual void threadedReload(void* ptr, void* param) const;
+    // ------------------------------------------------------------------------
+    virtual void threadedSubImage(void* ptr) const;
+    // ------------------------------------------------------------------------
+    virtual void cleanThreadedLoader();
+
+};   // HQMipmapGenerator
+
+#endif
--- a/src/graphics/stk_tex_manager.cpp
+++ b/src/graphics/stk_tex_manager.cpp
@ -17,6 +17,7 @@

 #include "graphics/stk_tex_manager.hpp"
 #include "config/hardware_stats.hpp"
+#include "config/user_config.hpp"
 #include "graphics/central_settings.hpp"
 #include "graphics/materials.hpp"
 #include "graphics/threaded_tex_loader.hpp"
@ -33,11 +34,12 @@ STKTexManager::STKTexManager() : m_pbo(0), m_thread_size(0)
 #if !(defined(SERVER_ONLY) || defined(USE_GLES2))
    if (CVS->supportsThreadedTextureLoading())
    {
+        UserConfigParams::m_hq_mipmap = true;
        pthread_mutex_init(&m_threaded_load_textures_mutex, NULL);
        pthread_cond_init(&m_cond_request, NULL);
        m_thread_size = HardwareStats::getNumProcessors();
-        m_thread_size = core::clamp(m_thread_size, 1, 3);
-        static const unsigned max_pbo_size = 48 * 1024 * 1024;
+        m_thread_size = core::clamp(m_thread_size, 1, 8);
+        static const unsigned max_pbo_size = 128 * 1024 * 1024;
        const unsigned each_capacity = max_pbo_size / m_thread_size;
        Log::info("STKTexManager", "%d thread(s) for texture loading,"
            " each capacity %d MB", m_thread_size,
--- a/src/graphics/stk_texture.cpp
+++ b/src/graphics/stk_texture.cpp
@ -18,14 +18,17 @@
 #include "graphics/stk_texture.hpp"
 #include "config/user_config.hpp"
 #include "graphics/central_settings.hpp"
+#include "graphics/hq_mipmap_generator.hpp"
 #include "graphics/irr_driver.hpp"
 #include "graphics/material.hpp"
 #include "graphics/material_manager.hpp"
 #include "graphics/materials.hpp"
+#include "graphics/stk_tex_manager.hpp"
 #include "modes/profile_world.hpp"
 #include "utils/log.hpp"
 #include "utils/string_utils.hpp"

+#include <algorithm>
 #include <fstream>
 #include <functional>

@ -215,7 +218,7 @@ void STKTexture::reload(bool no_upload, uint8_t* preload_data,
    const unsigned int w = m_size.Width;
    const unsigned int h = m_size.Height;
    unsigned int format = m_single_channel ? GL_RED : GL_BGRA;
-    unsigned int internal_format = m_single_channel ? GL_R8 : GL_RGBA;
+    unsigned int internal_format = m_single_channel ? GL_R8 : GL_RGBA8;

 #if !defined(USE_GLES2)
    if (m_mesh_texture && CVS->isTextureCompressionEnabled())
@ -227,13 +230,41 @@ void STKTexture::reload(bool no_upload, uint8_t* preload_data,
    else
    {
        internal_format =
-            m_single_channel ? GL_R8 : m_srgb ? GL_SRGB_ALPHA : GL_RGBA;
+            m_single_channel ? GL_R8 : m_srgb ? GL_SRGB8_ALPHA8 : GL_RGBA8;
    }
 #endif
    if (!useThreadedLoading())
        formatConversion(data, &format, w, h);

-    if (!no_upload)
+    if (useThreadedLoading())
+    {
+        if (m_texture_name == 0)
+        {
+            glGenTextures(1, &m_texture_name);
+            glBindTexture(GL_TEXTURE_2D, m_texture_name);
+            if (m_single_channel)
+            {
+                glPixelStorei(GL_UNPACK_ALIGNMENT, 1);
+                glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_SWIZZLE_R, GL_ONE);
+                glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_SWIZZLE_G, GL_ONE);
+                glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_SWIZZLE_B, GL_ONE);
+                glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_SWIZZLE_A, GL_RED);
+            }
+            int levels = 1;
+            int width = w;
+            int height = h;
+            while (true)
+            {
+                width = width < 2 ? 1 : width >> 1;
+                height = height < 2 ? 1 : height >> 1;
+                levels++;
+                if (width == 1 && height == 1)
+                    break;
+            }
+            glTexStorage2D(GL_TEXTURE_2D, levels, internal_format, w, h);
+        }
+    }
+    else if (!no_upload)
    {
        const bool reload = m_texture_name != 0;
        if (!reload)
@ -253,14 +284,14 @@ void STKTexture::reload(bool no_upload, uint8_t* preload_data,
            glTexImage2D(GL_TEXTURE_2D, 0, internal_format, w, h, 0, format,
                GL_UNSIGNED_BYTE, data);
        }
-        else if (!useThreadedLoading())
+        else
        {
            glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, w, h, format,
                GL_UNSIGNED_BYTE, data);
        }
        if (orig_img)
            orig_img->unlock();
-        if (hasMipMaps() && !useThreadedLoading())
+        if (hasMipMaps())
            glGenerateMipmap(GL_TEXTURE_2D);
    }

@ -594,8 +625,15 @@ void STKTexture::threadedReload(void* ptr, void* param) const
    if (orig_img)
    {
        orig_img->unlock();
+        orig_img->setDeleteMemory(false);
        orig_img->drop();
    }
+    if (useHQMipmap())
+    {
+        HQMipmapGenerator* hqmg = new HQMipmapGenerator(NamedPath, data,
+            m_size, m_texture_name, m_single_channel);
+        ((STKTexManager*)(param))->addThreadedLoadTexture(hqmg);
+    }
    else
        delete[] data;
 }   // threadedReload
@ -607,8 +645,11 @@ void STKTexture::threadedSubImage(void* ptr) const
    glBindTexture(GL_TEXTURE_2D, m_texture_name);
    glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, m_size.Width, m_size.Height,
        m_single_channel ? GL_RED : GL_BGRA, GL_UNSIGNED_BYTE, ptr);
+    if (useHQMipmap())
+        return;
    if (hasMipMaps())
        glGenerateMipmap(GL_TEXTURE_2D);
+
 #endif
 }   // threadedSubImage

@ -620,3 +661,10 @@ void STKTexture::cleanThreadedLoader()
    m_file = NULL;
    m_img_loader = NULL;
 }   // cleanThreadedLoader
+
+//-----------------------------------------------------------------------------
+bool STKTexture::useHQMipmap() const
+{
+    return UserConfigParams::m_hq_mipmap && m_size.Width > 1 &&
+        m_size.Height > 1;
+}   // useHQMipmap
--- a/src/graphics/stk_texture.hpp
+++ b/src/graphics/stk_texture.hpp
@ -76,6 +76,8 @@ private:
            sc[i] = data[4 * i + 3];
        return sc;
    }
+    // ------------------------------------------------------------------------
+    bool useHQMipmap() const;

 public:
    // ------------------------------------------------------------------------
--- a/src/graphics/threaded_tex_loader.cpp
+++ b/src/graphics/threaded_tex_loader.cpp
@ -84,8 +84,9 @@ void ThreadedTexLoader::handleCompletedTextures()
    size_t offset = m_pbo_offset;
    for (irr::video::ITexture* tex : m_completed_textures)
    {
+        size_t cur_offset = tex->getTextureSize();
        tex->threadedSubImage((void*)offset);
-        offset += tex->getTextureSize();
+        offset += cur_offset;
    }
    m_completed_textures.clear();
 #endif