diff --git a/CMakeLists.txt b/CMakeLists.txt
index 31ef6f003..8a0a558a8 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -116,6 +116,10 @@ if((WIN32 AND NOT MINGW) OR APPLE)
     set(JPEG_INCLUDE_DIR "${PROJECT_SOURCE_DIR}/lib/jpeglib/")
     set(JPEG_LIBRARY jpeglib)
 endif()
+
+add_subdirectory("${PROJECT_SOURCE_DIR}/lib/graphics_utils")
+include_directories("${PROJECT_SOURCE_DIR}/lib/graphics_utils")
+
 # Build the irrlicht library
 add_subdirectory("${PROJECT_SOURCE_DIR}/lib/irrlicht")
 include_directories("${PROJECT_SOURCE_DIR}/lib/irrlicht/include")
@@ -370,6 +374,7 @@ target_link_libraries(supertuxkart
     bulletmath
     enet
     stkirrlicht
+    graphics_utils
     ${Angelscript_LIBRARIES}
     ${CURL_LIBRARIES}
     ${OGGVORBIS_LIBRARIES}
diff --git a/lib/graphics_utils/CMakeLists.txt b/lib/graphics_utils/CMakeLists.txt
new file mode 100644
index 000000000..fa8efa49e
--- /dev/null
+++ b/lib/graphics_utils/CMakeLists.txt
@@ -0,0 +1,9 @@
+cmake_minimum_required(VERSION 2.6)
+if (UNIX OR MINGW)
+    add_definitions(-O3 -msse -msse2 -msse3 -mssse3 -msse4.1 -mavx -ffast-math)
+endif()
+add_library(graphics_utils STATIC
+    mipmap/cpusimd.c
+    mipmap/img.c
+    mipmap/imgresize.c
+)
diff --git a/lib/graphics_utils/mipmap/cpusimd.c b/lib/graphics_utils/mipmap/cpusimd.c
new file mode 100644
index 000000000..74e0b8be6
--- /dev/null
+++ b/lib/graphics_utils/mipmap/cpusimd.c
@@ -0,0 +1,568 @@
+/* -----------------------------------------------------------------------------
+ *
+ * Copyright (c) 2008-2016 Alexis Naveros.
+ *
+ *
+ * The SIMD trigonometry functions are Copyright (C) 2007  Julien Pommier
+ * See copyright notice for simd4f_sin_ps(), simd4f_cos_ps(), simd4f_sincos_ps()
+ *
+ *
+ * Some functions are Copyright (C) 2008  José Fonseca
+ * See copyright notice for simd4f_exp2_ps(), simd4f_log2_ps(), simd4f_pow_ps()
+ *
+ *
+ * Portions developed under contract to the SURVICE Engineering Company.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty. In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ * claim that you wrote the original software. If you use this software
+ * in a product, an acknowledgment in the product documentation would be
+ * appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ * misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ *
+ * -----------------------------------------------------------------------------
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <string.h>
+#include <math.h>
+#include <float.h>
+#include <limits.h>
+
+#include <sys/time.h>
+
+
+#include "cpusimd.h"
+
+
+////
+
+
+#if CPU_SSE_SUPPORT
+
+const uint32_t simd4fSignMask[4] CPU_ALIGN16 = { 0x80000000, 0x80000000, 0x80000000, 0x80000000 };
+const uint32_t simd4fSignMaskInv[4] CPU_ALIGN16 = { 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff };
+const float simd4fHalf[4] CPU_ALIGN16 = { 0.5, 0.5, 0.5, 0.5 };
+const float simd4fOne[4] CPU_ALIGN16 = { 1.0, 1.0, 1.0, 1.0 };
+const float simd4fTwo[4] CPU_ALIGN16 = { 2.0, 2.0, 2.0, 2.0 };
+const float simd4fThree[4] CPU_ALIGN16 = { 3.0, 3.0, 3.0, 3.0 };
+const uint32_t simd4uOne[4] CPU_ALIGN16 = { 1, 1, 1, 1 };
+const uint32_t simd4uOneInv[4] CPU_ALIGN16 = { ~1, ~1, ~1, ~1 };
+const uint32_t simd4uTwo[4] CPU_ALIGN16 = { 2, 2, 2, 2 };
+const uint32_t simd4uFour[4] CPU_ALIGN16 = { 4, 4, 4, 4 };
+const float simd4fQuarter[4] CPU_ALIGN16 = { 0.25, 0.25, 0.25, 0.25 };
+const float simd4fPi[4] CPU_ALIGN16 = { M_PI, M_PI, M_PI, M_PI };
+const float simd4fZeroOneTwoThree[4] CPU_ALIGN16 = { 0.0, 1.0, 2.0, 3.0 };
+const uint32_t simd4fAlphaMask[4] CPU_ALIGN16 = { 0x00000000, 0x00000000, 0x00000000, 0xffffffff };
+const float simd4f255[4] CPU_ALIGN16 = { 255.0f, 255.0f, 255.0f, 255.0f };
+const float simd4f255Inv[4] CPU_ALIGN16 = { 1.0f/255.0f, 1.0f/255.0f, 1.0f/255.0f, 1.0f/255.0f };
+
+#endif
+
+
+////
+
+
+#if CPU_SSE2_SUPPORT
+
+
+/* Copyright (C) 2007  Julien Pommier
+
+  This software is provided 'as-is', without any express or implied
+  warranty.  In no event will the authors be held liable for any damages
+  arising from the use of this software.
+
+  Permission is granted to anyone to use this software for any purpose,
+  including commercial applications, and to alter it and redistribute it
+  freely, subject to the following restrictions:
+
+  1. The origin of this software must not be misrepresented; you must not
+     claim that you wrote the original software. If you use this software
+     in a product, an acknowledgment in the product documentation would be
+     appreciated but is not required.
+  2. Altered source versions must be plainly marked as such, and must not be
+     misrepresented as being the original software.
+  3. This notice may not be removed or altered from any source distribution.
+
+  (this is the zlib license)
+*/
+
+static const float simd4f_cephes_FOPI[4] CPU_ALIGN16 = { 1.27323954473516, 1.27323954473516, 1.27323954473516, 1.27323954473516 };
+static const float simd4f_minus_cephes_DP1[4] CPU_ALIGN16 = { -0.78515625, -0.78515625, -0.78515625, -0.78515625 };
+static const float simd4f_minus_cephes_DP2[4] CPU_ALIGN16 = { -2.4187564849853515625e-4, -2.4187564849853515625e-4, -2.4187564849853515625e-4, -2.4187564849853515625e-4 };
+static const float simd4f_minus_cephes_DP3[4] CPU_ALIGN16 = { -3.77489497744594108e-8, -3.77489497744594108e-8, -3.77489497744594108e-8, -3.77489497744594108e-8 };
+static const float simd4f_sincof_p0[4] CPU_ALIGN16 = { -1.9515295891E-4, -1.9515295891E-4, -1.9515295891E-4, -1.9515295891E-4 };
+static const float simd4f_sincof_p1[4] CPU_ALIGN16 = { 8.3321608736E-3, 8.3321608736E-3, 8.3321608736E-3, 8.3321608736E-3 };
+static const float simd4f_sincof_p2[4] CPU_ALIGN16 = { -1.6666654611E-1, -1.6666654611E-1, -1.6666654611E-1, -1.6666654611E-1 };
+static const float simd4f_coscof_p0[4] CPU_ALIGN16 = { 2.443315711809948E-005, 2.443315711809948E-005, 2.443315711809948E-005, 2.443315711809948E-005 };
+static const float simd4f_coscof_p1[4] CPU_ALIGN16 = { -1.388731625493765E-003, -1.388731625493765E-003, -1.388731625493765E-003, -1.388731625493765E-003 };
+static const float simd4f_coscof_p2[4] CPU_ALIGN16 = { 4.166664568298827E-002, 4.166664568298827E-002, 4.166664568298827E-002, 4.166664568298827E-002 };
+
+__m128 simd4f_sin_ps( __m128 x )
+{
+  __m128 xmm1, xmm2, xmm3, sign_bit, y;
+  __m128i emm0, emm2;
+
+  xmm2 = _mm_setzero_ps();
+
+  sign_bit = x;
+  /* take the absolute value */
+  x = _mm_and_ps( x, *(__m128 *)simd4fSignMaskInv );
+  /* extract the sign bit (upper one) */
+  sign_bit = _mm_and_ps(sign_bit, *(__m128 *)simd4fSignMask);
+
+  /* scale by 4/Pi */
+  y = _mm_mul_ps(x, *(__m128 *)simd4f_cephes_FOPI);
+
+  /* store the integer part of y in mm0 */
+  emm2 = _mm_cvttps_epi32(y);
+  /* j=(j+1) & (~1) (see the cephes sources) */
+  emm2 = _mm_add_epi32(emm2, *(__m128i*)simd4uOne);
+  emm2 = _mm_and_si128(emm2, *(__m128i*)simd4uOneInv);
+  y = _mm_cvtepi32_ps(emm2);
+
+  /* get the swap sign flag */
+  emm0 = _mm_and_si128(emm2, *(__m128i*)simd4uFour);
+  emm0 = _mm_slli_epi32(emm0, 29);
+  /* get the polynom selection mask
+     there is one polynom for 0 <= x <= Pi/4
+     and another one for Pi/4<x<=Pi/2
+     Both branches will be computed.
+  */
+  emm2 = _mm_and_si128(emm2, *(__m128i*)simd4uTwo);
+  emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
+
+  __m128 swap_sign_bit = _mm_castsi128_ps(emm0);
+  __m128 poly_mask = _mm_castsi128_ps(emm2);
+  sign_bit = _mm_xor_ps(sign_bit, swap_sign_bit);
+
+  /* The magic pass: "Extended precision modular arithmetic"
+     x = ((x - y * DP1) - y * DP2) - y * DP3; */
+  xmm1 = *(__m128 *)simd4f_minus_cephes_DP1;
+  xmm2 = *(__m128 *)simd4f_minus_cephes_DP2;
+  xmm3 = *(__m128 *)simd4f_minus_cephes_DP3;
+  xmm1 = _mm_mul_ps(y, xmm1);
+  xmm2 = _mm_mul_ps(y, xmm2);
+  xmm3 = _mm_mul_ps(y, xmm3);
+  x = _mm_add_ps(x, xmm1);
+  x = _mm_add_ps(x, xmm2);
+  x = _mm_add_ps(x, xmm3);
+
+  /* Evaluate the first polynom  (0 <= x <= Pi/4) */
+  y = *(__m128 *)simd4f_coscof_p0;
+  __m128 z = _mm_mul_ps(x,x);
+
+  y = _mm_mul_ps(y, z);
+  y = _mm_add_ps(y, *(__m128 *)simd4f_coscof_p1);
+  y = _mm_mul_ps(y, z);
+  y = _mm_add_ps(y, *(__m128 *)simd4f_coscof_p2);
+  y = _mm_mul_ps(y, z);
+  y = _mm_mul_ps(y, z);
+  __m128 tmp = _mm_mul_ps(z, *(__m128 *)simd4fHalf);
+  y = _mm_sub_ps(y, tmp);
+  y = _mm_add_ps(y, *(__m128 *)simd4fOne);
+
+  /* Evaluate the second polynom  (Pi/4 <= x <= 0) */
+
+  __m128 y2 = *(__m128 *)simd4f_sincof_p0;
+  y2 = _mm_mul_ps(y2, z);
+  y2 = _mm_add_ps(y2, *(__m128 *)simd4f_sincof_p1);
+  y2 = _mm_mul_ps(y2, z);
+  y2 = _mm_add_ps(y2, *(__m128 *)simd4f_sincof_p2);
+  y2 = _mm_mul_ps(y2, z);
+  y2 = _mm_mul_ps(y2, x);
+  y2 = _mm_add_ps(y2, x);
+
+  /* select the correct result from the two polynoms */
+  xmm3 = poly_mask;
+  y2 = _mm_and_ps(xmm3, y2);
+  y = _mm_andnot_ps(xmm3, y);
+  y = _mm_add_ps(y,y2);
+  /* update the sign */
+  y = _mm_xor_ps(y, sign_bit);
+
+  return y;
+}
+
+
+
+/* almost the same as sin_ps */
+__m128 simd4f_cos_ps( __m128 x )
+{
+  __m128 xmm1, xmm2, xmm3, y;
+  __m128i emm0, emm2;
+
+  xmm2 = _mm_setzero_ps();
+
+  /* take the absolute value */
+  x = _mm_and_ps(x, *(__m128*)simd4fSignMaskInv);
+  
+  /* scale by 4/Pi */
+  y = _mm_mul_ps(x, *(__m128*)simd4f_cephes_FOPI);
+  
+  /* store the integer part of y in mm0 */
+  emm2 = _mm_cvttps_epi32(y);
+  /* j=(j+1) & (~1) (see the cephes sources) */
+  emm2 = _mm_add_epi32(emm2, *(__m128i*)simd4uOne);
+  emm2 = _mm_and_si128(emm2, *(__m128i*)simd4uOneInv);
+  y = _mm_cvtepi32_ps(emm2);
+
+  emm2 = _mm_sub_epi32(emm2, *(__m128i*)simd4uTwo);
+  
+  /* get the swap sign flag */
+  emm0 = _mm_andnot_si128(emm2, *(__m128i*)simd4uFour);
+  emm0 = _mm_slli_epi32(emm0, 29);
+  /* get the polynom selection mask */
+  emm2 = _mm_and_si128(emm2, *(__m128i*)simd4uTwo);
+  emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
+  
+  __m128 sign_bit = _mm_castsi128_ps(emm0);
+  __m128 poly_mask = _mm_castsi128_ps(emm2);
+  /* The magic pass: "Extended precision modular arithmetic" 
+     x = ((x - y * DP1) - y * DP2) - y * DP3; */
+  xmm1 = *(__m128*)simd4f_minus_cephes_DP1;
+  xmm2 = *(__m128*)simd4f_minus_cephes_DP2;
+  xmm3 = *(__m128*)simd4f_minus_cephes_DP3;
+  xmm1 = _mm_mul_ps(y, xmm1);
+  xmm2 = _mm_mul_ps(y, xmm2);
+  xmm3 = _mm_mul_ps(y, xmm3);
+  x = _mm_add_ps(x, xmm1);
+  x = _mm_add_ps(x, xmm2);
+  x = _mm_add_ps(x, xmm3);
+  
+  /* Evaluate the first polynom  (0 <= x <= Pi/4) */
+  y = *(__m128*)simd4f_coscof_p0;
+  __m128 z = _mm_mul_ps(x,x);
+
+  y = _mm_mul_ps(y, z);
+  y = _mm_add_ps(y, *(__m128*)simd4f_coscof_p1);
+  y = _mm_mul_ps(y, z);
+  y = _mm_add_ps(y, *(__m128*)simd4f_coscof_p2);
+  y = _mm_mul_ps(y, z);
+  y = _mm_mul_ps(y, z);
+  __m128 tmp = _mm_mul_ps(z, *(__m128*)simd4fHalf);
+  y = _mm_sub_ps(y, tmp);
+  y = _mm_add_ps(y, *(__m128*)simd4fOne);
+  
+  /* Evaluate the second polynom  (Pi/4 <= x <= 0) */
+
+  __m128 y2 = *(__m128*)simd4f_sincof_p0;
+  y2 = _mm_mul_ps(y2, z);
+  y2 = _mm_add_ps(y2, *(__m128*)simd4f_sincof_p1);
+  y2 = _mm_mul_ps(y2, z);
+  y2 = _mm_add_ps(y2, *(__m128*)simd4f_sincof_p2);
+  y2 = _mm_mul_ps(y2, z);
+  y2 = _mm_mul_ps(y2, x);
+  y2 = _mm_add_ps(y2, x);
+
+  /* select the correct result from the two polynoms */  
+  xmm3 = poly_mask;
+  y2 = _mm_and_ps(xmm3, y2); //, xmm3);
+  y = _mm_andnot_ps(xmm3, y);
+  y = _mm_add_ps(y,y2);
+  /* update the sign */
+  y = _mm_xor_ps(y, sign_bit);
+
+  return y;
+}
+
+/* since sin_ps and cos_ps are almost identical, sincos_ps could replace both of them..
+   it is almost as fast, and gives you a free cosine with your sine */
+void simd4f_sincos_ps( __m128 x, __m128 *s, __m128 *c )
+{
+  __m128 xmm1, xmm2, xmm3, sign_bit_sin, y;
+  __m128i emm0, emm2, emm4;
+
+  xmm3 = _mm_setzero_ps();
+
+  sign_bit_sin = x;
+  /* take the absolute value */
+  x = _mm_and_ps(x, *(__m128*)simd4fSignMaskInv);
+  /* extract the sign bit (upper one) */
+  sign_bit_sin = _mm_and_ps(sign_bit_sin, *(__m128*)simd4fSignMask);
+  
+  /* scale by 4/Pi */
+  y = _mm_mul_ps(x, *(__m128*)simd4f_cephes_FOPI);
+    
+  /* store the integer part of y in emm2 */
+  emm2 = _mm_cvttps_epi32(y);
+
+  /* j=(j+1) & (~1) (see the cephes sources) */
+  emm2 = _mm_add_epi32(emm2, *(__m128i*)simd4uOne);
+  emm2 = _mm_and_si128(emm2, *(__m128i*)simd4uOneInv);
+  y = _mm_cvtepi32_ps(emm2);
+
+  emm4 = emm2;
+
+  /* get the swap sign flag for the sine */
+  emm0 = _mm_and_si128(emm2, *(__m128i*)simd4uFour);
+  emm0 = _mm_slli_epi32(emm0, 29);
+  __m128 swap_sign_bit_sin = _mm_castsi128_ps(emm0);
+
+  /* get the polynom selection mask for the sine*/
+  emm2 = _mm_and_si128(emm2, *(__m128i*)simd4uTwo);
+  emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
+  __m128 poly_mask = _mm_castsi128_ps(emm2);
+
+  /* The magic pass: "Extended precision modular arithmetic" 
+     x = ((x - y * DP1) - y * DP2) - y * DP3; */
+  xmm1 = *(__m128*)simd4f_minus_cephes_DP1;
+  xmm2 = *(__m128*)simd4f_minus_cephes_DP2;
+  xmm3 = *(__m128*)simd4f_minus_cephes_DP3;
+  xmm1 = _mm_mul_ps(y, xmm1);
+  xmm2 = _mm_mul_ps(y, xmm2);
+  xmm3 = _mm_mul_ps(y, xmm3);
+  x = _mm_add_ps(x, xmm1);
+  x = _mm_add_ps(x, xmm2);
+  x = _mm_add_ps(x, xmm3);
+
+  emm4 = _mm_sub_epi32(emm4, *(__m128i*)simd4uTwo);
+  emm4 = _mm_andnot_si128(emm4, *(__m128i*)simd4uFour);
+  emm4 = _mm_slli_epi32(emm4, 29);
+  __m128 sign_bit_cos = _mm_castsi128_ps(emm4);
+
+  sign_bit_sin = _mm_xor_ps(sign_bit_sin, swap_sign_bit_sin);
+  
+  /* Evaluate the first polynom  (0 <= x <= Pi/4) */
+  __m128 z = _mm_mul_ps(x,x);
+  y = *(__m128*)simd4f_coscof_p0;
+
+  y = _mm_mul_ps(y, z);
+  y = _mm_add_ps(y, *(__m128*)simd4f_coscof_p1);
+  y = _mm_mul_ps(y, z);
+  y = _mm_add_ps(y, *(__m128*)simd4f_coscof_p2);
+  y = _mm_mul_ps(y, z);
+  y = _mm_mul_ps(y, z);
+  __m128 tmp = _mm_mul_ps(z, *(__m128*)simd4fHalf);
+  y = _mm_sub_ps(y, tmp);
+  y = _mm_add_ps(y, *(__m128*)simd4fOne);
+  
+  /* Evaluate the second polynom  (Pi/4 <= x <= 0) */
+
+  __m128 y2 = *(__m128*)simd4f_sincof_p0;
+  y2 = _mm_mul_ps(y2, z);
+  y2 = _mm_add_ps(y2, *(__m128*)simd4f_sincof_p1);
+  y2 = _mm_mul_ps(y2, z);
+  y2 = _mm_add_ps(y2, *(__m128*)simd4f_sincof_p2);
+  y2 = _mm_mul_ps(y2, z);
+  y2 = _mm_mul_ps(y2, x);
+  y2 = _mm_add_ps(y2, x);
+
+  /* select the correct result from the two polynoms */  
+  xmm3 = poly_mask;
+  __m128 ysin2 = _mm_and_ps(xmm3, y2);
+  __m128 ysin1 = _mm_andnot_ps(xmm3, y);
+  y2 = _mm_sub_ps(y2,ysin2);
+  y = _mm_sub_ps(y, ysin1);
+
+  xmm1 = _mm_add_ps(ysin1,ysin2);
+  xmm2 = _mm_add_ps(y,y2);
+ 
+  /* update the sign */
+  *s = _mm_xor_ps(xmm1, sign_bit_sin);
+  *c = _mm_xor_ps(xmm2, sign_bit_cos);
+}
+
+
+#endif
+
+
+////
+
+
+#if CPU_SSE2_SUPPORT
+
+
+/* Copyright (C) 2008  José Fonseca
+   http://jrfonseca.blogspot.ca/2008/09/fast-sse2-pow-tables-or-polynomials.html
+   MIT license
+
+   Permission is hereby granted, free of charge, to any person obtaining a copy
+   of this software and associated documentation files (the "Software"), to deal
+   in the Software without restriction, including without limitation the rights
+   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+   copies of the Software, and to permit persons to whom the Software is
+   furnished to do so, subject to the following conditions:
+
+   The above copyright notice and this permission notice shall be included in all
+   copies or substantial portions of the Software.
+
+   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+   SOFTWARE.
+*/
+
+#define POLY0(x,c0) _mm_set1_ps(c0)
+#define POLY1(x,c0,c1) _mm_add_ps(_mm_mul_ps(POLY0(x, c1), x), _mm_set1_ps(c0))
+#define POLY2(x,c0,c1,c2) _mm_add_ps(_mm_mul_ps(POLY1(x, c1, c2), x), _mm_set1_ps(c0))
+#define POLY3(x,c0,c1,c2,c3) _mm_add_ps(_mm_mul_ps(POLY2(x, c1, c2, c3), x), _mm_set1_ps(c0))
+#define POLY4(x,c0,c1,c2,c3,c4) _mm_add_ps(_mm_mul_ps(POLY3(x, c1, c2, c3, c4), x), _mm_set1_ps(c0))
+#define POLY5(x,c0,c1,c2,c3,c4,c5) _mm_add_ps(_mm_mul_ps(POLY4(x, c1, c2, c3, c4, c5), x), _mm_set1_ps(c0))
+
+#define EXP_POLY_DEGREE 3
+#define LOG_POLY_DEGREE 5
+
+__m128 simd4f_exp2_ps( __m128 x )
+{
+  __m128i ipart;
+  __m128 fpart, expipart, expfpart;
+
+  x = _mm_min_ps( x, _mm_set1_ps(  129.00000f ) );
+  x = _mm_max_ps( x, _mm_set1_ps( -126.99999f ) );
+  /* ipart = int(x - 0.5) */
+  ipart = _mm_cvtps_epi32( _mm_sub_ps( x, _mm_set1_ps( 0.5f ) ) );
+  /* fpart = x - ipart */
+  fpart = _mm_sub_ps( x, _mm_cvtepi32_ps( ipart ) );
+  /* expipart = (float) (1 << ipart) */
+  expipart = _mm_castsi128_ps( _mm_slli_epi32( _mm_add_epi32( ipart, _mm_set1_epi32( 127 ) ), 23 ) );
+  /* minimax polynomial fit of 2**x, in range [-0.5, 0.5[ */
+#if EXP_POLY_DEGREE == 5
+  expfpart = POLY5( fpart, 9.9999994e-1f, 6.9315308e-1f, 2.4015361e-1f, 5.5826318e-2f, 8.9893397e-3f, 1.8775767e-3f );
+#elif EXP_POLY_DEGREE == 4
+  expfpart = POLY4( fpart, 1.0000026f, 6.9300383e-1f, 2.4144275e-1f, 5.2011464e-2f, 1.3534167e-2f );
+#elif EXP_POLY_DEGREE == 3
+  expfpart = POLY3( fpart, 9.9992520e-1f, 6.9583356e-1f, 2.2606716e-1f, 7.8024521e-2f );
+#elif EXP_POLY_DEGREE == 2
+  expfpart = POLY2( fpart, 1.0017247f, 6.5763628e-1f, 3.3718944e-1f );
+#else
+#error
+#endif
+  return _mm_mul_ps(expipart, expfpart);
+}
+
+__m128 simd4f_log2_ps( __m128 x )
+{
+  __m128i expmask, mantmask, i;
+  __m128 one, vexp, mant, logmant;
+
+  expmask = _mm_set1_epi32( 0x7f800000 );
+  mantmask = _mm_set1_epi32( 0x007fffff );
+  one = _mm_set1_ps( 1.0f );
+  i = _mm_castps_si128( x );
+  /* exp = (float) exponent(x) */
+  vexp = _mm_cvtepi32_ps( _mm_sub_epi32( _mm_srli_epi32( _mm_and_si128( i, expmask ), 23 ), _mm_set1_epi32( 127 ) ) );
+  /* mant = (float) mantissa(x) */
+  mant = _mm_or_ps( _mm_castsi128_ps( _mm_and_si128( i, mantmask ) ), one );
+  /* Minimax polynomial fit of log2(x)/(x - 1), for x in range [1, 2[ 
+   * These coefficients can be generate with 
+   * http://www.boost.org/doc/libs/1_36_0/libs/math/doc/sf_and_dist/html/math_toolkit/toolkit/internals2/minimax.html
+   */
+#if LOG_POLY_DEGREE == 6
+  logmant = POLY5( mant, 3.11578814719469302614f, -3.32419399085241980044f, 2.59883907202499966007f, -1.23152682416275988241f, 0.318212422185251071475f, -0.0344359067839062357313f );
+#elif LOG_POLY_DEGREE == 5
+  logmant = POLY4( mant, 2.8882704548164776201f, -2.52074962577807006663f, 1.48116647521213171641f, -0.465725644288844778798f, 0.0596515482674574969533f );
+#elif LOG_POLY_DEGREE == 4
+  logmant = POLY3( mant, 2.61761038894603480148f, -1.75647175389045657003f, 0.688243882994381274313f, -0.107254423828329604454f );
+#elif LOG_POLY_DEGREE == 3
+  logmant = POLY2( mant, 2.28330284476918490682f, -1.04913055217340124191f, 0.204446009836232697516f );
+#else
+#error
+#endif
+  /* This effectively increases the polynomial degree by one, but ensures that log2(1) == 0*/
+  logmant = _mm_mul_ps( logmant, _mm_sub_ps(mant, one ) );
+  return _mm_add_ps( logmant, vexp );
+}
+
+
+__m128 simd4f_pow_ps( __m128 x, __m128 y )
+{
+  return simd4f_exp2_ps( _mm_mul_ps( simd4f_log2_ps( x ), y ) );
+}
+
+
+#endif
+
+
+////
+
+
+#if CPU_SSE2_SUPPORT
+
+
+/*
+By Potatoswatter
+http://stackoverflow.com/questions/6475373/optimizations-for-pow-with-const-non-integer-exponent
+*/
+
+#ifndef CC_ALWAYSINLINE
+ #if defined(__GNUC__) || defined(__INTEL_COMPILER)
+  #define CC_ALWAYSINLINE __attribute__((always_inline))
+ #else
+  #define CC_ALWAYSINLINE
+ #endif
+#endif
+
+static inline CC_ALWAYSINLINE __m128 simd4f_fastpow_ps( __m128 arg, uint32_t expnum, uint32_t expden, uint32_t coeffnum, uint32_t coeffden )
+{
+  __m128 ret = arg;
+  float corrfactor, powfactor;
+  /* Apply a constant pre-correction factor. */
+  corrfactor = exp2( 127.0 * expden / expnum - 127.0 ) * pow( 1.0 * coeffnum / coeffden, 1.0 * expden / expnum );
+  powfactor = 1.0 * expnum / expden;
+  ret = _mm_mul_ps( ret, _mm_set1_ps( corrfactor ) );
+  /* Reinterpret arg as integer to obtain logarithm. */
+  ret = _mm_cvtepi32_ps( _mm_castps_si128( ret ) );
+  /* Multiply logarithm by power. */
+  ret = _mm_mul_ps( ret, _mm_set1_ps( powfactor ) );
+  /* Convert back to "integer" to exponentiate. */
+  ret = _mm_castsi128_ps( _mm_cvtps_epi32( ret ) );
+  return ret;
+}
+
+__m128 simd4f_pow12d5_ps( __m128 arg )
+{
+  /* Lower exponents provide lower initial error, but too low causes overflow. */
+  __m128 xf = simd4f_fastpow_ps( arg, 4, 5, (int)( 1.38316186f * 1e9 ), (int)1e9 );
+  /* Imprecise 4-cycle sqrt is still far better than fastpow, good enough. */
+  __m128 xfm4 = _mm_rsqrt_ps( xf );
+  __m128 xf4 = _mm_mul_ps( xf, xfm4 );
+  /* Precisely calculate x^2 and x^3 */
+  __m128 x2 = _mm_mul_ps( arg, arg );
+  __m128 x3 = _mm_mul_ps( x2, arg );
+  /* Overestimate of x^2 * x^0.4 */
+  x2 = _mm_mul_ps( x2, xf4 );
+  /* Get x^-0.2 from x^0.4, and square it for x^-0.4. Combine into x^-0.6. */
+  __m128 xfm2 = _mm_rsqrt_ps( xf4 );
+  x3 = _mm_mul_ps( x3, xfm4 );
+  x3 = _mm_mul_ps( x3, xfm2 );
+  return _mm_mul_ps( _mm_add_ps( x2, x3 ), _mm_set1_ps( 1.0f/1.960131704207789f * 0.9999f ) );
+}
+
+__m128 simd4f_pow5d12_ps( __m128 arg )
+{
+  /* 5/12 is too small, so compute the 4th root of 20/12 instead. */
+  /* 20/12 = 5/3 = 1 + 2/3 = 2 - 1/3. 2/3 is a suitable argument for fastpow. */
+  /* weighting coefficient: a^-1/2 = 2 a; a = 2^-2/3 */
+  __m128 xf = simd4f_fastpow_ps( arg, 2, 3, (int)( 0.629960524947437f * 1e9 ), (int)1e9 );
+  __m128 xover = _mm_mul_ps( arg, xf );
+  __m128 xfm1 = _mm_rsqrt_ps( xf );
+  __m128 x2 = _mm_mul_ps( arg, arg );
+  __m128 xunder = _mm_mul_ps( x2, xfm1 );
+  /* sqrt2 * over + 2 * sqrt2 * under */
+  __m128 xavg = _mm_mul_ps( _mm_set1_ps( 1.0f/( 3.0f * 0.629960524947437f ) * 0.999852f ), _mm_add_ps( xover, xunder ) );
+  xavg = _mm_mul_ps( xavg, _mm_rsqrt_ps( xavg ) );
+  xavg = _mm_mul_ps( xavg, _mm_rsqrt_ps( xavg ) );
+  return xavg;
+}
+
+#endif
+
+
+////
+
diff --git a/lib/graphics_utils/mipmap/cpusimd.h b/lib/graphics_utils/mipmap/cpusimd.h
new file mode 100644
index 000000000..4c0b0f582
--- /dev/null
+++ b/lib/graphics_utils/mipmap/cpusimd.h
@@ -0,0 +1,410 @@
+/* -----------------------------------------------------------------------------
+ *
+ * Copyright (c) 2008-2016 Alexis Naveros.
+ *
+ * The SIMD trigonometry functions are Copyright (C) 2007  Julien Pommier
+ * See copyright notice for simd4f_sin_ps(), simd4f_cos_ps(), simd4f_sincos_ps()
+ *
+ * Portions developed under contract to the SURVICE Engineering Company.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty. In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ * claim that you wrote the original software. If you use this software
+ * in a product, an acknowledgment in the product documentation would be
+ * appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ * misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ *
+ * -----------------------------------------------------------------------------
+ */
+
+
+#ifndef CPUSIMD_H
+#define CPUSIMD_H
+
+
+////
+
+
+#if __MMX__ || CPU_ENABLE_MMX
+ #include <mmintrin.h>
+ #define CPU_MMX_SUPPORT (1)
+#endif
+#if __SSE__ || _M_X64 || _M_IX86_FP >= 1  || CPU_ENABLE_SSE
+ #include <xmmintrin.h>
+ #define CPU_SSE_SUPPORT (1)
+#endif
+#if __SSE2__ || _M_X64 || _M_IX86_FP >= 2  || CPU_ENABLE_SSE2
+ #include <emmintrin.h>
+ #define CPU_SSE2_SUPPORT (1)
+#endif
+#if __SSE3__ || __AVX__ || CPU_ENABLE_SSE3
+ #include <pmmintrin.h>
+ #define CPU_SSE3_SUPPORT (1)
+#endif
+#if __SSSE3__ || __AVX__  || CPU_ENABLE_SSSE3
+ #include <tmmintrin.h>
+ #define CPU_SSSE3_SUPPORT (1)
+#endif
+#if __SSE4_1__ || __AVX__  || CPU_ENABLE_SSE4_1
+ #include <smmintrin.h>
+ #define CPU_SSE4_1_SUPPORT (1)
+#endif
+#if __SSE4_2__ || CPU_ENABLE_SSE4_2
+ #include <nmmintrin.h>
+ #define CPU_SSE4_2_SUPPORT (1)
+#endif
+#if __SSE4A__ || CPU_ENABLE_SSE4A
+ #include <ammintrin.h>
+ #define CPU_SSE4A_SUPPORT (1)
+#endif
+#if __AVX__ || CPU_ENABLE_AVX
+ #include <immintrin.h>
+ #define CPU_AVX_SUPPORT (1)
+#endif
+#if __AVX2__ || CPU_ENABLE_AVX2
+ #include <immintrin.h>
+ #define CPU_AVX2_SUPPORT (1)
+#endif
+#if __XOP__ || CPU_ENABLE_XOP
+ #include <immintrin.h>
+ #define CPU_XOP_SUPPORT (1)
+#endif
+#if __FMA3__ || CPU_ENABLE_FMA3
+ #include <immintrin.h>
+ #define CPU_FMA3_SUPPORT (1)
+#endif
+#if __FMA4__ || CPU_ENABLE_FMA4
+ #include <immintrin.h>
+ #define CPU_FMA4_SUPPORT (1)
+#endif
+#if __RDRND__ || CPU_ENABLE_RDRND
+ #include <immintrin.h>
+ #define CPU_RDRND_SUPPORT (1)
+#endif
+#if __POPCNT__ || CPU_ENABLE_POPCNT
+ #include <popcntintrin.h>
+ #define CPU_POPCNT_SUPPORT (1)
+#endif
+#if __LZCNT__ || CPU_ENABLE_LZCNT
+ #include <lzcntintrin.h>
+ #define CPU_LZCNT_SUPPORT (1)
+#endif
+#if __F16C__ || CPU_ENABLE_F16C
+ #include <f16cintrin.h>
+ #define CPU_F16C_SUPPORT (1)
+#endif
+#if __BMI__ || CPU_ENABLE_BMI
+ #include <bmiintrin.h>
+ #define CPU_BMI_SUPPORT (1)
+#endif
+#if __BMI2__ || CPU_ENABLE_BMI2
+ #include <bmi2intrin.h>
+ #define CPU_BMI2_SUPPORT (1)
+#endif
+#if __TBM__ || CPU_ENABLE_TBM
+ #include <tbmintrin.h>
+ #define CPU_TBM_SUPPORT (1)
+#endif
+
+
+
+#if defined(__GNUC__) || defined(__INTEL_COMPILER)
+ #define CPU_ALIGN16 __attribute__((aligned(16)))
+ #define CPU_ALIGN32 __attribute__((aligned(32)))
+ #define CPU_ALIGN64 __attribute__((aligned(64)))
+#elif defined(_MSC_VER)
+ #define CPU_ALIGN16 __declspec(align(16))
+ #define CPU_ALIGN64 __declspec(align(64))
+#else
+ #define CPU_ALIGN16
+ #define CPU_ALIGN32
+ #define CPU_ALIGN64
+ #warning "SSE/AVX Disabled: Unsupported Compiler."
+ #undef CPU_SSE_SUPPORT
+ #undef CPU_SSE2_SUPPORT
+ #undef CPU_SSE3_SUPPORT
+ #undef CPU_SSSE3_SUPPORT
+ #undef CPU_SSE4_1_SUPPORT
+ #undef CPU_SSE4_2_SUPPORT
+ #undef CPU_AVX_SUPPORT
+ #undef CPU_AVX2_SUPPORT
+ #undef CPU_XOP_SUPPORT
+ #undef CPU_FMA3_SUPPORT
+ #undef CPU_FMA4_SUPPORT
+#endif
+
+
+////
+
+
+#if CPU_SSE_SUPPORT
+ #define CPU_APPROX_DIV_FLOAT(z,w) _mm_cvtss_f32(_mm_mul_ss(_mm_set_ss(z),_mm_rcp_ss(_mm_set_ss(w))))
+ #define CPU_APPROX_SQRT_FLOAT(z) _mm_cvtss_f32(_mm_mul_ss(_mm_set_ss(z),_mm_rsqrt_ss(_mm_set_ss(z))))
+ #define CPU_APPROX_RSQRT_FLOAT(z) _mm_cvtss_f32(_mm_rsqrt_ss(_mm_set_ss(z)))
+ #define CPU_APPROX_DIVSQRT_FLOAT(z,w) _mm_cvtss_f32(_mm_mul_ss(_mm_set_ss(z),_mm_rsqrt_ss(_mm_set_ss(w))))
+#else
+ #define CPU_APPROX_DIV_FLOAT(z,w) ((z)/(w))
+ #define CPU_APPROX_SQRT_FLOAT(z) (sqrtf(z))
+ #define CPU_APPROX_RSQRT_FLOAT(z) (1.0/sqrtf(z))
+ #define CPU_APPROX_DIVSQRT_FLOAT(z,w) ((z)/sqrtf(w))
+#endif
+
+
+#if CPU_SSE3_SUPPORT
+ #define CPU_HADD_PS(vx,vy) _mm_hadd_ps(vx,vy)
+ #define CPU_HADD_PD(vx,vy) _mm_hadd_pd(vx,vy)
+#elif CPU_SSE_SUPPORT
+ static inline __m128 CPU_HADD_PS( __m128 vx, __m128 vy )
+ {
+   __m128 vh, vl;
+   vh = _mm_shuffle_ps( vx, vy, _MM_SHUFFLE(3,1,3,1) );
+   vl = _mm_shuffle_ps( vx, vy, _MM_SHUFFLE(2,0,2,0) );
+   return _mm_add_ps( vh, vl );
+ }
+ #define CPU_HADD_PD(vx,vy) _mm_add_sd(vx,_mm_unpackhi_pd(vy,vy))
+#endif
+
+
+#if CPU_SSE4_1_SUPPORT
+ #define CPU_CVT_U8_TO_I32(x,vzero) _mm_cvtepu8_epi32(x)
+ #define CPU_CVT_S8_TO_I32(x,vzero) _mm_cvtepi8_epi32(x)
+#elif CPU_SSE2_SUPPORT
+ #define CPU_CVT_U8_TO_I32(x,vzero) _mm_unpacklo_epi16(_mm_unpacklo_epi8((x),(vzero)),(vzero))
+static inline __m128i CPU_CVT_S8_TO_I32( __m128i vx, __m128i vzero )
+{
+  __m128i vsign;
+  vsign = _mm_cmpgt_epi8( vzero, vx );
+  return _mm_unpacklo_epi16( _mm_unpacklo_epi8( vx, vsign ), _mm_unpacklo_epi8( vsign, vsign ) );
+}
+#endif
+
+
+#if CPU_SSE4_1_SUPPORT
+ #define CPU_BLENDV_PS(x,y,mask) _mm_blendv_ps(x,y,mask)
+ #define CPU_BLENDV_PD(x,y,mask) _mm_blendv_pd(x,y,mask)
+#elif CPU_SSE2_SUPPORT
+ #define CPU_BLENDV_PS(x,y,mask) _mm_or_ps(_mm_andnot_ps(mask,x),_mm_and_ps(y,mask))
+ #define CPU_BLENDV_PD(x,y,mask) _mm_or_pd(_mm_andnot_pd(mask,x),_mm_and_pd(y,mask))
+#endif
+
+
+
+/*
+CPU_FMADD = ((f0*f1)+t0)
+CPU_FMSUB = ((f0*f1)-t0)
+*/
+#if CPU_FMA3_SUPPORT
+ #define CPU_FMADD_SS(f0,f1,t0) _mm_fmadd_ss(f0,f1,t0)
+ #define CPU_FMADD_PS(f0,f1,t0) _mm_fmadd_ps(f0,f1,t0)
+ #define CPU_FMADD_SD(f0,f1,t0) _mm_fmadd_sd(f0,f1,t0)
+ #define CPU_FMADD_PD(f0,f1,t0) _mm_fmadd_pd(f0,f1,t0)
+ #define CPU_FMSUB_SS(f0,f1,t0) _mm_fmsub_ss(f0,f1,t0)
+ #define CPU_FMSUB_PS(f0,f1,t0) _mm_fmsub_ps(f0,f1,t0)
+ #define CPU_FMSUB_SD(f0,f1,t0) _mm_fmsub_sd(f0,f1,t0)
+ #define CPU_FMSUB_PD(f0,f1,t0) _mm_fmsub_pd(f0,f1,t0)
+ #define CPU_FMADD256_SS(f0,f1,t0) _mm256_fmadd_ss(f0,f1,t0)
+ #define CPU_FMADD256_PS(f0,f1,t0) _mm256_fmadd_ps(f0,f1,t0)
+ #define CPU_FMADD256_SD(f0,f1,t0) _mm256_fmadd_sd(f0,f1,t0)
+ #define CPU_FMADD256_PD(f0,f1,t0) _mm256_fmadd_pd(f0,f1,t0)
+ #define CPU_FMSUB256_SS(f0,f1,t0) _mm256_fmsub_ss(f0,f1,t0)
+ #define CPU_FMSUB256_PS(f0,f1,t0) _mm256_fmsub_ps(f0,f1,t0)
+ #define CPU_FMSUB256_SD(f0,f1,t0) _mm256_fmsub_sd(f0,f1,t0)
+ #define CPU_FMSUB256_PD(f0,f1,t0) _mm256_fmsub_pd(f0,f1,t0)
+#elif CPU_FMA4_SUPPORT
+ #define CPU_FMADD_SS(f0,f1,t0) _mm_macc_ss(f0,f1,t0)
+ #define CPU_FMADD_PS(f0,f1,t0) _mm_macc_ps(f0,f1,t0)
+ #define CPU_FMADD_SD(f0,f1,t0) _mm_macc_sd(f0,f1,t0)
+ #define CPU_FMADD_PD(f0,f1,t0) _mm_macc_pd(f0,f1,t0)
+ #define CPU_FMSUB_SS(f0,f1,t0) _mm_msub_ss(f0,f1,t0)
+ #define CPU_FMSUB_PS(f0,f1,t0) _mm_msub_ps(f0,f1,t0)
+ #define CPU_FMSUB_SD(f0,f1,t0) _mm_msub_sd(f0,f1,t0)
+ #define CPU_FMSUB_PD(f0,f1,t0) _mm_msub_pd(f0,f1,t0)
+ #define CPU_FMADD256_SS(f0,f1,t0) _mm256_macc_ss(f0,f1,t0)
+ #define CPU_FMADD256_PS(f0,f1,t0) _mm256_macc_ps(f0,f1,t0)
+ #define CPU_FMADD256_SD(f0,f1,t0) _mm256_macc_sd(f0,f1,t0)
+ #define CPU_FMADD256_PD(f0,f1,t0) _mm256_macc_pd(f0,f1,t0)
+ #define CPU_FMSUB256_SS(f0,f1,t0) _mm256_msub_ss(f0,f1,t0)
+ #define CPU_FMSUB256_PS(f0,f1,t0) _mm256_msub_ps(f0,f1,t0)
+ #define CPU_FMSUB256_SD(f0,f1,t0) _mm256_msub_sd(f0,f1,t0)
+ #define CPU_FMSUB256_PD(f0,f1,t0) _mm256_msub_pd(f0,f1,t0)
+#else
+ #define CPU_FMADD_SS(f0,f1,t0) _mm_add_ss(_mm_mul_ss(f0,f1),t0)
+ #define CPU_FMADD_PS(f0,f1,t0) _mm_add_ps(_mm_mul_ps(f0,f1),t0)
+ #define CPU_FMADD_SD(f0,f1,t0) _mm_add_sd(_mm_mul_sd(f0,f1),t0)
+ #define CPU_FMADD_PD(f0,f1,t0) _mm_add_pd(_mm_mul_pd(f0,f1),t0)
+ #define CPU_FMSUB_SS(f0,f1,t0) _mm_sub_ss(_mm_mul_ss(f0,f1),t0)
+ #define CPU_FMSUB_PS(f0,f1,t0) _mm_sub_ps(_mm_mul_ps(f0,f1),t0)
+ #define CPU_FMSUB_SD(f0,f1,t0) _mm_sub_sd(_mm_mul_sd(f0,f1),t0)
+ #define CPU_FMSUB_PD(f0,f1,t0) _mm_sub_pd(_mm_mul_pd(f0,f1),t0)
+ #define CPU_FMADD256_SS(f0,f1,t0) _mm256_add_ss(_mm256_mul_ss(f0,f1),t0)
+ #define CPU_FMADD256_PS(f0,f1,t0) _mm256_add_ps(_mm256_mul_ps(f0,f1),t0)
+ #define CPU_FMADD256_SD(f0,f1,t0) _mm256_add_sd(_mm256_mul_sd(f0,f1),t0)
+ #define CPU_FMADD256_PD(f0,f1,t0) _mm256_add_pd(_mm256_mul_pd(f0,f1),t0)
+ #define CPU_FMSUB256_SS(f0,f1,t0) _mm256_sub_ss(_mm256_mul_ss(f0,f1),t0)
+ #define CPU_FMSUB256_PS(f0,f1,t0) _mm256_sub_ps(_mm256_mul_ps(f0,f1),t0)
+ #define CPU_FMSUB256_SD(f0,f1,t0) _mm256_sub_sd(_mm256_mul_sd(f0,f1),t0)
+ #define CPU_FMSUB256_PD(f0,f1,t0) _mm256_sub_pd(_mm256_mul_pd(f0,f1),t0)
+#endif
+
+
+////
+
+
+#if CPU_SSE_SUPPORT
+
+extern const uint32_t simd4fSignMask[4];
+extern const uint32_t simd4fSignMaskInv[4];
+extern const float simd4fHalf[4];
+extern const float simd4fOne[4];
+extern const float simd4fTwo[4];
+extern const float simd4fThree[4];
+extern const uint32_t simd4uOne[4];
+extern const uint32_t simd4uOneInv[4];
+extern const uint32_t simd4uTwo[4];
+extern const uint32_t simd4uFour[4];
+extern const float simd4fQuarter[4];
+extern const float simd4fPi[4];
+extern const float simd4fZeroOneTwoThree[4];
+extern const uint32_t simd4fAlphaMask[4];
+extern const float simd4f255[4];
+extern const float simd4f255Inv[4];
+
+#endif
+
+
+#if CPU_SSE2_SUPPORT
+
+/* Input range between -8192 and 8192 */
+__m128 simd4f_sin_ps( __m128 x );
+__m128 simd4f_cos_ps( __m128 x );
+void simd4f_sincos_ps( __m128 x, __m128 *s, __m128 *c );
+
+#endif
+
+#if CPU_SSE2_SUPPORT
+
+__m128 simd4f_exp2_ps( __m128 x );
+__m128 simd4f_log2_ps( __m128 x );
+__m128 simd4f_pow_ps( __m128 x, __m128 y );
+
+#endif
+
+#if CPU_SSE2_SUPPORT
+
+__m128 simd4f_pow12d5_ps( __m128 arg );
+__m128 simd4f_pow5d12_ps( __m128 arg );
+
+#endif
+
+
+////
+
+
+#if CPU_SSE2_SUPPORT
+
+#ifndef CC_ALWAYSINLINE
+ #if defined(__GNUC__) || defined(__INTEL_COMPILER)
+  #define CC_ALWAYSINLINE __attribute__((always_inline))
+ #else
+  #define CC_ALWAYSINLINE
+ #endif
+#endif
+
+static inline CC_ALWAYSINLINE __m128 simd4f_pow12d5_inline_ps( __m128 vx )
+{
+  __m128 vpow, vpwsqrtinv, vpwsqrt, vx2;
+  vx2 = _mm_mul_ps( vx, vx );
+  vpow = _mm_castsi128_ps( _mm_cvtps_epi32( _mm_mul_ps( _mm_cvtepi32_ps( _mm_castps_si128( _mm_mul_ps( vx, _mm_set1_ps( 5417434112.0f ) ) ) ), _mm_set1_ps( 0.8f ) ) ) );
+  vpwsqrtinv = _mm_rsqrt_ps( vpow );
+  vpwsqrt = _mm_mul_ps( vpow, vpwsqrtinv );
+  return _mm_mul_ps( _mm_add_ps( _mm_mul_ps( vx2, vpwsqrt ), _mm_mul_ps( _mm_mul_ps( _mm_mul_ps( vx2, vx ), vpwsqrtinv ), _mm_rsqrt_ps( vpwsqrt ) ) ), _mm_set1_ps( 0.51011878327f ) );
+}
+
+static inline CC_ALWAYSINLINE __m128 simd4f_pow5d12_inline_ps( __m128 vx )
+{
+  __m128 vpow;
+  vpow = _mm_castsi128_ps( _mm_cvtps_epi32( _mm_mul_ps( _mm_cvtepi32_ps( _mm_castps_si128( _mm_mul_ps( vx, _mm_set1_ps( 6521909350804488192.0f ) ) ) ), _mm_set1_ps( 0.666666666666f ) ) ) );
+  vx = _mm_mul_ps( _mm_add_ps( _mm_mul_ps( vx, vpow ), _mm_mul_ps( _mm_mul_ps( vx, vx ), _mm_rsqrt_ps( vpow ) ) ), _mm_set1_ps( 0.5290553722f ) );
+#if 0
+  vx = _mm_mul_ps( vx, _mm_rsqrt_ps( vx ) );
+  vx = _mm_mul_ps( vx, _mm_rsqrt_ps( vx ) );
+#else
+  vx = _mm_sqrt_ps( vx );
+  vx = _mm_sqrt_ps( vx );
+#endif
+  return vx;
+}
+
+#endif
+
+
+////
+
+
+#if CPU_SSE_SUPPORT
+
+static inline void simdPrintDebugSSE4f( char *str, __m128 v )
+{
+  float CPU_ALIGN16 store[4];
+  _mm_store_ps( (void *)store, v );
+  printf( "%s %f %f %f %f\n", str, store[0], store[1], store[2], store[3] );
+  return;
+}
+
+static inline void simdPrintDebugSSE2d( char *str, __m128d v )
+{
+  double CPU_ALIGN16 store[2];
+  _mm_store_pd( (void *)store, v );
+  printf( "%s %f %f\n", str, store[0], store[1] );
+  return;
+}
+
+static inline void simdPrintDebugSSE16u8( char *str, __m128i v )
+{
+  uint8_t CPU_ALIGN16 store[16];
+  _mm_store_si128( (void *)store, v );
+  printf( "%s %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d\n", str, store[0], store[1], store[2], store[3], store[4], store[5], store[6], store[7], store[8], store[9], store[10], store[11], store[12], store[13], store[14], store[15] );
+  return;
+}
+
+static inline void simdPrintDebugSSE8u16( char *str, __m128i v )
+{
+  uint16_t CPU_ALIGN16 store[8];
+  _mm_store_si128( (void *)store, v );
+  printf( "%s %d %d %d %d %d %d %d %d\n", str, store[0], store[1], store[2], store[3], store[4], store[5], store[6], store[7] );
+  return;
+}
+
+static inline void simdPrintDebugSSE4u32( char *str, __m128i v )
+{
+  uint32_t CPU_ALIGN16 store[4];
+  _mm_store_si128( (void *)store, v );
+  printf( "%s %d %d %d %d\n", str, store[0], store[1], store[2], store[3] );
+  return;
+}
+
+static inline void simdPrintDebugSSE2u64( char *str, __m128i v )
+{
+  uint64_t CPU_ALIGN16 store[2];
+  _mm_store_si128( (void *)store, v );
+  printf( "%s %lld %lld\n", str, (long long)store[0], (long long)store[1] );
+  return;
+}
+
+#endif
+
+
+////
+
+
+#endif
+
diff --git a/lib/graphics_utils/mipmap/img.c b/lib/graphics_utils/mipmap/img.c
new file mode 100644
index 000000000..369e21f16
--- /dev/null
+++ b/lib/graphics_utils/mipmap/img.c
@@ -0,0 +1,628 @@
+/* *****************************************************************************
+ *
+ * Copyright (c) 2007-2016 Alexis Naveros.
+ * Portions developed under contract to the SURVICE Engineering Company.
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * version 2.1 as published by the Free Software Foundation.
+ *
+ * This library is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this file; see the file named COPYING for more
+ * information.
+ *
+ * *****************************************************************************
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <string.h>
+#include <math.h>
+
+
+#include "cpusimd.h"
+
+#include "img.h"
+
+
+#ifndef ADDRESS
+ #define ADDRESS(p,o) ((void *)(((char *)p)+(o)))
+#endif
+
+
+////
+
+
+void imgCopyRect( imgImage *image, int dstx, int dsty, int srcx, int srcy, int sizex, int sizey )
+{
+  int y;
+  void *dst, *src;
+  src = ADDRESS( image->data, ( srcx * image->format.bytesperpixel ) + ( srcy * image->format.bytesperline ) );
+  dst = ADDRESS( image->data, ( dstx * image->format.bytesperpixel ) + ( dsty * image->format.bytesperline ) );
+  for( y = 0 ; y < sizey ; y++ )
+  {
+    memcpy( dst, src, sizex * image->format.bytesperpixel );
+    src = ADDRESS( src, image->format.bytesperline );
+    dst = ADDRESS( dst, image->format.bytesperline );
+  }
+  return;
+}
+
+
+#if CPU_SSE2_SUPPORT
+static const uint16_t CPU_ALIGN16 imgBlendRgbMask[8] = { 0xffff, 0xffff, 0xffff, 0x0000, 0xffff, 0xffff, 0xffff, 0x0000 };
+static const uint8_t CPU_ALIGN16 imgBlendAlphaTestMask[16] = { 0x00, 0x00, 0x00, 0xff, 0x00, 0x00, 0x00, 0xff, 0x00, 0x00, 0x00, 0xff, 0x00, 0x00, 0x00, 0xff };
+static const uint16_t CPU_ALIGN16 imgBlendRoundBias[8] = { 128, 128, 128, 128, 128, 128, 128, 128 };
+ #if CPU_SSSE3_SUPPORT
+static const uint8_t CPU_ALIGN16 imgBlendShufMask[16] = { 6,7,6,7,6,7,6,7, 14,15,14,15,14,15,14,15 };
+ #endif
+#endif
+
+static void imgBlendImageRgba2Rgba( imgImage *dstimage, int dstx, int dsty, imgImage *srcimage )
+{
+  int x, y;
+#if CPU_SSE2_SUPPORT
+  int row4size;
+  __m128i vsrc01, vsrc23, vdst01, vdst23, vblend01, vblend23;
+  __m128i vzero, v255, vrgbmask, valphatest, vroundbias;
+ #if CPU_SSSE3_SUPPORT
+  __m128i vshufmask;
+ #endif
+#else
+  int32_t dstr, dstg, dstb, dsta;
+  int32_t srcr, srcg, srcb, srca;
+#endif
+  unsigned char *src, *srcrow, *dstrow;
+  uint32_t *dst;
+
+  /* TODO: Other function to clamp copy area? */
+
+#if CPU_SSE2_SUPPORT
+  row4size = srcimage->format.width & ~3;
+  vzero = _mm_setzero_si128();
+  v255 = _mm_set1_epi16( 255 );
+  vrgbmask = _mm_load_si128( (void *)imgBlendRgbMask );
+  valphatest = _mm_load_si128( (void *)imgBlendAlphaTestMask );
+  vroundbias = _mm_load_si128( (void *)imgBlendRoundBias );
+ #if CPU_SSSE3_SUPPORT
+  vshufmask = _mm_load_si128( (void *)imgBlendShufMask );
+ #endif
+#endif
+
+  src = srcimage->data;
+  dst = ADDRESS( dstimage->data, ( dstx * 4 ) + ( dsty * dstimage->format.bytesperline ) );
+  for( y = 0 ; y < srcimage->format.height ; y++ )
+  {
+    srcrow = src;
+    dstrow = (unsigned char *)dst;
+
+#if CPU_SSE2_SUPPORT
+    for( x = 0 ; x < row4size ; x += 4, srcrow += 16, dstrow += 16 )
+    {
+      /* r0g0b0a0,r1g1b1a1,r2g2b2a2,r3g3b3a3 */
+      vsrc23 = _mm_loadu_si128( (void *)srcrow );
+      if( _mm_movemask_ps( _mm_castsi128_ps( _mm_cmpeq_epi32( _mm_and_si128( valphatest, vsrc23 ), vzero ) ) ) == 0xf )
+        continue;
+      vdst23 = _mm_loadu_si128( (void *)dstrow );
+      /* r0__g0__b0__a0__, r1__g1__b1__a1__ */
+      vsrc01 = _mm_unpacklo_epi8( vsrc23, vzero );
+      vdst01 = _mm_unpacklo_epi8( vdst23, vzero );
+      /* r2__g2__b2__a2__, r3__g3__b3__a3__ */
+      vsrc23 = _mm_unpackhi_epi8( vsrc23, vzero );
+      vdst23 = _mm_unpackhi_epi8( vdst23, vzero );
+ #if CPU_SSSE3_SUPPORT
+      /* __a0__a0__a0__a0, __a1__a1__a1__a1 */
+      vblend01 = _mm_shuffle_epi8( vsrc01, vshufmask );
+      /* __a2__a2__a2__a2, __a3__a3__a3__a3 */
+      vblend23 = _mm_shuffle_epi8( vsrc23, vshufmask );
+ #else
+      vblend01 = _mm_shufflelo_epi16( vsrc01, 0xff );
+      vblend01 = _mm_shufflehi_epi16( vblend01, 0xff );
+      vblend23 = _mm_shufflelo_epi16( vsrc23, 0xff );
+      vblend23 = _mm_shufflehi_epi16( vblend23, 0xff );
+ #endif
+      vdst01 = _mm_adds_epu16( _mm_adds_epu16( _mm_mullo_epi16( vdst01, _mm_sub_epi16( v255, _mm_and_si128( vblend01, vrgbmask ) ) ), _mm_mullo_epi16( vsrc01, vblend01 ) ), vroundbias );
+      vdst23 = _mm_adds_epu16( _mm_adds_epu16( _mm_mullo_epi16( vdst23, _mm_sub_epi16( v255, _mm_and_si128( vblend23, vrgbmask ) ) ), _mm_mullo_epi16( vsrc23, vblend23 ) ), vroundbias );
+      /* Correction to divide by 255 instead of 256 */
+      vdst01 = _mm_srli_epi16( _mm_adds_epu16( vdst01, _mm_srli_epi16( vdst01, 8 ) ), 8 );
+      vdst23 = _mm_srli_epi16( _mm_adds_epu16( vdst23, _mm_srli_epi16( vdst23, 8 ) ), 8 );
+      /* Combine interleaved and store */
+      _mm_storeu_si128( (void *)dstrow, _mm_packus_epi16( vdst01, vdst23 ) );
+    }
+    for( ; x < srcimage->format.width ; x++, srcrow += 4, dstrow += 4 )
+    {
+      if( !( srcrow[3] ) )
+        continue;
+      vsrc01 = _mm_castps_si128( _mm_load_ss( (void *)srcrow ) );
+      vdst01 = _mm_castps_si128( _mm_load_ss( (void *)dstrow ) );
+      vsrc01 = _mm_unpacklo_epi8( vsrc01, vzero );
+      vdst01 = _mm_unpacklo_epi8( vdst01, vzero );
+ #if CPU_SSSE3_SUPPORT
+      vblend01 = _mm_shuffle_epi8( vsrc01, vshufmask );
+ #else
+      vblend01 = _mm_shufflelo_epi16( vsrc01, 0xff );
+      vblend01 = _mm_shufflehi_epi16( vblend01, 0xff );
+ #endif
+      vdst01 = _mm_adds_epu16( _mm_adds_epu16( _mm_mullo_epi16( vdst01, _mm_sub_epi16( v255, _mm_and_si128( vblend01, vrgbmask ) ) ), _mm_mullo_epi16( vsrc01, vblend01 ) ), vroundbias );
+      /* Correction to divide by 255 instead of 256 */
+      vdst01 = _mm_srli_epi16( _mm_adds_epu16( vdst01, _mm_srli_epi16( vdst01, 8 ) ), 8 );
+      _mm_store_ss( (void *)dstrow, _mm_castsi128_ps( _mm_packus_epi16( vdst01, vdst01 ) ) );
+    }
+#else
+    for( x = 0 ; x < srcimage->format.width ; x++, srcrow += 4, dstrow += 4 )
+    {
+      if( !( srcrow[3] ) )
+        continue;
+      srcr = (int32_t)srcrow[0];
+      srcg = (int32_t)srcrow[1];
+      srcb = (int32_t)srcrow[2];
+      srca = (int32_t)srcrow[3];
+      dstr = (int32_t)dstrow[0];
+      dstg = (int32_t)dstrow[1];
+      dstb = (int32_t)dstrow[2];
+      dsta = (int32_t)dstrow[3];
+      dstr = ( ( dstr << 8 ) - dstr + ( srca * ( srcr - dstr ) ) + 128 );
+      dstg = ( ( dstg << 8 ) - dstg + ( srca * ( srcg - dstg ) ) + 128 );
+      dstb = ( ( dstb << 8 ) - dstb + ( srca * ( srcb - dstb ) ) + 128 );
+      dsta = ( ( dsta << 8 ) - dsta + ( srca * srca ) + 128 );
+      dstr = ( dstr + ( dstr >> 8 ) ) >> 8;
+      dstg = ( dstg + ( dstg >> 8 ) ) >> 8;
+      dstb = ( dstb + ( dstb >> 8 ) ) >> 8;
+      dsta = ( dsta + ( dsta >> 8 ) ) >> 8;
+      if( dsta > 255 )
+        dsta = 255;
+      dstrow[0] = (unsigned char)dstr;
+      dstrow[1] = (unsigned char)dstg;
+      dstrow[2] = (unsigned char)dstb;
+      dstrow[3] = (unsigned char)dsta;
+    }
+#endif
+    src = ADDRESS( src, srcimage->format.bytesperline );
+    dst = ADDRESS( dst, dstimage->format.bytesperline );
+  }
+
+  return;
+}
+
+static void imgBlendImageRgba2Rgbx( imgImage *dstimage, int dstx, int dsty, imgImage *srcimage )
+{
+  int x, y;
+#if CPU_SSE2_SUPPORT
+  int row4size;
+  __m128i vsrc01, vsrc23, vdst01, vdst23, vblend01, vblend23;
+  __m128i vzero, v255, valphatest, vroundbias;
+ #if CPU_SSSE3_SUPPORT
+  __m128i vshufmask;
+ #endif
+#else
+  int32_t dstr, dstg, dstb;
+  int32_t srcr, srcg, srcb, srca;
+#endif
+  unsigned char *src, *srcrow, *dstrow;
+  uint32_t *dst;
+
+  /* TODO: Other function to clamp copy area? */
+
+#if CPU_SSE2_SUPPORT
+  row4size = srcimage->format.width & ~3;
+  vzero = _mm_setzero_si128();
+  v255 = _mm_set1_epi16( 255 );
+  valphatest = _mm_load_si128( (void *)imgBlendAlphaTestMask );
+  vroundbias = _mm_load_si128( (void *)imgBlendRoundBias );
+ #if CPU_SSSE3_SUPPORT
+  vshufmask = _mm_load_si128( (void *)imgBlendShufMask );
+ #endif
+#endif
+
+  src = srcimage->data;
+  dst = ADDRESS( dstimage->data, ( dstx * 4 ) + ( dsty * dstimage->format.bytesperline ) );
+  for( y = 0 ; y < srcimage->format.height ; y++ )
+  {
+    srcrow = src;
+    dstrow = (unsigned char *)dst;
+
+#if CPU_SSE2_SUPPORT
+    for( x = 0 ; x < row4size ; x += 4, srcrow += 16, dstrow += 16 )
+    {
+      /* r0g0b0a0,r1g1b1a1,r2g2b2a2,r3g3b3a3 */
+      vsrc23 = _mm_loadu_si128( (void *)srcrow );
+      if( _mm_movemask_ps( _mm_castsi128_ps( _mm_cmpeq_epi32( _mm_and_si128( valphatest, vsrc23 ), vzero ) ) ) == 0xf )
+        continue;
+      vdst23 = _mm_loadu_si128( (void *)dstrow );
+      /* r0__g0__b0__a0__, r1__g1__b1__a1__ */
+      vsrc01 = _mm_unpacklo_epi8( vsrc23, vzero );
+      vdst01 = _mm_unpacklo_epi8( vdst23, vzero );
+      /* r2__g2__b2__a2__, r3__g3__b3__a3__ */
+      vsrc23 = _mm_unpackhi_epi8( vsrc23, vzero );
+      vdst23 = _mm_unpackhi_epi8( vdst23, vzero );
+ #if CPU_SSSE3_SUPPORT
+      /* __a0__a0__a0__a0, __a1__a1__a1__a1 */
+      vblend01 = _mm_shuffle_epi8( vsrc01, vshufmask );
+      /* __a2__a2__a2__a2, __a3__a3__a3__a3 */
+      vblend23 = _mm_shuffle_epi8( vsrc23, vshufmask );
+ #else
+      vblend01 = _mm_shufflelo_epi16( vsrc01, 0xff );
+      vblend01 = _mm_shufflehi_epi16( vblend01, 0xff );
+      vblend23 = _mm_shufflelo_epi16( vsrc23, 0xff );
+      vblend23 = _mm_shufflehi_epi16( vblend23, 0xff );
+ #endif
+      vdst01 = _mm_adds_epu16( _mm_adds_epu16( _mm_mullo_epi16( vdst01, _mm_sub_epi16( v255, vblend01 ) ), _mm_mullo_epi16( vsrc01, vblend01 ) ), vroundbias );
+      vdst23 = _mm_adds_epu16( _mm_adds_epu16( _mm_mullo_epi16( vdst23, _mm_sub_epi16( v255, vblend23 ) ), _mm_mullo_epi16( vsrc23, vblend23 ) ), vroundbias );
+      /* Correction to divide by 255 instead of 256 */
+      vdst01 = _mm_srli_epi16( _mm_adds_epu16( vdst01, _mm_srli_epi16( vdst01, 8 ) ), 8 );
+      vdst23 = _mm_srli_epi16( _mm_adds_epu16( vdst23, _mm_srli_epi16( vdst23, 8 ) ), 8 );
+      /* Combine interleaved and store */
+      _mm_storeu_si128( (void *)dstrow, _mm_or_si128( _mm_packus_epi16( vdst01, vdst23 ), valphatest ) );
+    }
+    for( ; x < srcimage->format.width ; x++, srcrow += 4, dstrow += 4 )
+    {
+      if( !( srcrow[3] ) )
+        continue;
+      vsrc01 = _mm_castps_si128( _mm_load_ss( (void *)srcrow ) );
+      vdst01 = _mm_castps_si128( _mm_load_ss( (void *)dstrow ) );
+      vsrc01 = _mm_unpacklo_epi8( vsrc01, vzero );
+      vdst01 = _mm_unpacklo_epi8( vdst01, vzero );
+ #if CPU_SSSE3_SUPPORT
+      vblend01 = _mm_shuffle_epi8( vsrc01, vshufmask );
+ #else
+      vblend01 = _mm_shufflelo_epi16( vsrc01, 0xff );
+      vblend01 = _mm_shufflehi_epi16( vblend01, 0xff );
+ #endif
+      vdst01 = _mm_adds_epu16( _mm_adds_epu16( _mm_mullo_epi16( vdst01, _mm_sub_epi16( v255, vblend01 ) ), _mm_mullo_epi16( vsrc01, vblend01 ) ), vroundbias );
+      /* Correction to divide by 255 instead of 256 */
+      vdst01 = _mm_srli_epi16( _mm_adds_epu16( vdst01, _mm_srli_epi16( vdst01, 8 ) ), 8 );
+      _mm_store_ss( (void *)dstrow, _mm_castsi128_ps( _mm_or_si128( _mm_packus_epi16( vdst01, vdst01 ), valphatest ) ) );
+    }
+#else
+    for( x = 0 ; x < srcimage->format.width ; x++, srcrow += 4, dstrow += 4 )
+    {
+      if( !( srcrow[3] ) )
+        continue;
+      srcr = (int32_t)srcrow[0];
+      srcg = (int32_t)srcrow[1];
+      srcb = (int32_t)srcrow[2];
+      srca = (int32_t)srcrow[3];
+      dstr = (int32_t)dstrow[0];
+      dstg = (int32_t)dstrow[1];
+      dstb = (int32_t)dstrow[2];
+      dstr = ( ( dstr << 8 ) - dstr + ( srca * ( srcr - dstr ) ) + 128 );
+      dstg = ( ( dstg << 8 ) - dstg + ( srca * ( srcg - dstg ) ) + 128 );
+      dstb = ( ( dstb << 8 ) - dstb + ( srca * ( srcb - dstb ) ) + 128 );
+      dstr = ( dstr + ( dstr >> 8 ) ) >> 8;
+      dstg = ( dstg + ( dstg >> 8 ) ) >> 8;
+      dstb = ( dstb + ( dstb >> 8 ) ) >> 8;
+      dstrow[0] = (unsigned char)dstr;
+      dstrow[1] = (unsigned char)dstg;
+      dstrow[2] = (unsigned char)dstb;
+      dstrow[3] = (unsigned char)255;
+    }
+#endif
+    src = ADDRESS( src, srcimage->format.bytesperline );
+    dst = ADDRESS( dst, dstimage->format.bytesperline );
+  }
+
+  return;
+}
+
+static void imgBlendImageRgba2Rgb( imgImage *dstimage, int dstx, int dsty, imgImage *srcimage )
+{
+  int x, y;
+  int32_t dstr, dstg, dstb;
+  int32_t srcr, srcg, srcb, srca;
+  unsigned char *src, *srcrow, *dstrow;
+  uint32_t *dst;
+
+  /* TODO: Other function to clamp copy area? */
+
+  src = srcimage->data;
+  dst = ADDRESS( dstimage->data, ( dstx * 3 ) + ( dsty * dstimage->format.bytesperline ) );
+  for( y = 0 ; y < srcimage->format.height ; y++ )
+  {
+    srcrow = src;
+    dstrow = (unsigned char *)dst;
+    for( x = 0 ; x < srcimage->format.width ; x++, srcrow += 4, dstrow += 3 )
+    {
+      if( !( srcrow[3] ) )
+        continue;
+      srcr = (int32_t)srcrow[0];
+      srcg = (int32_t)srcrow[1];
+      srcb = (int32_t)srcrow[2];
+      srca = (int32_t)srcrow[3];
+      dstr = (int32_t)dstrow[0];
+      dstg = (int32_t)dstrow[1];
+      dstb = (int32_t)dstrow[2];
+      dstr = ( ( dstr << 8 ) - dstr + ( srca * ( srcr - dstr ) ) + 128 );
+      dstg = ( ( dstg << 8 ) - dstg + ( srca * ( srcg - dstg ) ) + 128 );
+      dstb = ( ( dstb << 8 ) - dstb + ( srca * ( srcb - dstb ) ) + 128 );
+      dstr = ( dstr + ( dstr >> 8 ) ) >> 8;
+      dstg = ( dstg + ( dstg >> 8 ) ) >> 8;
+      dstb = ( dstb + ( dstb >> 8 ) ) >> 8;
+      dstrow[0] = (unsigned char)dstr;
+      dstrow[1] = (unsigned char)dstg;
+      dstrow[2] = (unsigned char)dstb;
+    }
+    src = ADDRESS( src, srcimage->format.bytesperline );
+    dst = ADDRESS( dst, dstimage->format.bytesperline );
+  }
+
+  return;
+}
+
+
+void (*imgBlendGetFunction( imgImage *dstimage, imgImage *srcimage ))( imgImage *dstimage, int dstx, int dsty, imgImage *srcimage )
+{
+  void (*blendfunc)( imgImage *dstimage, int dstx, int dsty, imgImage *srcimage );
+  blendfunc = 0;
+  if( srcimage->format.bytesperpixel == 4 )
+  {
+    if( dstimage->format.bytesperpixel == 4 )
+    {
+      if( ( dstimage->format.type == IMG_FORMAT_TYPE_RGBA32 ) || ( dstimage->format.type == IMG_FORMAT_TYPE_BGRA32 ) )
+        blendfunc = imgBlendImageRgba2Rgba;
+      else
+        blendfunc = imgBlendImageRgba2Rgbx;
+    }
+    else if( dstimage->format.bytesperpixel == 3 )
+      blendfunc = imgBlendImageRgba2Rgb;
+  }
+  return blendfunc;
+}
+
+
+int imgBlendImage( imgImage *dstimage, int dstx, int dsty, imgImage *srcimage )
+{
+  void (*blendfunc)( imgImage *dstimage, int dstx, int dsty, imgImage *srcimage );
+  blendfunc = imgBlendGetFunction( dstimage, srcimage );
+  if( blendfunc )
+  {
+    blendfunc( dstimage, dstx, dsty, srcimage );
+    return 1;
+  }
+  return 0;
+}
+
+
+////
+
+
+void imgAllocCopy( imgImage *dstimage, imgImage *srcimage )
+{
+  dstimage->format = srcimage->format;
+  dstimage->data = malloc( srcimage->format.height * srcimage->format.bytesperline );
+  memcpy( dstimage->data, srcimage->data, srcimage->format.height * srcimage->format.bytesperline );
+  return;
+}
+
+
+void imgAllocCopyExtendBorder( imgImage *dstimage, imgImage *srcimage, int extendsize )
+{
+  int y;
+  void *dst, *src, *dstrow;
+
+  dstimage->format.width = srcimage->format.width + ( extendsize << 1 );
+  dstimage->format.height = srcimage->format.height + ( extendsize << 1 );
+  dstimage->format.type = srcimage->format.type;
+  dstimage->format.bytesperpixel = srcimage->format.bytesperpixel;
+  dstimage->format.bytesperline = dstimage->format.width * dstimage->format.bytesperpixel;
+  dstimage->data = malloc( dstimage->format.height * dstimage->format.bytesperline );
+
+  src = srcimage->data;
+  dst = dstimage->data;
+  for( y = 0 ; y < extendsize ; y++ )
+  {
+    memset( dst, 0, dstimage->format.bytesperline );
+    dst = ADDRESS( dst, dstimage->format.bytesperline );
+  }
+  for( y = 0 ; y < srcimage->format.height ; y++ )
+  {
+    dstrow = dst;
+    memset( dstrow, 0, extendsize * dstimage->format.bytesperpixel );
+    dstrow = ADDRESS( dstrow, extendsize * dstimage->format.bytesperpixel );
+    memcpy( dstrow, src, srcimage->format.width * dstimage->format.bytesperpixel );
+    dstrow = ADDRESS( dstrow, srcimage->format.width * dstimage->format.bytesperpixel );
+    memset( dstrow, 0, extendsize * dstimage->format.bytesperpixel );
+    src = ADDRESS( src, srcimage->format.bytesperline );
+    dst = ADDRESS( dst, dstimage->format.bytesperline );
+  }
+  for( y = 0 ; y < extendsize ; y++ )
+  {
+    memset( dst, 0, dstimage->format.bytesperline );
+    dst = ADDRESS( dst, dstimage->format.bytesperline );
+  }
+
+  return;
+}
+
+
+void imgAllocExtractChannel( imgImage *dstimage, imgImage *srcimage, int channelindex )
+{
+  int x, y;
+  unsigned char *dst, *src, *srcrow;
+
+  dstimage->format.width = srcimage->format.width;
+  dstimage->format.height = srcimage->format.height;
+  dstimage->format.type = IMG_FORMAT_TYPE_GRAYSCALE;
+  dstimage->format.bytesperpixel = 1;
+  dstimage->format.bytesperline = dstimage->format.width * dstimage->format.bytesperpixel;
+  dstimage->data = malloc( dstimage->format.height * dstimage->format.bytesperline );
+
+  src = ADDRESS( srcimage->data, channelindex );
+  dst = dstimage->data;
+  for( y = 0 ; y < dstimage->format.height ; y++ )
+  {
+    srcrow = src;
+    for( x = 0 ; x < dstimage->format.width ; x++ )
+    {
+      dst[x] = *srcrow;
+      srcrow = ADDRESS( srcrow, srcimage->format.bytesperpixel );
+    }
+    src = ADDRESS( src, srcimage->format.bytesperline );
+    dst = ADDRESS( dst, dstimage->format.bytesperline );
+  }
+
+  return;
+}
+
+
+void imgAllocExtractChannelExtendBorder( imgImage *dstimage, imgImage *srcimage, int channelindex, int extendsize )
+{
+  int x, y;
+  unsigned char *src, *dst, *srcrow, *dstrow;
+
+  dstimage->format.width = srcimage->format.width + ( extendsize << 1 );
+  dstimage->format.height = srcimage->format.height + ( extendsize << 1 );
+  dstimage->format.type = IMG_FORMAT_TYPE_GRAYSCALE;
+  dstimage->format.bytesperpixel = 1;
+  dstimage->format.bytesperline = dstimage->format.width * dstimage->format.bytesperpixel;
+  dstimage->data = malloc( dstimage->format.height * dstimage->format.bytesperline );
+
+  src = ADDRESS( srcimage->data, channelindex );
+  dst = dstimage->data;
+  for( y = 0 ; y < extendsize ; y++ )
+  {
+    memset( dst, 0, dstimage->format.bytesperline );
+    dst = ADDRESS( dst, dstimage->format.bytesperline );
+  }
+  for( y = 0 ; y < srcimage->format.height ; y++ )
+  {
+    srcrow = src;
+    dstrow = dst;
+    memset( dstrow, 0, extendsize * dstimage->format.bytesperpixel );
+    dstrow = ADDRESS( dstrow, extendsize * dstimage->format.bytesperpixel );
+    for( x = 0 ; x < srcimage->format.width ; x++ )
+    {
+      dstrow[x] = *srcrow;
+      srcrow = ADDRESS( srcrow, srcimage->format.bytesperpixel );
+    }
+    dstrow = ADDRESS( dstrow, srcimage->format.width * dstimage->format.bytesperpixel );
+    memset( dstrow, 0, extendsize * dstimage->format.bytesperpixel );
+    src = ADDRESS( src, srcimage->format.bytesperline );
+    dst = ADDRESS( dst, dstimage->format.bytesperline );
+  }
+  for( y = 0 ; y < extendsize ; y++ )
+  {
+    memset( dst, 0, dstimage->format.bytesperline );
+    dst = ADDRESS( dst, dstimage->format.bytesperline );
+  }
+
+  return;
+}
+
+
+void imgAllocCopyChannelToAlpha( imgImage *dstimage, imgImage *srcimage, int channelindex, unsigned char r, unsigned char g, unsigned char b )
+{
+  int x, y;
+  unsigned char *dst, *src, *dstrow, *srcrow;
+
+  dstimage->format.width = srcimage->format.width;
+  dstimage->format.height = srcimage->format.height;
+  dstimage->format.type = IMG_FORMAT_TYPE_RGBA32;
+  dstimage->format.bytesperpixel = 4;
+  dstimage->format.bytesperline = dstimage->format.width * dstimage->format.bytesperpixel;
+  dstimage->data = malloc( dstimage->format.height * dstimage->format.bytesperline );
+
+  src = ADDRESS( srcimage->data, channelindex );
+  dst = dstimage->data;
+  for( y = 0 ; y < dstimage->format.height ; y++ )
+  {
+    srcrow = src;
+    dstrow = dst;
+    for( x = 0 ; x < dstimage->format.width ; x++ )
+    {
+      dstrow[0] = r;
+      dstrow[1] = g;
+      dstrow[2] = b;
+      dstrow[3] = *srcrow;
+      srcrow = ADDRESS( srcrow, srcimage->format.bytesperpixel );
+      dstrow = ADDRESS( dstrow, dstimage->format.bytesperpixel );
+    }
+    src = ADDRESS( src, srcimage->format.bytesperline );
+    dst = ADDRESS( dst, dstimage->format.bytesperline );
+  }
+
+  return;
+}
+
+
+void imgAllocAdjustBrightnessContrast( imgImage *dstimage, imgImage *srcimage, float brightness, float contrast )
+{
+  int x, y;
+  float r, g, b;
+  unsigned char *dst, *src, *dstrow, *srcrow;
+
+  dstimage->format = srcimage->format;
+  dstimage->data = malloc( srcimage->format.height * srcimage->format.bytesperline );
+
+  brightness += 0.5f;
+
+  if( dstimage->format.bytesperpixel >= 3 )
+  {
+    src = srcimage->data;
+    dst = dstimage->data;
+    for( y = 0 ; y < dstimage->format.height ; y++ )
+    {
+      srcrow = src;
+      dstrow = dst;
+      for( x = 0 ; x < dstimage->format.width ; x++ )
+      {
+        r = (1.0f/255.0f) * (float)srcrow[0];
+        g = (1.0f/255.0f) * (float)srcrow[1];
+        b = (1.0f/255.0f) * (float)srcrow[2];
+        r = ( ( r - 0.5f ) * contrast ) + brightness;
+        g = ( ( g - 0.5f ) * contrast ) + brightness;
+        b = ( ( b - 0.5f ) * contrast ) + brightness;
+        dstrow[0] = (unsigned char)fmaxf( 0.0f, fminf( 255.0f, roundf( r * 255.0f ) ) );
+        dstrow[1] = (unsigned char)fmaxf( 0.0f, fminf( 255.0f, roundf( g * 255.0f ) ) );
+        dstrow[2] = (unsigned char)fmaxf( 0.0f, fminf( 255.0f, roundf( b * 255.0f ) ) );
+        if( dstimage->format.bytesperpixel >= 4 )
+          dstrow[3] = srcrow[3];
+        srcrow = ADDRESS( srcrow, srcimage->format.bytesperpixel );
+        dstrow = ADDRESS( dstrow, dstimage->format.bytesperpixel );
+      }
+      src = ADDRESS( src, srcimage->format.bytesperline );
+      dst = ADDRESS( dst, dstimage->format.bytesperline );
+    }
+  }
+  else if( dstimage->format.bytesperpixel == 1 )
+  {
+    src = srcimage->data;
+    dst = dstimage->data;
+    for( y = 0 ; y < dstimage->format.height ; y++ )
+    {
+      srcrow = src;
+      dstrow = dst;
+      for( x = 0 ; x < dstimage->format.width ; x++ )
+      {
+        r = (1.0f/255.0f) * (float)srcrow[0];
+        r = ( ( r - 0.5f ) * contrast ) + brightness;
+        dstrow[0] = (unsigned char)fmaxf( 0.0f, fminf( 255.0f, roundf( r * 255.0f ) ) );
+        srcrow = ADDRESS( srcrow, srcimage->format.bytesperpixel );
+        dstrow = ADDRESS( dstrow, dstimage->format.bytesperpixel );
+      }
+      src = ADDRESS( src, srcimage->format.bytesperline );
+      dst = ADDRESS( dst, dstimage->format.bytesperline );
+    }
+  }
+
+  return;
+}
+
+
+void imgFree( imgImage *image )
+{
+  free( image->data );
+  image->data = 0;
+  return;
+}
+
+
+////
+
+
diff --git a/lib/graphics_utils/mipmap/img.h b/lib/graphics_utils/mipmap/img.h
new file mode 100644
index 000000000..3871ca3f3
--- /dev/null
+++ b/lib/graphics_utils/mipmap/img.h
@@ -0,0 +1,74 @@
+/* *****************************************************************************
+ *
+ * Copyright (c) 2007-2016 Alexis Naveros.
+ * Portions developed under contract to the SURVICE Engineering Company.
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * version 2.1 as published by the Free Software Foundation.
+ *
+ * This library is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this file; see the file named COPYING for more
+ * information.
+ *
+ * *****************************************************************************
+ */
+
+#ifndef IMG_H
+#define IMG_H
+
+
+typedef struct
+{
+  int width;
+  int height;
+  int type;
+  int bytesperpixel;
+  int bytesperline;
+} imgFormat;
+
+enum
+{
+  IMG_FORMAT_TYPE_ANY,
+  IMG_FORMAT_TYPE_RGB24,
+  IMG_FORMAT_TYPE_BGR24,
+  IMG_FORMAT_TYPE_RGBX32,
+  IMG_FORMAT_TYPE_BGRX32,
+  IMG_FORMAT_TYPE_RGBA32,
+  IMG_FORMAT_TYPE_BGRA32,
+  IMG_FORMAT_TYPE_GRAYSCALE,
+  IMG_FORMAT_TYPE_GRAYALPHA
+};
+
+typedef struct
+{
+  imgFormat format;
+  void *data;
+} imgImage;
+
+
+////
+
+
+void imgCopyRect( imgImage *image, int dstx, int dsty, int srcx, int srcy, int sizex, int sizey );
+
+void (*imgBlendGetFunction( imgImage *dstimage, imgImage *srcimage ))( imgImage *dstimage, int dstx, int dsty, imgImage *srcimage );
+int imgBlendImage( imgImage *dstimage, int dstx, int dsty, imgImage *srcimage );
+
+void imgAllocCopy( imgImage *dst, imgImage *src );
+void imgAllocCopyExtendBorder( imgImage *dstimage, imgImage *srcimage, int extendsize );
+void imgAllocExtractChannel( imgImage *dst, imgImage *src, int channelindex );
+void imgAllocExtractChannelExtendBorder( imgImage *dstimage, imgImage *srcimage, int channelindex, int extendsize );
+void imgAllocCopyChannelToAlpha( imgImage *dstimage, imgImage *srcimage, int channelindex, unsigned char r, unsigned char g, unsigned char b );
+void imgAllocAdjustBrightnessContrast( imgImage *dstimage, imgImage *srcimage, float brightness, float contrast );
+
+void imgFree( imgImage *image );
+
+
+#endif
+
diff --git a/lib/graphics_utils/mipmap/imgresize.c b/lib/graphics_utils/mipmap/imgresize.c
new file mode 100644
index 000000000..72edeb9a1
--- /dev/null
+++ b/lib/graphics_utils/mipmap/imgresize.c
@@ -0,0 +1,4098 @@
+/* -----------------------------------------------------------------------------
+ *
+ * Copyright (c) 2014-2017 Alexis Naveros.
+ * Portions developed under contract to the SURVICE Engineering Company.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty. In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ * claim that you wrote the original software. If you use this software
+ * in a product, an acknowledgment in the product documentation would be
+ * appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ * misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ *
+ * -----------------------------------------------------------------------------
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <string.h>
+#include <math.h>
+#include <float.h>
+#include <limits.h>
+
+
+#include "cpusimd.h"
+
+#include "img.h"
+#include "imgresize.h"
+
+
+////
+
+
+#define IM_RESIZE_DEBUG (0)
+#define IM_RESIZE_DEBUG_PROGRESS (0)
+
+
+////
+
+
+#ifndef M_PI
+ #define M_PI (3.14159265358979323846)
+#endif
+
+#ifndef ADDRESS
+ #define ADDRESS(p,o) ((void *)(((char *)p)+(o)))
+#endif
+
+#if defined(__GNUC__) || defined(__INTEL_COMPILER)
+ #define CC_ALWAYSINLINE __attribute__((always_inline))
+#elif defined(_MSC_VER)
+ #define CC_ALWAYSINLINE __forceinline
+#else
+ #define CC_ALWAYSINLINE
+#endif
+
+static inline CC_ALWAYSINLINE uint32_t ccIsPow2Int32( uint32_t v )
+{
+  return ( ( v & ( v - 1 ) ) == 0 );
+}
+
+#define ROUND_POSITIVE_FLOAT(x) ((int)((x)+0.5f))
+
+
+////
+
+
+#if defined(__x86_64__) || defined(__x86_64) || defined(__amd64__) || defined(__amd64) || defined(__i386__) || defined(__i386)  || defined(i386)
+
+/* Input is 0.0,255.0, output is 0.0,1.0 */
+static inline CC_ALWAYSINLINE float srgb2linear( float v )
+{
+  float v2, vpow, vpwsqrt;
+  union
+  {
+    int32_t i;
+    float f;
+  } u;
+  if( v <= (0.04045f*255.0f) )
+    v = v * ( (1.0f/12.92f)*(1.0f/255.0f) );
+  else
+  {
+    v = ( v + (0.055f*255.0f) ) * ( (1.0f/1.055f)*(1.0f/255.0f) );
+    v2 = v * v;
+    u.f = v * 5417434112.0f;
+    u.i = (int32_t)ROUND_POSITIVE_FLOAT( (float)u.i * 0.8f );
+    vpow = u.f;
+    vpwsqrt = sqrtf( vpow );
+    v = ( ( v2 * vpwsqrt ) + ( ( ( v2 * v ) / vpwsqrt ) / sqrtf( vpwsqrt ) ) ) * 0.51011878327f;
+  }
+  return v;
+}
+
+/* Input is 0.0,1.0, output is 0.0,255.0 */
+static inline CC_ALWAYSINLINE float linear2srgb( float v )
+{
+  float vpow;
+  union
+  {
+    int32_t i;
+    float f;
+  } u;
+  if( v <= 0.0031308f )
+    v = v * (12.92f*255.0f);
+  else
+  {
+    u.f = ( v * 6521909350804488192.0f );
+    u.i = (int32_t)ROUND_POSITIVE_FLOAT( (float)u.i * 0.666666666666f );
+    vpow = u.f;
+    v = ( v * vpow ) + ( ( v * v ) / sqrtf( vpow ) );
+    v = ( (1.055f*255.0f) * sqrtf( sqrtf( v * 0.5290553722f ) ) - (0.055f*255.0f) );
+  }
+  return v;
+}
+
+#else
+
+/* Input is 0.0,255.0, output is 0.0,1.0 */
+/* Only for reference, this is waayyy too slow and should never be used */
+static inline CC_ALWAYSINLINE float srgb2linear( float v )
+{
+  v *= (1.0f/255.0f);
+  if( v <= 0.04045f )
+    v = v * (1.0f/12.92);
+  else
+    v = pow( ( v + 0.055f ) * (1.0f/1.055f), 2.4f );
+  return v;
+}
+
+/* Input is 0.0,1.0, output is 0.0,255.0 */
+/* Only for reference, this is waayyy too slow and should never be used */
+static inline CC_ALWAYSINLINE float linear2srgb( float v )
+{
+  if( v <= 0.0031308f )
+    v = v * 12.92f;
+  else
+    v = 1.055f * pow( v, 1.0f/2.4f ) - 0.055f;
+  return 255.0f * v;
+}
+
+#endif
+
+
+////
+
+
+#if CPU_SSE2_SUPPORT
+
+static const float CPU_ALIGN16 srgbLinearConst00[4] = { 0.04045f*255.0f, 0.04045f*255.0f, 0.04045f*255.0f, 1024.0f };
+static const float CPU_ALIGN16 srgbLinearConst01[4] = { (1.0f/12.92f)*(1.0f/255.0f), (1.0f/12.92f)*(1.0f/255.0f), (1.0f/12.92f)*(1.0f/255.0f), 1.0f };
+static const float CPU_ALIGN16 srgbLinearConst02[4] = { 0.055f*255.0f, 0.055f*255.0f, 0.055f*255.0f, 0.055f*255.0f };
+static const float CPU_ALIGN16 srgbLinearConst03[4] = { (1.0f/1.055f)*(1.0f/255.0f), (1.0f/1.055f)*(1.0f/255.0f), (1.0f/1.055f)*(1.0f/255.0f), (1.0f/1.055f)*(1.0f/255.0f) };
+static const float CPU_ALIGN16 srgbLinearConst04[4] = { 5417434112.0f, 5417434112.0f, 5417434112.0f, 5417434112.0f };
+static const float CPU_ALIGN16 srgbLinearConst05[4] = { 0.8f, 0.8f, 0.8f, 0.8f };
+static const float CPU_ALIGN16 srgbLinearConst06[4] = { 0.51011878327f, 0.51011878327f, 0.51011878327f, 0.51011878327f };
+static const float CPU_ALIGN16 srgbLinearConst07[4] = { 0.0031308f, 0.0031308f, 0.0031308f, 1024.0f };
+static const float CPU_ALIGN16 srgbLinearConst08[4] = { 12.92f*255.0f, 12.92f*255.0f, 12.92f*255.0f, 1.0f };
+static const float CPU_ALIGN16 srgbLinearConst09[4] = { 6521909350804488192.0f, 6521909350804488192.0f, 6521909350804488192.0f, 6521909350804488192.0f };
+static const float CPU_ALIGN16 srgbLinearConst10[4] = { 0.666666666666f, 0.666666666666f, 0.666666666666f, 0.666666666666f };
+static const float CPU_ALIGN16 srgbLinearConst11[4] = { 0.5290553722f, 0.5290553722f, 0.5290553722f, 0.5290553722f };
+static const float CPU_ALIGN16 srgbLinearConst12[4] = { 1.055f*255.0f, 1.055f*255.0f, 1.055f*255.0f, 1.055f*255.0f };
+static const float CPU_ALIGN16 srgbLinearConst13[4] = { -0.055f*255.0f, -0.055f*255.0f, -0.055f*255.0f, -0.055f*255.0f };
+static const float CPU_ALIGN16 srgbLinearConst14[4] = { 0.04045f*255.0f, 0.04045f*255.0f, 0.04045f*255.0f, 0.04045f*255.0f };
+static const float CPU_ALIGN16 srgbLinearConst15[4] = { (1.0f/12.92f)*(1.0f/255.0f), (1.0f/12.92f)*(1.0f/255.0f), (1.0f/12.92f)*(1.0f/255.0f), (1.0f/12.92f)*(1.0f/255.0f) };
+
+/* Input is 0.0,255.0 ~ output is 0.0,1.0 ~ alpha channel is passed as-is */
+static inline CC_ALWAYSINLINE __m128 srgb2linear3( __m128 vx )
+{
+  __m128 vmask, vbase;
+  __m128 vpow, vpwsqrtinv, vpwsqrt, vx2;
+  vmask = _mm_cmple_ps( vx, *(__m128*)srgbLinearConst00 );
+  vbase = _mm_mul_ps( vx, *(__m128*)srgbLinearConst01 );
+  vx = _mm_mul_ps( _mm_add_ps( vx, *(__m128*)srgbLinearConst02 ), *(__m128*)srgbLinearConst03 );
+  vx2 = _mm_mul_ps( vx, vx );
+  vpow = _mm_castsi128_ps( _mm_cvtps_epi32( _mm_mul_ps( _mm_cvtepi32_ps( _mm_castps_si128( _mm_mul_ps( vx, *(__m128*)srgbLinearConst04 ) ) ), *(__m128*)srgbLinearConst05 ) ) );
+  vpwsqrtinv = _mm_rsqrt_ps( vpow );
+  vpwsqrt = _mm_mul_ps( vpow, vpwsqrtinv );
+  vx = _mm_mul_ps( _mm_add_ps( _mm_mul_ps( vx2, vpwsqrt ), _mm_mul_ps( _mm_mul_ps( _mm_mul_ps( vx2, vx ), vpwsqrtinv ), _mm_rsqrt_ps( vpwsqrt ) ) ), *(__m128*)srgbLinearConst06 );
+  return CPU_BLENDV_PS( vx, vbase, vmask );
+}
+
+/* Input is 0.0,1.0 ~ output is 0.0,255.0 ~ alpha channel is passed as-is */
+static inline CC_ALWAYSINLINE __m128 linear2srgb3( __m128 vx )
+{
+  __m128 vmask, vbase, vpow;
+  vmask = _mm_cmple_ps( vx, *(__m128*)srgbLinearConst07 );
+  vbase = _mm_mul_ps( vx, *(__m128*)srgbLinearConst08 );
+  vpow = _mm_castsi128_ps( _mm_cvtps_epi32( _mm_mul_ps( _mm_cvtepi32_ps( _mm_castps_si128( _mm_mul_ps( vx, *(__m128*)srgbLinearConst09 ) ) ), *(__m128*)srgbLinearConst10 ) ) );
+  vx = _mm_add_ps( _mm_mul_ps( _mm_sqrt_ps( _mm_sqrt_ps( _mm_mul_ps( _mm_add_ps( _mm_mul_ps( vx, vpow ), _mm_mul_ps( _mm_mul_ps( vx, vx ), _mm_rsqrt_ps( vpow ) ) ), *(__m128*)srgbLinearConst11 ) ) ), *(__m128*)srgbLinearConst12 ), *(__m128*)srgbLinearConst13 );
+  return CPU_BLENDV_PS( vx, vbase, vmask );
+}
+
+/* Input is 0.0,255.0 ~ output is 0.0,1.0 ~ alpha channel is passed as-is */
+static inline CC_ALWAYSINLINE __m128 srgb2linear4( __m128 vx )
+{
+  __m128 vmask, vbase;
+  __m128 vpow, vpwsqrtinv, vpwsqrt, vx2;
+  vmask = _mm_cmple_ps( vx, *(__m128*)srgbLinearConst14 );
+  vbase = _mm_mul_ps( vx, *(__m128*)srgbLinearConst15 );
+  vx = _mm_mul_ps( _mm_add_ps( vx, *(__m128*)srgbLinearConst02 ), *(__m128*)srgbLinearConst03 );
+  vx2 = _mm_mul_ps( vx, vx );
+  vpow = _mm_castsi128_ps( _mm_cvtps_epi32( _mm_mul_ps( _mm_cvtepi32_ps( _mm_castps_si128( _mm_mul_ps( vx, *(__m128*)srgbLinearConst04 ) ) ), *(__m128*)srgbLinearConst05 ) ) );
+  vpwsqrtinv = _mm_rsqrt_ps( vpow );
+  vpwsqrt = _mm_mul_ps( vpow, vpwsqrtinv );
+  vx = _mm_mul_ps( _mm_add_ps( _mm_mul_ps( vx2, vpwsqrt ), _mm_mul_ps( _mm_mul_ps( _mm_mul_ps( vx2, vx ), vpwsqrtinv ), _mm_rsqrt_ps( vpwsqrt ) ) ), *(__m128*)srgbLinearConst06 );
+  return CPU_BLENDV_PS( vx, vbase, vmask );
+}
+
+#endif
+
+
+////
+
+
+static inline CC_ALWAYSINLINE double bessel( double x )
+{
+  double sum, t, y;
+
+  /* Zeroth order Bessel function of the first kind. */
+  sum = 1.0;
+  y = x * x * 0.25;
+  t = y;
+  sum += t;
+  t *= y * (1.0/(2.0*2.0));
+  sum += t;
+  t *= y * (1.0/(3.0*3.0));
+  sum += t;
+  t *= y * (1.0/(4.0*4.0));
+  sum += t;
+  t *= y * (1.0/(5.0*5.0));
+  sum += t;
+  t *= y * (1.0/(6.0*6.0));
+  sum += t;
+  t *= y * (1.0/(7.0*7.0));
+  sum += t;
+  t *= y * (1.0/(8.0*8.0));
+  sum += t;
+  t *= y * (1.0/(9.0*9.0));
+  sum += t;
+  t *= y * (1.0/(10.0*10.0));
+  sum += t;
+  t *= y * (1.0/(11.0*11.0));
+  sum += t;
+  t *= y * (1.0/(12.0*12.0));
+  sum += t;
+  t *= y * (1.0/(13.0*13.0));
+  sum += t;
+  t *= y * (1.0/(14.0*14.0));
+  sum += t;
+
+  return sum;
+}
+
+static inline CC_ALWAYSINLINE double kaiser( double x, double beta )
+{
+  return bessel( beta * sqrt( fmaxf( 0.0, 1.0 - ( x * x ) ) ) );
+}
+
+static inline CC_ALWAYSINLINE double sinc( double x )
+{
+  if( x == 0.0 )
+    return 1.0;
+  x = sin( x * M_PI ) / ( x * M_PI );
+  return x;
+}
+
+
+////
+
+
+#if CPU_SSE2_SUPPORT
+
+static inline CC_ALWAYSINLINE __m128 simd4f_bessel( __m128 x )
+{
+  __m128 sum, t, y;
+
+  sum = *(__m128 *)simd4fOne;
+  y = _mm_mul_ps( *(__m128 *)simd4fQuarter, _mm_mul_ps( x, x ) );
+  t = y;
+  sum = _mm_add_ps( sum, t );
+  t = _mm_mul_ps( t, _mm_mul_ps( y, _mm_set1_ps( 1.0f/(2.0f*2.0f) ) ) );
+  sum = _mm_add_ps( sum, t );
+  t = _mm_mul_ps( t, _mm_mul_ps( y, _mm_set1_ps( 1.0f/(3.0f*3.0f) ) ) );
+  sum = _mm_add_ps( sum, t );
+  t = _mm_mul_ps( t, _mm_mul_ps( y, _mm_set1_ps( 1.0f/(4.0f*4.0f) ) ) );
+  sum = _mm_add_ps( sum, t );
+  t = _mm_mul_ps( t, _mm_mul_ps( y, _mm_set1_ps( 1.0f/(5.0f*5.0f) ) ) );
+  sum = _mm_add_ps( sum, t );
+  t = _mm_mul_ps( t, _mm_mul_ps( y, _mm_set1_ps( 1.0f/(6.0f*6.0f) ) ) );
+  sum = _mm_add_ps( sum, t );
+  t = _mm_mul_ps( t, _mm_mul_ps( y, _mm_set1_ps( 1.0f/(7.0f*7.0f) ) ) );
+  sum = _mm_add_ps( sum, t );
+  t = _mm_mul_ps( t, _mm_mul_ps( y, _mm_set1_ps( 1.0f/(8.0f*8.0f) ) ) );
+  sum = _mm_add_ps( sum, t );
+  t = _mm_mul_ps( t, _mm_mul_ps( y, _mm_set1_ps( 1.0f/(9.0f*9.0f) ) ) );
+  sum = _mm_add_ps( sum, t );
+  t = _mm_mul_ps( t, _mm_mul_ps( y, _mm_set1_ps( 1.0f/(10.0f*10.0f) ) ) );
+  sum = _mm_add_ps( sum, t );
+  t = _mm_mul_ps( t, _mm_mul_ps( y, _mm_set1_ps( 1.0f/(11.0f*11.0f) ) ) );
+  sum = _mm_add_ps( sum, t );
+  t = _mm_mul_ps( t, _mm_mul_ps( y, _mm_set1_ps( 1.0f/(12.0f*12.0f) ) ) );
+  sum = _mm_add_ps( sum, t );
+  t = _mm_mul_ps( t, _mm_mul_ps( y, _mm_set1_ps( 1.0f/(13.0f*13.0f) ) ) );
+  sum = _mm_add_ps( sum, t );
+  t = _mm_mul_ps( t, _mm_mul_ps( y, _mm_set1_ps( 1.0f/(14.0f*14.0f) ) ) );
+  sum = _mm_add_ps( sum, t );
+
+  return sum;
+}
+
+static inline CC_ALWAYSINLINE __m128 simd4f_kaiser( __m128 x, __m128 beta )
+{
+  return simd4f_bessel( _mm_mul_ps( beta, _mm_sqrt_ps( _mm_max_ps( _mm_setzero_ps(), _mm_sub_ps( *(__m128 *)simd4fOne, _mm_mul_ps( x, x ) ) ) ) ) );
+}
+
+static inline CC_ALWAYSINLINE __m128 simd4f_sinc( __m128 x )
+{
+  __m128 zeromask;
+  zeromask = _mm_cmpeq_ps( x, _mm_setzero_ps() );
+  x = _mm_mul_ps( x, _mm_load_ps( simd4fPi ) );
+  x = _mm_div_ps( simd4f_sin_ps( x ), x );
+  x = CPU_BLENDV_PS( x, *(__m128 *)simd4fOne, zeromask );
+  return x;
+}
+
+#endif
+
+
+////
+
+
+typedef struct
+{
+  int matrixsize;
+  int matrixoffset;
+  int matrixrowwidth;
+  int matrixrowsize;
+  int rowreturn;
+  float *matrix;
+
+  int minimumalpha;
+  float dithersum;
+  float minimumalphaf;
+  float amplifynormal;
+  float normalsustainfactor;
+  void *alloc;
+
+  unsigned char *srcdata;
+  int width1;
+  int width2;
+  int width3;
+  int width4;
+  int height;
+  int bytesperline;
+} imStaticMatrixState;
+
+
+static int imBuildStaticMatrix( imStaticMatrixState * restrict state, int sizedivisor, float hopcount, float alpha )
+{
+  int i, j, minx, maxx;
+  double x, xshift, hopsize, offset, scalefactor, hopcountinv, beta, linsq, sum;
+  double *linear;
+  float *matrix;
+
+  if( alpha > 16.0f )
+    alpha = 16.0f;
+  beta = alpha * M_PI;
+  hopcountinv = 1.0 / hopcount;
+
+  scalefactor = 1.0 / (double)sizedivisor;
+  hopsize = 0.5 * (double)sizedivisor;
+  offset = hopsize - 0.5;
+  minx = (int)ceil( ( -hopcount * hopsize ) + offset );
+  maxx = (int)floor( ( hopcount * hopsize ) + offset );
+  state->matrixoffset = minx;
+  state->matrixsize = ( maxx - minx ) + 1;
+  state->matrixrowwidth = ( state->matrixsize + 3 ) & ~3;
+  state->rowreturn = state->matrixrowwidth - state->matrixsize;
+  state->matrixrowsize = state->matrixrowwidth * sizeof(float);
+
+#if IM_RESIZE_DEBUG
+  printf( "ResizeMatrix ; scalefactor %.3f, offset %.3f, hopsize %.3f\n", scalefactor, offset, hopsize );
+#endif
+
+  linear = malloc( state->matrixrowwidth * sizeof(double) );
+  for( i = 0 ; i < state->matrixsize ; i++ )
+  {
+    x = (double)( i + state->matrixoffset );
+    xshift = 2.0 * scalefactor * ( x - offset );
+    linear[i] = sinc( xshift ) * kaiser( hopcountinv * xshift, beta );
+#if IM_RESIZE_DEBUG
+    printf( " x[%+.3f] = %+.3f ( %+.3f * %+.3f )\n", x, linear[i], sinc( xshift ), kaiser( hopcountinv * xshift, beta ) );
+#endif
+  }
+  for( ; i < state->matrixrowwidth ; i++ )
+    linear[i] = 0.0;
+
+  /* Build normalized state */
+  state->alloc = malloc( ( state->matrixsize * state->matrixrowsize ) + 16 );
+  state->matrix = (void *)( ( (uintptr_t)state->alloc + 0xf ) & ~0xf );
+  matrix = state->matrix;
+  sum = 0.0;
+  for( i = 0 ; i < state->matrixsize ; i++ )
+  {
+    for( j = 0 ; j < state->matrixsize ; j++ )
+    {
+      linsq = linear[i] * linear[j];
+      matrix[j] = (float)linsq;
+      sum += linsq;
+    }
+    for( ; j < state->matrixrowwidth ; j++ )
+      matrix[j] = 0.0f;
+    matrix += state->matrixrowwidth;
+  }
+  free( linear );
+
+#if IM_RESIZE_DEBUG
+  printf( "Matrix sum : %f\n", sum );
+#endif
+
+  sum = 1.0 / sum;
+  j = state->matrixsize * state->matrixrowwidth;
+  for( i = 0 ; i < j ; i++ )
+    state->matrix[i] *= sum;
+
+#if IM_RESIZE_DEBUG
+  printf( "Matrix %dx%d :\n", state->matrixsize, state->matrixsize );
+  for( i = 0 ; i < state->matrixsize ; i++ )
+  {
+    for( j = 0 ; j < state->matrixsize ; j++ )
+      printf( " %+.6f", state->matrix[ ( i * state->matrixrowwidth ) + j ] );
+    printf( "\n" );
+  }
+  printf( "Matrix Offset : %d\n", state->matrixoffset );
+  printf( "Matrix Size : %d\n", state->matrixsize );
+  printf( "Matrix Rowwidth : %d\n", state->matrixrowwidth );
+#endif
+
+  return 1;
+}
+
+
+static void imFreeStaticState( imStaticMatrixState * restrict state )
+{
+  free( state->alloc );
+  state->alloc = 0;
+  return;
+}
+
+
+////
+
+
+typedef struct
+{
+  int matrixsizex, matrixsizey;
+  int matrixoffsetx, matrixoffsety;
+  float *linearx;
+  float *lineary;
+  float beta;
+  float hopcountinv;
+
+  float dithersum;
+  int minimumalpha;
+  float minimumalphaf;
+  float amplifynormal;
+  float normalsustainfactor;
+  void *alloc;
+
+  unsigned char *srcdata;
+  int width1;
+  int width2;
+  int width3;
+  int width4;
+  int height;
+  int bytesperline;
+} imGenericMatrixState;
+
+
+static inline int imAllocGenericState( imGenericMatrixState *state, float scalex, float scaley, float hopcount, float alpha )
+{
+  int allocx, allocy, size;
+  void *align;
+  if( alpha > 16.0f )
+    alpha = 16.0f;
+  allocx = ( (int)ceilf( hopcount / scalex ) + 2 + 3 ) & ~0x3;
+  allocy = ( (int)ceilf( hopcount / scaley ) + 2 + 3 ) & ~0x3;
+  size = ( ( allocx + allocy ) * sizeof(float) ) + 16;
+  state->alloc = malloc( size );
+  memset( state->alloc, 0, size );
+  align = (void *)( ( (uintptr_t)state->alloc + 0xf ) & ~0xf );
+  state->linearx = align;
+  state->lineary = ADDRESS( align, allocx * sizeof(float) );
+  state->beta = (float)alpha * M_PI;
+  state->hopcountinv = 1.0f / (float)hopcount;
+  return 1;
+}
+
+static inline void imBuildGenericLinearX( imGenericMatrixState *state, float scalex, float scaleinvx, float sourcex, float hopcount, float alpha, int width )
+{
+  int i, minx, maxx;
+  float hopsizex, offsetx;
+  float *linearx;
+
+  hopsizex = 0.5 * scaleinvx;
+  offsetx = (float)sourcex;
+  minx = (int)ceil( ( -hopcount * hopsizex ) + offsetx );
+  maxx = (int)floor( ( hopcount * hopsizex ) + offsetx );
+  state->matrixsizex = ( maxx - minx ) + 1;
+  state->matrixoffsetx = ( minx + ( width << 8 ) ) % width;
+
+  linearx = state->linearx;
+  scalex *= 2.0;
+#if CPU_SSE2_SUPPORT
+  for( i = 0 ; i < state->matrixsizex ; i += 4 )
+  {
+    __m128 vx, vxshift;
+    vx = _mm_add_ps( _mm_set1_ps( (float)( i + minx ) ), _mm_load_ps( simd4fZeroOneTwoThree ) );
+    vxshift = _mm_mul_ps( _mm_set1_ps( scalex ), _mm_sub_ps( vx, _mm_set1_ps( offsetx ) ) );
+    _mm_store_ps( &linearx[i], _mm_mul_ps( simd4f_sinc( vxshift ), simd4f_kaiser( _mm_mul_ps( _mm_set1_ps( state->hopcountinv ), vxshift ), _mm_set1_ps( state->beta ) ) ) );
+ #if IM_RESIZE_DEBUG
+    printf( " linearx[%d] = %.3f\n", i+minx+0, linearx[i+0] );
+    printf( " linearx[%d] = %.3f\n", i+minx+1, linearx[i+1] );
+    printf( " linearx[%d] = %.3f\n", i+minx+2, linearx[i+2] );
+    printf( " linearx[%d] = %.3f\n", i+minx+3, linearx[i+3] );
+ #endif
+  }
+#else
+  for( i = 0 ; i < state->matrixsizex ; i++ )
+  {
+    float x, xshift;
+    x = (float)( i + minx );
+    xshift = scalex * ( x - offsetx );
+    linearx[i] = (float)( sinc( xshift ) * kaiser( state->hopcountinv * xshift, state->beta ) );
+ #if IM_RESIZE_DEBUG
+    printf( " linearx[%+.3f] = %.3f ( %+.3f * %+.3f )\n", x, linearx[i], sinc( xshift ), kaiser( state->hopcountinv * xshift, state->beta ) );
+ #endif
+  }
+#endif
+
+  return;
+}
+
+static inline void imBuildGenericLinearY( imGenericMatrixState *state, float scaley, float scaleinvy, float sourcey, float hopcount, float alpha, int height )
+{
+  int i, miny, maxy;
+  float hopsizey, offsety;
+  float *lineary;
+
+  hopsizey = 0.5 * scaleinvy;
+  offsety = (float)sourcey;
+  miny = (int)ceil( ( -hopcount * hopsizey ) + offsety );
+  maxy = (int)floor( ( hopcount * hopsizey ) + offsety );
+  state->matrixsizey = ( maxy - miny ) + 1;
+  state->matrixoffsety = ( miny + ( height << 8 ) ) % height;
+
+  lineary = state->lineary;
+  scaley *= 2.0;
+#if CPU_SSE2_SUPPORT
+  for( i = 0 ; i < state->matrixsizey ; i += 4 )
+  {
+    __m128 vy, vyshift;
+    vy = _mm_add_ps( _mm_set1_ps( (float)( i + miny ) ), _mm_load_ps( simd4fZeroOneTwoThree ) );
+    vyshift = _mm_mul_ps( _mm_set1_ps( scaley ), _mm_sub_ps( vy, _mm_set1_ps( offsety ) ) );
+    _mm_store_ps( &lineary[i], _mm_mul_ps( simd4f_sinc( vyshift ), simd4f_kaiser( _mm_mul_ps( _mm_set1_ps( state->hopcountinv ), vyshift ), _mm_set1_ps( state->beta ) ) ) );
+ #if IM_RESIZE_DEBUG
+    printf( " lineary[%d] = %.3f\n", i+miny+0, lineary[i+0] );
+    printf( " lineary[%d] = %.3f\n", i+miny+1, lineary[i+1] );
+    printf( " lineary[%d] = %.3f\n", i+miny+2, lineary[i+2] );
+    printf( " lineary[%d] = %.3f\n", i+miny+3, lineary[i+3] );
+ #endif
+  }
+#else
+  for( i = 0 ; i < state->matrixsizey ; i++ )
+  {
+    float y, yshift;
+    y = (float)( i + miny );
+    yshift = scaley * ( y - offsety );
+    lineary[i] = (float)( sinc( yshift ) * kaiser( state->hopcountinv * yshift, state->beta ) );
+ #if IM_RESIZE_DEBUG
+    printf( " lineary[%+.3f] = %.3f ( %+.3f * %+.3f )\n", y, lineary[i], sinc( yshift ), kaiser( state->hopcountinv * yshift, state->beta ) );
+ #endif
+  }
+#endif
+
+  return;
+}
+
+
+static inline void imFreeGenericState( imGenericMatrixState *state )
+{
+  free( state->alloc );
+  state->alloc = 0;
+  return;
+}
+
+
+
+////////////////////////////////////////////////////////////////////////////////
+
+
+
+static void imStaticKernel1Linear( unsigned char *dst, int pointx, int pointy, imStaticMatrixState * restrict state )
+{
+  int x, y, mapx, mapy;
+  float f, sum0;
+  float *matrix;
+  unsigned char *src;
+
+  sum0 = 0.0f;
+  matrix = state->matrix;
+  mapy = pointy;
+  for( y = 0 ; y < state->matrixsize ; y++ )
+  {
+    src = ADDRESS( state->srcdata, ( mapy * state->bytesperline ) );
+    mapx = pointx;
+    for( x = 0 ; x < state->matrixsize ; x++ )
+    {
+      f = matrix[x];
+      sum0 += f * (float)src[ mapx + 0 ];
+      mapx++;
+      if( mapx >= state->width1 )
+        mapx = 0;
+    }
+    matrix = ADDRESS( matrix, state->matrixrowsize );
+    mapy++;
+    if( mapy >= state->height )
+      mapy = 0;
+  }
+
+  dst[0] = (unsigned char)( fmaxf( 0.0f, fminf( 255.0f, sum0 + 0.5f ) ) );
+
+  return;
+}
+
+static void imStaticKernel2Linear( unsigned char *dst, int pointx, int pointy, imStaticMatrixState * restrict state )
+{
+  int x, y, mapx, mapy;
+  float f, sum0, sum1;
+  float *matrix;
+  unsigned char *src;
+
+  sum0 = 0.0f;
+  sum1 = 0.0f;
+  matrix = state->matrix;
+  mapy = pointy;
+  for( y = 0 ; y < state->matrixsize ; y++ )
+  {
+    src = ADDRESS( state->srcdata, ( mapy * state->bytesperline ) );
+    mapx = pointx << 1;
+    for( x = 0 ; x < state->matrixsize ; x++ )
+    {
+      f = matrix[x];
+      sum0 += f * (float)src[ mapx + 0 ];
+      sum1 += f * (float)src[ mapx + 1 ];
+      mapx += 2;
+      if( mapx >= state->width2 )
+        mapx = 0;
+    }
+    matrix = ADDRESS( matrix, state->matrixrowsize );
+    mapy++;
+    if( mapy >= state->height )
+      mapy = 0;
+  }
+
+  dst[0] = (unsigned char)( fmaxf( 0.0f, fminf( 255.0f, sum0 + 0.5f ) ) );
+  dst[1] = (unsigned char)( fmaxf( 0.0f, fminf( 255.0f, sum1 + 0.5f ) ) );
+
+  return;
+}
+
+static void imStaticKernel3Linear( unsigned char *dst, int pointx, int pointy, imStaticMatrixState * restrict state )
+{
+  int x, y, mapx, mapy;
+  float f, sum0, sum1, sum2;
+  float *matrix;
+  unsigned char *src;
+
+  sum0 = 0.0f;
+  sum1 = 0.0f;
+  sum2 = 0.0f;
+  matrix = state->matrix;
+  mapy = pointy;
+  for( y = 0 ; y < state->matrixsize ; y++ )
+  {
+    src = ADDRESS( state->srcdata, ( mapy * state->bytesperline ) );
+    mapx = pointx + ( pointx << 1 );
+    for( x = 0 ; x < state->matrixsize ; x++ )
+    {
+      f = matrix[x];
+      sum0 += f * (float)src[ mapx + 0 ];
+      sum1 += f * (float)src[ mapx + 1 ];
+      sum2 += f * (float)src[ mapx + 2 ];
+      mapx += 3;
+      if( mapx >= state->width3 )
+        mapx = 0;
+    }
+    matrix = ADDRESS( matrix, state->matrixrowsize );
+    mapy++;
+    if( mapy >= state->height )
+      mapy = 0;
+  }
+
+  dst[0] = (unsigned char)( fmaxf( 0.0f, fminf( 255.0f, sum0 + 0.5f ) ) );
+  dst[1] = (unsigned char)( fmaxf( 0.0f, fminf( 255.0f, sum1 + 0.5f ) ) );
+  dst[2] = (unsigned char)( fmaxf( 0.0f, fminf( 255.0f, sum2 + 0.5f ) ) );
+
+  return;
+}
+
+static void imStaticKernel4Linear( unsigned char *dst, int pointx, int pointy, imStaticMatrixState * restrict state )
+{
+  int x, y, mapx, mapy;
+#if CPU_SSE2_SUPPORT
+  __m128 vsum, vsrc;
+  __m128i vzero;
+#else
+  float f, sum0, sum1, sum2, sum3;
+#endif
+  float *matrix;
+  unsigned char *src;
+
+#if CPU_SSE2_SUPPORT
+  vsum = _mm_setzero_ps();
+  vzero = _mm_setzero_si128();
+#else
+  sum0 = 0.0f;
+  sum1 = 0.0f;
+  sum2 = 0.0f;
+  sum3 = 0.0f;
+#endif
+  matrix = state->matrix;
+  mapy = pointy;
+  for( y = 0 ; y < state->matrixsize ; y++ )
+  {
+    src = ADDRESS( state->srcdata, ( mapy * state->bytesperline ) );
+    mapx = pointx << 2;
+    for( x = 0 ; x < state->matrixsize ; x++ )
+    {
+#if CPU_SSE2_SUPPORT
+      vsrc = _mm_cvtepi32_ps( CPU_CVT_U8_TO_I32( _mm_castps_si128( _mm_load_ss( (void *)&src[ mapx ] ) ), vzero ) );
+      vsum = _mm_add_ps( vsum, _mm_mul_ps( _mm_set1_ps( matrix[x] ), vsrc ) );
+#else
+      f = matrix[x];
+      sum0 += f * (float)src[ mapx + 0 ];
+      sum1 += f * (float)src[ mapx + 1 ];
+      sum2 += f * (float)src[ mapx + 2 ];
+      sum3 += f * (float)src[ mapx + 3 ];
+#endif
+      mapx += 4;
+      if( mapx >= state->width4 )
+        mapx = 0;
+    }
+    matrix = ADDRESS( matrix, state->matrixrowsize );
+    mapy++;
+    if( mapy >= state->height )
+      mapy = 0;
+  }
+
+#if CPU_SSE2_SUPPORT
+  _mm_store_ss( (void *)dst, _mm_castsi128_ps( _mm_packus_epi16( _mm_packs_epi32( _mm_cvtps_epi32( vsum ), vzero ), vzero ) ) );
+#else
+  dst[0] = (unsigned char)( fmaxf( 0.0f, fminf( 255.0f, sum0 + 0.5f ) ) );
+  dst[1] = (unsigned char)( fmaxf( 0.0f, fminf( 255.0f, sum1 + 0.5f ) ) );
+  dst[2] = (unsigned char)( fmaxf( 0.0f, fminf( 255.0f, sum2 + 0.5f ) ) );
+  dst[3] = (unsigned char)( fmaxf( 0.0f, fminf( 255.0f, sum3 + 0.5f ) ) );
+#endif
+
+  return;
+}
+
+
+#if CPU_SSE2_SUPPORT
+
+static void imStaticKernel4Linear_Core( unsigned char *dst, int pointx, int pointy, imStaticMatrixState * restrict state )
+{
+  int x, y, mapx, mapy;
+  __m128 vsum, vf, v0, v1, v2, v3;
+  __m128i vzero;
+  float *matrix;
+  unsigned char *src;
+
+  vsum = _mm_setzero_ps();
+  vzero = _mm_setzero_si128();
+  matrix = state->matrix;
+  mapy = pointy;
+  for( y = 0 ; y < state->matrixsize ; y++ )
+  {
+    src = ADDRESS( state->srcdata, ( mapy * state->bytesperline ) );
+    mapx = pointx << 2;
+    for( x = 0 ; x < state->matrixsize ; x += 4 )
+    {
+      vf = _mm_load_ps( &matrix[x] );
+      v0 = _mm_cvtepi32_ps( CPU_CVT_U8_TO_I32( _mm_castps_si128( _mm_load_ss( (void *)&src[ mapx + 0 ] ) ), vzero ) );
+      v1 = _mm_cvtepi32_ps( CPU_CVT_U8_TO_I32( _mm_castps_si128( _mm_load_ss( (void *)&src[ mapx + 4 ] ) ), vzero ) );
+      v2 = _mm_cvtepi32_ps( CPU_CVT_U8_TO_I32( _mm_castps_si128( _mm_load_ss( (void *)&src[ mapx + 8 ] ) ), vzero ) );
+      v3 = _mm_cvtepi32_ps( CPU_CVT_U8_TO_I32( _mm_castps_si128( _mm_load_ss( (void *)&src[ mapx + 12 ] ) ), vzero ) );
+      vsum = _mm_add_ps( vsum, _mm_mul_ps( _mm_shuffle_ps( vf, vf, 0x00 ), v0 ) );
+      vsum = _mm_add_ps( vsum, _mm_mul_ps( _mm_shuffle_ps( vf, vf, 0x55 ), v1 ) );
+      vsum = _mm_add_ps( vsum, _mm_mul_ps( _mm_shuffle_ps( vf, vf, 0xaa ), v2 ) );
+      vsum = _mm_add_ps( vsum, _mm_mul_ps( _mm_shuffle_ps( vf, vf, 0xff ), v3 ) );
+      mapx += 16;
+    }
+    matrix = ADDRESS( matrix, state->matrixrowsize );
+    mapy++;
+    if( mapy >= state->height )
+      mapy = 0;
+  }
+
+  _mm_store_ss( (void *)dst, _mm_castsi128_ps( _mm_packus_epi16( _mm_packs_epi32( _mm_cvtps_epi32( vsum ), vzero ), vzero ) ) );
+
+  return;
+}
+
+#endif
+
+
+////
+
+
+static void imStaticKernel4LinearAlphaNorm( unsigned char *dst, int pointx, int pointy, imStaticMatrixState * restrict state )
+{
+  int x, y, mapx, mapy;
+  float f, sum0, sum1, sum2, sum3;
+  float *matrix;
+  unsigned char *src;
+
+  sum0 = 0.0f;
+  sum1 = 0.0f;
+  sum2 = 0.0f;
+  sum3 = 0.0f;
+  matrix = state->matrix;
+  mapy = pointy;
+  for( y = 0 ; y < state->matrixsize ; y++ )
+  {
+    src = ADDRESS( state->srcdata, ( mapy * state->bytesperline ) );
+    mapx = pointx << 2;
+    for( x = 0 ; x < state->matrixsize ; x++ )
+    {
+      f = matrix[x] * (float)src[ mapx + 3 ];
+      sum0 += f * (float)src[ mapx + 0 ];
+      sum1 += f * (float)src[ mapx + 1 ];
+      sum2 += f * (float)src[ mapx + 2 ];
+      sum3 += f;
+      mapx += 4;
+      if( mapx >= state->width4 )
+        mapx = 0;
+    }
+    matrix = ADDRESS( matrix, state->matrixrowsize );
+    mapy++;
+    if( mapy >= state->height )
+      mapy = 0;
+  }
+
+  if( sum3 >= state->minimumalphaf )
+  {
+    f = 1.0f / sum3;
+    dst[0] = (unsigned char)( fmaxf( 0.0f, fminf( 255.0f, ( sum0 * f ) + 0.5f ) ) );
+    dst[1] = (unsigned char)( fmaxf( 0.0f, fminf( 255.0f, ( sum1 * f ) + 0.5f ) ) );
+    dst[2] = (unsigned char)( fmaxf( 0.0f, fminf( 255.0f, ( sum2 * f ) + 0.5f ) ) );
+    dst[3] = (unsigned char)( fmaxf( 0.0f, fminf( 255.0f, sum3 + 0.5f ) ) );
+  }
+  else
+  {
+    dst[0] = 0;
+    dst[1] = 0;
+    dst[2] = 0;
+    dst[3] = 0;
+  }
+
+  return;
+}
+
+
+#if CPU_SSE2_SUPPORT
+
+static void imStaticKernel4LinearAlphaNorm_Core( unsigned char *dst, int pointx, int pointy, imStaticMatrixState * restrict state )
+{
+  int x, y, mapx, mapy;
+  uint32_t pixel;
+  float *matrix;
+  unsigned char *src;
+  __m128 vsum0, vsum1, vsum2, vsum3;
+  __m128 vf, valpha, vr, vg, vb, va, vsrcf;
+  __m128i vsrc, vshufmask;
+  __m128i vzero;
+
+ #if CPU_SSSE3_SUPPORT
+  vshufmask = _mm_setr_epi8( 0x00,0x04,0x08,0x0c, 0x01,0x05,0x09,0x0d, 0x02,0x06,0x0a,0x0e, 0x03,0x07,0x0b,0x0f );
+ #endif
+  vsum0 = _mm_setzero_ps();
+  vsum1 = _mm_setzero_ps();
+  vsum2 = _mm_setzero_ps();
+  vsum3 = _mm_setzero_ps();
+  vzero = _mm_castps_si128( _mm_setzero_ps() );
+
+  matrix = state->matrix;
+  mapy = pointy;
+  for( y = 0 ; y < state->matrixsize ; y++ )
+  {
+    src = ADDRESS( state->srcdata, ( mapy * state->bytesperline ) );
+    mapx = pointx << 2;
+    for( x = 0 ; x < state->matrixsize ; x += 4 )
+    {
+      vf = _mm_load_ps( &matrix[x] );
+      /* Load 16 bytes and unpack as RRRR,GGGG,BBBB,AAAA in one SSE register */
+      vsrc = _mm_loadu_si128( (void *)&src[ mapx ] );
+ #if CPU_SSSE3_SUPPORT
+      vsrc = _mm_shuffle_epi8( vsrc, vshufmask );
+ #else
+      vshufmask = _mm_shuffle_epi32( vsrc, 0x39 );
+      vsrc = _mm_unpacklo_epi16( _mm_unpacklo_epi8( vsrc, vshufmask ), _mm_unpackhi_epi8( vsrc, vshufmask ) );
+ #endif
+      /* Break that into 4 SSE registers as floats: vR,vG,vB,vA */
+      vsrcf = _mm_castsi128_ps( vsrc );
+      vr = _mm_cvtepi32_ps( CPU_CVT_U8_TO_I32( _mm_castps_si128( vsrcf ), vzero ) );
+ #if CPU_SSE3_SUPPORT
+      vg = _mm_cvtepi32_ps( CPU_CVT_U8_TO_I32( _mm_castps_si128( _mm_movehdup_ps( vsrcf ) ), vzero ) );
+ #else
+      vg = _mm_cvtepi32_ps( CPU_CVT_U8_TO_I32( _mm_castps_si128( _mm_shuffle_ps( vsrcf, vsrcf, 0x55 ) ), vzero ) );
+ #endif
+      vb = _mm_cvtepi32_ps( CPU_CVT_U8_TO_I32( _mm_castps_si128( _mm_movehl_ps( vsrcf, vsrcf ) ), vzero ) );
+      va = _mm_cvtepi32_ps( CPU_CVT_U8_TO_I32( _mm_castps_si128( _mm_shuffle_ps( vsrcf, vsrcf, 0xff ) ), vzero ) );
+      valpha = _mm_mul_ps( va, vf );
+      vsum0 = _mm_add_ps( vsum0, _mm_mul_ps( vr, valpha ) );
+      vsum1 = _mm_add_ps( vsum1, _mm_mul_ps( vg, valpha ) );
+      vsum2 = _mm_add_ps( vsum2, _mm_mul_ps( vb, valpha ) );
+      vsum3 = _mm_add_ps( vsum3, valpha );
+      mapx += 16;
+    }
+    matrix = ADDRESS( matrix, state->matrixrowsize );
+    mapy++;
+    if( mapy >= state->height )
+      mapy = 0;
+  }
+
+ #if CPU_SSE3_SUPPORT
+  vsum0 = _mm_hadd_ps( vsum0, vsum1 );
+  vsum2 = _mm_hadd_ps( vsum2, vsum3 );
+  vsum0 = _mm_hadd_ps( vsum0, vsum2 );
+ #else
+  vsum0 = _mm_add_ps( _mm_unpacklo_ps( vsum0, vsum2 ), _mm_unpackhi_ps( vsum0, vsum2 ) );
+  vsum1 = _mm_add_ps( _mm_unpacklo_ps( vsum1, vsum3 ), _mm_unpackhi_ps( vsum1, vsum3 ) );
+  vsum0 = _mm_add_ps( _mm_unpacklo_ps( vsum0, vsum1 ), _mm_unpackhi_ps( vsum0, vsum1 ) );
+ #endif
+
+  valpha = _mm_shuffle_ps( vsum0, vsum0, 0xff );
+  pixel = 0;
+  if( _mm_comige_ss( valpha, _mm_load_ss( &state->minimumalphaf ) ) )
+  {
+    __m128i vpixel;
+    vsum0 = _mm_mul_ps( vsum0, _mm_rcp_ps( valpha ) );
+    vsum0 = CPU_BLENDV_PS( vsum0, valpha, *(__m128 *)simd4fAlphaMask );
+    vpixel = _mm_cvtps_epi32( vsum0 );
+    vpixel = _mm_packs_epi32( vpixel, vpixel );
+    vpixel = _mm_packus_epi16( vpixel, vpixel );
+    pixel = (uint32_t)_mm_cvtsi128_si32( vpixel );
+  }
+  *(uint32_t *)dst = pixel;
+
+  return;
+}
+
+#endif
+
+
+////
+
+
+static void imStaticKernel1sRGB( unsigned char *dst, int pointx, int pointy, imStaticMatrixState * restrict state )
+{
+  int x, y, mapx, mapy;
+#if CPU_SSE2_SUPPORT
+  __m128 vsum, vsrc;
+  __m128i vzero;
+#else
+  float f, sum0;
+#endif
+  float *matrix;
+  unsigned char *src;
+
+#if CPU_SSE2_SUPPORT
+  vsum = _mm_setzero_ps();
+  vzero = _mm_setzero_si128();
+#else
+  sum0 = 0.0f;
+#endif
+  matrix = state->matrix;
+  mapy = pointy;
+  for( y = 0 ; y < state->matrixsize ; y++ )
+  {
+    src = ADDRESS( state->srcdata, ( mapy * state->bytesperline ) );
+    mapx = pointx;
+    for( x = 0 ; x < state->matrixsize ; x++ )
+    {
+#if CPU_SSE2_SUPPORT
+      vsrc = _mm_set_ss( (float)src[ mapx + 0 ] );
+      vsrc = srgb2linear3( vsrc );
+      vsum = _mm_add_ps( vsum, _mm_mul_ps( _mm_set_ss( matrix[x] ), vsrc ) );
+#else
+      f = matrix[x];
+      sum0 += f * srgb2linear( (float)src[ mapx + 0 ] );
+#endif
+      mapx++;
+      if( mapx >= state->width1 )
+        mapx = 0;
+    }
+    matrix = ADDRESS( matrix, state->matrixrowsize );
+    mapy++;
+    if( mapy >= state->height )
+      mapy = 0;
+  }
+
+#if CPU_SSE2_SUPPORT
+  union
+  {
+    char c[4];
+    uint32_t i;
+  } u;
+  vsum = linear2srgb3( vsum );
+  _mm_store_ss( (void *)&u.i, _mm_castsi128_ps( _mm_packus_epi16( _mm_packs_epi32( _mm_cvtps_epi32( vsum ), vzero ), vzero ) ) );
+  dst[0] = u.c[0];
+#else
+  dst[0] = (unsigned char)( fmaxf( 0.0f, fminf( 255.0f, linear2srgb( sum0 ) + 0.5f ) ) );
+#endif
+
+  return;
+}
+
+static void imStaticKernel2sRGB( unsigned char *dst, int pointx, int pointy, imStaticMatrixState * restrict state )
+{
+  int x, y, mapx, mapy;
+#if CPU_SSE2_SUPPORT
+  __m128 vsum, vsrc;
+  __m128i vzero;
+#else
+  float f, sum0, sum1;
+#endif
+  float *matrix;
+  unsigned char *src;
+
+#if CPU_SSE2_SUPPORT
+  vsum = _mm_setzero_ps();
+  vzero = _mm_setzero_si128();
+#else
+  sum0 = 0.0f;
+  sum1 = 0.0f;
+#endif
+  matrix = state->matrix;
+  mapy = pointy;
+  for( y = 0 ; y < state->matrixsize ; y++ )
+  {
+    src = ADDRESS( state->srcdata, ( mapy * state->bytesperline ) );
+    mapx = pointx << 1;
+    for( x = 0 ; x < state->matrixsize ; x++ )
+    {
+#if CPU_SSE2_SUPPORT
+      vsrc = _mm_set_ps( 0.0f, 0.0f, (float)src[ mapx + 1 ], (float)src[ mapx + 0 ] );
+      vsrc = srgb2linear3( vsrc );
+      vsum = _mm_add_ps( vsum, _mm_mul_ps( _mm_set1_ps( matrix[x] ), vsrc ) );
+#else
+      f = matrix[x];
+      sum0 += f * srgb2linear( (float)src[ mapx + 0 ] );
+      sum1 += f * srgb2linear( (float)src[ mapx + 1 ] );
+#endif
+      mapx += 2;
+      if( mapx >= state->width2 )
+        mapx = 0;
+    }
+    matrix = ADDRESS( matrix, state->matrixrowsize );
+    mapy++;
+    if( mapy >= state->height )
+      mapy = 0;
+  }
+
+#if CPU_SSE2_SUPPORT
+  union
+  {
+    char c[4];
+    uint32_t i;
+  } u;
+  vsum = linear2srgb3( vsum );
+  _mm_store_ss( (void *)&u.i, _mm_castsi128_ps( _mm_packus_epi16( _mm_packs_epi32( _mm_cvtps_epi32( vsum ), vzero ), vzero ) ) );
+  dst[0] = u.c[0];
+  dst[1] = u.c[1];
+#else
+  dst[0] = (unsigned char)( fmaxf( 0.0f, fminf( 255.0f, linear2srgb( sum0 ) + 0.5f ) ) );
+  dst[1] = (unsigned char)( fmaxf( 0.0f, fminf( 255.0f, linear2srgb( sum1 ) + 0.5f ) ) );
+#endif
+
+  return;
+}
+
+static void imStaticKernel3sRGB( unsigned char *dst, int pointx, int pointy, imStaticMatrixState * restrict state )
+{
+  int x, y, mapx, mapy;
+#if CPU_SSE2_SUPPORT
+  __m128 vsum, vsrc;
+  __m128i vzero;
+#else
+  float f, sum0, sum1, sum2;
+#endif
+  float *matrix;
+  unsigned char *src;
+
+#if CPU_SSE2_SUPPORT
+  vsum = _mm_setzero_ps();
+  vzero = _mm_setzero_si128();
+#else
+  sum0 = 0.0f;
+  sum1 = 0.0f;
+  sum2 = 0.0f;
+#endif
+  matrix = state->matrix;
+  mapy = pointy;
+  for( y = 0 ; y < state->matrixsize ; y++ )
+  {
+    src = ADDRESS( state->srcdata, ( mapy * state->bytesperline ) );
+    mapx = pointx + ( pointx << 1 );
+    for( x = 0 ; x < state->matrixsize ; x++ )
+    {
+#if CPU_SSE2_SUPPORT
+      vsrc = _mm_set_ps( 0.0f, (float)src[ mapx + 2 ], (float)src[ mapx + 1 ], (float)src[ mapx + 0 ] );
+      vsrc = srgb2linear3( vsrc );
+      vsum = _mm_add_ps( vsum, _mm_mul_ps( _mm_set1_ps( matrix[x] ), vsrc ) );
+#else
+      f = matrix[x];
+      sum0 += f * srgb2linear( (float)src[ mapx + 0 ] );
+      sum1 += f * srgb2linear( (float)src[ mapx + 1 ] );
+      sum2 += f * srgb2linear( (float)src[ mapx + 2 ] );
+#endif
+      mapx += 3;
+      if( mapx >= state->width3 )
+        mapx = 0;
+    }
+    matrix = ADDRESS( matrix, state->matrixrowsize );
+    mapy++;
+    if( mapy >= state->height )
+      mapy = 0;
+  }
+
+#if CPU_SSE2_SUPPORT
+  union
+  {
+    char c[4];
+    uint32_t i;
+  } u;
+  vsum = linear2srgb3( vsum );
+  _mm_store_ss( (void *)&u.i, _mm_castsi128_ps( _mm_packus_epi16( _mm_packs_epi32( _mm_cvtps_epi32( vsum ), vzero ), vzero ) ) );
+  dst[0] = u.c[0];
+  dst[1] = u.c[1];
+  dst[2] = u.c[2];
+#else
+  dst[0] = (unsigned char)( fmaxf( 0.0f, fminf( 255.0f, linear2srgb( sum0 ) + 0.5f ) ) );
+  dst[1] = (unsigned char)( fmaxf( 0.0f, fminf( 255.0f, linear2srgb( sum1 ) + 0.5f ) ) );
+  dst[2] = (unsigned char)( fmaxf( 0.0f, fminf( 255.0f, linear2srgb( sum2 ) + 0.5f ) ) );
+#endif
+
+  return;
+}
+
+static void imStaticKernel4sRGB( unsigned char *dst, int pointx, int pointy, imStaticMatrixState * restrict state )
+{
+  int x, y, mapx, mapy;
+#if CPU_SSE2_SUPPORT
+  __m128 vsum, vsrc;
+  __m128i vzero;
+#else
+  float f, sum0, sum1, sum2, sum3;
+#endif
+  float *matrix;
+  unsigned char *src;
+
+#if CPU_SSE2_SUPPORT
+  vsum = _mm_setzero_ps();
+  vzero = _mm_setzero_si128();
+#else
+  sum0 = 0.0f;
+  sum1 = 0.0f;
+  sum2 = 0.0f;
+  sum3 = 0.0f;
+#endif
+  matrix = state->matrix;
+  mapy = pointy;
+  for( y = 0 ; y < state->matrixsize ; y++ )
+  {
+    src = ADDRESS( state->srcdata, ( mapy * state->bytesperline ) );
+    mapx = pointx << 2;
+    for( x = 0 ; x < state->matrixsize ; x++ )
+    {
+#if CPU_SSE2_SUPPORT
+      vsrc = _mm_cvtepi32_ps( CPU_CVT_U8_TO_I32( _mm_castps_si128( _mm_load_ss( (void *)&src[ mapx ] ) ), vzero ) );
+      vsrc = srgb2linear3( vsrc );
+      vsum = _mm_add_ps( vsum, _mm_mul_ps( _mm_set1_ps( matrix[x] ), vsrc ) );
+#else
+      f = matrix[x];
+      sum0 += f * srgb2linear( (float)src[ mapx + 0 ] );
+      sum1 += f * srgb2linear( (float)src[ mapx + 1 ] );
+      sum2 += f * srgb2linear( (float)src[ mapx + 2 ] );
+      sum3 += f * (float)src[ mapx + 3 ];
+#endif
+      mapx += 4;
+      if( mapx >= state->width4 )
+        mapx = 0;
+    }
+    matrix = ADDRESS( matrix, state->matrixrowsize );
+    mapy++;
+    if( mapy >= state->height )
+      mapy = 0;
+  }
+
+#if CPU_SSE2_SUPPORT
+  vsum = linear2srgb3( vsum );
+  _mm_store_ss( (void *)dst, _mm_castsi128_ps( _mm_packus_epi16( _mm_packs_epi32( _mm_cvtps_epi32( vsum ), vzero ), vzero ) ) );
+#else
+  dst[0] = (unsigned char)( fmaxf( 0.0f, fminf( 255.0f, linear2srgb( sum0 ) + 0.5f ) ) );
+  dst[1] = (unsigned char)( fmaxf( 0.0f, fminf( 255.0f, linear2srgb( sum1 ) + 0.5f ) ) );
+  dst[2] = (unsigned char)( fmaxf( 0.0f, fminf( 255.0f, linear2srgb( sum2 ) + 0.5f ) ) );
+  dst[3] = (unsigned char)( fmaxf( 0.0f, fminf( 255.0f, sum3 + 0.5f ) ) );
+#endif
+
+  return;
+}
+
+
+#if CPU_SSE2_SUPPORT
+
+static void imStaticKernel3sRGB_Core( unsigned char *dst, int pointx, int pointy, imStaticMatrixState * restrict state )
+{
+  int x, y, mapx, mapy;
+  __m128 vsum0, vsum1, vsum2, vsrc0, vsrc1, vsrc2, vf;
+  __m128i vzero;
+  float *matrix;
+  unsigned char *src;
+  union
+  {
+    char c[4];
+    uint32_t i;
+  } u;
+
+  vsum0 = _mm_setzero_ps();
+  vsum1 = _mm_setzero_ps();
+  vsum2 = _mm_setzero_ps();
+  vzero = _mm_setzero_si128();
+  matrix = state->matrix;
+  mapy = pointy;
+  for( y = 0 ; y < state->matrixsize ; y++ )
+  {
+    src = ADDRESS( state->srcdata, ( mapy * state->bytesperline ) );
+    mapx = pointx + ( pointx << 1 );
+    for( x = 0 ; x < state->matrixsize ; x += 4 )
+    {
+      vsrc0 = _mm_cvtepi32_ps( CPU_CVT_U8_TO_I32( _mm_castps_si128( _mm_load_ss( (void *)&src[ mapx+0 ] ) ), vzero ) );
+      vsrc1 = _mm_cvtepi32_ps( CPU_CVT_U8_TO_I32( _mm_castps_si128( _mm_load_ss( (void *)&src[ mapx+4 ] ) ), vzero ) );
+      vsrc2 = _mm_cvtepi32_ps( CPU_CVT_U8_TO_I32( _mm_castps_si128( _mm_load_ss( (void *)&src[ mapx+8 ] ) ), vzero ) );
+      vsrc0 = srgb2linear4( vsrc0 );
+      vsrc1 = srgb2linear4( vsrc1 );
+      vsrc2 = srgb2linear4( vsrc2 );
+      vf = _mm_load_ps( &matrix[x] );
+      vsum0 = _mm_add_ps( vsum0, _mm_mul_ps( _mm_shuffle_ps( vf, vf, 0x40 ), vsrc0 ) );
+      vsum1 = _mm_add_ps( vsum1, _mm_mul_ps( _mm_shuffle_ps( vf, vf, 0xA5 ), vsrc1 ) );
+      vsum2 = _mm_add_ps( vsum2, _mm_mul_ps( _mm_shuffle_ps( vf, vf, 0xFE ), vsrc2 ) );
+      mapx += 12;
+    }
+    matrix = ADDRESS( matrix, state->matrixrowsize );
+    mapy++;
+    if( mapy >= state->height )
+      mapy = 0;
+  }
+
+ #if CPU_SSSE3_SUPPORT
+  vsum0 = _mm_add_ps( vsum0, _mm_castsi128_ps( _mm_alignr_epi8( _mm_castps_si128( vsum1 ), _mm_castps_si128( vsum0 ), 12 ) ) );
+  vsum0 = _mm_add_ps( vsum0, _mm_castsi128_ps( _mm_alignr_epi8( _mm_castps_si128( vsum2 ), _mm_castps_si128( vsum1 ), 8 ) ) );
+  vsum0 = _mm_add_ps( vsum0, _mm_castsi128_ps( _mm_alignr_epi8( _mm_castps_si128( vsum2 ), _mm_castps_si128( vsum2 ), 4 ) ) );
+ #else
+  vf = _mm_shuffle_ps( vsum0, vsum1, 0x4f );
+  vsum0 = _mm_add_ps( vsum0, _mm_shuffle_ps( vf, vf, 0x38 ) );
+  vsum0 = _mm_add_ps( vsum0, _mm_shuffle_ps( vsum1, vsum2, 0x0E ) );
+  vsum0 = _mm_add_ps( vsum0, _mm_shuffle_ps( vsum2, vsum2, 0x39 ) );
+ #endif
+
+  vsum0 = linear2srgb3( vsum0 );
+  _mm_store_ss( (void *)&u.i, _mm_castsi128_ps( _mm_packus_epi16( _mm_packs_epi32( _mm_cvtps_epi32( vsum0 ), vzero ), vzero ) ) );
+  dst[0] = u.c[0];
+  dst[1] = u.c[1];
+  dst[2] = u.c[2];
+
+  return;
+}
+
+static void imStaticKernel4sRGB_Core( unsigned char *dst, int pointx, int pointy, imStaticMatrixState * restrict state )
+{
+  int x, y, mapx, mapy;
+  __m128 vsum, vsrc0, vsrc1;
+  __m128i vzero;
+  float *matrix;
+  unsigned char *src;
+
+  vsum = _mm_setzero_ps();
+  vzero = _mm_setzero_si128();
+  matrix = state->matrix;
+  mapy = pointy;
+  for( y = 0 ; y < state->matrixsize ; y++ )
+  {
+    src = ADDRESS( state->srcdata, ( mapy * state->bytesperline ) );
+    mapx = pointx << 2;
+    for( x = 0 ; x < state->matrixsize ; x += 2 )
+    {
+      vsrc0 = _mm_cvtepi32_ps( CPU_CVT_U8_TO_I32( _mm_castps_si128( _mm_load_ss( (void *)&src[ mapx+0 ] ) ), vzero ) );
+      vsrc1 = _mm_cvtepi32_ps( CPU_CVT_U8_TO_I32( _mm_castps_si128( _mm_load_ss( (void *)&src[ mapx+4 ] ) ), vzero ) );
+      vsrc0 = srgb2linear3( vsrc0 );
+      vsrc1 = srgb2linear3( vsrc1 );
+      vsum = _mm_add_ps( vsum, _mm_mul_ps( _mm_set1_ps( matrix[x+0] ), vsrc0 ) );
+      vsum = _mm_add_ps( vsum, _mm_mul_ps( _mm_set1_ps( matrix[x+1] ), vsrc1 ) );
+      mapx += 8;
+    }
+    matrix = ADDRESS( matrix, state->matrixrowsize );
+    mapy++;
+    if( mapy >= state->height )
+      mapy = 0;
+  }
+
+  vsum = linear2srgb3( vsum );
+  _mm_store_ss( (void *)dst, _mm_castsi128_ps( _mm_packus_epi16( _mm_packs_epi32( _mm_cvtps_epi32( vsum ), vzero ), vzero ) ) );
+
+  return;
+}
+
+#endif
+
+
+////
+
+
+static void imStaticKernel4sRGBAlphaNorm( unsigned char *dst, int pointx, int pointy, imStaticMatrixState * restrict state )
+{
+  int x, y, mapx, mapy;
+#if CPU_SSE2_SUPPORT
+  __m128 vsum, vsrc, valpha;
+  __m128i vzero;
+  uint32_t pixel;
+#else
+  float f, sum0, sum1, sum2, sum3;
+#endif
+  float *matrix;
+  unsigned char *src;
+
+#if CPU_SSE2_SUPPORT
+  vsum = _mm_setzero_ps();
+  vzero = _mm_setzero_si128();
+#else
+  sum0 = 0.0f;
+  sum1 = 0.0f;
+  sum2 = 0.0f;
+  sum3 = 0.0f;
+#endif
+  matrix = state->matrix;
+  mapy = pointy;
+  for( y = 0 ; y < state->matrixsize ; y++ )
+  {
+    src = ADDRESS( state->srcdata, ( mapy * state->bytesperline ) );
+    mapx = pointx << 2;
+    for( x = 0 ; x < state->matrixsize ; x++ )
+    {
+#if CPU_SSE2_SUPPORT
+      vsrc = _mm_cvtepi32_ps( CPU_CVT_U8_TO_I32( _mm_castps_si128( _mm_load_ss( (void *)&src[ mapx ] ) ), vzero ) );
+      valpha = _mm_shuffle_ps( vsrc, _mm_set_ss( 1.0f ), 0x0f );
+      vsrc = srgb2linear3( vsrc );
+      vsum = _mm_add_ps( vsum, _mm_mul_ps( _mm_mul_ps( _mm_shuffle_ps( valpha, valpha, 0xC0 ), _mm_set1_ps( matrix[x] ) ), vsrc ) );
+#else
+      f = matrix[x] * (float)src[ mapx + 3 ];
+      sum0 += f * srgb2linear( (float)src[ mapx + 0 ] );
+      sum1 += f * srgb2linear( (float)src[ mapx + 1 ] );
+      sum2 += f * srgb2linear( (float)src[ mapx + 2 ] );
+      sum3 += f;
+#endif
+      mapx += 4;
+      if( mapx >= state->width4 )
+        mapx = 0;
+    }
+    matrix = ADDRESS( matrix, state->matrixrowsize );
+    mapy++;
+    if( mapy >= state->height )
+      mapy = 0;
+  }
+
+#if CPU_SSE2_SUPPORT
+  valpha = _mm_shuffle_ps( vsum, vsum, 0xff );
+  pixel = 0;
+  if( _mm_comige_ss( valpha, _mm_load_ss( &state->minimumalphaf ) ) )
+  {
+    __m128i vpixel;
+    vsum = _mm_mul_ps( vsum, _mm_rcp_ps( valpha ) );
+    vsum = CPU_BLENDV_PS( vsum, valpha, *(__m128 *)simd4fAlphaMask );
+    vsum = linear2srgb3( vsum );
+    vpixel = _mm_cvtps_epi32( vsum );
+    vpixel = _mm_packs_epi32( vpixel, vpixel );
+    vpixel = _mm_packus_epi16( vpixel, vpixel );
+    pixel = (uint32_t)_mm_cvtsi128_si32( vpixel );
+  }
+  *(uint32_t *)dst = pixel;
+#else
+  if( sum3 >= state->minimumalphaf )
+  {
+    f = 1.0f / sum3;
+    dst[0] = (unsigned char)( fmaxf( 0.0f, fminf( 255.0f, linear2srgb( sum0 * f ) + 0.5f ) ) );
+    dst[1] = (unsigned char)( fmaxf( 0.0f, fminf( 255.0f, linear2srgb( sum1 * f ) + 0.5f ) ) );
+    dst[2] = (unsigned char)( fmaxf( 0.0f, fminf( 255.0f, linear2srgb( sum2 * f ) + 0.5f ) ) );
+    dst[3] = (unsigned char)( fmaxf( 0.0f, fminf( 255.0f, sum3 + 0.5f ) ) );
+  }
+  else
+  {
+    dst[0] = 0;
+    dst[1] = 0;
+    dst[2] = 0;
+    dst[3] = 0;
+  }
+#endif
+
+  return;
+}
+
+
+#if CPU_SSE2_SUPPORT
+
+static void imStaticKernel4sRGBAlphaNorm_Core( unsigned char *dst, int pointx, int pointy, imStaticMatrixState * restrict state )
+{
+  int x, y, mapx, mapy;
+  __m128 vsum, vsrc0, vsrc1, valpha0, valpha1;
+  __m128i vzero;
+  uint32_t pixel;
+  float *matrix;
+  unsigned char *src;
+
+  vsum = _mm_setzero_ps();
+  vzero = _mm_setzero_si128();
+  matrix = state->matrix;
+  mapy = pointy;
+  for( y = 0 ; y < state->matrixsize ; y++ )
+  {
+    src = ADDRESS( state->srcdata, ( mapy * state->bytesperline ) );
+    mapx = pointx << 2;
+    for( x = 0 ; x < state->matrixsize ; x += 2 )
+    {
+      vsrc0 = _mm_cvtepi32_ps( CPU_CVT_U8_TO_I32( _mm_castps_si128( _mm_load_ss( (void *)&src[ mapx+0 ] ) ), vzero ) );
+      vsrc1 = _mm_cvtepi32_ps( CPU_CVT_U8_TO_I32( _mm_castps_si128( _mm_load_ss( (void *)&src[ mapx+4 ] ) ), vzero ) );
+      valpha0 = _mm_shuffle_ps( vsrc0, _mm_set_ss( 1.0f ), 0x0f );
+      valpha1 = _mm_shuffle_ps( vsrc1, _mm_set_ss( 1.0f ), 0x0f );
+      vsrc0 = srgb2linear3( vsrc0 );
+      vsrc1 = srgb2linear3( vsrc1 );
+      vsum = _mm_add_ps( vsum, _mm_mul_ps( _mm_mul_ps( _mm_shuffle_ps( valpha0, valpha0, 0xC0 ), _mm_set1_ps( matrix[x+0] ) ), vsrc0 ) );
+      vsum = _mm_add_ps( vsum, _mm_mul_ps( _mm_mul_ps( _mm_shuffle_ps( valpha1, valpha1, 0xC0 ), _mm_set1_ps( matrix[x+1] ) ), vsrc1 ) );
+      mapx += 8;
+    }
+    matrix = ADDRESS( matrix, state->matrixrowsize );
+    mapy++;
+    if( mapy >= state->height )
+      mapy = 0;
+  }
+
+  valpha0 = _mm_shuffle_ps( vsum, vsum, 0xff );
+  pixel = 0;
+  if( _mm_comige_ss( valpha0, _mm_load_ss( &state->minimumalphaf ) ) )
+  {
+    __m128i vpixel;
+    vsum = _mm_mul_ps( vsum, _mm_rcp_ps( valpha0 ) );
+    vsum = CPU_BLENDV_PS( vsum, valpha0, *(__m128 *)simd4fAlphaMask );
+    vsum = linear2srgb3( vsum );
+    vpixel = _mm_cvtps_epi32( vsum );
+    vpixel = _mm_packs_epi32( vpixel, vpixel );
+    vpixel = _mm_packus_epi16( vpixel, vpixel );
+    pixel = (uint32_t)_mm_cvtsi128_si32( vpixel );
+  }
+  *(uint32_t *)dst = pixel;
+
+  return;
+}
+
+#endif
+
+////
+
+
+static void imStaticKernel3Normal( unsigned char *dst, int pointx, int pointy, imStaticMatrixState * restrict state )
+{
+  int x, y, mapx, mapy;
+  float f, sum0, sum1, sum2, suminv;
+  float *matrix;
+  unsigned char *src;
+
+  sum0 = 0.0f;
+  sum1 = 0.0f;
+  sum2 = 0.0f;
+  matrix = state->matrix;
+  mapy = pointy;
+  for( y = 0 ; y < state->matrixsize ; y++ )
+  {
+    src = ADDRESS( state->srcdata, ( mapy * state->bytesperline ) );
+    mapx = pointx + ( pointx << 1 );
+    for( x = 0 ; x < state->matrixsize ; x++ )
+    {
+      f = matrix[x];
+      sum0 += f * (float)src[ mapx + 0 ];
+      sum1 += f * (float)src[ mapx + 1 ];
+      sum2 += f * (float)src[ mapx + 2 ];
+      mapx += 3;
+      if( mapx >= state->width3 )
+        mapx = 0;
+    }
+    matrix = ADDRESS( matrix, state->matrixrowsize );
+    mapy++;
+    if( mapy >= state->height )
+      mapy = 0;
+  }
+
+  sum0 -= 0.5f*255.0f;
+  sum1 -= 0.5f*255.0f;
+  sum2 -= 0.5f*255.0f;
+  sum0 *= state->amplifynormal;
+  sum1 *= state->amplifynormal;
+  suminv = (0.5f*255.0f) / sqrtf( ( sum0 * sum0 ) + ( sum1 * sum1 ) + ( sum2 * sum2 ) );
+  sum0 = (0.5f*255.0f) + ( sum0 * suminv );
+  sum1 = (0.5f*255.0f) + ( sum1 * suminv );
+  sum2 = (0.5f*255.0f) + ( sum2 * suminv );
+  dst[0] = (unsigned char)( fmaxf( 0.0f, fminf( 255.0f, sum0 + 0.5f ) ) );
+  dst[1] = (unsigned char)( fmaxf( 0.0f, fminf( 255.0f, sum1 + 0.5f ) ) );
+  dst[2] = (unsigned char)( fmaxf( 0.0f, fminf( 255.0f, sum2 + 0.5f ) ) );
+
+  return;
+}
+
+static void imStaticKernel4Normal( unsigned char *dst, int pointx, int pointy, imStaticMatrixState * restrict state )
+{
+  int x, y, mapx, mapy;
+#if CPU_SSE2_SUPPORT
+  __m128 vsum, vsrc;
+  __m128i vzero;
+#else
+  float f;
+#endif
+  float sum0, sum1, sum2, sum3, suminv;
+  float *matrix;
+  unsigned char *src;
+
+#if CPU_SSE2_SUPPORT
+  vsum = _mm_setzero_ps();
+  vzero = _mm_setzero_si128();
+#else
+  sum0 = 0.0f;
+  sum1 = 0.0f;
+  sum2 = 0.0f;
+  sum3 = 0.0f;
+#endif
+  matrix = state->matrix;
+  mapy = pointy;
+  for( y = 0 ; y < state->matrixsize ; y++ )
+  {
+    src = ADDRESS( state->srcdata, ( mapy * state->bytesperline ) );
+    mapx = pointx << 2;
+    for( x = 0 ; x < state->matrixsize ; x++ )
+    {
+#if CPU_SSE2_SUPPORT
+      vsrc = _mm_cvtepi32_ps( CPU_CVT_U8_TO_I32( _mm_castps_si128( _mm_load_ss( (void *)&src[ mapx ] ) ), vzero ) );
+      vsum = _mm_add_ps( vsum, _mm_mul_ps( _mm_set1_ps( matrix[x] ), vsrc ) );
+#else
+      f = matrix[x];
+      sum0 += f * (float)src[ mapx + 0 ];
+      sum1 += f * (float)src[ mapx + 1 ];
+      sum2 += f * (float)src[ mapx + 2 ];
+      sum3 += f * (float)src[ mapx + 3 ];
+#endif
+      mapx += 4;
+      if( mapx >= state->width4 )
+        mapx = 0;
+    }
+    matrix = ADDRESS( matrix, state->matrixrowsize );
+    mapy++;
+    if( mapy >= state->height )
+      mapy = 0;
+  }
+
+#if CPU_SSE2_SUPPORT
+  vsum = _mm_sub_ps( vsum, _mm_set_ps( 1.0f/255.0f, 1.0f/255.0f, 1.0f/255.0f, 0.0f ) );
+  sum0 = _mm_cvtss_f32( vsum );
+ #if CPU_SSE3_SUPPORT
+  sum1 = _mm_cvtss_f32( _mm_movehdup_ps( vsum ) );
+ #else
+  sum1 = _mm_cvtss_f32( _mm_shuffle_ps( vsum, vsum, 0x55 ) );
+ #endif
+  sum2 = _mm_cvtss_f32( _mm_movehl_ps( vsum, vsum ) );
+  sum3 = _mm_cvtss_f32( _mm_shuffle_ps( vsum, vsum, 0xff ) );
+#else
+  sum0 -= 0.5f*255.0f;
+  sum1 -= 0.5f*255.0f;
+  sum2 -= 0.5f*255.0f;
+#endif
+  sum0 *= state->amplifynormal;
+  sum1 *= state->amplifynormal;
+  suminv = (0.5f*255.0f) / sqrtf( ( sum0 * sum0 ) + ( sum1 * sum1 ) + ( sum2 * sum2 ) );
+  sum0 = (0.5f*255.0f) + ( sum0 * suminv );
+  sum1 = (0.5f*255.0f) + ( sum1 * suminv );
+  sum2 = (0.5f*255.0f) + ( sum2 * suminv );
+  dst[0] = (unsigned char)( fmaxf( 0.0f, fminf( 255.0f, sum0 + 0.5f ) ) );
+  dst[1] = (unsigned char)( fmaxf( 0.0f, fminf( 255.0f, sum1 + 0.5f ) ) );
+  dst[2] = (unsigned char)( fmaxf( 0.0f, fminf( 255.0f, sum2 + 0.5f ) ) );
+  dst[3] = (unsigned char)( fmaxf( 0.0f, fminf( 255.0f, sum3 + 0.5f ) ) );
+
+  return;
+}
+
+
+////
+
+
+static void imStaticKernel3NormalSustain( unsigned char *dst, int pointx, int pointy, imStaticMatrixState * restrict state )
+{
+  int x, y, mapx, mapy;
+  float f, v0, v1, v2, energy, sum0, sum1, sum2, sumenergy, suminv;
+  float *matrix;
+  unsigned char *src;
+
+  sum0 = 0.0f;
+  sum1 = 0.0f;
+  sum2 = 0.0f;
+  sumenergy = 0.0f;
+  matrix = state->matrix;
+  mapy = pointy;
+  for( y = 0 ; y < state->matrixsize ; y++ )
+  {
+    src = ADDRESS( state->srcdata, ( mapy * state->bytesperline ) );
+    mapx = pointx + ( pointx << 1 );
+    for( x = 0 ; x < state->matrixsize ; x++ )
+    {
+      f = matrix[x];
+      v0 = f * ( (float)src[ mapx + 0 ] - 127.5f );
+      v1 = f * ( (float)src[ mapx + 1 ] - 127.5f );
+      v2 = f * ( (float)src[ mapx + 2 ] - 127.5f );
+      sum0 += v0;
+      sum1 += v1;
+      sum2 += v2;
+      energy = ( v0 * v0 ) + ( v1 * v1 );
+      if( energy )
+        sumenergy += sqrtf( energy ) / sqrtf( energy + ( v2 * v2 ) );
+      mapx += 3;
+      if( mapx >= state->width3 )
+        mapx = 0;
+    }
+    matrix = ADDRESS( matrix, state->matrixrowsize );
+    mapy++;
+    if( mapy >= state->height )
+      mapy = 0;
+  }
+
+  sum0 *= state->amplifynormal;
+  sum1 *= state->amplifynormal;
+  suminv = (0.5f*255.0f) / fmaxf( 0.0625f, sqrtf( ( sum0 * sum0 ) + ( sum1 * sum1 ) + ( sum2 * sum2 ) ) );
+  sum0 *= suminv;
+  sum1 *= suminv;
+  sum2 *= suminv;
+  energy = sqrtf( ( sum0 * sum0 ) + ( sum1 * sum1 ) );
+  sumenergy *= state->normalsustainfactor;
+  if( energy < sumenergy )
+  {
+    f = fminf( sumenergy / energy, 8.0f );
+    sum0 *= f;
+    sum1 *= f;
+    suminv = (0.5f*255.0f) / fmaxf( 0.0625f, sqrtf( ( sum0 * sum0 ) + ( sum1 * sum1 ) + ( sum2 * sum2 ) ) );
+    sum0 *= suminv;
+    sum1 *= suminv;
+    sum2 *= suminv;
+  }
+  sum0 += (0.5f*255.0f);
+  sum1 += (0.5f*255.0f);
+  sum2 += (0.5f*255.0f);
+  dst[0] = (unsigned char)( fmaxf( 0.0f, fminf( 255.0f, sum0 + 0.5f ) ) );
+  dst[1] = (unsigned char)( fmaxf( 0.0f, fminf( 255.0f, sum1 + 0.5f ) ) );
+  dst[2] = (unsigned char)( fmaxf( 0.0f, fminf( 255.0f, sum2 + 0.5f ) ) );
+
+  return;
+}
+
+static void imStaticKernel4NormalSustain( unsigned char *dst, int pointx, int pointy, imStaticMatrixState * restrict state )
+{
+  int x, y, mapx, mapy;
+  float f, v0, v1, v2, v3, energy, sum0, sum1, sum2, sum3, sumenergy, suminv;
+  float *matrix;
+  unsigned char *src;
+
+  sum0 = 0.0f;
+  sum1 = 0.0f;
+  sum2 = 0.0f;
+  sum3 = 0.0f;
+  sumenergy = 0.0f;
+  matrix = state->matrix;
+  mapy = pointy;
+  for( y = 0 ; y < state->matrixsize ; y++ )
+  {
+    src = ADDRESS( state->srcdata, ( mapy * state->bytesperline ) );
+    mapx = pointx << 2;
+    for( x = 0 ; x < state->matrixsize ; x++ )
+    {
+      f = matrix[x];
+      v0 = f * ( (float)src[ mapx + 0 ] - 127.5f );
+      v1 = f * ( (float)src[ mapx + 1 ] - 127.5f );
+      v2 = f * ( (float)src[ mapx + 2 ] - 127.5f );
+      v3 = f * (float)src[ mapx + 3 ];
+      sum0 += v0;
+      sum1 += v1;
+      sum2 += v2;
+      sum3 += v3;
+      energy = ( v0 * v0 ) + ( v1 * v1 );
+      if( energy )
+        sumenergy += sqrtf( energy ) / sqrtf( energy + ( v2 * v2 ) );
+      mapx += 4;
+      if( mapx >= state->width4 )
+        mapx = 0;
+    }
+    matrix = ADDRESS( matrix, state->matrixrowsize );
+    mapy++;
+    if( mapy >= state->height )
+      mapy = 0;
+  }
+
+  sum0 *= state->amplifynormal;
+  sum1 *= state->amplifynormal;
+  suminv = (0.5f*255.0f) / fmaxf( 0.0625f, sqrtf( ( sum0 * sum0 ) + ( sum1 * sum1 ) + ( sum2 * sum2 ) ) );
+  sum0 *= suminv;
+  sum1 *= suminv;
+  sum2 *= suminv;
+  energy = sqrtf( ( sum0 * sum0 ) + ( sum1 * sum1 ) );
+  sumenergy *= state->normalsustainfactor;
+  if( energy < sumenergy )
+  {
+    f = fminf( sumenergy / energy, 8.0f );
+    sum0 *= f;
+    sum1 *= f;
+    suminv = (0.5f*255.0f) / fmaxf( 0.0625f, sqrtf( ( sum0 * sum0 ) + ( sum1 * sum1 ) + ( sum2 * sum2 ) ) );
+    sum0 *= suminv;
+    sum1 *= suminv;
+    sum2 *= suminv;
+  }
+  sum0 += (0.5f*255.0f);
+  sum1 += (0.5f*255.0f);
+  sum2 += (0.5f*255.0f);
+  dst[0] = (unsigned char)( fmaxf( 0.0f, fminf( 255.0f, sum0 + 0.5f ) ) );
+  dst[1] = (unsigned char)( fmaxf( 0.0f, fminf( 255.0f, sum1 + 0.5f ) ) );
+  dst[2] = (unsigned char)( fmaxf( 0.0f, fminf( 255.0f, sum2 + 0.5f ) ) );
+  dst[3] = (unsigned char)( fmaxf( 0.0f, fminf( 255.0f, sum3 + 0.5f ) ) );
+
+  return;
+}
+
+
+////
+
+
+static void imStaticKernel4NormalSustainAlphaNorm( unsigned char *dst, int pointx, int pointy, imStaticMatrixState * restrict state )
+{
+  int x, y, mapx, mapy;
+  float f, v0, v1, v2, v3, energy, sum0, sum1, sum2, sum3, sumenergy, suminv;
+  float *matrix;
+  unsigned char *src;
+
+  sum0 = 0.0f;
+  sum1 = 0.0f;
+  sum2 = 0.0f;
+  sum3 = 0.0f;
+  sumenergy = 0.0f;
+  matrix = state->matrix;
+  mapy = pointy;
+  for( y = 0 ; y < state->matrixsize ; y++ )
+  {
+    src = ADDRESS( state->srcdata, ( mapy * state->bytesperline ) );
+    mapx = pointx << 2;
+    for( x = 0 ; x < state->matrixsize ; x++ )
+    {
+      f = matrix[x] * (float)src[ mapx + 3 ];
+      v0 = f * ( (float)src[ mapx + 0 ] - 127.5f );
+      v1 = f * ( (float)src[ mapx + 1 ] - 127.5f );
+      v2 = f * ( (float)src[ mapx + 2 ] - 127.5f );
+      v3 = f;
+      sum0 += v0;
+      sum1 += v1;
+      sum2 += v2;
+      sum3 += v3;
+      energy = ( v0 * v0 ) + ( v1 * v1 );
+      if( energy )
+        sumenergy += sqrtf( energy ) / sqrtf( energy + ( v2 * v2 ) );
+      mapx += 4;
+      if( mapx >= state->width4 )
+        mapx = 0;
+    }
+    matrix = ADDRESS( matrix, state->matrixrowsize );
+    mapy++;
+    if( mapy >= state->height )
+      mapy = 0;
+  }
+
+  if( sum3 >= state->minimumalphaf )
+  {
+    f = 1.0f / sum3;
+    sum0 *= f;
+    sum1 *= f;
+    sum2 *= f;
+    sum0 *= state->amplifynormal;
+    sum1 *= state->amplifynormal;
+    suminv = (0.5f*255.0f) / fmaxf( 0.0625f, sqrtf( ( sum0 * sum0 ) + ( sum1 * sum1 ) + ( sum2 * sum2 ) ) );
+    sum0 *= suminv;
+    sum1 *= suminv;
+    sum2 *= suminv;
+    energy = sqrtf( ( sum0 * sum0 ) + ( sum1 * sum1 ) );
+    sumenergy *= state->normalsustainfactor;
+    if( energy < sumenergy )
+    {
+      f = fminf( sumenergy / energy, 8.0f );
+      sum0 *= f;
+      sum1 *= f;
+      suminv = (0.5f*255.0f) / fmaxf( 0.0625f, sqrtf( ( sum0 * sum0 ) + ( sum1 * sum1 ) + ( sum2 * sum2 ) ) );
+      sum0 *= suminv;
+      sum1 *= suminv;
+      sum2 *= suminv;
+    }
+    sum0 += (0.5f*255.0f);
+    sum1 += (0.5f*255.0f);
+    sum2 += (0.5f*255.0f);
+    dst[0] = (unsigned char)( fmaxf( 0.0f, fminf( 255.0f, sum0 + 0.5f ) ) );
+    dst[1] = (unsigned char)( fmaxf( 0.0f, fminf( 255.0f, sum1 + 0.5f ) ) );
+    dst[2] = (unsigned char)( fmaxf( 0.0f, fminf( 255.0f, sum2 + 0.5f ) ) );
+    dst[3] = (unsigned char)( fmaxf( 0.0f, fminf( 255.0f, sum3 + 0.5f ) ) );
+  }
+  else
+  {
+    dst[0] = 0;
+    dst[1] = 0;
+    dst[2] = 0;
+    dst[3] = 0;
+  }
+
+  return;
+}
+
+
+////
+
+
+static void imStaticKernelPoT3Water( unsigned char *dst, int pointx, int pointy, imStaticMatrixState * restrict state )
+{
+  int x, y, mapx, mapy, heightmask, widthmask;
+  int minx, maxx, miny, maxy;
+  float f, sum0, sum1, sum2, suminv;
+  float *matrix;
+  unsigned char *src;
+
+  minx = pointx;
+  maxx = minx + state->matrixsize;
+  miny = pointy;
+  maxy = miny + state->matrixsize;
+  heightmask = state->height - 1;
+  widthmask = state->width1 - 1;
+
+  sum0 = 0.0f;
+  sum1 = 0.0f;
+  sum2 = 0.0f;
+  matrix = state->matrix;
+  for( y = miny ; y < maxy ; y++ )
+  {
+    mapy = y & heightmask;
+    src = ADDRESS( state->srcdata, ( mapy * state->bytesperline ) );
+    for( x = minx ; x < maxx ; x++, matrix++ )
+    {
+      mapx = x & widthmask;
+      mapx += mapx << 1;
+      f = *matrix;
+      sum0 += (float)src[ mapx + 0 ] * f;
+      sum1 += (float)src[ mapx + 1 ] * f;
+      sum2 += (float)src[ mapx + 2 ] * f;
+    }
+    matrix += state->rowreturn;
+  }
+
+  sum0 *= 1.0f/255.0f;
+  sum1 *= 1.0f/255.0f;
+  sum2 *= 1.0f/255.0f;
+  sum0 = 2.0f * ( sum0 - 0.5f );
+  sum1 = 2.0f * ( sum1 - 0.5f );
+  suminv = sqrtf( ( sum0 * sum0 ) + ( sum1 * sum1 ) );
+  if( suminv < 0.75f )
+  {
+    suminv = 0.5f / suminv;
+    sum0 = 0.5f + ( sum0 * suminv );
+    sum1 = 0.5f + ( sum1 * suminv );
+  }
+  if( sum2 > 0.1f )
+  {
+    state->dithersum += sum2;
+    if( sum2 > 0.45f )
+      sum2 = 1.0f;
+    else if( ( sum2 < 0.3f ) && ( state->dithersum < 1.0f ) )
+      sum2 = 0.0f;
+    else
+      sum2 = ( ( sum2 + state->dithersum ) < 0.45f ? 0.0f : 1.0f );
+    state->dithersum -= sum2;
+  }
+  sum0 *= 255.0f;
+  sum1 *= 255.0f;
+  sum2 *= 255.0f;
+  dst[0] = (unsigned char)( fmaxf( 0.0f, fminf( 255.0f, sum0 + 0.5f ) ) );
+  dst[1] = (unsigned char)( fmaxf( 0.0f, fminf( 255.0f, sum1 + 0.5f ) ) );
+  dst[2] = (unsigned char)( fmaxf( 0.0f, fminf( 255.0f, sum2 + 0.5f ) ) );
+
+  return;
+}
+
+static void imStaticKernelPoT4Water( unsigned char *dst, int pointx, int pointy, imStaticMatrixState * restrict state )
+{
+  int x, y, mapx, mapy, heightmask, widthmask;
+  int minx, maxx, miny, maxy;
+  float f, sum0, sum1, sum2, sum3, suminv;
+  float *matrix;
+  unsigned char *src;
+
+  minx = pointx;
+  maxx = minx + state->matrixsize;
+  miny = pointy;
+  maxy = miny + state->matrixsize;
+  heightmask = state->height - 1;
+  widthmask = state->width1 - 1;
+
+  sum0 = 0.0f;
+  sum1 = 0.0f;
+  sum2 = 0.0f;
+  sum3 = 0.0f;
+  matrix = state->matrix;
+  for( y = miny ; y < maxy ; y++ )
+  {
+    mapy = y & heightmask;
+    src = ADDRESS( state->srcdata, ( mapy * state->bytesperline ) );
+    for( x = minx ; x < maxx ; x++, matrix++ )
+    {
+      mapx = x & widthmask;
+      mapx <<= 2;
+      f = *matrix;
+      sum0 += (float)src[ mapx + 0 ] * f;
+      sum1 += (float)src[ mapx + 1 ] * f;
+      sum2 += (float)src[ mapx + 2 ] * f;
+      sum3 += (float)src[ mapx + 3 ] * f;
+    }
+    matrix += state->rowreturn;
+  }
+
+  sum0 *= 1.0f/255.0f;
+  sum1 *= 1.0f/255.0f;
+  sum2 *= 1.0f/255.0f;
+  sum0 = 2.0f * ( sum0 - 0.5f );
+  sum1 = 2.0f * ( sum1 - 0.5f );
+  suminv = sqrtf( ( sum0 * sum0 ) + ( sum1 * sum1 ) );
+  if( suminv < 0.75f )
+  {
+    suminv = 0.5f / suminv;
+    sum0 = 0.5f + ( sum0 * suminv );
+    sum1 = 0.5f + ( sum1 * suminv );
+  }
+  if( sum2 > 0.1f )
+  {
+    state->dithersum += sum2;
+    if( sum2 > 0.45f )
+      sum2 = 1.0f;
+    else if( ( sum2 < 0.3f ) && ( state->dithersum < 1.0f ) )
+      sum2 = 0.0f;
+    else
+      sum2 = ( ( sum2 + state->dithersum ) < 0.45f ? 0.0f : 1.0f );
+    state->dithersum -= sum2;
+  }
+  sum0 *= 255.0f;
+  sum1 *= 255.0f;
+  sum2 *= 255.0f;
+  dst[0] = (unsigned char)( fmaxf( 0.0f, fminf( 255.0f, sum0 + 0.5f ) ) );
+  dst[1] = (unsigned char)( fmaxf( 0.0f, fminf( 255.0f, sum1 + 0.5f ) ) );
+  dst[2] = (unsigned char)( fmaxf( 0.0f, fminf( 255.0f, sum2 + 0.5f ) ) );
+  dst[3] = (unsigned char)( fmaxf( 0.0f, fminf( 255.0f, sum3 + 0.5f ) ) );
+
+  return;
+}
+
+
+////
+
+
+static void imStaticKernelPoT4Plant( unsigned char *dst, int pointx, int pointy, imStaticMatrixState * restrict state )
+{
+  int x, y, mapx, mapy, heightmask, widthmask;
+  int minx, maxx, miny, maxy;
+  float f, sum0, sum1, sum2, sum3;
+  float *matrix;
+  unsigned char *src;
+
+  minx = pointx;
+  maxx = minx + state->matrixsize;
+  miny = pointy;
+  maxy = miny + state->matrixsize;
+  heightmask = state->height - 1;
+  widthmask = state->width1 - 1;
+
+  sum0 = 0.0f;
+  sum1 = 0.0f;
+  sum2 = 0.0f;
+  sum3 = 0.0f;
+  matrix = state->matrix;
+  for( y = miny ; y < maxy ; y++ )
+  {
+    mapy = y & heightmask;
+    src = ADDRESS( state->srcdata, ( mapy * state->bytesperline ) );
+    for( x = minx ; x < maxx ; x++, matrix++ )
+    {
+      mapx = x & widthmask;
+      mapx <<= 2;
+      f = *matrix;
+      sum0 += (float)src[ mapx + 0 ] * f;
+      sum1 += (float)src[ mapx + 1 ] * f;
+      sum2 += (float)src[ mapx + 2 ] * f;
+      sum3 += (float)src[ mapx + 3 ] * f;
+    }
+    matrix += state->rowreturn;
+  }
+
+  sum3 *= 1.25f;
+  dst[0] = (unsigned char)( fmaxf( 0.0f, fminf( 255.0f, sum0 + 0.5f ) ) );
+  dst[1] = (unsigned char)( fmaxf( 0.0f, fminf( 255.0f, sum1 + 0.5f ) ) );
+  dst[2] = (unsigned char)( fmaxf( 0.0f, fminf( 255.0f, sum2 + 0.5f ) ) );
+  dst[3] = (unsigned char)( fmaxf( 0.0f, fminf( 255.0f, sum3 + 0.5f ) ) );
+
+  return;
+}
+
+
+////
+
+
+int imReduceImageKaiserDataDivisor( unsigned char *dstdata, unsigned char *srcdata, int width, int height, int bytesperpixel, int bytesperline, int sizedivisor, imReduceOptions *options )
+{
+  int filter, x, y, pointx, pointy, basex, basey, pow2flag;
+  int newwidth, newheight;
+  unsigned char *dst;
+  imStaticMatrixState state;
+  void (*applykernel)( unsigned char *dst, int pointx, int pointy, imStaticMatrixState * restrict state );
+#if CPU_SSE2_SUPPORT
+  int corebase, corerange;
+  void (*applykernelcore)( unsigned char *dst, int pointx, int pointy, imStaticMatrixState * restrict state );
+#endif
+
+  filter = options->filter;
+  imBuildStaticMatrix( &state, sizedivisor, options->hopcount, options->alpha );
+
+  newwidth = ( width < sizedivisor ) ? 1 : ( ( width + sizedivisor - 1 ) / sizedivisor );
+  newheight = ( height < sizedivisor ) ? 1 : ( ( height + sizedivisor - 1 ) / sizedivisor );
+
+  pow2flag = ccIsPow2Int32( width ) && ccIsPow2Int32( height );
+  applykernel = 0;
+#if CPU_SSE2_SUPPORT
+  applykernelcore = 0;
+#endif
+
+  if( filter == IM_REDUCE_FILTER_LINEAR )
+  {
+    if( bytesperpixel == 4 )
+    {
+      applykernel = imStaticKernel4Linear;
+#if CPU_SSE2_SUPPORT
+      applykernelcore = imStaticKernel4Linear_Core;
+#endif
+    }
+    else if( bytesperpixel == 3 )
+      applykernel = imStaticKernel3Linear;
+    else if( bytesperpixel == 2 )
+      applykernel = imStaticKernel2Linear;
+    else if( bytesperpixel == 1 )
+      applykernel = imStaticKernel1Linear;
+  }
+  else if( filter == IM_REDUCE_FILTER_LINEAR_ALPHANORM )
+  {
+    if( bytesperpixel == 4 )
+    {
+      applykernel = imStaticKernel4LinearAlphaNorm;
+#if CPU_SSE2_SUPPORT
+      applykernelcore = imStaticKernel4LinearAlphaNorm_Core;
+#endif
+    }
+    else if( bytesperpixel == 3 )
+      applykernel = imStaticKernel3Linear;
+    else if( bytesperpixel == 2 )
+      applykernel = imStaticKernel2Linear;
+    else if( bytesperpixel == 1 )
+      applykernel = imStaticKernel1Linear;
+  }
+  else if( filter == IM_REDUCE_FILTER_SRGB )
+  {
+    if( bytesperpixel == 4 )
+    {
+      applykernel = imStaticKernel4sRGB;
+#if CPU_SSE2_SUPPORT
+      applykernelcore = imStaticKernel4sRGB_Core;
+#endif
+    }
+    else if( bytesperpixel == 3 )
+    {
+      applykernel = imStaticKernel3sRGB;
+#if CPU_SSE2_SUPPORT
+      applykernelcore = imStaticKernel3sRGB_Core;
+#endif
+    }
+    else if( bytesperpixel == 2 )
+      applykernel = imStaticKernel2sRGB;
+    else if( bytesperpixel == 1 )
+      applykernel = imStaticKernel1sRGB;
+  }
+  else if( filter == IM_REDUCE_FILTER_SRGB_ALPHANORM )
+  {
+    if( bytesperpixel == 4 )
+    {
+      applykernel = imStaticKernel4sRGBAlphaNorm;
+#if CPU_SSE2_SUPPORT
+      applykernelcore = imStaticKernel4sRGBAlphaNorm_Core;
+#endif
+    }
+    else if( bytesperpixel == 3 )
+      applykernel = imStaticKernel3sRGB;
+    else if( bytesperpixel == 2 )
+      applykernel = imStaticKernel2sRGB;
+    else if( bytesperpixel == 1 )
+      applykernel = imStaticKernel1sRGB;
+  }
+  else if( filter == IM_REDUCE_FILTER_NORMALMAP )
+  {
+    if( bytesperpixel == 4 )
+      applykernel = imStaticKernel4Normal;
+    else if( bytesperpixel == 3 )
+      applykernel = imStaticKernel3Normal;
+  }
+  else if( filter == IM_REDUCE_FILTER_NORMALMAP_SUSTAIN )
+  {
+    if( bytesperpixel == 4 )
+      applykernel = imStaticKernel4NormalSustain;
+    else if( bytesperpixel == 3 )
+      applykernel = imStaticKernel3NormalSustain;
+  }
+  else if( filter == IM_REDUCE_FILTER_NORMALMAP_SUSTAIN_ALPHANORM )
+  {
+    if( bytesperpixel == 4 )
+      applykernel = imStaticKernel4NormalSustainAlphaNorm;
+    else if( bytesperpixel == 3 )
+      applykernel = imStaticKernel3NormalSustain;
+  }
+  else if( filter == IM_REDUCE_FILTER_WATERMAP )
+  {
+    if( ( bytesperpixel == 4 ) && ( pow2flag ) )
+      applykernel = imStaticKernelPoT4Water;
+    else if( ( bytesperpixel == 3 ) && ( pow2flag ) )
+      applykernel = imStaticKernelPoT3Water;
+  }
+  else if( filter == IM_REDUCE_FILTER_PLANTMAP )
+  {
+    if( ( bytesperpixel == 4 ) && ( pow2flag ) )
+      applykernel = imStaticKernelPoT4Plant;
+  }
+
+  if( !applykernel )
+    return 0;
+
+#if CPU_SSE2_SUPPORT
+  corebase = -state.matrixoffset;
+  corerange = ( newwidth + state.matrixoffset ) - corebase;
+#endif
+
+  state.dithersum = 0.0f;
+  if( ( newwidth | newheight ) > 2 )
+    state.dithersum = 0.5f;
+
+  state.srcdata = srcdata;
+  state.width1 = width * 1;
+  state.width2 = width * 2;
+  state.width3 = width * 3;
+  state.width4 = width * 4;
+  state.height = height;
+  state.bytesperline = bytesperline;
+
+  state.minimumalpha = 4;
+  state.minimumalphaf = (float)state.minimumalpha;
+  state.amplifynormal = fmaxf( 1.0f, options->amplifynormal );
+  state.normalsustainfactor = options->normalsustainfactor;
+
+  basex = ( state.matrixoffset + ( width << 8 ) ) % width;
+  basey = ( state.matrixoffset + ( height << 8 ) ) % height;
+  while( basex < 0 )
+    basex += width;
+  while( basey < 0 )
+    basey += height;
+
+#if CPU_SSE2_SUPPORT
+  if( applykernelcore )
+  {
+    dst = dstdata;
+    pointy = basey;
+    for( y = 0 ; y < newheight ; y++ )
+    {
+      pointx = basex;
+      for( x = 0 ; x < newwidth ; x++, dst += bytesperpixel )
+      {
+        ( (unsigned int)( x - corebase ) < corerange ? applykernelcore : applykernel )( dst, pointx, pointy, &state );
+        pointx += sizedivisor;
+        while( pointx >= width )
+          pointx -= width;
+      }
+      pointy += sizedivisor;
+      while( pointy >= height )
+        pointy -= height;
+    }
+  }
+  else
+#endif
+  {
+    dst = dstdata;
+    pointy = basey;
+    for( y = 0 ; y < newheight ; y++ )
+    {
+      pointx = basex;
+      for( x = 0 ; x < newwidth ; x++, dst += bytesperpixel )
+      {
+        applykernel( dst, pointx, pointy, &state );
+        pointx += sizedivisor;
+        while( pointx >= width )
+          pointx -= width;
+      }
+      pointy += sizedivisor;
+      while( pointy >= height )
+        pointy -= height;
+    }
+  }
+
+  imFreeStaticState( &state );
+
+  return 1;
+}
+
+
+int imReduceImageKaiserDivisor( imgImage *imgdst, imgImage *imgsrc, int sizedivisor, imReduceOptions *options )
+{
+  int width, height;
+  int newwidth, newheight, retvalue;
+
+  width = imgsrc->format.width;
+  height = imgsrc->format.height;
+  newwidth = ( width < sizedivisor ) ? 1 : ( ( width + sizedivisor - 1 ) / sizedivisor );
+  newheight = ( height < sizedivisor ) ? 1 : ( ( height + sizedivisor - 1 ) / sizedivisor );
+
+  imgdst->format.width = newwidth;
+  imgdst->format.height = newheight;
+  imgdst->format.type = imgsrc->format.type;
+  imgdst->format.bytesperpixel = imgsrc->format.bytesperpixel;
+  imgdst->format.bytesperline = imgdst->format.width * imgdst->format.bytesperpixel;
+  imgdst->data = malloc( imgdst->format.height * imgdst->format.bytesperline );
+  if( !( imgdst->data ) )
+    return 0;
+
+  retvalue = imReduceImageKaiserDataDivisor( imgdst->data, imgsrc->data, width, height, imgsrc->format.bytesperpixel, imgsrc->format.bytesperline, sizedivisor, options );
+
+  return retvalue;
+}
+
+
+
+////////////////////////////////////////////////////////////////////////////////
+
+
+
+static void imDynamicKernel1Linear( unsigned char *dst, imGenericMatrixState *state )
+{
+  int x, y, mapx, mapy;
+  float f, sum0;
+  float matrixsum;
+  unsigned char *src;
+
+  sum0 = 0.0f;
+  matrixsum = 0.0f;
+  mapy = state->matrixoffsety;
+  for( y = 0 ; y < state->matrixsizey ; y++ )
+  {
+    src = ADDRESS( state->srcdata, ( mapy * state->bytesperline ) );
+    mapx = state->matrixoffsetx;
+    for( x = 0 ; x < state->matrixsizex ; x++ )
+    {
+      f = state->linearx[x] * state->lineary[y];
+      sum0 += f * (float)src[ mapx + 0 ];
+      matrixsum += f;
+      mapx++;
+      if( mapx >= state->width1 )
+        mapx = 0;
+    }
+    mapy++;
+    if( mapy >= state->height )
+      mapy = 0;
+  }
+
+  sum0 /= matrixsum;
+  dst[0] = (unsigned char)( fmaxf( 0.0f, fminf( 255.0f, sum0 + 0.5f ) ) );
+
+  return;
+}
+
+
+static void imDynamicKernel2Linear( unsigned char *dst, imGenericMatrixState *state )
+{
+  int x, y, mapx, mapy;
+  float f, sum0, sum1;
+  float matrixsum;
+  unsigned char *src;
+
+  sum0 = 0.0f;
+  sum1 = 0.0f;
+  matrixsum = 0.0f;
+  mapy = state->matrixoffsety;
+  for( y = 0 ; y < state->matrixsizey ; y++ )
+  {
+    src = ADDRESS( state->srcdata, ( mapy * state->bytesperline ) );
+    mapx = state->matrixoffsetx << 1;
+    for( x = 0 ; x < state->matrixsizex ; x++ )
+    {
+      f = state->linearx[x] * state->lineary[y];
+      sum0 += f * (float)src[ mapx + 0 ];
+      sum1 += f * (float)src[ mapx + 1 ];
+      matrixsum += f;
+      mapx += 2;
+      if( mapx >= state->width2 )
+        mapx = 0;
+    }
+    mapy++;
+    if( mapy >= state->height )
+      mapy = 0;
+  }
+
+  matrixsum = 1.0f / matrixsum;
+  sum0 *= matrixsum;
+  sum1 *= matrixsum;
+  dst[0] = (unsigned char)( fmaxf( 0.0f, fminf( 255.0f, sum0 + 0.5f ) ) );
+  dst[1] = (unsigned char)( fmaxf( 0.0f, fminf( 255.0f, sum1 + 0.5f ) ) );
+
+  return;
+}
+
+
+static void imDynamicKernel3Linear( unsigned char *dst, imGenericMatrixState *state )
+{
+  int x, y, mapx, mapy;
+  float f, sum0, sum1, sum2;
+  float matrixsum;
+  unsigned char *src;
+
+  sum0 = 0.0f;
+  sum1 = 0.0f;
+  sum2 = 0.0f;
+  matrixsum = 0.0f;
+  mapy = state->matrixoffsety;
+  for( y = 0 ; y < state->matrixsizey ; y++ )
+  {
+    src = ADDRESS( state->srcdata, ( mapy * state->bytesperline ) );
+    mapx = state->matrixoffsetx + ( state->matrixoffsetx << 1 );
+    for( x = 0 ; x < state->matrixsizex ; x++ )
+    {
+      f = state->linearx[x] * state->lineary[y];
+      sum0 += f * (float)src[ mapx + 0 ];
+      sum1 += f * (float)src[ mapx + 1 ];
+      sum2 += f * (float)src[ mapx + 2 ];
+      matrixsum += f;
+      mapx += 3;
+      if( mapx >= state->width3 )
+        mapx = 0;
+    }
+    mapy++;
+    if( mapy >= state->height )
+      mapy = 0;
+  }
+
+  matrixsum = 1.0f / matrixsum;
+  sum0 *= matrixsum;
+  sum1 *= matrixsum;
+  sum2 *= matrixsum;
+  dst[0] = (unsigned char)( fmaxf( 0.0f, fminf( 255.0f, sum0 + 0.5f ) ) );
+  dst[1] = (unsigned char)( fmaxf( 0.0f, fminf( 255.0f, sum1 + 0.5f ) ) );
+  dst[2] = (unsigned char)( fmaxf( 0.0f, fminf( 255.0f, sum2 + 0.5f ) ) );
+
+  return;
+}
+
+
+static void imDynamicKernel4Linear( unsigned char *dst, imGenericMatrixState *state )
+{
+  int x, y, mapx, mapy;
+  float f, sum0, sum1, sum2, sum3;
+  float matrixsum;
+  unsigned char *src;
+
+  sum0 = 0.0f;
+  sum1 = 0.0f;
+  sum2 = 0.0f;
+  sum3 = 0.0f;
+  matrixsum = 0.0f;
+  mapy = state->matrixoffsety;
+  for( y = 0 ; y < state->matrixsizey ; y++ )
+  {
+    src = ADDRESS( state->srcdata, ( mapy * state->bytesperline ) );
+    mapx = state->matrixoffsetx << 2;
+    for( x = 0 ; x < state->matrixsizex ; x++ )
+    {
+      f = state->linearx[x] * state->lineary[y];
+      sum0 += f * (float)src[ mapx + 0 ];
+      sum1 += f * (float)src[ mapx + 1 ];
+      sum2 += f * (float)src[ mapx + 2 ];
+      sum3 += f * (float)src[ mapx + 3 ];
+      matrixsum += f;
+      mapx += 4;
+      if( mapx >= state->width4 )
+        mapx = 0;
+    }
+    mapy++;
+    if( mapy >= state->height )
+      mapy = 0;
+  }
+
+  matrixsum = 1.0f / matrixsum;
+  sum0 *= matrixsum;
+  sum1 *= matrixsum;
+  sum2 *= matrixsum;
+  sum3 *= matrixsum;
+  dst[0] = (unsigned char)( fmaxf( 0.0f, fminf( 255.0f, sum0 + 0.5f ) ) );
+  dst[1] = (unsigned char)( fmaxf( 0.0f, fminf( 255.0f, sum1 + 0.5f ) ) );
+  dst[2] = (unsigned char)( fmaxf( 0.0f, fminf( 255.0f, sum2 + 0.5f ) ) );
+  dst[3] = (unsigned char)( fmaxf( 0.0f, fminf( 255.0f, sum3 + 0.5f ) ) );
+
+  return;
+}
+
+
+////
+
+
+static void imDynamicKernel4LinearAlphaNorm( unsigned char *dst, imGenericMatrixState *state )
+{
+  int x, y, mapx, mapy;
+  float f, sum0, sum1, sum2, sum3, alpha;
+  float matrixsum;
+  unsigned char *src;
+
+  sum0 = 0.0f;
+  sum1 = 0.0f;
+  sum2 = 0.0f;
+  sum3 = 0.0f;
+  matrixsum = 0.0f;
+  mapy = state->matrixoffsety;
+  for( y = 0 ; y < state->matrixsizey ; y++ )
+  {
+    src = ADDRESS( state->srcdata, ( mapy * state->bytesperline ) );
+    mapx = state->matrixoffsetx << 2;
+    for( x = 0 ; x < state->matrixsizex ; x++ )
+    {
+      f = state->linearx[x] * state->lineary[y];
+      alpha = (float)src[ mapx + 3 ] * f;
+      sum0 += alpha * (float)src[ mapx + 0 ];
+      sum1 += alpha * (float)src[ mapx + 1 ];
+      sum2 += alpha * (float)src[ mapx + 2 ];
+      sum3 += alpha;
+      matrixsum += f;
+      mapx += 4;
+      if( mapx >= state->width4 )
+        mapx = 0;
+    }
+    mapy++;
+    if( mapy >= state->height )
+      mapy = 0;
+  }
+
+  matrixsum = 1.0f / matrixsum;
+  sum0 *= matrixsum;
+  sum1 *= matrixsum;
+  sum2 *= matrixsum;
+  sum3 *= matrixsum;
+  if( sum3 >= state->minimumalphaf )
+  {
+    f = 1.0 / sum3;
+    dst[0] = (unsigned char)( fmaxf( 0.0f, fminf( 255.0f, ( sum0 * f ) + 0.5f ) ) );
+    dst[1] = (unsigned char)( fmaxf( 0.0f, fminf( 255.0f, ( sum1 * f ) + 0.5f ) ) );
+    dst[2] = (unsigned char)( fmaxf( 0.0f, fminf( 255.0f, ( sum2 * f ) + 0.5f ) ) );
+    dst[3] = (unsigned char)( fmaxf( 0.0f, fminf( 255.0f, sum3 + 0.5f ) ) );
+  }
+  else
+  {
+    dst[0] = 0;
+    dst[1] = 0;
+    dst[2] = 0;
+    dst[3] = 0;
+  }
+
+  return;
+}
+
+
+#if CPU_SSE2_SUPPORT
+
+static void imDynamicKernel4LinearAlphaNorm_Core( unsigned char *dst, imGenericMatrixState *state )
+{
+  int x, y, mapx, mapy;
+  uint32_t pixel;
+  unsigned char *src;
+  __m128 vmatrixsum, vsum0, vsum1, vsum2, vsum3;
+  __m128 vlx, vly, vf, valpha, vr, vg, vb, va, vsrcf;
+  __m128i vsrc, vshufmask;
+  __m128i vzero;
+
+ #if CPU_SSSE3_SUPPORT
+  vshufmask = _mm_setr_epi8( 0x00,0x04,0x08,0x0c, 0x01,0x05,0x09,0x0d, 0x02,0x06,0x0a,0x0e, 0x03,0x07,0x0b,0x0f );
+ #endif
+  vsum0 = _mm_setzero_ps();
+  vsum1 = _mm_setzero_ps();
+  vsum2 = _mm_setzero_ps();
+  vsum3 = _mm_setzero_ps();
+  vmatrixsum = _mm_setzero_ps();
+  vzero = _mm_castps_si128( _mm_setzero_ps() );
+
+  mapy = state->matrixoffsety;
+  for( y = 0 ; y < state->matrixsizey ; y++ )
+  {
+    src = ADDRESS( state->srcdata, ( mapy * state->bytesperline ) );
+    mapx = state->matrixoffsetx << 2;
+    vly = _mm_set1_ps( state->lineary[y] );
+    for( x = 0 ; x < state->matrixsizex ; x += 4 )
+    {
+      vlx = _mm_load_ps( &state->linearx[x] );
+      /* Load 16 bytes and unpack as RRRR,GGGG,BBBB,AAAA in one SSE register */
+      vsrc = _mm_loadu_si128( (void *)&src[ mapx ] );
+ #if CPU_SSSE3_SUPPORT
+      vsrc = _mm_shuffle_epi8( vsrc, vshufmask );
+ #else
+      vshufmask = _mm_shuffle_epi32( vsrc, 0x39 );
+      vsrc = _mm_unpacklo_epi16( _mm_unpacklo_epi8( vsrc, vshufmask ), _mm_unpackhi_epi8( vsrc, vshufmask ) );
+ #endif
+      /* Break that into 4 SSE registers as floats: vR,vG,vB,vA */
+      vsrcf = _mm_castsi128_ps( vsrc );
+      vr = _mm_cvtepi32_ps( CPU_CVT_U8_TO_I32( _mm_castps_si128( vsrcf ), vzero ) );
+ #if CPU_SSE3_SUPPORT
+      vg = _mm_cvtepi32_ps( CPU_CVT_U8_TO_I32( _mm_castps_si128( _mm_movehdup_ps( vsrcf ) ), vzero ) );
+ #else
+      vg = _mm_cvtepi32_ps( CPU_CVT_U8_TO_I32( _mm_castps_si128( _mm_shuffle_ps( vsrcf, vsrcf, 0x55 ) ), vzero ) );
+ #endif
+      vb = _mm_cvtepi32_ps( CPU_CVT_U8_TO_I32( _mm_castps_si128( _mm_movehl_ps( vsrcf, vsrcf ) ), vzero ) );
+      va = _mm_cvtepi32_ps( CPU_CVT_U8_TO_I32( _mm_castps_si128( _mm_shuffle_ps( vsrcf, vsrcf, 0xff ) ), vzero ) );
+      vf = _mm_mul_ps( vlx, vly );
+      valpha = _mm_mul_ps( va, vf );
+      vsum0 = _mm_add_ps( vsum0, _mm_mul_ps( vr, valpha ) );
+      vsum1 = _mm_add_ps( vsum1, _mm_mul_ps( vg, valpha ) );
+      vsum2 = _mm_add_ps( vsum2, _mm_mul_ps( vb, valpha ) );
+      vsum3 = _mm_add_ps( vsum3, valpha );
+      vmatrixsum = _mm_add_ps( vmatrixsum, vf );      
+      mapx += 16;
+    }
+    mapy++;
+    if( mapy >= state->height )
+      mapy = 0;
+  }
+
+ #if CPU_SSE3_SUPPORT
+  vmatrixsum = _mm_hadd_ps( vmatrixsum, vmatrixsum );
+  vmatrixsum = _mm_hadd_ps( vmatrixsum, vmatrixsum );
+ #else
+  vmatrixsum = _mm_add_ps( vmatrixsum, _mm_shuffle_ps( vmatrixsum, vmatrixsum, 0x4e ) );
+  vmatrixsum = _mm_add_ps( vmatrixsum, _mm_shuffle_ps( vmatrixsum, vmatrixsum, 0x39 ) );
+ #endif
+
+ #if CPU_SSE3_SUPPORT
+  vsum0 = _mm_hadd_ps( vsum0, vsum1 );
+  vsum2 = _mm_hadd_ps( vsum2, vsum3 );
+  vsum0 = _mm_hadd_ps( vsum0, vsum2 );
+ #else
+  vsum0 = _mm_add_ps( _mm_unpacklo_ps( vsum0, vsum2 ), _mm_unpackhi_ps( vsum0, vsum2 ) );
+  vsum1 = _mm_add_ps( _mm_unpacklo_ps( vsum1, vsum3 ), _mm_unpackhi_ps( vsum1, vsum3 ) );
+  vsum0 = _mm_add_ps( _mm_unpacklo_ps( vsum0, vsum1 ), _mm_unpackhi_ps( vsum0, vsum1 ) );
+ #endif
+  vsum0 = _mm_div_ps( vsum0, vmatrixsum );
+
+  valpha = _mm_shuffle_ps( vsum0, vsum0, 0xff );
+  pixel = 0;
+  if( _mm_comige_ss( valpha, _mm_load_ss( &state->minimumalphaf ) ) )
+  {
+    __m128i vpixel;
+    vsum0 = _mm_mul_ps( vsum0, _mm_rcp_ps( valpha ) );
+    vsum0 = CPU_BLENDV_PS( vsum0, valpha, *(__m128 *)simd4fAlphaMask );
+    vpixel = _mm_cvtps_epi32( vsum0 );
+    vpixel = _mm_packs_epi32( vpixel, vpixel );
+    vpixel = _mm_packus_epi16( vpixel, vpixel );
+    pixel = (uint32_t)_mm_cvtsi128_si32( vpixel );
+  }
+  *(uint32_t *)dst = pixel;
+
+  return;
+}
+
+#endif
+
+
+////
+
+
+static void imDynamicKernel1sRGB( unsigned char *dst, imGenericMatrixState *state )
+{
+  int x, y, mapx, mapy;
+#if CPU_SSE2_SUPPORT
+  __m128 vsum, vsrc;
+  __m128i vzero;
+#else
+  float sum0;
+#endif
+  float f, matrixsum;
+  unsigned char *src;
+
+#if CPU_SSE2_SUPPORT
+  vsum = _mm_setzero_ps();
+  vzero = _mm_setzero_si128();
+#else
+  sum0 = 0.0f;
+#endif
+  matrixsum = 0.0f;
+  mapy = state->matrixoffsety;
+  for( y = 0 ; y < state->matrixsizey ; y++ )
+  {
+    src = ADDRESS( state->srcdata, ( mapy * state->bytesperline ) );
+    mapx = state->matrixoffsetx;
+    for( x = 0 ; x < state->matrixsizex ; x++ )
+    {
+#if CPU_SSE2_SUPPORT
+      f = state->linearx[x] * state->lineary[y];
+      vsrc = _mm_set_ss( (float)src[ mapx + 0 ] );
+      vsrc = srgb2linear3( vsrc );
+      vsum = _mm_add_ps( vsum, _mm_mul_ps( _mm_set1_ps( f ), vsrc ) );
+#else
+      f = state->linearx[x] * state->lineary[y];
+      sum0 += f * srgb2linear( (float)src[ mapx + 0 ] );
+#endif
+      matrixsum += f;
+      mapx++;
+      if( mapx >= state->width1 )
+        mapx = 0;
+    }
+    mapy++;
+    if( mapy >= state->height )
+      mapy = 0;
+  }
+
+#if CPU_SSE2_SUPPORT
+  vsum = linear2srgb3( _mm_div_ps( vsum, _mm_set1_ps( matrixsum ) ) );
+  dst[0] = _mm_cvtsi128_si32( _mm_packus_epi16( _mm_packs_epi32( _mm_cvtps_epi32( vsum ), vzero ), vzero ) );
+#else
+  sum0 /= matrixsum;
+  dst[0] = (unsigned char)( fmaxf( 0.0f, fminf( 255.0f, linear2srgb( sum0 ) + 0.5f ) ) );
+#endif
+
+  return;
+}
+
+
+static void imDynamicKernel2sRGB( unsigned char *dst, imGenericMatrixState *state )
+{
+  int x, y, mapx, mapy;
+#if CPU_SSE2_SUPPORT
+  __m128 vsum, vsrc;
+  __m128i vzero;
+#else
+  float sum0, sum1;
+#endif
+  float f, matrixsum;
+  unsigned char *src;
+
+#if CPU_SSE2_SUPPORT
+  vsum = _mm_setzero_ps();
+  vzero = _mm_setzero_si128();
+#else
+  sum0 = 0.0f;
+  sum1 = 0.0f;
+#endif
+  matrixsum = 0.0f;
+  mapy = state->matrixoffsety;
+  for( y = 0 ; y < state->matrixsizey ; y++ )
+  {
+    src = ADDRESS( state->srcdata, ( mapy * state->bytesperline ) );
+    mapx = state->matrixoffsetx << 1;
+    for( x = 0 ; x < state->matrixsizex ; x++ )
+    {
+#if CPU_SSE2_SUPPORT
+      f = state->linearx[x] * state->lineary[y];
+      vsrc = _mm_set_ps( 0.0f, 0.0f, (float)src[ mapx + 1 ], (float)src[ mapx + 0 ] );
+      vsrc = srgb2linear3( vsrc );
+      vsum = _mm_add_ps( vsum, _mm_mul_ps( _mm_set1_ps( f ), vsrc ) );
+#else
+      f = state->linearx[x] * state->lineary[y];
+      sum0 += f * srgb2linear( (float)src[ mapx + 0 ] );
+      sum1 += f * srgb2linear( (float)src[ mapx + 1 ] );
+#endif
+      matrixsum += f;
+      mapx += 2;
+      if( mapx >= state->width2 )
+        mapx = 0;
+    }
+    mapy++;
+    if( mapy >= state->height )
+      mapy = 0;
+  }
+
+#if CPU_SSE2_SUPPORT
+  union
+  {
+    char c[4];
+    uint32_t i;
+  } u;
+  vsum = linear2srgb3( _mm_div_ps( vsum, _mm_set1_ps( matrixsum ) ) );
+  _mm_store_ss( (void *)&u.i, _mm_castsi128_ps( _mm_packus_epi16( _mm_packs_epi32( _mm_cvtps_epi32( vsum ), vzero ), vzero ) ) );
+  dst[0] = u.c[0];
+  dst[1] = u.c[1];
+#else
+  matrixsum = 1.0f / matrixsum;
+  sum0 *= matrixsum;
+  sum1 *= matrixsum;
+  dst[0] = (unsigned char)( fmaxf( 0.0f, fminf( 255.0f, linear2srgb( sum0 ) + 0.5f ) ) );
+  dst[1] = (unsigned char)( fmaxf( 0.0f, fminf( 255.0f, linear2srgb( sum1 ) + 0.5f ) ) );
+#endif
+
+  return;
+}
+
+
+static void imDynamicKernel3sRGB( unsigned char *dst, imGenericMatrixState *state )
+{
+  int x, y, mapx, mapy;
+#if CPU_SSE2_SUPPORT
+  __m128 vsum, vsrc;
+  __m128i vzero;
+#else
+  float sum0, sum1, sum2;
+#endif
+  float f, matrixsum;
+  unsigned char *src;
+
+#if CPU_SSE2_SUPPORT
+  vsum = _mm_setzero_ps();
+  vzero = _mm_setzero_si128();
+#else
+  sum0 = 0.0f;
+  sum1 = 0.0f;
+  sum2 = 0.0f;
+#endif
+  matrixsum = 0.0f;
+  mapy = state->matrixoffsety;
+  for( y = 0 ; y < state->matrixsizey ; y++ )
+  {
+    src = ADDRESS( state->srcdata, ( mapy * state->bytesperline ) );
+    mapx = state->matrixoffsetx + ( state->matrixoffsetx << 1 );
+    for( x = 0 ; x < state->matrixsizex ; x++ )
+    {
+#if CPU_SSE2_SUPPORT
+      f = state->linearx[x] * state->lineary[y];
+      vsrc = _mm_set_ps( 0.0f, (float)src[ mapx + 2 ], (float)src[ mapx + 1 ], (float)src[ mapx + 0 ] );
+      vsrc = srgb2linear3( vsrc );
+      vsum = _mm_add_ps( vsum, _mm_mul_ps( _mm_set1_ps( f ), vsrc ) );
+#else
+      f = state->linearx[x] * state->lineary[y];
+      sum0 += f * srgb2linear( (float)src[ mapx + 0 ] );
+      sum1 += f * srgb2linear( (float)src[ mapx + 1 ] );
+      sum2 += f * srgb2linear( (float)src[ mapx + 2 ] );
+#endif
+      matrixsum += f;
+      mapx += 3;
+      if( mapx >= state->width3 )
+        mapx = 0;
+    }
+    mapy++;
+    if( mapy >= state->height )
+      mapy = 0;
+  }
+
+#if CPU_SSE2_SUPPORT
+  union
+  {
+    char c[4];
+    uint32_t i;
+  } u;
+  vsum = linear2srgb3( _mm_div_ps( vsum, _mm_set1_ps( matrixsum ) ) );
+  _mm_store_ss( (void *)&u.i, _mm_castsi128_ps( _mm_packus_epi16( _mm_packs_epi32( _mm_cvtps_epi32( vsum ), vzero ), vzero ) ) );
+  dst[0] = u.c[0];
+  dst[1] = u.c[1];
+  dst[2] = u.c[2];
+#else
+  matrixsum = 1.0f / matrixsum;
+  sum0 *= matrixsum;
+  sum1 *= matrixsum;
+  sum2 *= matrixsum;
+  dst[0] = (unsigned char)( fmaxf( 0.0f, fminf( 255.0f, linear2srgb( sum0 ) + 0.5f ) ) );
+  dst[1] = (unsigned char)( fmaxf( 0.0f, fminf( 255.0f, linear2srgb( sum1 ) + 0.5f ) ) );
+  dst[2] = (unsigned char)( fmaxf( 0.0f, fminf( 255.0f, linear2srgb( sum2 ) + 0.5f ) ) );
+#endif
+
+  return;
+}
+
+
+static void imDynamicKernel4sRGB( unsigned char *dst, imGenericMatrixState *state )
+{
+  int x, y, mapx, mapy;
+#if CPU_SSE2_SUPPORT
+  __m128 vsum, vsrc;
+  __m128i vzero;
+#else
+  float sum0, sum1, sum2, sum3;
+#endif
+  float f, matrixsum;
+  unsigned char *src;
+
+#if CPU_SSE2_SUPPORT
+  vsum = _mm_setzero_ps();
+  vzero = _mm_setzero_si128();
+#else
+  sum0 = 0.0f;
+  sum1 = 0.0f;
+  sum2 = 0.0f;
+  sum3 = 0.0f;
+#endif
+  matrixsum = 0.0f;
+  mapy = state->matrixoffsety;
+  for( y = 0 ; y < state->matrixsizey ; y++ )
+  {
+    src = ADDRESS( state->srcdata, ( mapy * state->bytesperline ) );
+    mapx = state->matrixoffsetx << 2;
+    for( x = 0 ; x < state->matrixsizex ; x++ )
+    {
+#if CPU_SSE2_SUPPORT
+      f = state->linearx[x] * state->lineary[y];
+      vsrc = _mm_cvtepi32_ps( CPU_CVT_U8_TO_I32( _mm_castps_si128( _mm_load_ss( (void *)&src[ mapx ] ) ), vzero ) );
+      vsrc = srgb2linear3( vsrc );
+      vsum = _mm_add_ps( vsum, _mm_mul_ps( _mm_set1_ps( f ), vsrc ) );
+#else
+      f = state->linearx[x] * state->lineary[y];
+      sum0 += f * srgb2linear( (float)src[ mapx + 0 ] );
+      sum1 += f * srgb2linear( (float)src[ mapx + 1 ] );
+      sum2 += f * srgb2linear( (float)src[ mapx + 2 ] );
+      sum3 += f * (float)src[ mapx + 3 ];
+#endif
+      matrixsum += f;
+      mapx += 4;
+      if( mapx >= state->width4 )
+        mapx = 0;
+    }
+    mapy++;
+    if( mapy >= state->height )
+      mapy = 0;
+  }
+
+#if CPU_SSE2_SUPPORT
+  vsum = linear2srgb3( _mm_div_ps( vsum, _mm_set1_ps( matrixsum ) ) );
+  _mm_store_ss( (void *)dst, _mm_castsi128_ps( _mm_packus_epi16( _mm_packs_epi32( _mm_cvtps_epi32( vsum ), vzero ), vzero ) ) );
+#else
+  matrixsum = 1.0f / matrixsum;
+  sum0 *= matrixsum;
+  sum1 *= matrixsum;
+  sum2 *= matrixsum;
+  sum3 *= matrixsum;
+  dst[0] = (unsigned char)( fmaxf( 0.0f, fminf( 255.0f, linear2srgb( sum0 ) + 0.5f ) ) );
+  dst[1] = (unsigned char)( fmaxf( 0.0f, fminf( 255.0f, linear2srgb( sum1 ) + 0.5f ) ) );
+  dst[2] = (unsigned char)( fmaxf( 0.0f, fminf( 255.0f, linear2srgb( sum2 ) + 0.5f ) ) );
+  dst[3] = (unsigned char)( fmaxf( 0.0f, fminf( 255.0f, sum3 + 0.5f ) ) );
+#endif
+
+  return;
+}
+
+
+////
+
+
+static void imDynamicKernel4sRGBAlphaNorm( unsigned char *dst, imGenericMatrixState *state )
+{
+  int x, y, mapx, mapy;
+#if CPU_SSE2_SUPPORT
+  __m128 vsum, vsrc, valpha;
+  __m128i vzero;
+  uint32_t pixel;
+#else
+  float sum0, sum1, sum2, sum3, alpha;
+#endif
+  float f, matrixsum;
+  unsigned char *src;
+
+#if CPU_SSE2_SUPPORT
+  vsum = _mm_setzero_ps();
+  vzero = _mm_setzero_si128();
+#else
+  sum0 = 0.0f;
+  sum1 = 0.0f;
+  sum2 = 0.0f;
+  sum3 = 0.0f;
+#endif
+  matrixsum = 0.0f;
+  mapy = state->matrixoffsety;
+  for( y = 0 ; y < state->matrixsizey ; y++ )
+  {
+    src = ADDRESS( state->srcdata, ( mapy * state->bytesperline ) );
+    mapx = state->matrixoffsetx << 2;
+    for( x = 0 ; x < state->matrixsizex ; x++ )
+    {
+#if CPU_SSE2_SUPPORT
+      f = state->linearx[x] * state->lineary[y];
+      vsrc = _mm_cvtepi32_ps( CPU_CVT_U8_TO_I32( _mm_castps_si128( _mm_load_ss( (void *)&src[ mapx ] ) ), vzero ) );
+      valpha = _mm_shuffle_ps( vsrc, _mm_set_ss( 1.0f ), 0x0f );
+      vsrc = srgb2linear3( vsrc );
+      vsum = _mm_add_ps( vsum, _mm_mul_ps( _mm_mul_ps( _mm_shuffle_ps( valpha, valpha, 0xC0 ), _mm_set1_ps( f ) ), vsrc ) );
+#else
+      f = state->linearx[x] * state->lineary[y];
+      alpha = (float)src[ mapx + 3 ] * f;
+      sum0 += alpha * srgb2linear( (float)src[ mapx + 0 ] );
+      sum1 += alpha * srgb2linear( (float)src[ mapx + 1 ] );
+      sum2 += alpha * srgb2linear( (float)src[ mapx + 2 ] );
+      sum3 += alpha;
+#endif
+      matrixsum += f;
+      mapx += 4;
+      if( mapx >= state->width4 )
+        mapx = 0;
+    }
+    mapy++;
+    if( mapy >= state->height )
+      mapy = 0;
+  }
+
+#if CPU_SSE2_SUPPORT
+  vsum = _mm_div_ps( vsum, _mm_set1_ps( matrixsum ) );
+  valpha = _mm_shuffle_ps( vsum, vsum, 0xff );
+  pixel = 0;
+  if( _mm_comige_ss( valpha, _mm_load_ss( &state->minimumalphaf ) ) )
+  {
+    __m128i vpixel;
+    vsum = _mm_mul_ps( vsum, _mm_rcp_ps( valpha ) );
+    vsum = CPU_BLENDV_PS( vsum, valpha, *(__m128 *)simd4fAlphaMask );
+    vsum = linear2srgb3( vsum );
+    vpixel = _mm_cvtps_epi32( vsum );
+    vpixel = _mm_packs_epi32( vpixel, vpixel );
+    vpixel = _mm_packus_epi16( vpixel, vpixel );
+    pixel = (uint32_t)_mm_cvtsi128_si32( vpixel );
+  }
+  *(uint32_t *)dst = pixel;
+#else
+  matrixsum = 1.0f / matrixsum;
+  sum0 *= matrixsum;
+  sum1 *= matrixsum;
+  sum2 *= matrixsum;
+  sum3 *= matrixsum;
+  if( sum3 >= state->minimumalphaf )
+  {
+    f = 1.0f / sum3;
+    dst[0] = (unsigned char)( fmaxf( 0.0f, fminf( 255.0f, linear2srgb( sum0 * f ) + 0.5f ) ) );
+    dst[1] = (unsigned char)( fmaxf( 0.0f, fminf( 255.0f, linear2srgb( sum1 * f ) + 0.5f ) ) );
+    dst[2] = (unsigned char)( fmaxf( 0.0f, fminf( 255.0f, linear2srgb( sum2 * f ) + 0.5f ) ) );
+    dst[3] = (unsigned char)( fmaxf( 0.0f, fminf( 255.0f, sum3 + 0.5f ) ) );
+  }
+  else
+  {
+    dst[0] = 0;
+    dst[1] = 0;
+    dst[2] = 0;
+    dst[3] = 0;
+  }
+#endif
+
+  return;
+}
+
+
+////
+
+
+static void imDynamicKernel3Normal( unsigned char *dst, imGenericMatrixState *state )
+{
+  int x, y, mapx, mapy;
+  float f, sum0, sum1, sum2;
+  float matrixsum, suminv;
+  unsigned char *src;
+
+  sum0 = 0.0f;
+  sum1 = 0.0f;
+  sum2 = 0.0f;
+  matrixsum = 0.0f;
+  mapy = state->matrixoffsety;
+  for( y = 0 ; y < state->matrixsizey ; y++ )
+  {
+    src = ADDRESS( state->srcdata, ( mapy * state->bytesperline ) );
+    mapx = state->matrixoffsetx + ( state->matrixoffsetx << 1 );
+    for( x = 0 ; x < state->matrixsizex ; x++ )
+    {
+      f = state->linearx[x] * state->lineary[y];
+      sum0 += f * (float)src[ mapx + 0 ];
+      sum1 += f * (float)src[ mapx + 1 ];
+      sum2 += f * (float)src[ mapx + 2 ];
+      matrixsum += f;
+      mapx += 3;
+      if( mapx >= state->width3 )
+        mapx = 0;
+    }
+    mapy++;
+    if( mapy >= state->height )
+      mapy = 0;
+  }
+
+  matrixsum = (1.0f/255.0f) / matrixsum;
+  sum0 *= matrixsum;
+  sum1 *= matrixsum;
+  sum2 *= matrixsum;
+  sum0 -= 0.5f;
+  sum1 -= 0.5f;
+  sum2 -= 0.5f;
+  sum0 *= state->amplifynormal;
+  sum1 *= state->amplifynormal;
+  suminv = (0.5f*255.0f) / sqrtf( ( sum0 * sum0 ) + ( sum1 * sum1 ) + ( sum2 * sum2 ) );
+  sum0 = (0.5f*255.0f) + ( sum0 * suminv );
+  sum1 = (0.5f*255.0f) + ( sum1 * suminv );
+  sum2 = (0.5f*255.0f) + ( sum2 * suminv );
+  dst[0] = (unsigned char)( fmaxf( 0.0f, fminf( 255.0f, sum0 + 0.5f ) ) );
+  dst[1] = (unsigned char)( fmaxf( 0.0f, fminf( 255.0f, sum1 + 0.5f ) ) );
+  dst[2] = (unsigned char)( fmaxf( 0.0f, fminf( 255.0f, sum2 + 0.5f ) ) );
+
+  return;
+}
+
+
+static void imDynamicKernel4Normal( unsigned char *dst, imGenericMatrixState *state )
+{
+  int x, y, mapx, mapy;
+  float f, sum0, sum1, sum2, sum3;
+  float matrixsum, suminv;
+  unsigned char *src;
+
+  sum0 = 0.0f;
+  sum1 = 0.0f;
+  sum2 = 0.0f;
+  sum3 = 0.0f;
+  matrixsum = 0.0f;
+  mapy = state->matrixoffsety;
+  for( y = 0 ; y < state->matrixsizey ; y++ )
+  {
+    src = ADDRESS( state->srcdata, ( mapy * state->bytesperline ) );
+    mapx = state->matrixoffsetx << 2;
+    for( x = 0 ; x < state->matrixsizex ; x++ )
+    {
+      f = state->linearx[x] * state->lineary[y];
+      sum0 += f * (float)src[ mapx + 0 ];
+      sum1 += f * (float)src[ mapx + 1 ];
+      sum2 += f * (float)src[ mapx + 2 ];
+      sum3 += f * (float)src[ mapx + 3 ];
+      matrixsum += f;
+      mapx += 4;
+      if( mapx >= state->width4 )
+        mapx = 0;
+    }
+    mapy++;
+    if( mapy >= state->height )
+      mapy = 0;
+  }
+
+  matrixsum = (1.0f/255.0f) / matrixsum;
+  sum0 *= matrixsum;
+  sum1 *= matrixsum;
+  sum2 *= matrixsum;
+  sum3 *= matrixsum;
+  sum0 -= 0.5f;
+  sum1 -= 0.5f;
+  sum2 -= 0.5f;
+  sum0 *= state->amplifynormal;
+  sum1 *= state->amplifynormal;
+  suminv = (0.5f*255.0f) / sqrtf( ( sum0 * sum0 ) + ( sum1 * sum1 ) + ( sum2 * sum2 ) );
+  sum0 = (0.5f*255.0f) + ( sum0 * suminv );
+  sum1 = (0.5f*255.0f) + ( sum1 * suminv );
+  sum2 = (0.5f*255.0f) + ( sum2 * suminv );
+  sum3 *= 255.0f;
+  dst[0] = (unsigned char)( fmaxf( 0.0f, fminf( 255.0f, sum0 + 0.5f ) ) );
+  dst[1] = (unsigned char)( fmaxf( 0.0f, fminf( 255.0f, sum1 + 0.5f ) ) );
+  dst[2] = (unsigned char)( fmaxf( 0.0f, fminf( 255.0f, sum2 + 0.5f ) ) );
+  dst[3] = (unsigned char)( fmaxf( 0.0f, fminf( 255.0f, sum3 + 0.5f ) ) );
+
+  return;
+}
+
+
+////
+
+
+static void imDynamicKernel3NormalSustain( unsigned char *dst, imGenericMatrixState *state )
+{
+  int x, y, mapx, mapy;
+  float f, v0, v1, v2, energy, sum0, sum1, sum2, sumenergy;
+  float matrixsum, suminv;
+  unsigned char *src;
+
+  sum0 = 0.0f;
+  sum1 = 0.0f;
+  sum2 = 0.0f;
+  sumenergy = 0.0f;
+  matrixsum = 0.0f;
+  mapy = state->matrixoffsety;
+  for( y = 0 ; y < state->matrixsizey ; y++ )
+  {
+    src = ADDRESS( state->srcdata, ( mapy * state->bytesperline ) );
+    mapx = state->matrixoffsetx + ( state->matrixoffsetx << 1 );
+    for( x = 0 ; x < state->matrixsizex ; x++ )
+    {
+      f = state->linearx[x] * state->lineary[y];
+      v0 = f * ( (float)src[ mapx + 0 ] - 127.5f );
+      v1 = f * ( (float)src[ mapx + 1 ] - 127.5f );
+      v2 = f * ( (float)src[ mapx + 2 ] - 127.5f );
+      sum0 += v0;
+      sum1 += v1;
+      sum2 += v2;
+      energy = ( v0 * v0 ) + ( v1 * v1 );
+      if( energy )
+        sumenergy += sqrtf( energy ) / sqrtf( energy + ( v2 * v2 ) );
+      matrixsum += f;
+      mapx += 3;
+      if( mapx >= state->width3 )
+        mapx = 0;
+    }
+    mapy++;
+    if( mapy >= state->height )
+      mapy = 0;
+  }
+
+  matrixsum = (1.0f/255.0f) / matrixsum;
+  sum0 *= matrixsum;
+  sum1 *= matrixsum;
+  sum2 *= matrixsum;
+  sum0 *= state->amplifynormal;
+  sum1 *= state->amplifynormal;
+  suminv = (0.5f*255.0f) / fmaxf( 0.0625f, sqrtf( ( sum0 * sum0 ) + ( sum1 * sum1 ) + ( sum2 * sum2 ) ) );
+  sum0 *= suminv;
+  sum1 *= suminv;
+  sum2 *= suminv;
+  energy = sqrtf( ( sum0 * sum0 ) + ( sum1 * sum1 ) );
+  sumenergy *= state->normalsustainfactor;
+  if( energy < sumenergy )
+  {
+    f = fminf( sumenergy / energy, 8.0f );
+    sum0 *= f;
+    sum1 *= f;
+    suminv = (0.5f*255.0f) / fmaxf( 0.0625f, sqrtf( ( sum0 * sum0 ) + ( sum1 * sum1 ) + ( sum2 * sum2 ) ) );
+    sum0 *= suminv;
+    sum1 *= suminv;
+    sum2 *= suminv;
+  }
+  sum0 += (0.5f*255.0f);
+  sum1 += (0.5f*255.0f);
+  sum2 += (0.5f*255.0f);
+  dst[0] = (unsigned char)( fmaxf( 0.0f, fminf( 255.0f, sum0 + 0.5f ) ) );
+  dst[1] = (unsigned char)( fmaxf( 0.0f, fminf( 255.0f, sum1 + 0.5f ) ) );
+  dst[2] = (unsigned char)( fmaxf( 0.0f, fminf( 255.0f, sum2 + 0.5f ) ) );
+
+  return;
+}
+
+static void imDynamicKernel4NormalSustain( unsigned char *dst, imGenericMatrixState *state )
+{
+  int x, y, mapx, mapy;
+  float f, v0, v1, v2, v3, energy, sum0, sum1, sum2, sum3, sumenergy;
+  float matrixsum, suminv;
+  unsigned char *src;
+
+  sum0 = 0.0f;
+  sum1 = 0.0f;
+  sum2 = 0.0f;
+  sum3 = 0.0f;
+  sumenergy = 0.0f;
+  matrixsum = 0.0f;
+  mapy = state->matrixoffsety;
+  for( y = 0 ; y < state->matrixsizey ; y++ )
+  {
+    src = ADDRESS( state->srcdata, ( mapy * state->bytesperline ) );
+    mapx = state->matrixoffsetx << 2;
+    for( x = 0 ; x < state->matrixsizex ; x++ )
+    {
+      f = state->linearx[x] * state->lineary[y];
+      v0 = f * ( (float)src[ mapx + 0 ] - 127.5f );
+      v1 = f * ( (float)src[ mapx + 1 ] - 127.5f );
+      v2 = f * ( (float)src[ mapx + 2 ] - 127.5f );
+      v3 = f * (float)src[ mapx + 3 ];
+      sum0 += v0;
+      sum1 += v1;
+      sum2 += v2;
+      sum3 += v3;
+      energy = ( v0 * v0 ) + ( v1 * v1 );
+      if( energy )
+        sumenergy += sqrtf( energy ) / sqrtf( energy + ( v2 * v2 ) );
+      matrixsum += f;
+      mapx += 4;
+      if( mapx >= state->width4 )
+        mapx = 0;
+    }
+    mapy++;
+    if( mapy >= state->height )
+      mapy = 0;
+  }
+
+  matrixsum = (1.0f/255.0f) / matrixsum;
+  sum0 *= matrixsum;
+  sum1 *= matrixsum;
+  sum2 *= matrixsum;
+  sum3 *= matrixsum;
+  sum0 *= state->amplifynormal;
+  sum1 *= state->amplifynormal;
+  suminv = (0.5f*255.0f) / fmaxf( 0.0625f, sqrtf( ( sum0 * sum0 ) + ( sum1 * sum1 ) + ( sum2 * sum2 ) ) );
+  sum0 *= suminv;
+  sum1 *= suminv;
+  sum2 *= suminv;
+  energy = sqrtf( ( sum0 * sum0 ) + ( sum1 * sum1 ) );
+  sumenergy *= state->normalsustainfactor;
+  if( energy < sumenergy )
+  {
+    f = fminf( sumenergy / energy, 8.0f );
+    sum0 *= f;
+    sum1 *= f;
+    suminv = (0.5f*255.0f) / fmaxf( 0.0625f, sqrtf( ( sum0 * sum0 ) + ( sum1 * sum1 ) + ( sum2 * sum2 ) ) );
+    sum0 *= suminv;
+    sum1 *= suminv;
+    sum2 *= suminv;
+  }
+  sum0 += (0.5f*255.0f);
+  sum1 += (0.5f*255.0f);
+  sum2 += (0.5f*255.0f);
+  sum3 *= 255.0f;
+  dst[0] = (unsigned char)( fmaxf( 0.0f, fminf( 255.0f, sum0 + 0.5f ) ) );
+  dst[1] = (unsigned char)( fmaxf( 0.0f, fminf( 255.0f, sum1 + 0.5f ) ) );
+  dst[2] = (unsigned char)( fmaxf( 0.0f, fminf( 255.0f, sum2 + 0.5f ) ) );
+  dst[3] = (unsigned char)( fmaxf( 0.0f, fminf( 255.0f, sum3 + 0.5f ) ) );
+
+
+  return;
+}
+
+
+////
+
+
+static void imDynamicKernel4NormalSustainAlphaNorm( unsigned char *dst, imGenericMatrixState *state )
+{
+  int x, y, mapx, mapy;
+  float f, alpha, v0, v1, v2, v3, energy, sum0, sum1, sum2, sum3, sumenergy;
+  float matrixsum, suminv;
+  unsigned char *src;
+
+  sum0 = 0.0f;
+  sum1 = 0.0f;
+  sum2 = 0.0f;
+  sum3 = 0.0f;
+  sumenergy = 0.0f;
+  matrixsum = 0.0f;
+  mapy = state->matrixoffsety;
+  for( y = 0 ; y < state->matrixsizey ; y++ )
+  {
+    src = ADDRESS( state->srcdata, ( mapy * state->bytesperline ) );
+    mapx = state->matrixoffsetx << 2;
+    for( x = 0 ; x < state->matrixsizex ; x++ )
+    {
+      f = state->linearx[x] * state->lineary[y];
+      alpha = (float)src[ mapx + 3 ] * f;
+      v0 = alpha * ( (float)src[ mapx + 0 ] - 127.5f );
+      v1 = alpha * ( (float)src[ mapx + 1 ] - 127.5f );
+      v2 = alpha * ( (float)src[ mapx + 2 ] - 127.5f );
+      v3 = alpha;
+      sum0 += v0;
+      sum1 += v1;
+      sum2 += v2;
+      sum3 += v3;
+      energy = ( v0 * v0 ) + ( v1 * v1 );
+      if( energy )
+        sumenergy += sqrtf( energy ) / sqrtf( energy + ( v2 * v2 ) );
+      matrixsum += f;
+      mapx += 4;
+      if( mapx >= state->width4 )
+        mapx = 0;
+    }
+    mapy++;
+    if( mapy >= state->height )
+      mapy = 0;
+  }
+
+  matrixsum = 1.0f / matrixsum;
+  sum0 *= matrixsum;
+  sum1 *= matrixsum;
+  sum2 *= matrixsum;
+  sum3 *= matrixsum;
+  if( sum3 >= state->minimumalphaf )
+  {
+    f = 1.0f / sum3;
+    sum0 *= f;
+    sum1 *= f;
+    sum2 *= f;
+    sum0 *= state->amplifynormal;
+    sum1 *= state->amplifynormal;
+    suminv = (0.5f*255.0f) / fmaxf( 0.0625f, sqrtf( ( sum0 * sum0 ) + ( sum1 * sum1 ) + ( sum2 * sum2 ) ) );
+    sum0 *= suminv;
+    sum1 *= suminv;
+    sum2 *= suminv;
+    energy = sqrtf( ( sum0 * sum0 ) + ( sum1 * sum1 ) );
+    sumenergy *= state->normalsustainfactor;
+    if( energy < sumenergy )
+    {
+      f = fminf( sumenergy / energy, 8.0f );
+      sum0 *= f;
+      sum1 *= f;
+      suminv = (0.5f*255.0f) / fmaxf( 0.0625f, sqrtf( ( sum0 * sum0 ) + ( sum1 * sum1 ) + ( sum2 * sum2 ) ) );
+      sum0 *= suminv;
+      sum1 *= suminv;
+      sum2 *= suminv;
+    }
+    sum0 += (0.5f*255.0f);
+    sum1 += (0.5f*255.0f);
+    sum2 += (0.5f*255.0f);
+    dst[0] = (unsigned char)( fmaxf( 0.0f, fminf( 255.0f, sum0 + 0.5f ) ) );
+    dst[1] = (unsigned char)( fmaxf( 0.0f, fminf( 255.0f, sum1 + 0.5f ) ) );
+    dst[2] = (unsigned char)( fmaxf( 0.0f, fminf( 255.0f, sum2 + 0.5f ) ) );
+    dst[3] = (unsigned char)( fmaxf( 0.0f, fminf( 255.0f, sum3 + 0.5f ) ) );
+  }
+  else
+  {
+    dst[0] = 0;
+    dst[1] = 0;
+    dst[2] = 0;
+    dst[3] = 0;
+  }
+
+  return;
+}
+
+
+////
+
+
+int imReduceImageKaiserData( unsigned char *dstdata, unsigned char *srcdata, int width, int height, int bytesperpixel, int bytesperline, int newwidth, int newheight, imReduceOptions *options )
+{
+  int filter, x, y;
+  float scalex, scaley, scaleinvx, scaleinvy;
+  float sourcex, sourcey;
+  unsigned char *dst;
+  imGenericMatrixState state;
+  void (*applykernel)( unsigned char *dst, imGenericMatrixState *state );
+#if CPU_SSE2_SUPPORT
+  void (*applykernelcore)( unsigned char *dst, imGenericMatrixState *state );
+#endif
+
+  filter = options->filter;
+  if( ( newwidth > width ) || ( newheight > height ) )
+    return 0;
+
+  applykernel = 0;
+#if CPU_SSE2_SUPPORT
+  applykernelcore = 0;
+#endif
+
+  if( filter == IM_REDUCE_FILTER_LINEAR )
+  {
+    if( bytesperpixel == 4 )
+      applykernel = imDynamicKernel4Linear;
+    else if( bytesperpixel == 3 )
+      applykernel = imDynamicKernel3Linear;
+    else if( bytesperpixel == 2 )
+      applykernel = imDynamicKernel2Linear;
+    else if( bytesperpixel == 1 )
+      applykernel = imDynamicKernel1Linear;
+  }
+  else if( filter == IM_REDUCE_FILTER_LINEAR_ALPHANORM )
+  {
+    if( bytesperpixel == 4 )
+    {
+      applykernel = imDynamicKernel4LinearAlphaNorm;
+#if CPU_SSE2_SUPPORT
+      applykernelcore = imDynamicKernel4LinearAlphaNorm_Core;
+#endif
+    }
+    else if( bytesperpixel == 3 )
+      applykernel = imDynamicKernel3Linear;
+    else if( bytesperpixel == 2 )
+      applykernel = imDynamicKernel2Linear;
+    else if( bytesperpixel == 1 )
+      applykernel = imDynamicKernel1Linear;
+  }
+  else if( filter == IM_REDUCE_FILTER_SRGB )
+  {
+    if( bytesperpixel == 4 )
+      applykernel = imDynamicKernel4sRGB;
+    else if( bytesperpixel == 3 )
+      applykernel = imDynamicKernel3sRGB;
+    else if( bytesperpixel == 2 )
+      applykernel = imDynamicKernel2sRGB;
+    else if( bytesperpixel == 1 )
+      applykernel = imDynamicKernel1sRGB;
+  }
+  else if( filter == IM_REDUCE_FILTER_SRGB_ALPHANORM )
+  {
+    if( bytesperpixel == 4 )
+      applykernel = imDynamicKernel4sRGBAlphaNorm;
+    else if( bytesperpixel == 3 )
+      applykernel = imDynamicKernel3sRGB;
+    else if( bytesperpixel == 2 )
+      applykernel = imDynamicKernel2sRGB;
+    else if( bytesperpixel == 1 )
+      applykernel = imDynamicKernel1sRGB;
+  }
+  else if( filter == IM_REDUCE_FILTER_NORMALMAP )
+  {
+    if( bytesperpixel == 4 )
+      applykernel = imDynamicKernel4Normal;
+    else if( bytesperpixel == 3 )
+      applykernel = imDynamicKernel3Normal;
+  }
+  else if( filter == IM_REDUCE_FILTER_NORMALMAP_SUSTAIN )
+  {
+    if( bytesperpixel == 4 )
+      applykernel = imDynamicKernel4NormalSustain;
+    else if( bytesperpixel == 3 )
+      applykernel = imDynamicKernel3NormalSustain;
+  }
+  else if( filter == IM_REDUCE_FILTER_NORMALMAP_SUSTAIN_ALPHANORM )
+  {
+    if( bytesperpixel == 4 )
+      applykernel = imDynamicKernel4NormalSustainAlphaNorm;
+    else if( bytesperpixel == 3 )
+      applykernel = imDynamicKernel3NormalSustain;
+  }
+
+  if( !applykernel )
+    return 0;
+
+  state.minimumalpha = 4;
+  state.minimumalphaf = (float)state.minimumalpha;
+  state.amplifynormal = fmaxf( 1.0f, options->amplifynormal );
+  state.normalsustainfactor = options->normalsustainfactor;
+
+  state.dithersum = 0.0f;
+  if( ( newwidth | newheight ) > 2 )
+    state.dithersum = 0.5f;
+
+  state.srcdata = srcdata;
+  state.width1 = width * 1;
+  state.width2 = width * 2;
+  state.width3 = width * 3;
+  state.width4 = width * 4;
+  state.height = height;
+  state.bytesperline = bytesperline;
+
+  scalex = (float)newwidth / (float)width;
+  scaley = (float)newheight / (float)height;
+  scaleinvx = (float)width / (float)newwidth;
+  scaleinvy = (float)height / (float)newheight;
+
+  imAllocGenericState( &state, scalex, scaley, options->hopcount, options->alpha );
+
+#if CPU_SSE2_SUPPORT
+  if( applykernelcore )
+  {
+    dst = dstdata;
+    for( y = 0 ; y < newheight ; y++ )
+    {
+      sourcey = ( ( (float)y + 0.5f ) * scaleinvy ) - 0.5f;
+      imBuildGenericLinearY( &state, scaley, scaleinvy, sourcey, options->hopcount, options->alpha, height );
+      for( x = 0 ; x < newwidth ; x++, dst += bytesperpixel )
+      {
+        sourcex = ( ( (float)x + 0.5f ) * scaleinvx ) - 0.5f;
+        imBuildGenericLinearX( &state, scalex, scaleinvx, sourcex, options->hopcount, options->alpha, width );
+        if( ( state.matrixoffsetx + ( ( state.matrixsizex + 3 ) & ~3 ) ) < width )
+          applykernelcore( dst, &state );
+        else
+          applykernel( dst, &state );
+      }
+    }
+  }
+  else
+#endif
+  {
+    dst = dstdata;
+    for( y = 0 ; y < newheight ; y++ )
+    {
+      sourcey = ( ( (float)y + 0.5f ) * scaleinvy ) - 0.5f;
+      imBuildGenericLinearY( &state, scaley, scaleinvy, sourcey, options->hopcount, options->alpha, height );
+      for( x = 0 ; x < newwidth ; x++, dst += bytesperpixel )
+      {
+        sourcex = ( ( (float)x + 0.5f ) * scaleinvx ) - 0.5f;
+        imBuildGenericLinearX( &state, scalex, scaleinvx, sourcex, options->hopcount, options->alpha, width );
+        applykernel( dst, &state );
+      }
+    }
+  }
+
+  imFreeGenericState( &state );
+
+  return 1;
+}
+
+
+int imReduceImageKaiser( imgImage *imgdst, imgImage *imgsrc, int newwidth, int newheight, imReduceOptions *options )
+{
+  int retvalue;
+
+  imgdst->format.width = newwidth;
+  imgdst->format.height = newheight;
+  imgdst->format.type = imgsrc->format.type;
+  imgdst->format.bytesperpixel = imgsrc->format.bytesperpixel;
+  imgdst->format.bytesperline = imgdst->format.width * imgdst->format.bytesperpixel;
+  imgdst->data = malloc( imgdst->format.height * imgdst->format.bytesperline );
+  if( !( imgdst->data ) )
+    return 0;
+
+  retvalue = imReduceImageKaiserData( imgdst->data, imgsrc->data, imgsrc->format.width, imgsrc->format.height, imgsrc->format.bytesperpixel, imgsrc->format.bytesperline, newwidth, newheight, options );
+
+  return retvalue;
+}
+
+
+
+////////////////////////////////////////////////////////////////////////////////
+
+
+
+static inline CC_ALWAYSINLINE void imReduceHalfBox1Linear( unsigned char *dst, unsigned char *src, int bytesperpixel, int bytesperline, float *dithersum )
+{
+  dst[0] = (unsigned char)( ( (int)src[0] + (int)src[bytesperpixel+0] + (int)src[bytesperline+0] + (int)src[bytesperpixel+bytesperline+0] + 2 ) >> 2 );
+  return;
+}
+
+static inline CC_ALWAYSINLINE void imReduceHalfBox2Linear( unsigned char *dst, unsigned char *src, int bytesperpixel, int bytesperline, float *dithersum )
+{
+  dst[0] = (unsigned char)( ( (int)src[0] + (int)src[bytesperpixel+0] + (int)src[bytesperline+0] + (int)src[bytesperpixel+bytesperline+0] + 2 ) >> 2 );
+  dst[1] = (unsigned char)( ( (int)src[1] + (int)src[bytesperpixel+1] + (int)src[bytesperline+1] + (int)src[bytesperpixel+bytesperline+1] + 2 ) >> 2 );
+  return;
+}
+
+static inline CC_ALWAYSINLINE void imReduceHalfBox3Linear( unsigned char *dst, unsigned char *src, int bytesperpixel, int bytesperline, float *dithersum )
+{
+  dst[0] = (unsigned char)( ( (int)src[0] + (int)src[bytesperpixel+0] + (int)src[bytesperline+0] + (int)src[bytesperpixel+bytesperline+0] + 2 ) >> 2 );
+  dst[1] = (unsigned char)( ( (int)src[1] + (int)src[bytesperpixel+1] + (int)src[bytesperline+1] + (int)src[bytesperpixel+bytesperline+1] + 2 ) >> 2 );
+  dst[2] = (unsigned char)( ( (int)src[2] + (int)src[bytesperpixel+2] + (int)src[bytesperline+2] + (int)src[bytesperpixel+bytesperline+2] + 2 ) >> 2 );
+  return;
+}
+
+static inline CC_ALWAYSINLINE void imReduceHalfBox4Linear( unsigned char *dst, unsigned char *src, int bytesperpixel, int bytesperline, float *dithersum )
+{
+  dst[0] = (unsigned char)( ( (int)src[0] + (int)src[bytesperpixel+0] + (int)src[bytesperline+0] + (int)src[bytesperpixel+bytesperline+0] + 2 ) >> 2 );
+  dst[1] = (unsigned char)( ( (int)src[1] + (int)src[bytesperpixel+1] + (int)src[bytesperline+1] + (int)src[bytesperpixel+bytesperline+1] + 2 ) >> 2 );
+  dst[2] = (unsigned char)( ( (int)src[2] + (int)src[bytesperpixel+2] + (int)src[bytesperline+2] + (int)src[bytesperpixel+bytesperline+2] + 2 ) >> 2 );
+  dst[3] = (unsigned char)( ( (int)src[3] + (int)src[bytesperpixel+3] + (int)src[bytesperline+3] + (int)src[bytesperpixel+bytesperline+3] + 2 ) >> 2 );
+  return;
+}
+
+static inline CC_ALWAYSINLINE void imReduceHalfBox1sRGB( unsigned char *dst, unsigned char *src, int bytesperpixel, int bytesperline, float *dithersum )
+{
+  int i, offset[4];
+  float sum0;
+  offset[0] = 0;
+  offset[1] = bytesperpixel;
+  offset[2] = bytesperline;
+  offset[3] = bytesperline + bytesperpixel;
+  sum0 = 0.0f;
+  for( i = 0 ; i < 4 ; i++ )
+    sum0 += srgb2linear( (float)src[offset[i]+0] );
+  dst[0] = (unsigned char)( fmaxf( 0.0f, fminf( 255.0f, linear2srgb( sum0 * 0.25f ) + 0.5f ) ) );
+  return;
+}
+
+static inline CC_ALWAYSINLINE void imReduceHalfBox2sRGB( unsigned char *dst, unsigned char *src, int bytesperpixel, int bytesperline, float *dithersum )
+{
+  int i, offset[4];
+  float sum0, sum1;
+  offset[0] = 0;
+  offset[1] = bytesperpixel;
+  offset[2] = bytesperline;
+  offset[3] = bytesperline + bytesperpixel;
+  sum0 = 0.0f;
+  sum1 = 0.0f;
+  for( i = 0 ; i < 4 ; i++ )
+  {
+    sum0 += srgb2linear( (float)src[offset[i]+0] );
+    sum1 += srgb2linear( (float)src[offset[i]+1] );
+  }
+  dst[0] = (unsigned char)( fmaxf( 0.0f, fminf( 255.0f, linear2srgb( sum0 * 0.25f ) + 0.5f ) ) );
+  dst[1] = (unsigned char)( fmaxf( 0.0f, fminf( 255.0f, linear2srgb( sum1 * 0.25f ) + 0.5f ) ) );
+  return;
+}
+
+static inline CC_ALWAYSINLINE void imReduceHalfBox3sRGB( unsigned char *dst, unsigned char *src, int bytesperpixel, int bytesperline, float *dithersum )
+{
+  int i, offset[4];
+  float sum0, sum1, sum2;
+  offset[0] = 0;
+  offset[1] = bytesperpixel;
+  offset[2] = bytesperline;
+  offset[3] = bytesperline + bytesperpixel;
+  sum0 = 0.0f;
+  sum1 = 0.0f;
+  sum2 = 0.0f;
+  for( i = 0 ; i < 4 ; i++ )
+  {
+    sum0 += srgb2linear( (float)src[offset[i]+0] );
+    sum1 += srgb2linear( (float)src[offset[i]+1] );
+    sum2 += srgb2linear( (float)src[offset[i]+2] );
+  }
+  dst[0] = (unsigned char)( fmaxf( 0.0f, fminf( 255.0f, linear2srgb( sum0 * 0.25f ) + 0.5f ) ) );
+  dst[1] = (unsigned char)( fmaxf( 0.0f, fminf( 255.0f, linear2srgb( sum1 * 0.25f ) + 0.5f ) ) );
+  dst[2] = (unsigned char)( fmaxf( 0.0f, fminf( 255.0f, linear2srgb( sum2 * 0.25f ) + 0.5f ) ) );
+  return;
+}
+
+static inline CC_ALWAYSINLINE void imReduceHalfBox4sRGB( unsigned char *dst, unsigned char *src, int bytesperpixel, int bytesperline, float *dithersum )
+{
+  int i, offset[4], sum3;
+  float sum0, sum1, sum2;
+  offset[0] = 0;
+  offset[1] = bytesperpixel;
+  offset[2] = bytesperline;
+  offset[3] = bytesperline + bytesperpixel;
+  sum0 = 0.0f;
+  sum1 = 0.0f;
+  sum2 = 0.0f;
+  sum3 = 2;
+  for( i = 0 ; i < 4 ; i++ )
+  {
+    sum0 += srgb2linear( (float)src[offset[i]+0] );
+    sum1 += srgb2linear( (float)src[offset[i]+1] );
+    sum2 += srgb2linear( (float)src[offset[i]+2] );
+    sum3 += (int)src[offset[i]+2];
+  }
+  dst[0] = (unsigned char)( fmaxf( 0.0f, fminf( 255.0f, linear2srgb( sum0 * 0.25f ) + 0.5f ) ) );
+  dst[1] = (unsigned char)( fmaxf( 0.0f, fminf( 255.0f, linear2srgb( sum1 * 0.25f ) + 0.5f ) ) );
+  dst[2] = (unsigned char)( fmaxf( 0.0f, fminf( 255.0f, linear2srgb( sum2 * 0.25f ) + 0.5f ) ) );
+  dst[3] = (unsigned char)( sum3 >> 2 );
+  return;
+}
+
+static inline CC_ALWAYSINLINE void imReduceHalfBox3Normal( unsigned char *dst, unsigned char *src, int bytesperpixel, int bytesperline, float *dithersum )
+{
+  float v0, v1, v2, suminv;
+
+  v0 = (1.0/1020.0) * (float)( (int)src[0] + (int)src[bytesperpixel+0] + (int)src[bytesperline+0] + (int)src[bytesperpixel+bytesperline+0] );
+  v1 = (1.0/1020.0) * (float)( (int)src[1] + (int)src[bytesperpixel+1] + (int)src[bytesperline+1] + (int)src[bytesperpixel+bytesperline+1] );
+  v2 = (1.0/1020.0) * (float)( (int)src[2] + (int)src[bytesperpixel+2] + (int)src[bytesperline+2] + (int)src[bytesperpixel+bytesperline+2] );
+  v0 = 2.0 * ( v0 - 0.5 );
+  v1 = 2.0 * ( v1 - 0.5 );
+  v2 = 2.0 * ( v2 - 0.5 );
+  suminv = 0.5 / sqrtf( ( v0 * v0 ) + ( v1 * v1 ) + ( v2 * v2 ) );
+  v0 = 0.5 + ( v0 * suminv );
+  v1 = 0.5 + ( v1 * suminv );
+  v2 = 0.5 + ( v2 * suminv );
+  dst[0] = (unsigned char)ROUND_POSITIVE_FLOAT( 255.0 * v0 );
+  dst[1] = (unsigned char)ROUND_POSITIVE_FLOAT( 255.0 * v1 );
+  dst[2] = (unsigned char)ROUND_POSITIVE_FLOAT( 255.0 * v2 );
+
+  return;
+}
+
+static inline CC_ALWAYSINLINE void imReduceHalfBox4Normal( unsigned char *dst, unsigned char *src, int bytesperpixel, int bytesperline, float *dithersum )
+{
+  float v0, v1, v2, suminv;
+
+  v0 = (1.0/1020.0) * (float)( (int)src[0] + (int)src[bytesperpixel+0] + (int)src[bytesperline+0] + (int)src[bytesperpixel+bytesperline+0] );
+  v1 = (1.0/1020.0) * (float)( (int)src[1] + (int)src[bytesperpixel+1] + (int)src[bytesperline+1] + (int)src[bytesperpixel+bytesperline+1] );
+  v2 = (1.0/1020.0) * (float)( (int)src[2] + (int)src[bytesperpixel+2] + (int)src[bytesperline+2] + (int)src[bytesperpixel+bytesperline+2] );
+  v0 = 2.0 * ( v0 - 0.5 );
+  v1 = 2.0 * ( v1 - 0.5 );
+  v2 = 2.0 * ( v2 - 0.5 );
+  suminv = 0.5 / sqrtf( ( v0 * v0 ) + ( v1 * v1 ) + ( v2 * v2 ) );
+  v0 = 0.5 + ( v0 * suminv );
+  v1 = 0.5 + ( v1 * suminv );
+  v2 = 0.5 + ( v2 * suminv );
+  dst[0] = (unsigned char)ROUND_POSITIVE_FLOAT( 255.0 * v0 );
+  dst[1] = (unsigned char)ROUND_POSITIVE_FLOAT( 255.0 * v1 );
+  dst[2] = (unsigned char)ROUND_POSITIVE_FLOAT( 255.0 * v2 );
+  dst[3] = (unsigned char)( ( (int)src[3] + (int)src[bytesperpixel+3] + (int)src[bytesperline+3] + (int)src[bytesperpixel+bytesperline+3] + 2 ) >> 2 );
+
+  return;
+}
+
+static inline CC_ALWAYSINLINE void imReduceHalfBox3Water( unsigned char *dst, unsigned char *src, int bytesperpixel, int bytesperline, float *dithersum )
+{
+  float v0, v1, v2, suminv;
+
+  v0 = (1.0f/1020.0f) * (float)( (int)src[0] + (int)src[bytesperpixel+0] + (int)src[bytesperline+0] + (int)src[bytesperpixel+bytesperline+0] );
+  v1 = (1.0f/1020.0f) * (float)( (int)src[1] + (int)src[bytesperpixel+1] + (int)src[bytesperline+1] + (int)src[bytesperpixel+bytesperline+1] );
+  v2 = (1.0f/1020.0f) * (float)( (int)src[2] + (int)src[bytesperpixel+2] + (int)src[bytesperline+2] + (int)src[bytesperpixel+bytesperline+2] );
+
+  v0 = 2.0f * ( v0 - 0.5f );
+  v1 = 2.0f * ( v1 - 0.5f );
+  suminv = sqrtf( ( v0 * v0 ) + ( v1 * v1 ) );
+  if( suminv < 0.75f )
+  {
+    suminv = 0.5f / suminv;
+    v0 = 0.5f + ( v0 * suminv );
+    v1 = 0.5f + ( v1 * suminv );
+  }
+  if( v2 > 0.1f )
+  {
+    *dithersum += v2;
+    if( v2 > 0.45f )
+      v2 = 1.0f;
+    else if( ( v2 < 0.3f ) && ( *dithersum < 1.0f ) )
+      v2 = 0.0f;
+    else
+      v2 = ( ( v2 + *dithersum ) < 0.45f ? 0.0f : 1.0f );
+    *dithersum -= v2;
+  }
+  v0 *= 255.0f;
+  v1 *= 255.0f;
+  v2 *= 255.0f;
+
+  dst[0] = (int)( fmaxf( 0.0f, fminf( 255.0f, v0 + 0.5f ) ) );
+  dst[1] = (int)( fmaxf( 0.0f, fminf( 255.0f, v1 + 0.5f ) ) );
+  dst[2] = (int)( fmaxf( 0.0f, fminf( 255.0f, v2 + 0.5f ) ) );
+
+  return;
+}
+
+static inline CC_ALWAYSINLINE void imReduceHalfBox4Water( unsigned char *dst, unsigned char *src, int bytesperpixel, int bytesperline, float *dithersum )
+{
+  float v0, v1, v2, suminv;
+
+  v0 = (1.0f/1020.0f) * (float)( (int)src[0] + (int)src[bytesperpixel+0] + (int)src[bytesperline+0] + (int)src[bytesperpixel+bytesperline+0] );
+  v1 = (1.0f/1020.0f) * (float)( (int)src[1] + (int)src[bytesperpixel+1] + (int)src[bytesperline+1] + (int)src[bytesperpixel+bytesperline+1] );
+  v2 = (1.0f/1020.0f) * (float)( (int)src[2] + (int)src[bytesperpixel+2] + (int)src[bytesperline+2] + (int)src[bytesperpixel+bytesperline+2] );
+
+  v0 = 2.0f * ( v0 - 0.5f );
+  v1 = 2.0f * ( v1 - 0.5f );
+  suminv = sqrtf( ( v0 * v0 ) + ( v1 * v1 ) );
+  if( suminv < 0.75f )
+  {
+    suminv = 0.5f / suminv;
+    v0 = 0.5f + ( v0 * suminv );
+    v1 = 0.5f + ( v1 * suminv );
+  }
+  if( v2 > 0.1f )
+  {
+    *dithersum += v2;
+    if( v2 > 0.45f )
+      v2 = 1.0f;
+    else if( ( v2 < 0.3f ) && ( *dithersum < 1.0f ) )
+      v2 = 0.0f;
+    else
+      v2 = ( ( v2 + *dithersum ) < 0.45f ? 0.0f : 1.0f );
+    *dithersum -= v2;
+  }
+  v0 *= 255.0f;
+  v1 *= 255.0f;
+  v2 *= 255.0f;
+
+  dst[0] = (int)( fmaxf( 0.0f, fminf( 255.0f, v0 + 0.5f ) ) );
+  dst[1] = (int)( fmaxf( 0.0f, fminf( 255.0f, v1 + 0.5f ) ) );
+  dst[2] = (int)( fmaxf( 0.0f, fminf( 255.0f, v2 + 0.5f ) ) );
+  dst[3] = (unsigned char)( ( (int)src[3] + (int)src[bytesperpixel+3] + (int)src[bytesperline+3] + (int)src[bytesperpixel+bytesperline+3] + 2 ) >> 2 );
+
+  return;
+}
+
+static inline CC_ALWAYSINLINE void imReduceHalfBox4Plant( unsigned char *dst, unsigned char *src, int bytesperpixel, int bytesperline, float *dithersum )
+{
+  int alpha;
+
+  dst[0] = (unsigned char)( ( (int)src[0] + (int)src[bytesperpixel+0] + (int)src[bytesperline+0] + (int)src[bytesperpixel+bytesperline+0] + 2 ) >> 2 );
+  dst[1] = (unsigned char)( ( (int)src[1] + (int)src[bytesperpixel+1] + (int)src[bytesperline+1] + (int)src[bytesperpixel+bytesperline+1] + 2 ) >> 2 );
+  dst[2] = (unsigned char)( ( (int)src[2] + (int)src[bytesperpixel+2] + (int)src[bytesperline+2] + (int)src[bytesperpixel+bytesperline+2] + 2 ) >> 2 );
+
+  alpha = ( (int)src[3] + (int)src[bytesperpixel+3] + (int)src[bytesperline+3] + (int)src[bytesperpixel+bytesperline+3] );
+  alpha += alpha >> 2;
+  alpha = ( alpha + 2 ) >> 2;
+  if( alpha > 255 )
+    alpha = 255;
+  dst[3] = (unsigned char)alpha;
+
+  return;
+}
+
+
+static inline CC_ALWAYSINLINE void imReduceImageHalfBoxWork( unsigned char *dst, unsigned char *src, int width, int height, int bytesperpixel, int bytesperline, void (*work)( unsigned char *dst, unsigned char *src, int bytesperpixel, int bytesperline, float *dithersum ) )
+{
+  int x, y, newwidth, newheight, rowoffset;
+  float dithersum;
+  newwidth = ( width < 2 ) ? 1 : ( ( width + 1 ) / 2 );
+  newheight = ( height < 2 ) ? 1 : ( ( height + 1 ) / 2 );
+  rowoffset = bytesperline + ( bytesperpixel * ( width - ( newwidth << 1 ) ) );
+  dithersum = 0.0f;
+  if( ( newwidth | newheight ) > 2 )
+    dithersum = 0.5f;
+  for( y = 0 ; y < newheight ; y++ )
+  {
+    for( x = 0 ; x < newwidth ; x++, src += bytesperpixel, dst += bytesperpixel )
+    {
+      work( dst, src, bytesperpixel, bytesperline, &dithersum );
+      src += bytesperpixel;
+    }
+    src += rowoffset;
+  }
+  return;
+}
+
+
+int imReduceImageHalfBoxData( unsigned char *dstdata, unsigned char *srcdata, int width, int height, int bytesperpixel, int bytesperline, imReduceOptions *options )
+{
+  int filter, retval;
+
+  filter = options->filter;
+  retval = 1;
+  if( ( filter == IM_REDUCE_FILTER_LINEAR ) || ( filter == IM_REDUCE_FILTER_LINEAR_ALPHANORM ) )
+  {
+    if( bytesperpixel == 4 )
+      imReduceImageHalfBoxWork( dstdata, srcdata, width, height, bytesperpixel, bytesperline, imReduceHalfBox4Linear );
+    else if( bytesperpixel == 3 )
+      imReduceImageHalfBoxWork( dstdata, srcdata, width, height, bytesperpixel, bytesperline, imReduceHalfBox3Linear );
+    else if( bytesperpixel == 2 )
+      imReduceImageHalfBoxWork( dstdata, srcdata, width, height, bytesperpixel, bytesperline, imReduceHalfBox2Linear );
+    else if( bytesperpixel == 1 )
+      imReduceImageHalfBoxWork( dstdata, srcdata, width, height, bytesperpixel, bytesperline, imReduceHalfBox1Linear );
+    else
+      retval = 0;
+  }
+  else if( ( filter == IM_REDUCE_FILTER_SRGB ) || ( filter == IM_REDUCE_FILTER_SRGB_ALPHANORM ) )
+  {
+    if( bytesperpixel == 4 )
+      imReduceImageHalfBoxWork( dstdata, srcdata, width, height, bytesperpixel, bytesperline, imReduceHalfBox4sRGB );
+    else if( bytesperpixel == 3 )
+      imReduceImageHalfBoxWork( dstdata, srcdata, width, height, bytesperpixel, bytesperline, imReduceHalfBox3sRGB );
+    else if( bytesperpixel == 2 )
+      imReduceImageHalfBoxWork( dstdata, srcdata, width, height, bytesperpixel, bytesperline, imReduceHalfBox2sRGB );
+    else if( bytesperpixel == 1 )
+      imReduceImageHalfBoxWork( dstdata, srcdata, width, height, bytesperpixel, bytesperline, imReduceHalfBox1sRGB );
+    else
+      retval = 0;
+  }
+  else if( ( filter == IM_REDUCE_FILTER_NORMALMAP ) || ( filter == IM_REDUCE_FILTER_NORMALMAP_ALPHANORM ) || ( filter == IM_REDUCE_FILTER_NORMALMAP_SUSTAIN ) || ( filter == IM_REDUCE_FILTER_NORMALMAP_SUSTAIN_ALPHANORM ) )
+  {
+    if( bytesperpixel == 4 )
+      imReduceImageHalfBoxWork( dstdata, srcdata, width, height, bytesperpixel, bytesperline, imReduceHalfBox4Normal );
+    else if( bytesperpixel == 3 )
+      imReduceImageHalfBoxWork( dstdata, srcdata, width, height, bytesperpixel, bytesperline, imReduceHalfBox3Normal );
+    else
+      retval = 0;
+  }
+  else if( filter == IM_REDUCE_FILTER_WATERMAP )
+  {
+    if( bytesperpixel == 4 )
+      imReduceImageHalfBoxWork( dstdata, srcdata, width, height, bytesperpixel, bytesperline, imReduceHalfBox4Water );
+    else if( bytesperpixel == 3 )
+      imReduceImageHalfBoxWork( dstdata, srcdata, width, height, bytesperpixel, bytesperline, imReduceHalfBox3Water );
+    else
+      retval = 0;
+  }
+  else if( filter == IM_REDUCE_FILTER_PLANTMAP )
+  {
+    if( bytesperpixel == 4 )
+      imReduceImageHalfBoxWork( dstdata, srcdata, width, height, bytesperpixel, bytesperline, imReduceHalfBox4Plant );
+    else
+      retval = 0;
+  }
+  else
+    retval = 0;
+
+  return retval;
+}
+
+
+int imReduceImageHalfBox( imgImage *imgdst, imgImage *imgsrc, imReduceOptions *options )
+{
+  int newwidth, newheight, retvalue;
+
+  newwidth = ( ( imgsrc->format.width < 2 ) ? 1 : ( ( imgsrc->format.width + 1 ) / 2 ) );
+  newheight = ( ( imgsrc->format.height < 2 ) ? 1 : ( ( imgsrc->format.height + 1 ) / 2 ) );
+
+  imgdst->format.width = newwidth;
+  imgdst->format.height = newheight;
+  imgdst->format.type = imgsrc->format.type;
+  imgdst->format.bytesperpixel = imgsrc->format.bytesperpixel;
+  imgdst->format.bytesperline = imgdst->format.width * imgdst->format.bytesperpixel;
+  imgdst->data = malloc( imgdst->format.height * imgdst->format.bytesperline );
+
+  retvalue = imReduceImageHalfBoxData( imgdst->data, imgsrc->data, imgsrc->format.width, imgsrc->format.height, imgsrc->format.bytesperpixel, imgsrc->format.bytesperline, options );
+
+  return retvalue;
+}
+
+
+
+////////////////////////////////////////////////////////////////////////////////
+
+
+
+int imBuildMipmapCascade( imMipmapCascade *cascade, void *imagedata, int width, int height, int layercount, int bytesperpixel, int bytesperline, imReduceOptions *options, int cascadeflags )
+{
+  int layerindex, level, srclevel, srcwidth, srcheight, method, divisor;
+  int levelwidth, levelheight;
+  void *src, *dst;
+
+  cascade->width = width;
+  cascade->height = height;
+  cascade->layercount = layercount;
+  cascade->bytesperpixel = bytesperpixel;
+  cascade->bytesperline = bytesperline;
+  cascade->options = options;
+
+  /* No need for mipmaps */
+  if( ( cascade->width == 1 ) && ( cascade->height == 1 ) )
+    return 1;
+  if( bytesperpixel != 4 )
+    cascadeflags &= ~( IM_CASCADE_FLAGS_COLOR_BORDER_BASE | IM_CASCADE_FLAGS_COLOR_BORDER_MIPMAPS );
+
+  /* Allocate all the mipmap levels */
+  if( !( layercount ) )
+    layercount = 1;
+  cascade->mipmap[0] = imagedata;
+  levelwidth = cascade->width;
+  levelheight = cascade->height;
+  for( level = 1 ; ; level++ )
+  {
+    levelwidth = ( levelwidth < 2 ) ? 1 : ( levelwidth >> 1 );
+    levelheight = ( levelheight < 2 ) ? 1 : ( levelheight >> 1 );
+    if( !( cascade->mipmap[level] = malloc( levelwidth * levelheight * layercount * bytesperpixel ) ) )
+      return 0;
+    if( ( levelwidth == 1 ) && ( levelheight == 1 ) )
+      break;
+  }
+  cascade->mipmap[level+1] = 0;
+
+  if( cascadeflags & IM_CASCADE_FLAGS_COLOR_BORDER_BASE )
+    imPropagateAlphaBorder( imagedata, width, height * layercount, bytesperpixel, bytesperline );
+
+  /* For every layer, compute all its mipmap */
+  for( layerindex = 0 ; layerindex < layercount ; layerindex++ )
+  {
+    levelwidth = cascade->width;
+    levelheight = cascade->height;
+    for( level = 1 ; cascade->mipmap[level] ; level++ )
+    {
+      levelwidth = ( levelwidth < 2 ) ? 1 : ( levelwidth >> 1 );
+      levelheight = ( levelheight < 2 ) ? 1 : ( levelheight >> 1 );
+      dst = ADDRESS( cascade->mipmap[level], layerindex * levelwidth * levelheight * bytesperpixel );
+
+      /* Decide what method and source level to pick */
+      if( ( levelwidth | levelheight ) >= 16 )
+      {
+        srclevel = level - 2;
+        if( srclevel < 0 )
+          srclevel = 0;
+        method = 1;
+      }
+      else
+      {
+        srclevel = level - 1;
+        method = 0;
+      }
+#if DEBUG_VERBOSE
+      printf( "Tex level %d, srclevel %d, layer %d, filter %d, method %d : %d x %d\n", level, srclevel, layerindex, options->filter, method, levelwidth, levelheight );
+#endif
+      srcwidth = width >> srclevel;
+      if( !( srcwidth ) )
+        srcwidth = 1;
+      srcheight = height >> srclevel;
+      if( !( srcheight ) )
+        srcheight = 1;
+      if( srclevel )
+        src = ADDRESS( cascade->mipmap[srclevel], layerindex * srcheight * srcwidth * bytesperpixel );
+      else
+        src = ADDRESS( cascade->mipmap[srclevel], layerindex * srcheight * cascade->bytesperline );
+
+      divisor = 1 << ( level - srclevel );
+      if( ( ( levelwidth * divisor ) != srcwidth ) || ( ( levelheight * divisor ) != srcheight ) )
+        method = 2;
+
+      if( method == 2 )
+      {
+        if( !( imReduceImageKaiserData( dst, src, srcwidth, srcheight, bytesperpixel, srcwidth * bytesperpixel, levelwidth, levelheight, options ) ) )
+        {
+          printf( "ERROR AT %s:%d\n", __FILE__, __LINE__ );
+          return 0;
+        }
+      }
+      else if( method == 1 )
+      {
+        if( !( imReduceImageKaiserDataDivisor( dst, src, srcwidth, srcheight, bytesperpixel, srcwidth * bytesperpixel, divisor, options ) ) )
+        {
+          printf( "ERROR AT %s:%d\n", __FILE__, __LINE__ );
+          return 0;
+        }
+      }
+      else
+      {
+        if( !( imReduceImageHalfBoxData( dst, src, srcwidth, srcheight, bytesperpixel, srcwidth * bytesperpixel, options ) ) )
+        {
+          printf( "ERROR AT %s:%d\n", __FILE__, __LINE__ );
+          return 0;
+        }
+      }
+
+      if( cascadeflags & IM_CASCADE_FLAGS_COLOR_BORDER_MIPMAPS )
+        imPropagateAlphaBorder( dst, levelwidth, levelheight, bytesperpixel, levelwidth * bytesperpixel );
+    }
+  }
+
+  return 1;
+}
+
+
+void imFreeMipmapCascade( imMipmapCascade *cascade )
+{
+  int level;
+  for( level = 1 ; ; level++ )
+  {
+    if( !( cascade->mipmap[level] ) )
+      break;
+    free( cascade->mipmap[level] );
+    cascade->mipmap[level] = 0;
+  }
+  return;
+}
+
+
+////
+
+
+#define IM_PIXEL_ALPHA_MASK (0xff000000)
+#define IM_PIXEL_RGB_MASK (0x00ffffff)
+
+void imPropagateAlphaBorder( unsigned char *imagedata, int width, int height, int bytesperpixel, int bytesperline )
+{
+  int x, y, backtrackflag;
+  uint32_t pixel, refcolor, prevrowpixel;
+  uint32_t *row, *prevrow;
+
+  if( bytesperpixel != 4 )
+    return;
+  row = (uint32_t *)imagedata;
+  prevrow = row;
+  for( y = 0 ; y < height ; y++ )
+  {
+    refcolor = 0;
+    backtrackflag = 0;
+    for( x = 0 ; x < width ; x++ )
+    {
+      pixel = row[x];
+      prevrowpixel = prevrow[x];
+      if( pixel & IM_PIXEL_ALPHA_MASK )
+      {
+        /* Pixel has some color, spread to neighbor if applicable */
+        refcolor = pixel & IM_PIXEL_RGB_MASK;
+        if( backtrackflag )
+        {
+          row[x-1] = refcolor;
+          backtrackflag = 0;
+        }
+        if( !( prevrowpixel & IM_PIXEL_ALPHA_MASK ) )
+          prevrow[x] = refcolor;
+      }
+      else
+      {
+        /* Pixel is fully transparent, spread from neighbor if applicable */
+        if( refcolor )
+        {
+          row[x] = refcolor;
+          backtrackflag = 0;
+          refcolor = 0;
+        }
+        else if( prevrowpixel & IM_PIXEL_ALPHA_MASK )
+        {
+          row[x] = prevrowpixel & IM_PIXEL_RGB_MASK;
+          backtrackflag = 0;
+        }
+        else
+          backtrackflag = 1;
+      }
+    }
+    prevrow = row;
+    row = ADDRESS( row, bytesperline );
+  }
+
+  return;
+}
+
+
+
diff --git a/lib/graphics_utils/mipmap/imgresize.h b/lib/graphics_utils/mipmap/imgresize.h
new file mode 100644
index 000000000..b47d07a6b
--- /dev/null
+++ b/lib/graphics_utils/mipmap/imgresize.h
@@ -0,0 +1,150 @@
+/* -----------------------------------------------------------------------------
+ *
+ * Copyright (c) 2014-2017 Alexis Naveros.
+ * Portions developed under contract to the SURVICE Engineering Company.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty. In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ * claim that you wrote the original software. If you use this software
+ * in a product, an acknowledgment in the product documentation would be
+ * appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ * misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ *
+ * -----------------------------------------------------------------------------
+ */
+
+
+#ifndef IMGRESIZE_H
+#define IMGRESIZE_H
+
+
+typedef struct
+{
+  /* Specify filter type, from the IM_REDUCE_FILTER_* list */
+  int filter;
+  /* High quality, a little slow: hopcount=3; */
+  /* Good quality, much faster: hopcount=2; */
+  int hopcount;
+  /* Strong preservation/amplification of details: alpha=2.0f; */
+  /* Mild preservation/amplification of details: alpha=6.0f; */
+  float alpha;
+  /* NORMALMAP filters: factor to amyplify normals on X and Y before normalization */
+  float amplifynormal;
+  /* NORMALMAP_SUSTAIN filters: Preserve a factor of deviation "energy" as calculated by sqrtf(x*x+y*y) */
+  float normalsustainfactor;
+} imReduceOptions;
+
+static inline void imReduceSetOptions( imReduceOptions *options, int filter, int hopcount, float alpha, float amplifynormal, float normalsustainfactor )
+{
+  options->filter = filter;
+  options->hopcount = hopcount;
+  options->alpha = alpha;
+  options->amplifynormal = amplifynormal;
+  options->normalsustainfactor = normalsustainfactor;
+  return;
+}
+
+
+/* Reduce the image's dimensions by an integer divisor ~ this is fairly fast */
+int imReduceImageKaiserDataDivisor( unsigned char *dstdata, unsigned char *srcdata, int width, int height, int bytesperpixel, int bytesperline, int sizedivisor, imReduceOptions *options );
+/* Same as imReduceImageKaiserDataDivisor(), but imgdst is allocated */
+int imReduceImageKaiserDivisor( imgImage *imgdst, imgImage *imgsrc, int sizedivisor, imReduceOptions *options );
+
+
+/* Reduce the image's dimensions to match the newwidth and newheight ~ this is a little slower */
+int imReduceImageKaiserData( unsigned char *dstdata, unsigned char *srcdata, int width, int height, int bytesperpixel, int bytesperline, int newwidth, int newheight, imReduceOptions *options );
+/* Same as imReduceImageKaiserData(), but imgdst is allocated */
+int imReduceImageKaiser( imgImage *imgdst, imgImage *imgsrc, int newwidth, int newheight, imReduceOptions *options );
+
+
+/* Resize by half with a dumb box filter ~ don't use that except for the smallest mipmaps */
+/* Filters with ALPHANORM and/or SUSTAIN keywords are processed as the regular base filter only */
+int imReduceImageHalfBoxData( unsigned char *dstdata, unsigned char *srcdata, int width, int height, int bytesperpixel, int bytesperline, imReduceOptions *options );
+int imReduceImageHalfBox( imgImage *imgdst, imgImage *imgsrc, imReduceOptions *options );
+
+
+/*
+Keywords for image reduction filters
+
+LINEAR: Data is linear, note that this is *not* the format of typical diffuse textures
+SRGB: Color is in sRGB space, any alpha is presumed linear
+NORMALMAP: RGB represents a XYZ vector as (2.0*RGB)-1.0f, any alpha is presumed linear
+
+ALPHANORM: Alpha normalization, the weight of pixels is proportional to their alpha values
+           (do you have "black" fully transparent pixels? please use an ALPHANORM filter)
+SUSTAIN: The "energy" of the normal map is sustained, amplified to preserve the level of details
+         Note that this filter is rather slow (set options->normalsustainfactor to 0.75 or so)
+*/
+
+enum
+{
+  /* Linear space */
+  IM_REDUCE_FILTER_LINEAR,
+  IM_REDUCE_FILTER_LINEAR_ALPHANORM,
+
+  /* sRGB space (probably what you want for diffuse textures) */
+  IM_REDUCE_FILTER_SRGB,
+  IM_REDUCE_FILTER_SRGB_ALPHANORM,
+
+  /* RGB represents a XYZ vector as (2.0*RGB)-1.0f, any alpha is presumed linear */
+  IM_REDUCE_FILTER_NORMALMAP,
+  IM_REDUCE_FILTER_NORMALMAP_ALPHANORM,
+  IM_REDUCE_FILTER_NORMALMAP_SUSTAIN,
+  IM_REDUCE_FILTER_NORMALMAP_SUSTAIN_ALPHANORM,
+
+  /* Custom specialized filters */
+  IM_REDUCE_FILTER_WATERMAP,
+  IM_REDUCE_FILTER_PLANTMAP,
+  IM_REDUCE_FILTER_FOLLIAGE,
+  IM_REDUCE_FILTER_SKY,
+  IM_REDUCE_FILTER_FOG
+};
+
+
+////
+
+
+#define IM_MIPMAP_CASCADE_MAX (16)
+
+typedef struct
+{
+  int width;
+  int height;
+  int layercount;
+  int bytesperpixel;
+  int bytesperline;
+  imReduceOptions *options;
+  void *mipmap[IM_MIPMAP_CASCADE_MAX];
+} imMipmapCascade;
+
+
+int imBuildMipmapCascade( imMipmapCascade *cascade, void *imagedata, int width, int height, int layercount, int bytesperpixel, int bytesperline, imReduceOptions *options, int cascadeflags );
+
+void imFreeMipmapCascade( imMipmapCascade *cascade );
+
+/* For base texture, propagate RGB channels to neighbors if they are fully transparent (ignored if bytesperpixel != 4 ) */
+#define IM_CASCADE_FLAGS_COLOR_BORDER_BASE (0x1)
+/* For generated mipmaps, propagate RGB channels to neighbors if they are fully transparent (ignored if bytesperpixel != 4 ) */
+#define IM_CASCADE_FLAGS_COLOR_BORDER_MIPMAPS (0x2)
+
+
+////
+
+
+void imPropagateAlphaBorder( unsigned char *imagedata, int width, int height, int bytesperpixel, int bytesperline );
+
+
+////
+
+
+#endif
+
diff --git a/src/config/user_config.hpp b/src/config/user_config.hpp
index 6bdfa1ae8..3efb659ee 100644
--- a/src/config/user_config.hpp
+++ b/src/config/user_config.hpp
@@ -934,6 +934,11 @@ namespace UserConfigParams
             PARAM_DEFAULT( BoolUserConfigParam(false, "everything_unlocked",
                                "Enable all karts and tracks") );
 
+    PARAM_PREFIX BoolUserConfigParam        m_hq_mipmap
+            PARAM_DEFAULT( BoolUserConfigParam(false, "hq_mipmap",
+                               "Generate mipmap for textures using "
+                               "high quality method with SSE") );
+
     // TODO? implement blacklist for new irrlicht device and GUI
     PARAM_PREFIX std::vector<std::string>   m_blacklist_res;
 
diff --git a/src/graphics/central_settings.cpp b/src/graphics/central_settings.cpp
index 9ad4e7495..d34b3e102 100644
--- a/src/graphics/central_settings.cpp
+++ b/src/graphics/central_settings.cpp
@@ -490,7 +490,7 @@ bool CentralVideoSettings::isARBPixelBufferObjectUsable() const
 
 bool CentralVideoSettings::supportsThreadedTextureLoading() const
 {
-    return isARBPixelBufferObjectUsable() && isARBBufferStorageUsable();
+    return isARBPixelBufferObjectUsable() && isARBBufferStorageUsable() && isARBTextureStorageUsable();
 }
 
 #endif   // !SERVER_ONLY
diff --git a/src/graphics/hq_mipmap_generator.cpp b/src/graphics/hq_mipmap_generator.cpp
new file mode 100644
index 000000000..4939f0ed5
--- /dev/null
+++ b/src/graphics/hq_mipmap_generator.cpp
@@ -0,0 +1,119 @@
+//  SuperTuxKart - a fun racing game with go-kart
+//  Copyright (C) 2017 SuperTuxKart-Team
+//
+//  This program is free software; you can redistribute it and/or
+//  modify it under the terms of the GNU General Public License
+//  as published by the Free Software Foundation; either version 3
+//  of the License, or (at your option) any later version.
+//
+//  This program is distributed in the hope that it will be useful,
+//  but WITHOUT ANY WARRANTY; without even the implied warranty of
+//  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+//  GNU General Public License for more details.
+//
+//  You should have received a copy of the GNU General Public License
+//  along with this program; if not, write to the Free Software
+//  Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
+
+
+#include "graphics/hq_mipmap_generator.hpp"
+#define DUMP_MIPMAP
+#ifdef DUMP_MIPMAP
+#include "graphics/irr_driver.hpp"
+#include "utils/string_utils.hpp"
+#endif
+#include <cassert>
+
+extern "C"
+{
+    #include <mipmap/img.h>
+    #include <mipmap/imgresize.h>
+}
+
+// ----------------------------------------------------------------------------
+HQMipmapGenerator::HQMipmapGenerator(const io::path& name, uint8_t* data,
+                                     const core::dimension2d<u32>& size,
+                                     GLuint texture_name, bool single_channel)
+                 : video::ITexture(name), m_orig_data(data), m_size(size),
+                   m_texture_name(texture_name), m_texture_size(0),
+                   m_single_channel(single_channel), m_mipmap_data(NULL)
+{
+    unsigned width = m_size.Width;
+    unsigned height = m_size.Height;
+    while (true)
+    {
+        width = width < 2 ? 1 : width >> 1;
+        height = height < 2 ? 1 : height >> 1;
+        m_mipmap_sizes.emplace_back(core::dimension2du(width, height),
+            m_texture_size);
+        m_texture_size += width * height * (m_single_channel ? 1 : 4);
+        if (width == 1 && height == 1)
+            break;
+    }
+    m_texture_size = unsigned(m_mipmap_sizes.back().second) +
+        (m_single_channel ? 1 : 4);
+    m_mipmap_data = malloc(sizeof(imMipmapCascade));
+}   // HQMipmapGenerator
+
+// ----------------------------------------------------------------------------
+HQMipmapGenerator::~HQMipmapGenerator()
+{
+    imFreeMipmapCascade((imMipmapCascade*)m_mipmap_data);
+    free(m_mipmap_data);
+}   // ~HQMipmapGenerator
+
+// ----------------------------------------------------------------------------
+void HQMipmapGenerator::threadedReload(void* ptr, void* param) const
+{
+    imReduceOptions options;
+    imReduceSetOptions(&options, IM_REDUCE_FILTER_SRGB, 3, 2.0f, 0.0f, 0.0f);
+    imMipmapCascade* mm_cascade = (imMipmapCascade*)m_mipmap_data;
+#ifdef DEBUG
+    int ret = imBuildMipmapCascade(mm_cascade, m_orig_data, m_size.Width,
+        m_size.Height, 1/*layercount*/, m_single_channel ? 1 : 4,
+        m_single_channel ? m_size.Width : m_size.Width * 4, &options, 0);
+    assert(ret == 1);
+#else
+    imBuildMipmapCascade(mm_cascade, m_orig_data, m_size.Width,
+        m_size.Height, 1/*layercount*/, m_single_channel ? 1 : 4,
+        m_single_channel ? m_size.Width : m_size.Width * 4, &options, 0);
+#endif
+    for (unsigned int i = 0; i < m_mipmap_sizes.size(); i++)
+    {
+        memcpy((uint8_t*)ptr + m_mipmap_sizes[i].second,
+            mm_cascade->mipmap[i + 1],
+            m_mipmap_sizes[i].first.getArea() * (m_single_channel ? 1 : 4));
+#ifdef DUMP_MIPMAP
+        if (m_single_channel) continue;
+        video::IImage* image = irr_driver->getVideoDriver()
+            ->createImageFromData(video::ECF_A8R8G8B8, m_mipmap_sizes[i].first,
+            mm_cascade->mipmap[i + 1], false/*ownForeignMemory*/);
+        irr_driver->getVideoDriver()->writeImageToFile(image, std::string
+            (StringUtils::toString(i) + "_" +
+            StringUtils::getBasename(NamedPath.getPtr()) + ".png").c_str());
+        image->drop();
+#endif
+    }
+}   // threadedReload
+
+// ----------------------------------------------------------------------------
+void HQMipmapGenerator::threadedSubImage(void* ptr) const
+{
+#if !(defined(SERVER_ONLY) || defined(USE_GLES2))
+    glBindTexture(GL_TEXTURE_2D, m_texture_name);
+    for (unsigned int i = 0; i < m_mipmap_sizes.size(); i++)
+    {
+        glTexSubImage2D(GL_TEXTURE_2D, i + 1, 0, 0,
+            m_mipmap_sizes[i].first.Width, m_mipmap_sizes[i].first.Height,
+            m_single_channel ? GL_RED : GL_BGRA, GL_UNSIGNED_BYTE,
+            (uint8_t*)ptr + m_mipmap_sizes[i].second);
+    }
+    delete this;
+#endif
+}   // threadedSubImage
+
+// ----------------------------------------------------------------------------
+void HQMipmapGenerator::cleanThreadedLoader()
+{
+    delete[] m_orig_data;
+}   // cleanThreadedLoader
diff --git a/src/graphics/hq_mipmap_generator.hpp b/src/graphics/hq_mipmap_generator.hpp
new file mode 100644
index 000000000..40827525d
--- /dev/null
+++ b/src/graphics/hq_mipmap_generator.hpp
@@ -0,0 +1,98 @@
+//  SuperTuxKart - a fun racing game with go-kart
+//  Copyright (C) 2017 SuperTuxKart-Team
+//
+//  This program is free software; you can redistribute it and/or
+//  modify it under the terms of the GNU General Public License
+//  as published by the Free Software Foundation; either version 3
+//  of the License, or (at your option) any later version.
+//
+//  This program is distributed in the hope that it will be useful,
+//  but WITHOUT ANY WARRANTY; without even the implied warranty of
+//  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+//  GNU General Public License for more details.
+//
+//  You should have received a copy of the GNU General Public License
+//  along with this program; if not, write to the Free Software
+//  Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
+
+#ifndef HEADER_HQ_MIPMAP_GENERATOR_HPP
+#define HEADER_HQ_MIPMAP_GENERATOR_HPP
+
+#include "graphics/gl_headers.hpp"
+#include "utils/no_copy.hpp"
+#include "utils/types.hpp"
+
+#include <vector>
+#include <ITexture.h>
+
+using namespace irr;
+
+class HQMipmapGenerator : public video::ITexture, NoCopy
+{
+private:
+    uint8_t* m_orig_data;
+
+    core::dimension2d<u32> m_size;
+
+    GLuint m_texture_name;
+
+    unsigned int m_texture_size;
+
+    bool m_single_channel;
+
+    void* m_mipmap_data;
+
+    std::vector<std::pair<core::dimension2d<u32>, size_t> > m_mipmap_sizes;
+
+public:
+    // ------------------------------------------------------------------------
+    HQMipmapGenerator(const io::path& name, uint8_t* data,
+                      const core::dimension2d<u32>& size, GLuint texture_name,
+                      bool single_channel);
+    // ------------------------------------------------------------------------
+    virtual ~HQMipmapGenerator();
+    // ------------------------------------------------------------------------
+    virtual void* lock(video::E_TEXTURE_LOCK_MODE mode =
+                       video::ETLM_READ_WRITE, u32 mipmap_level = 0)
+                                                               { return NULL; }
+    // ------------------------------------------------------------------------
+    virtual void unlock()                                                    {}
+    // ------------------------------------------------------------------------
+    virtual const core::dimension2d<u32>& getOriginalSize() const
+                                                             { return m_size; }
+    // ------------------------------------------------------------------------
+    virtual const core::dimension2d<u32>& getSize() const    { return m_size; }
+    // ------------------------------------------------------------------------
+    virtual video::E_DRIVER_TYPE getDriverType() const
+    {
+#if defined(USE_GLES2)
+        return video::EDT_OGLES2;
+#else
+        return video::EDT_OPENGL;
+#endif
+    }
+    // ------------------------------------------------------------------------
+    virtual video::ECOLOR_FORMAT getColorFormat() const
+                                                { return video::ECF_A8R8G8B8; }
+    // ------------------------------------------------------------------------
+    virtual u32 getPitch() const                                  { return 0; }
+    // ------------------------------------------------------------------------
+    virtual bool hasMipMaps() const                           { return false; }
+    // ------------------------------------------------------------------------
+    virtual void regenerateMipMapLevels(void* mipmap_data = NULL)            {}
+    // ------------------------------------------------------------------------
+    virtual u32 getOpenGLTextureName() const         { return m_texture_name; }
+    // ------------------------------------------------------------------------
+    virtual u64 getHandle()                                       { return 0; }
+    // ------------------------------------------------------------------------
+    virtual unsigned int getTextureSize() const      { return m_texture_size; }
+    // ------------------------------------------------------------------------
+    virtual void threadedReload(void* ptr, void* param) const;
+    // ------------------------------------------------------------------------
+    virtual void threadedSubImage(void* ptr) const;
+    // ------------------------------------------------------------------------
+    virtual void cleanThreadedLoader();
+
+};   // HQMipmapGenerator
+
+#endif
diff --git a/src/graphics/stk_tex_manager.cpp b/src/graphics/stk_tex_manager.cpp
index e524c3b4b..02c4fef52 100644
--- a/src/graphics/stk_tex_manager.cpp
+++ b/src/graphics/stk_tex_manager.cpp
@@ -17,6 +17,7 @@
 
 #include "graphics/stk_tex_manager.hpp"
 #include "config/hardware_stats.hpp"
+#include "config/user_config.hpp"
 #include "graphics/central_settings.hpp"
 #include "graphics/materials.hpp"
 #include "graphics/threaded_tex_loader.hpp"
@@ -33,11 +34,12 @@ STKTexManager::STKTexManager() : m_pbo(0), m_thread_size(0)
 #if !(defined(SERVER_ONLY) || defined(USE_GLES2))
     if (CVS->supportsThreadedTextureLoading())
     {
+        UserConfigParams::m_hq_mipmap = true;
         pthread_mutex_init(&m_threaded_load_textures_mutex, NULL);
         pthread_cond_init(&m_cond_request, NULL);
         m_thread_size = HardwareStats::getNumProcessors();
-        m_thread_size = core::clamp(m_thread_size, 1, 3);
-        static const unsigned max_pbo_size = 48 * 1024 * 1024;
+        m_thread_size = core::clamp(m_thread_size, 1, 8);
+        static const unsigned max_pbo_size = 128 * 1024 * 1024;
         const unsigned each_capacity = max_pbo_size / m_thread_size;
         Log::info("STKTexManager", "%d thread(s) for texture loading,"
             " each capacity %d MB", m_thread_size,
diff --git a/src/graphics/stk_texture.cpp b/src/graphics/stk_texture.cpp
index 0090db81a..2cfbe413c 100644
--- a/src/graphics/stk_texture.cpp
+++ b/src/graphics/stk_texture.cpp
@@ -18,14 +18,17 @@
 #include "graphics/stk_texture.hpp"
 #include "config/user_config.hpp"
 #include "graphics/central_settings.hpp"
+#include "graphics/hq_mipmap_generator.hpp"
 #include "graphics/irr_driver.hpp"
 #include "graphics/material.hpp"
 #include "graphics/material_manager.hpp"
 #include "graphics/materials.hpp"
+#include "graphics/stk_tex_manager.hpp"
 #include "modes/profile_world.hpp"
 #include "utils/log.hpp"
 #include "utils/string_utils.hpp"
 
+#include <algorithm>
 #include <fstream>
 #include <functional>
 
@@ -215,7 +218,7 @@ void STKTexture::reload(bool no_upload, uint8_t* preload_data,
     const unsigned int w = m_size.Width;
     const unsigned int h = m_size.Height;
     unsigned int format = m_single_channel ? GL_RED : GL_BGRA;
-    unsigned int internal_format = m_single_channel ? GL_R8 : GL_RGBA;
+    unsigned int internal_format = m_single_channel ? GL_R8 : GL_RGBA8;
 
 #if !defined(USE_GLES2)
     if (m_mesh_texture && CVS->isTextureCompressionEnabled())
@@ -227,13 +230,41 @@ void STKTexture::reload(bool no_upload, uint8_t* preload_data,
     else
     {
         internal_format =
-            m_single_channel ? GL_R8 : m_srgb ? GL_SRGB_ALPHA : GL_RGBA;
+            m_single_channel ? GL_R8 : m_srgb ? GL_SRGB8_ALPHA8 : GL_RGBA8;
     }
 #endif
     if (!useThreadedLoading())
         formatConversion(data, &format, w, h);
 
-    if (!no_upload)
+    if (useThreadedLoading())
+    {
+        if (m_texture_name == 0)
+        {
+            glGenTextures(1, &m_texture_name);
+            glBindTexture(GL_TEXTURE_2D, m_texture_name);
+            if (m_single_channel)
+            {
+                glPixelStorei(GL_UNPACK_ALIGNMENT, 1);
+                glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_SWIZZLE_R, GL_ONE);
+                glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_SWIZZLE_G, GL_ONE);
+                glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_SWIZZLE_B, GL_ONE);
+                glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_SWIZZLE_A, GL_RED);
+            }
+            int levels = 1;
+            int width = w;
+            int height = h;
+            while (true)
+            {
+                width = width < 2 ? 1 : width >> 1;
+                height = height < 2 ? 1 : height >> 1;
+                levels++;
+                if (width == 1 && height == 1)
+                    break;
+            }
+            glTexStorage2D(GL_TEXTURE_2D, levels, internal_format, w, h);
+        }
+    }
+    else if (!no_upload)
     {
         const bool reload = m_texture_name != 0;
         if (!reload)
@@ -253,14 +284,14 @@ void STKTexture::reload(bool no_upload, uint8_t* preload_data,
             glTexImage2D(GL_TEXTURE_2D, 0, internal_format, w, h, 0, format,
                 GL_UNSIGNED_BYTE, data);
         }
-        else if (!useThreadedLoading())
+        else
         {
             glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, w, h, format,
                 GL_UNSIGNED_BYTE, data);
         }
         if (orig_img)
             orig_img->unlock();
-        if (hasMipMaps() && !useThreadedLoading())
+        if (hasMipMaps())
             glGenerateMipmap(GL_TEXTURE_2D);
     }
 
@@ -594,8 +625,15 @@ void STKTexture::threadedReload(void* ptr, void* param) const
     if (orig_img)
     {
         orig_img->unlock();
+        orig_img->setDeleteMemory(false);
         orig_img->drop();
     }
+    if (useHQMipmap())
+    {
+        HQMipmapGenerator* hqmg = new HQMipmapGenerator(NamedPath, data,
+            m_size, m_texture_name, m_single_channel);
+        ((STKTexManager*)(param))->addThreadedLoadTexture(hqmg);
+    }
     else
         delete[] data;
 }   // threadedReload
@@ -607,8 +645,11 @@ void STKTexture::threadedSubImage(void* ptr) const
     glBindTexture(GL_TEXTURE_2D, m_texture_name);
     glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, m_size.Width, m_size.Height,
         m_single_channel ? GL_RED : GL_BGRA, GL_UNSIGNED_BYTE, ptr);
+    if (useHQMipmap())
+        return;
     if (hasMipMaps())
         glGenerateMipmap(GL_TEXTURE_2D);
+
 #endif
 }   // threadedSubImage
 
@@ -620,3 +661,10 @@ void STKTexture::cleanThreadedLoader()
     m_file = NULL;
     m_img_loader = NULL;
 }   // cleanThreadedLoader
+
+//-----------------------------------------------------------------------------
+bool STKTexture::useHQMipmap() const
+{
+    return UserConfigParams::m_hq_mipmap && m_size.Width > 1 &&
+        m_size.Height > 1;
+}   // useHQMipmap
diff --git a/src/graphics/stk_texture.hpp b/src/graphics/stk_texture.hpp
index 7d2c03fc6..652a41e8b 100644
--- a/src/graphics/stk_texture.hpp
+++ b/src/graphics/stk_texture.hpp
@@ -76,6 +76,8 @@ private:
             sc[i] = data[4 * i + 3];
         return sc;
     }
+    // ------------------------------------------------------------------------
+    bool useHQMipmap() const;
 
 public:
     // ------------------------------------------------------------------------
diff --git a/src/graphics/threaded_tex_loader.cpp b/src/graphics/threaded_tex_loader.cpp
index 5d09d095d..ffefeb9af 100644
--- a/src/graphics/threaded_tex_loader.cpp
+++ b/src/graphics/threaded_tex_loader.cpp
@@ -84,8 +84,9 @@ void ThreadedTexLoader::handleCompletedTextures()
     size_t offset = m_pbo_offset;
     for (irr::video::ITexture* tex : m_completed_textures)
     {
+        size_t cur_offset = tex->getTextureSize();
         tex->threadedSubImage((void*)offset);
-        offset += tex->getTextureSize();
+        offset += cur_offset;
     }
     m_completed_textures.clear();
 #endif