stk-code_catmod/lib/simd_wrapper/simd_wrapper.h
2022-04-29 11:02:25 +08:00

160 lines
5.1 KiB
C

/* ==========================================================================
* Copyright (c) 2022 SuperTuxKart-Team
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the
* "Software"), to deal in the Software without restriction, including
* without limitation the rights to use, copy, modify, merge, publish,
* distribute, sublicense, and/or sell copies of the Software, and to permit
* persons to whom the Software is furnished to do so, subject to the
* following conditions:
*
* The above copyright notice and this permission notice shall be included
* in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
* OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN
* NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
* DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
* OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
* USE OR OTHER DEALINGS IN THE SOFTWARE.
* ==========================================================================
*/
#ifndef HEADER_SIMD_WRAPPER_HPP
#define HEADER_SIMD_WRAPPER_HPP
#include <simde/simde-arch.h>
#if defined(SIMDE_ARCH_AMD64) || defined(SIMDE_ARCH_X86)
// Native SSE
#if __MMX__ || CPU_ENABLE_MMX
#include <mmintrin.h>
#define CPU_MMX_SUPPORT (1)
#endif
#if __SSE__ || defined(_M_X64) || ( defined(_M_IX86_FP) && ( _M_IX86_FP >= 1 ) ) || CPU_ENABLE_SSE
#include <xmmintrin.h>
#define CPU_SSE_SUPPORT (1)
#endif
#if __SSE2__ || defined(_M_X64) || ( defined(_M_IX86_FP) && ( _M_IX86_FP >= 2 ) ) || CPU_ENABLE_SSE2
#include <emmintrin.h>
#define CPU_SSE2_SUPPORT (1)
#endif
#if __SSE3__ || __AVX__ || CPU_ENABLE_SSE3
#include <pmmintrin.h>
#define CPU_SSE3_SUPPORT (1)
#endif
#if __SSSE3__ || __AVX__ || CPU_ENABLE_SSSE3
#include <tmmintrin.h>
#define CPU_SSSE3_SUPPORT (1)
#endif
#if __SSE4_1__ || __AVX__ || CPU_ENABLE_SSE4_1
#include <smmintrin.h>
#define CPU_SSE4_1_SUPPORT (1)
#endif
#if __SSE4_2__ || CPU_ENABLE_SSE4_2
#include <nmmintrin.h>
#define CPU_SSE4_2_SUPPORT (1)
#endif
#elif defined(SIMDE_ARCH_ARM_NEON)
// We only enable compile time SSE* to Neon for now because it's easy to test
// Enable up to SSE4.2 because after that (starting from AVX) it has few
// native conversion, which will use the slower C99 fallback
#define CPU_MMX_SUPPORT (1)
#define CPU_SSE_SUPPORT (1)
#define CPU_SSE2_SUPPORT (1)
#define CPU_SSE3_SUPPORT (1)
#define CPU_SSSE3_SUPPORT (1)
#define CPU_SSE4_1_SUPPORT (1)
#define CPU_SSE4_2_SUPPORT (1)
#if defined(_MSC_VER) && defined(__cplusplus)
// Fix math related functions missing in msvc
#include <cmath>
#endif
#define SIMDE_ENABLE_NATIVE_ALIASES
#include "simde/x86/sse4.2.h"
#endif
#ifndef _MM_FROUND_TO_NEG_INF
#define _MM_FROUND_TO_NEG_INF SIMDE_MM_FROUND_TO_NEG_INF
#endif
#ifndef _MM_FROUND_NO_EXC
#define _MM_FROUND_NO_EXC SIMDE_MM_FROUND_NO_EXC
#endif
#ifndef _MM_SET_ROUNDING_MODE
#define _MM_SET_ROUNDING_MODE _MM_SET_ROUNDING_MODE
#endif
#ifndef _MM_ROUND_NEAREST
#define _MM_ROUND_NEAREST SIMDE_MM_ROUND_NEAREST
#endif
#ifndef _MM_ROUND_UP
#define _MM_ROUND_UP SIMDE_MM_ROUND_UP
#endif
#ifndef _MM_ROUND_DOWN
#define _MM_ROUND_DOWN SIMDE_MM_ROUND_DOWN
#endif
// Utilities for aligned allocation
inline void* simd_aligned_alloc(size_t alignment, size_t bytes)
{
// we need to allocate enough storage for the requested bytes, some
// book-keeping (to store the location returned by malloc) and some extra
// padding to allow us to find an aligned byte. I'm not entirely sure if
// 2 * alignment is enough here, its just a guess.
const size_t total_size = bytes + (2 * alignment) + sizeof(size_t);
// use malloc to allocate the memory.
char* data = (char*)malloc(sizeof(char) * total_size);
if (data)
{
// store the original start of the malloc'd data.
const void* const data_start = data;
// dedicate enough space to the book-keeping.
data += sizeof(size_t);
// find a memory location with correct alignment. the alignment minus
// the remainder of this mod operation is how many bytes forward we need
// to move to find an aligned byte.
const size_t offset = alignment - (((size_t)data) % alignment);
// set data to the aligned memory.
data += offset;
// write the book-keeping.
size_t* book_keeping = (size_t*)(data - sizeof(size_t));
*book_keeping = (size_t)data_start;
}
return data;
}
inline void simd_aligned_free(void* raw_data)
{
if (raw_data)
{
char* data = (char*)raw_data;
// we have to assume this memory was allocated with simd_aligned_alloc.
// this means the sizeof(size_t) bytes before data are the book-keeping
// which points to the location we need to pass to free.
data -= sizeof(size_t);
// set data to the location stored in book-keeping.
data = (char*)(*((size_t*)data));
// free the memory.
free(data);
}
}
#endif