Add simde

This commit is contained in:
Benau 2022-04-29 11:02:25 +08:00
parent 0f2b3da37e
commit 383bd93261
350 changed files with 187154 additions and 0 deletions

View File

@ -0,0 +1,159 @@
/* ==========================================================================
* Copyright (c) 2022 SuperTuxKart-Team
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the
* "Software"), to deal in the Software without restriction, including
* without limitation the rights to use, copy, modify, merge, publish,
* distribute, sublicense, and/or sell copies of the Software, and to permit
* persons to whom the Software is furnished to do so, subject to the
* following conditions:
*
* The above copyright notice and this permission notice shall be included
* in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
* OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN
* NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
* DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
* OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
* USE OR OTHER DEALINGS IN THE SOFTWARE.
* ==========================================================================
*/
#ifndef HEADER_SIMD_WRAPPER_HPP
#define HEADER_SIMD_WRAPPER_HPP
#include <simde/simde-arch.h>
#if defined(SIMDE_ARCH_AMD64) || defined(SIMDE_ARCH_X86)
// Native SSE
#if __MMX__ || CPU_ENABLE_MMX
#include <mmintrin.h>
#define CPU_MMX_SUPPORT (1)
#endif
#if __SSE__ || defined(_M_X64) || ( defined(_M_IX86_FP) && ( _M_IX86_FP >= 1 ) ) || CPU_ENABLE_SSE
#include <xmmintrin.h>
#define CPU_SSE_SUPPORT (1)
#endif
#if __SSE2__ || defined(_M_X64) || ( defined(_M_IX86_FP) && ( _M_IX86_FP >= 2 ) ) || CPU_ENABLE_SSE2
#include <emmintrin.h>
#define CPU_SSE2_SUPPORT (1)
#endif
#if __SSE3__ || __AVX__ || CPU_ENABLE_SSE3
#include <pmmintrin.h>
#define CPU_SSE3_SUPPORT (1)
#endif
#if __SSSE3__ || __AVX__ || CPU_ENABLE_SSSE3
#include <tmmintrin.h>
#define CPU_SSSE3_SUPPORT (1)
#endif
#if __SSE4_1__ || __AVX__ || CPU_ENABLE_SSE4_1
#include <smmintrin.h>
#define CPU_SSE4_1_SUPPORT (1)
#endif
#if __SSE4_2__ || CPU_ENABLE_SSE4_2
#include <nmmintrin.h>
#define CPU_SSE4_2_SUPPORT (1)
#endif
#elif defined(SIMDE_ARCH_ARM_NEON)
// We only enable compile time SSE* to Neon for now because it's easy to test
// Enable up to SSE4.2 because after that (starting from AVX) it has few
// native conversion, which will use the slower C99 fallback
#define CPU_MMX_SUPPORT (1)
#define CPU_SSE_SUPPORT (1)
#define CPU_SSE2_SUPPORT (1)
#define CPU_SSE3_SUPPORT (1)
#define CPU_SSSE3_SUPPORT (1)
#define CPU_SSE4_1_SUPPORT (1)
#define CPU_SSE4_2_SUPPORT (1)
#if defined(_MSC_VER) && defined(__cplusplus)
// Fix math related functions missing in msvc
#include <cmath>
#endif
#define SIMDE_ENABLE_NATIVE_ALIASES
#include "simde/x86/sse4.2.h"
#endif
#ifndef _MM_FROUND_TO_NEG_INF
#define _MM_FROUND_TO_NEG_INF SIMDE_MM_FROUND_TO_NEG_INF
#endif
#ifndef _MM_FROUND_NO_EXC
#define _MM_FROUND_NO_EXC SIMDE_MM_FROUND_NO_EXC
#endif
#ifndef _MM_SET_ROUNDING_MODE
#define _MM_SET_ROUNDING_MODE _MM_SET_ROUNDING_MODE
#endif
#ifndef _MM_ROUND_NEAREST
#define _MM_ROUND_NEAREST SIMDE_MM_ROUND_NEAREST
#endif
#ifndef _MM_ROUND_UP
#define _MM_ROUND_UP SIMDE_MM_ROUND_UP
#endif
#ifndef _MM_ROUND_DOWN
#define _MM_ROUND_DOWN SIMDE_MM_ROUND_DOWN
#endif
// Utilities for aligned allocation
inline void* simd_aligned_alloc(size_t alignment, size_t bytes)
{
// we need to allocate enough storage for the requested bytes, some
// book-keeping (to store the location returned by malloc) and some extra
// padding to allow us to find an aligned byte. I'm not entirely sure if
// 2 * alignment is enough here, its just a guess.
const size_t total_size = bytes + (2 * alignment) + sizeof(size_t);
// use malloc to allocate the memory.
char* data = (char*)malloc(sizeof(char) * total_size);
if (data)
{
// store the original start of the malloc'd data.
const void* const data_start = data;
// dedicate enough space to the book-keeping.
data += sizeof(size_t);
// find a memory location with correct alignment. the alignment minus
// the remainder of this mod operation is how many bytes forward we need
// to move to find an aligned byte.
const size_t offset = alignment - (((size_t)data) % alignment);
// set data to the aligned memory.
data += offset;
// write the book-keeping.
size_t* book_keeping = (size_t*)(data - sizeof(size_t));
*book_keeping = (size_t)data_start;
}
return data;
}
inline void simd_aligned_free(void* raw_data)
{
if (raw_data)
{
char* data = (char*)raw_data;
// we have to assume this memory was allocated with simd_aligned_alloc.
// this means the sizeof(size_t) bytes before data are the book-keeping
// which points to the location we need to pass to free.
data -= sizeof(size_t);
// set data to the location stored in book-keeping.
data = (char*)(*((size_t*)data));
// free the memory.
free(data);
}
}
#endif

View File

@ -0,0 +1,20 @@
Copyright (c) 2017 Evan Nemerson <evan@nemerson.com>
Permission is hereby granted, free of charge, to any person obtaining
a copy of this software and associated documentation files (the
"Software"), to deal in the Software without restriction, including
without limitation the rights to use, copy, modify, merge, publish,
distribute, sublicense, and/or sell copies of the Software, and to
permit persons to whom the Software is furnished to do so, subject to
the following conditions:
The above copyright notice and this permission notice shall be
included in all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

View File

@ -0,0 +1,10 @@
# SIMDe Without Test Cases
This repository contains only the core of
[SIMDe](https://github.com/simd-everywhere/simde).
It is generated automatically for every commit to master, and is
intended to be used as a submodule in projects which don't want to
include the (rather large) test cases.
All development work happens in the main repository, please do not
file issues or create pull requests against this repository.

View File

@ -0,0 +1,210 @@
/* SPDX-License-Identifier: MIT
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use, copy,
* modify, merge, publish, distribute, sublicense, and/or sell copies
* of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
* BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*
* Copyright:
* 2020 Evan Nemerson <evan@nemerson.com>
*/
#if !defined(SIMDE_ARM_NEON_H)
#define SIMDE_ARM_NEON_H
#include "neon/types.h"
#include "neon/aba.h"
#include "neon/abd.h"
#include "neon/abdl.h"
#include "neon/abs.h"
#include "neon/add.h"
#include "neon/addhn.h"
#include "neon/addl.h"
#include "neon/addlv.h"
#include "neon/addl_high.h"
#include "neon/addv.h"
#include "neon/addw.h"
#include "neon/addw_high.h"
#include "neon/and.h"
#include "neon/bcax.h"
#include "neon/bic.h"
#include "neon/bsl.h"
#include "neon/cage.h"
#include "neon/cagt.h"
#include "neon/ceq.h"
#include "neon/ceqz.h"
#include "neon/cge.h"
#include "neon/cgez.h"
#include "neon/cgt.h"
#include "neon/cgtz.h"
#include "neon/cle.h"
#include "neon/clez.h"
#include "neon/cls.h"
#include "neon/clt.h"
#include "neon/cltz.h"
#include "neon/clz.h"
#include "neon/cmla.h"
#include "neon/cmla_rot90.h"
#include "neon/cmla_rot180.h"
#include "neon/cmla_rot270.h"
#include "neon/cnt.h"
#include "neon/cvt.h"
#include "neon/combine.h"
#include "neon/create.h"
#include "neon/dot.h"
#include "neon/dot_lane.h"
#include "neon/dup_lane.h"
#include "neon/dup_n.h"
#include "neon/eor.h"
#include "neon/ext.h"
#include "neon/fma.h"
#include "neon/fma_lane.h"
#include "neon/fma_n.h"
#include "neon/get_high.h"
#include "neon/get_lane.h"
#include "neon/get_low.h"
#include "neon/hadd.h"
#include "neon/hsub.h"
#include "neon/ld1.h"
#include "neon/ld1_dup.h"
#include "neon/ld1_lane.h"
#include "neon/ld2.h"
#include "neon/ld3.h"
#include "neon/ld4.h"
#include "neon/ld4_lane.h"
#include "neon/max.h"
#include "neon/maxnm.h"
#include "neon/maxv.h"
#include "neon/min.h"
#include "neon/minnm.h"
#include "neon/minv.h"
#include "neon/mla.h"
#include "neon/mla_n.h"
#include "neon/mlal.h"
#include "neon/mlal_high.h"
#include "neon/mlal_high_n.h"
#include "neon/mlal_lane.h"
#include "neon/mlal_n.h"
#include "neon/mls.h"
#include "neon/mls_n.h"
#include "neon/mlsl.h"
#include "neon/mlsl_high.h"
#include "neon/mlsl_high_n.h"
#include "neon/mlsl_lane.h"
#include "neon/mlsl_n.h"
#include "neon/movl.h"
#include "neon/movl_high.h"
#include "neon/movn.h"
#include "neon/movn_high.h"
#include "neon/mul.h"
#include "neon/mul_lane.h"
#include "neon/mul_n.h"
#include "neon/mull.h"
#include "neon/mull_high.h"
#include "neon/mull_lane.h"
#include "neon/mull_n.h"
#include "neon/mvn.h"
#include "neon/neg.h"
#include "neon/orn.h"
#include "neon/orr.h"
#include "neon/padal.h"
#include "neon/padd.h"
#include "neon/paddl.h"
#include "neon/pmax.h"
#include "neon/pmin.h"
#include "neon/qabs.h"
#include "neon/qadd.h"
#include "neon/qdmulh.h"
#include "neon/qdmulh_lane.h"
#include "neon/qdmulh_n.h"
#include "neon/qdmull.h"
#include "neon/qrdmulh.h"
#include "neon/qrdmulh_lane.h"
#include "neon/qrdmulh_n.h"
#include "neon/qrshrn_n.h"
#include "neon/qrshrun_n.h"
#include "neon/qmovn.h"
#include "neon/qmovun.h"
#include "neon/qmovn_high.h"
#include "neon/qneg.h"
#include "neon/qsub.h"
#include "neon/qshl.h"
#include "neon/qshlu_n.h"
#include "neon/qshrn_n.h"
#include "neon/qshrun_n.h"
#include "neon/qtbl.h"
#include "neon/qtbx.h"
#include "neon/rbit.h"
#include "neon/recpe.h"
#include "neon/recps.h"
#include "neon/reinterpret.h"
#include "neon/rev16.h"
#include "neon/rev32.h"
#include "neon/rev64.h"
#include "neon/rhadd.h"
#include "neon/rnd.h"
#include "neon/rndm.h"
#include "neon/rndi.h"
#include "neon/rndn.h"
#include "neon/rndp.h"
#include "neon/rshl.h"
#include "neon/rshr_n.h"
#include "neon/rshrn_n.h"
#include "neon/rsqrte.h"
#include "neon/rsqrts.h"
#include "neon/rsra_n.h"
#include "neon/set_lane.h"
#include "neon/shl.h"
#include "neon/shl_n.h"
#include "neon/shll_n.h"
#include "neon/shr_n.h"
#include "neon/shrn_n.h"
#include "neon/sqadd.h"
#include "neon/sra_n.h"
#include "neon/sri_n.h"
#include "neon/st1.h"
#include "neon/st1_lane.h"
#include "neon/st2.h"
#include "neon/st2_lane.h"
#include "neon/st3.h"
#include "neon/st3_lane.h"
#include "neon/st4.h"
#include "neon/st4_lane.h"
#include "neon/sub.h"
#include "neon/subhn.h"
#include "neon/subl.h"
#include "neon/subl_high.h"
#include "neon/subw.h"
#include "neon/subw_high.h"
#include "neon/tbl.h"
#include "neon/tbx.h"
#include "neon/trn.h"
#include "neon/trn1.h"
#include "neon/trn2.h"
#include "neon/tst.h"
#include "neon/uqadd.h"
#include "neon/uzp.h"
#include "neon/uzp1.h"
#include "neon/uzp2.h"
#include "neon/xar.h"
#include "neon/zip.h"
#include "neon/zip1.h"
#include "neon/zip2.h"
#endif /* SIMDE_ARM_NEON_H */

View File

@ -0,0 +1,208 @@
/* SPDX-License-Identifier: MIT
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use, copy,
* modify, merge, publish, distribute, sublicense, and/or sell copies
* of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
* BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*
* Copyright:
* 2020 Evan Nemerson <evan@nemerson.com>
*/
#if !defined(SIMDE_ARM_NEON_ABA_H)
#define SIMDE_ARM_NEON_ABA_H
#include "abd.h"
#include "add.h"
HEDLEY_DIAGNOSTIC_PUSH
SIMDE_DISABLE_UNWANTED_DIAGNOSTICS
SIMDE_BEGIN_DECLS_
SIMDE_FUNCTION_ATTRIBUTES
simde_int8x8_t
simde_vaba_s8(simde_int8x8_t a, simde_int8x8_t b, simde_int8x8_t c) {
#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
return vaba_s8(a, b, c);
#else
return simde_vadd_s8(simde_vabd_s8(b, c), a);
#endif
}
#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
#undef vaba_s8
#define vaba_s8(a, b, c) simde_vaba_s8((a), (b), (c))
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde_int16x4_t
simde_vaba_s16(simde_int16x4_t a, simde_int16x4_t b, simde_int16x4_t c) {
#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
return vaba_s16(a, b, c);
#else
return simde_vadd_s16(simde_vabd_s16(b, c), a);
#endif
}
#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
#undef vaba_s16
#define vaba_s16(a, b, c) simde_vaba_s16((a), (b), (c))
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde_int32x2_t
simde_vaba_s32(simde_int32x2_t a, simde_int32x2_t b, simde_int32x2_t c) {
#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
return vaba_s32(a, b, c);
#else
return simde_vadd_s32(simde_vabd_s32(b, c), a);
#endif
}
#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
#undef vaba_s32
#define vaba_s32(a, b, c) simde_vaba_s32((a), (b), (c))
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde_uint8x8_t
simde_vaba_u8(simde_uint8x8_t a, simde_uint8x8_t b, simde_uint8x8_t c) {
#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
return vaba_u8(a, b, c);
#else
return simde_vadd_u8(simde_vabd_u8(b, c), a);
#endif
}
#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
#undef vaba_u8
#define vaba_u8(a, b, c) simde_vaba_u8((a), (b), (c))
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde_uint16x4_t
simde_vaba_u16(simde_uint16x4_t a, simde_uint16x4_t b, simde_uint16x4_t c) {
#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
return vaba_u16(a, b, c);
#else
return simde_vadd_u16(simde_vabd_u16(b, c), a);
#endif
}
#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
#undef vaba_u16
#define vaba_u16(a, b, c) simde_vaba_u16((a), (b), (c))
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde_uint32x2_t
simde_vaba_u32(simde_uint32x2_t a, simde_uint32x2_t b, simde_uint32x2_t c) {
#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
return vaba_u32(a, b, c);
#else
return simde_vadd_u32(simde_vabd_u32(b, c), a);
#endif
}
#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
#undef vaba_u32
#define vaba_u32(a, b, c) simde_vaba_u32((a), (b), (c))
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde_int8x16_t
simde_vabaq_s8(simde_int8x16_t a, simde_int8x16_t b, simde_int8x16_t c) {
#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
return vabaq_s8(a, b, c);
#else
return simde_vaddq_s8(simde_vabdq_s8(b, c), a);
#endif
}
#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
#undef vabaq_s8
#define vabaq_s8(a, b, c) simde_vabaq_s8((a), (b), (c))
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde_int16x8_t
simde_vabaq_s16(simde_int16x8_t a, simde_int16x8_t b, simde_int16x8_t c) {
#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
return vabaq_s16(a, b, c);
#else
return simde_vaddq_s16(simde_vabdq_s16(b, c), a);
#endif
}
#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
#undef vabaq_s16
#define vabaq_s16(a, b, c) simde_vabaq_s16((a), (b), (c))
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde_int32x4_t
simde_vabaq_s32(simde_int32x4_t a, simde_int32x4_t b, simde_int32x4_t c) {
#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
return vabaq_s32(a, b, c);
#else
return simde_vaddq_s32(simde_vabdq_s32(b, c), a);
#endif
}
#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
#undef vabaq_s32
#define vabaq_s32(a, b, c) simde_vabaq_s32((a), (b), (c))
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde_uint8x16_t
simde_vabaq_u8(simde_uint8x16_t a, simde_uint8x16_t b, simde_uint8x16_t c) {
#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
return vabaq_u8(a, b, c);
#else
return simde_vaddq_u8(simde_vabdq_u8(b, c), a);
#endif
}
#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
#undef vabaq_u8
#define vabaq_u8(a, b, c) simde_vabaq_u8((a), (b), (c))
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde_uint16x8_t
simde_vabaq_u16(simde_uint16x8_t a, simde_uint16x8_t b, simde_uint16x8_t c) {
#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
return vabaq_u16(a, b, c);
#else
return simde_vaddq_u16(simde_vabdq_u16(b, c), a);
#endif
}
#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
#undef vabaq_u16
#define vabaq_u16(a, b, c) simde_vabaq_u16((a), (b), (c))
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde_uint32x4_t
simde_vabaq_u32(simde_uint32x4_t a, simde_uint32x4_t b, simde_uint32x4_t c) {
#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
return vabaq_u32(a, b, c);
#else
return simde_vaddq_u32(simde_vabdq_u32(b, c), a);
#endif
}
#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
#undef vabaq_u32
#define vabaq_u32(a, b, c) simde_vabaq_u32((a), (b), (c))
#endif
SIMDE_END_DECLS_
HEDLEY_DIAGNOSTIC_POP
#endif /* !defined(SIMDE_ARM_NEON_ABA_H) */

View File

@ -0,0 +1,489 @@
/* SPDX-License-Identifier: MIT
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use, copy,
* modify, merge, publish, distribute, sublicense, and/or sell copies
* of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
* BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*
* Copyright:
* 2020 Evan Nemerson <evan@nemerson.com>
*/
#if !defined(SIMDE_ARM_NEON_ABD_H)
#define SIMDE_ARM_NEON_ABD_H
#include "abs.h"
#include "subl.h"
#include "movn.h"
#include "movl.h"
#include "reinterpret.h"
HEDLEY_DIAGNOSTIC_PUSH
SIMDE_DISABLE_UNWANTED_DIAGNOSTICS
SIMDE_BEGIN_DECLS_
SIMDE_FUNCTION_ATTRIBUTES
simde_float32_t
simde_vabds_f32(simde_float32_t a, simde_float32_t b) {
#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
return vabds_f32(a, b);
#else
simde_float32_t r = a - b;
return r < 0 ? -r : r;
#endif
}
#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
#undef vabds_f32
#define vabds_f32(a, b) simde_vabds_f32((a), (b))
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde_float64_t
simde_vabdd_f64(simde_float64_t a, simde_float64_t b) {
#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
return vabdd_f64(a, b);
#else
simde_float64_t r = a - b;
return r < 0 ? -r : r;
#endif
}
#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
#undef vabdd_f64
#define vabdd_f64(a, b) simde_vabdd_f64((a), (b))
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde_float32x2_t
simde_vabd_f32(simde_float32x2_t a, simde_float32x2_t b) {
#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
return vabd_f32(a, b);
#else
return simde_vabs_f32(simde_vsub_f32(a, b));
#endif
}
#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
#undef vabd_f32
#define vabd_f32(a, b) simde_vabd_f32((a), (b))
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde_float64x1_t
simde_vabd_f64(simde_float64x1_t a, simde_float64x1_t b) {
#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
return vabd_f64(a, b);
#else
return simde_vabs_f64(simde_vsub_f64(a, b));
#endif
}
#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
#undef vabd_f64
#define vabd_f64(a, b) simde_vabd_f64((a), (b))
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde_int8x8_t
simde_vabd_s8(simde_int8x8_t a, simde_int8x8_t b) {
#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
return vabd_s8(a, b);
#elif defined(SIMDE_X86_MMX_NATIVE)
simde_int8x8_private
r_,
a_ = simde_int8x8_to_private(a),
b_ = simde_int8x8_to_private(b);
const __m64 m = _mm_cmpgt_pi8(b_.m64, a_.m64);
r_.m64 =
_mm_xor_si64(
_mm_add_pi8(
_mm_sub_pi8(a_.m64, b_.m64),
m
),
m
);
return simde_int8x8_from_private(r_);
#else
return simde_vmovn_s16(simde_vabsq_s16(simde_vsubl_s8(a, b)));
#endif
}
#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
#undef vabd_s8
#define vabd_s8(a, b) simde_vabd_s8((a), (b))
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde_int16x4_t
simde_vabd_s16(simde_int16x4_t a, simde_int16x4_t b) {
#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
return vabd_s16(a, b);
#elif defined(SIMDE_X86_MMX_NATIVE) && defined(SIMDE_X86_SSE_NATIVE)
simde_int16x4_private
r_,
a_ = simde_int16x4_to_private(a),
b_ = simde_int16x4_to_private(b);
r_.m64 = _mm_sub_pi16(_mm_max_pi16(a_.m64, b_.m64), _mm_min_pi16(a_.m64, b_.m64));
return simde_int16x4_from_private(r_);
#else
return simde_vmovn_s32(simde_vabsq_s32(simde_vsubl_s16(a, b)));
#endif
}
#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
#undef vabd_s16
#define vabd_s16(a, b) simde_vabd_s16((a), (b))
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde_int32x2_t
simde_vabd_s32(simde_int32x2_t a, simde_int32x2_t b) {
#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
return vabd_s32(a, b);
#else
return simde_vmovn_s64(simde_vabsq_s64(simde_vsubl_s32(a, b)));
#endif
}
#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
#undef vabd_s32
#define vabd_s32(a, b) simde_vabd_s32((a), (b))
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde_uint8x8_t
simde_vabd_u8(simde_uint8x8_t a, simde_uint8x8_t b) {
#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
return vabd_u8(a, b);
#else
return simde_vmovn_u16(
simde_vreinterpretq_u16_s16(
simde_vabsq_s16(
simde_vsubq_s16(
simde_vreinterpretq_s16_u16(simde_vmovl_u8(a)),
simde_vreinterpretq_s16_u16(simde_vmovl_u8(b))))));
#endif
}
#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
#undef vabd_u8
#define vabd_u8(a, b) simde_vabd_u8((a), (b))
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde_uint16x4_t
simde_vabd_u16(simde_uint16x4_t a, simde_uint16x4_t b) {
#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
return vabd_u16(a, b);
#else
return simde_vmovn_u32(
simde_vreinterpretq_u32_s32(
simde_vabsq_s32(
simde_vsubq_s32(
simde_vreinterpretq_s32_u32(simde_vmovl_u16(a)),
simde_vreinterpretq_s32_u32(simde_vmovl_u16(b))))));
#endif
}
#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
#undef vabd_u16
#define vabd_u16(a, b) simde_vabd_u16((a), (b))
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde_uint32x2_t
simde_vabd_u32(simde_uint32x2_t a, simde_uint32x2_t b) {
#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
return vabd_u32(a, b);
#else
return simde_vmovn_u64(
simde_vreinterpretq_u64_s64(
simde_vabsq_s64(
simde_vsubq_s64(
simde_vreinterpretq_s64_u64(simde_vmovl_u32(a)),
simde_vreinterpretq_s64_u64(simde_vmovl_u32(b))))));
#endif
}
#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
#undef vabd_u32
#define vabd_u32(a, b) simde_vabd_u32((a), (b))
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde_float32x4_t
simde_vabdq_f32(simde_float32x4_t a, simde_float32x4_t b) {
#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
return vabdq_f32(a, b);
#else
return simde_vabsq_f32(simde_vsubq_f32(a, b));
#endif
}
#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
#undef vabdq_f32
#define vabdq_f32(a, b) simde_vabdq_f32((a), (b))
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde_float64x2_t
simde_vabdq_f64(simde_float64x2_t a, simde_float64x2_t b) {
#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
return vabdq_f64(a, b);
#else
return simde_vabsq_f64(simde_vsubq_f64(a, b));
#endif
}
#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
#undef vabdq_f64
#define vabdq_f64(a, b) simde_vabdq_f64((a), (b))
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde_int8x16_t
simde_vabdq_s8(simde_int8x16_t a, simde_int8x16_t b) {
#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
return vabdq_s8(a, b);
#elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
return vec_sub(vec_max(a, b), vec_min(a, b));
#elif defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE)
return vec_max(a, b) - vec_min(a, b);
#else
simde_int8x16_private
r_,
a_ = simde_int8x16_to_private(a),
b_ = simde_int8x16_to_private(b);
#if defined(SIMDE_X86_SSE4_1_NATIVE)
r_.m128i = _mm_sub_epi8(_mm_max_epi8(a_.m128i, b_.m128i), _mm_min_epi8(a_.m128i, b_.m128i));
#elif defined(SIMDE_X86_SSE2_NATIVE)
const __m128i m = _mm_cmpgt_epi8(b_.m128i, a_.m128i);
r_.m128i =
_mm_xor_si128(
_mm_add_epi8(
_mm_sub_epi8(a_.m128i, b_.m128i),
m
),
m
);
#elif defined(SIMDE_WASM_SIMD128_NATIVE)
r_.v128 = wasm_i8x16_sub(wasm_i8x16_max(a_.v128, b_.v128), wasm_i8x16_min(a_.v128, b_.v128));
#else
SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
int16_t tmp = HEDLEY_STATIC_CAST(int16_t, a_.values[i]) - HEDLEY_STATIC_CAST(int16_t, b_.values[i]);
r_.values[i] = HEDLEY_STATIC_CAST(int8_t, tmp < 0 ? -tmp : tmp);
}
#endif
return simde_int8x16_from_private(r_);
#endif
}
#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
#undef vabdq_s8
#define vabdq_s8(a, b) simde_vabdq_s8((a), (b))
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde_int16x8_t
simde_vabdq_s16(simde_int16x8_t a, simde_int16x8_t b) {
#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
return vabdq_s16(a, b);
#elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
return vec_sub(vec_max(a, b), vec_min(a, b));
#elif defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE)
return vec_max(a, b) - vec_min(a, b);
#else
simde_int16x8_private
r_,
a_ = simde_int16x8_to_private(a),
b_ = simde_int16x8_to_private(b);
#if defined(SIMDE_X86_SSE2_NATIVE)
/* https://github.com/simd-everywhere/simde/issues/855#issuecomment-881658604 */
r_.m128i = _mm_sub_epi16(_mm_max_epi16(a_.m128i, b_.m128i), _mm_min_epi16(a_.m128i, b_.m128i));
#elif defined(SIMDE_WASM_SIMD128_NATIVE)
r_.v128 = wasm_i16x8_sub(wasm_i16x8_max(a_.v128, b_.v128), wasm_i16x8_min(a_.v128, b_.v128));
#else
SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
r_.values[i] =
(a_.values[i] < b_.values[i]) ?
(b_.values[i] - a_.values[i]) :
(a_.values[i] - b_.values[i]);
}
#endif
return simde_int16x8_from_private(r_);
#endif
}
#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
#undef vabdq_s16
#define vabdq_s16(a, b) simde_vabdq_s16((a), (b))
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde_int32x4_t
simde_vabdq_s32(simde_int32x4_t a, simde_int32x4_t b) {
#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
return vabdq_s32(a, b);
#elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
return vec_sub(vec_max(a, b), vec_min(a, b));
#elif defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE)
return vec_max(a, b) - vec_min(a, b);
#else
simde_int32x4_private
r_,
a_ = simde_int32x4_to_private(a),
b_ = simde_int32x4_to_private(b);
#if defined(SIMDE_X86_SSE4_1_NATIVE)
r_.m128i = _mm_sub_epi32(_mm_max_epi32(a_.m128i, b_.m128i), _mm_min_epi32(a_.m128i, b_.m128i));
#elif defined(SIMDE_X86_SSE2_NATIVE)
const __m128i m = _mm_cmpgt_epi32(b_.m128i, a_.m128i);
r_.m128i =
_mm_xor_si128(
_mm_add_epi32(
_mm_sub_epi32(a_.m128i, b_.m128i),
m
),
m
);
#else
SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
int64_t tmp = HEDLEY_STATIC_CAST(int64_t, a_.values[i]) - HEDLEY_STATIC_CAST(int64_t, b_.values[i]);
r_.values[i] = HEDLEY_STATIC_CAST(int32_t, tmp < 0 ? -tmp : tmp);
}
#endif
return simde_int32x4_from_private(r_);
#endif
}
#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
#undef vabdq_s32
#define vabdq_s32(a, b) simde_vabdq_s32((a), (b))
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde_uint8x16_t
simde_vabdq_u8(simde_uint8x16_t a, simde_uint8x16_t b) {
#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
return vabdq_u8(a, b);
#elif defined(SIMDE_POWER_ALTIVEC_P9_NATIVE)
return vec_absd(a, b);
#elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
return vec_sub(vec_max(a, b), vec_min(a, b));
#elif defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE)
return vec_max(a, b) - vec_min(a, b);
#else
simde_uint8x16_private
r_,
a_ = simde_uint8x16_to_private(a),
b_ = simde_uint8x16_to_private(b);
#if defined(SIMDE_X86_SSE2_NATIVE)
r_.m128i = _mm_sub_epi8(_mm_max_epu8(a_.m128i, b_.m128i), _mm_min_epu8(a_.m128i, b_.m128i));
#elif defined(SIMDE_WASM_SIMD128_NATIVE)
r_.v128 = wasm_i8x16_sub(wasm_u8x16_max(a_.v128, b_.v128), wasm_u8x16_min(a_.v128, b_.v128));
#else
SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
int16_t tmp = HEDLEY_STATIC_CAST(int16_t, a_.values[i]) - HEDLEY_STATIC_CAST(int16_t, b_.values[i]);
r_.values[i] = HEDLEY_STATIC_CAST(uint8_t, tmp < 0 ? -tmp : tmp);
}
#endif
return simde_uint8x16_from_private(r_);
#endif
}
#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
#undef vabdq_u8
#define vabdq_u8(a, b) simde_vabdq_u8((a), (b))
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde_uint16x8_t
simde_vabdq_u16(simde_uint16x8_t a, simde_uint16x8_t b) {
#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
return vabdq_u16(a, b);
#elif defined(SIMDE_POWER_ALTIVEC_P9_NATIVE)
return vec_absd(a, b);
#elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
return vec_sub(vec_max(a, b), vec_min(a, b));
#elif defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE)
return vec_max(a, b) - vec_min(a, b);
#else
simde_uint16x8_private
r_,
a_ = simde_uint16x8_to_private(a),
b_ = simde_uint16x8_to_private(b);
#if defined(SIMDE_X86_SSE4_2_NATIVE)
r_.m128i = _mm_sub_epi16(_mm_max_epu16(a_.m128i, b_.m128i), _mm_min_epu16(a_.m128i, b_.m128i));
#elif defined(SIMDE_WASM_SIMD128_NATIVE)
r_.v128 = wasm_i16x8_sub(wasm_u16x8_max(a_.v128, b_.v128), wasm_u16x8_min(a_.v128, b_.v128));
#else
SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
int32_t tmp = HEDLEY_STATIC_CAST(int32_t, a_.values[i]) - HEDLEY_STATIC_CAST(int32_t, b_.values[i]);
r_.values[i] = HEDLEY_STATIC_CAST(uint16_t, tmp < 0 ? -tmp : tmp);
}
#endif
return simde_uint16x8_from_private(r_);
#endif
}
#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
#undef vabdq_u16
#define vabdq_u16(a, b) simde_vabdq_u16((a), (b))
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde_uint32x4_t
simde_vabdq_u32(simde_uint32x4_t a, simde_uint32x4_t b) {
#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
return vabdq_u32(a, b);
#elif defined(SIMDE_POWER_ALTIVEC_P9_NATIVE)
return vec_absd(a, b);
#elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
return vec_sub(vec_max(a, b), vec_min(a, b));
#elif defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE)
return vec_max(a, b) - vec_min(a, b);
#else
simde_uint32x4_private
r_,
a_ = simde_uint32x4_to_private(a),
b_ = simde_uint32x4_to_private(b);
#if defined(SIMDE_X86_SSE4_2_NATIVE)
r_.m128i = _mm_sub_epi32(_mm_max_epu32(a_.m128i, b_.m128i), _mm_min_epu32(a_.m128i, b_.m128i));
#else
SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
int64_t tmp = HEDLEY_STATIC_CAST(int64_t, a_.values[i]) - HEDLEY_STATIC_CAST(int64_t, b_.values[i]);
r_.values[i] = HEDLEY_STATIC_CAST(uint32_t, tmp < 0 ? -tmp : tmp);
}
#endif
return simde_uint32x4_from_private(r_);
#endif
}
#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
#undef vabdq_u32
#define vabdq_u32(a, b) simde_vabdq_u32((a), (b))
#endif
SIMDE_END_DECLS_
HEDLEY_DIAGNOSTIC_POP
#endif /* !defined(SIMDE_ARM_NEON_ABD_H) */

View File

@ -0,0 +1,147 @@
/* SPDX-License-Identifier: MIT
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use, copy,
* modify, merge, publish, distribute, sublicense, and/or sell copies
* of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
* BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*
* Copyright:
* 2020 Evan Nemerson <evan@nemerson.com>
*/
#if !defined(SIMDE_ARM_NEON_ABDL_H)
#define SIMDE_ARM_NEON_ABDL_H
#include "abs.h"
#include "subl.h"
#include "movl.h"
#include "reinterpret.h"
HEDLEY_DIAGNOSTIC_PUSH
SIMDE_DISABLE_UNWANTED_DIAGNOSTICS
SIMDE_BEGIN_DECLS_
SIMDE_FUNCTION_ATTRIBUTES
simde_int16x8_t
simde_vabdl_s8(simde_int8x8_t a, simde_int8x8_t b) {
#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
return vabdl_s8(a, b);
#else
return simde_vabsq_s16(simde_vsubl_s8(a, b));
#endif
}
#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
#undef vabdl_s8
#define vabdl_s8(a, b) simde_vabdl_s8((a), (b))
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde_int32x4_t
simde_vabdl_s16(simde_int16x4_t a, simde_int16x4_t b) {
#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
return vabdl_s16(a, b);
#else
return simde_vabsq_s32(simde_vsubl_s16(a, b));
#endif
}
#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
#undef vabdl_s16
#define vabdl_s16(a, b) simde_vabdl_s16((a), (b))
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde_int64x2_t
simde_vabdl_s32(simde_int32x2_t a, simde_int32x2_t b) {
#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
return vabdl_s32(a, b);
#else
return simde_vabsq_s64(simde_vsubl_s32(a, b));
#endif
}
#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
#undef vabdl_s32
#define vabdl_s32(a, b) simde_vabdl_s32((a), (b))
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde_uint16x8_t
simde_vabdl_u8(simde_uint8x8_t a, simde_uint8x8_t b) {
#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
return vabdl_u8(a, b);
#else
return simde_vreinterpretq_u16_s16(
simde_vabsq_s16(
simde_vsubq_s16(
simde_vreinterpretq_s16_u16(simde_vmovl_u8(a)),
simde_vreinterpretq_s16_u16(simde_vmovl_u8(b))
)
)
);
#endif
}
#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
#undef vabdl_u8
#define vabdl_u8(a, b) simde_vabdl_u8((a), (b))
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde_uint32x4_t
simde_vabdl_u16(simde_uint16x4_t a, simde_uint16x4_t b) {
#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
return vabdl_u16(a, b);
#else
return simde_vreinterpretq_u32_s32(
simde_vabsq_s32(
simde_vsubq_s32(
simde_vreinterpretq_s32_u32(simde_vmovl_u16(a)),
simde_vreinterpretq_s32_u32(simde_vmovl_u16(b))
)
)
);
#endif
}
#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
#undef vabdl_u16
#define vabdl_u16(a, b) simde_vabdl_u16((a), (b))
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde_uint64x2_t
simde_vabdl_u32(simde_uint32x2_t a, simde_uint32x2_t b) {
#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
return vabdl_u32(a, b);
#else
return simde_vreinterpretq_u64_s64(
simde_vabsq_s64(
simde_vsubq_s64(
simde_vreinterpretq_s64_u64(simde_vmovl_u32(a)),
simde_vreinterpretq_s64_u64(simde_vmovl_u32(b))
)
)
);
#endif
}
#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
#undef vabdl_u32
#define vabdl_u32(a, b) simde_vabdl_u32((a), (b))
#endif
SIMDE_END_DECLS_
HEDLEY_DIAGNOSTIC_POP
#endif /* !defined(SIMDE_ARM_NEON_ABDL_H) */

View File

@ -0,0 +1,431 @@
/* SPDX-License-Identifier: MIT
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use, copy,
* modify, merge, publish, distribute, sublicense, and/or sell copies
* of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
* BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*
* Copyright:
* 2020 Evan Nemerson <evan@nemerson.com>
*/
#if !defined(SIMDE_ARM_NEON_ABS_H)
#define SIMDE_ARM_NEON_ABS_H
#include "types.h"
HEDLEY_DIAGNOSTIC_PUSH
SIMDE_DISABLE_UNWANTED_DIAGNOSTICS
SIMDE_BEGIN_DECLS_
SIMDE_FUNCTION_ATTRIBUTES
int64_t
simde_vabsd_s64(int64_t a) {
#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9,1,0))
return vabsd_s64(a);
#else
return a < 0 ? -a : a;
#endif
}
#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
#undef vabsd_s64
#define vabsd_s64(a) simde_vabsd_s64(a)
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde_float32x2_t
simde_vabs_f32(simde_float32x2_t a) {
#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
return vabs_f32(a);
#else
simde_float32x2_private
r_,
a_ = simde_float32x2_to_private(a);
SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
r_.values[i] = a_.values[i] < 0 ? -a_.values[i] : a_.values[i];
}
return simde_float32x2_from_private(r_);
#endif
}
#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
#undef vabs_f32
#define vabs_f32(a) simde_vabs_f32(a)
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde_float64x1_t
simde_vabs_f64(simde_float64x1_t a) {
#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
return vabs_f64(a);
#else
simde_float64x1_private
r_,
a_ = simde_float64x1_to_private(a);
SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
r_.values[i] = a_.values[i] < 0 ? -a_.values[i] : a_.values[i];
}
return simde_float64x1_from_private(r_);
#endif
}
#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
#undef vabs_f64
#define vabs_f64(a) simde_vabs_f64(a)
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde_int8x8_t
simde_vabs_s8(simde_int8x8_t a) {
#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
return vabs_s8(a);
#else
simde_int8x8_private
r_,
a_ = simde_int8x8_to_private(a);
#if defined(SIMDE_X86_SSSE3_NATIVE) && defined(SIMDE_X86_MMX_NATIVE)
r_.m64 = _mm_abs_pi8(a_.m64);
#elif (SIMDE_NATURAL_VECTOR_SIZE > 0) && defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) && !defined(SIMDE_BUG_GCC_100762)
__typeof__(r_.values) m = HEDLEY_REINTERPRET_CAST(__typeof__(r_.values), a_.values < INT8_C(0));
r_.values = (-a_.values & m) | (a_.values & ~m);
#else
SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
r_.values[i] = a_.values[i] < 0 ? -a_.values[i] : a_.values[i];
}
#endif
return simde_int8x8_from_private(r_);
#endif
}
#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
#undef vabs_s8
#define vabs_s8(a) simde_vabs_s8(a)
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde_int16x4_t
simde_vabs_s16(simde_int16x4_t a) {
#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
return vabs_s16(a);
#else
simde_int16x4_private
r_,
a_ = simde_int16x4_to_private(a);
#if defined(SIMDE_X86_SSSE3_NATIVE) && defined(SIMDE_X86_MMX_NATIVE)
r_.m64 = _mm_abs_pi16(a_.m64);
#elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) && !defined(SIMDE_BUG_GCC_100761)
__typeof__(r_.values) m = HEDLEY_REINTERPRET_CAST(__typeof__(r_.values), a_.values < INT16_C(0));
r_.values = (-a_.values & m) | (a_.values & ~m);
#else
SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
r_.values[i] = a_.values[i] < 0 ? -a_.values[i] : a_.values[i];
}
#endif
return simde_int16x4_from_private(r_);
#endif
}
#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
#undef vabs_s16
#define vabs_s16(a) simde_vabs_s16(a)
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde_int32x2_t
simde_vabs_s32(simde_int32x2_t a) {
#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
return vabs_s32(a);
#else
simde_int32x2_private
r_,
a_ = simde_int32x2_to_private(a);
#if defined(SIMDE_X86_SSSE3_NATIVE) && defined(SIMDE_X86_MMX_NATIVE)
r_.m64 = _mm_abs_pi32(a_.m64);
#elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) && !defined(SIMDE_BUG_GCC_100761)
__typeof__(r_.values) m = HEDLEY_REINTERPRET_CAST(__typeof__(r_.values), a_.values < INT32_C(0));
r_.values = (-a_.values & m) | (a_.values & ~m);
#else
SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
r_.values[i] = a_.values[i] < 0 ? -a_.values[i] : a_.values[i];
}
#endif
return simde_int32x2_from_private(r_);
#endif
}
#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
#undef vabs_s32
#define vabs_s32(a) simde_vabs_s32(a)
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde_int64x1_t
simde_vabs_s64(simde_int64x1_t a) {
#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
return vabs_s64(a);
#else
simde_int64x1_private
r_,
a_ = simde_int64x1_to_private(a);
#if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR)
__typeof__(r_.values) m = HEDLEY_REINTERPRET_CAST(__typeof__(r_.values), a_.values < INT64_C(0));
r_.values = (-a_.values & m) | (a_.values & ~m);
#else
SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
r_.values[i] = a_.values[i] < 0 ? -a_.values[i] : a_.values[i];
}
#endif
return simde_int64x1_from_private(r_);
#endif
}
#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
#undef vabs_s64
#define vabs_s64(a) simde_vabs_s64(a)
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde_float32x4_t
simde_vabsq_f32(simde_float32x4_t a) {
#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
return vabsq_f32(a);
#elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
return vec_abs(a);
#else
simde_float32x4_private
r_,
a_ = simde_float32x4_to_private(a);
#if defined(SIMDE_WASM_SIMD128_NATIVE)
r_.v128 = wasm_f32x4_abs(a_.v128);
#elif defined(SIMDE_X86_SSE_NATIVE)
simde_float32 mask_;
uint32_t u32_ = UINT32_C(0x7FFFFFFF);
simde_memcpy(&mask_, &u32_, sizeof(u32_));
r_.m128 = _mm_and_ps(_mm_set1_ps(mask_), a_.m128);
#else
SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
r_.values[i] = simde_math_fabsf(a_.values[i]);
}
#endif
return simde_float32x4_from_private(r_);
#endif
}
#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
#undef vabsq_f32
#define vabsq_f32(a) simde_vabsq_f32(a)
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde_float64x2_t
simde_vabsq_f64(simde_float64x2_t a) {
#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
return vabsq_f64(a);
#elif defined(SIMDE_POWER_ALTIVEC_P7_NATIVE)
return vec_abs(a);
#else
simde_float64x2_private
r_,
a_ = simde_float64x2_to_private(a);
#if defined(SIMDE_X86_SSE2_NATIVE)
simde_float64 mask_;
uint64_t u64_ = UINT64_C(0x7FFFFFFFFFFFFFFF);
simde_memcpy(&mask_, &u64_, sizeof(u64_));
r_.m128d = _mm_and_pd(_mm_set1_pd(mask_), a_.m128d);
#else
SIMDE_VECTORIZE