From dc247c9f9913e336200ecf8bb72152fdabdb3585 Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin" <hpa@zytor.com>
Date: Mon, 15 Sep 2025 18:19:55 -0700
Subject: [PATCH] Speed up autoconf, use POSIX names instead of cpu_to_le*()

The POSIX names for these functions are htole*(). Use those
preferentially.

Speed up autoconf by allowing early-out during alternative function
searches.

Signed-off-by: H. Peter Anvin (Intel) <hpa@zytor.com>
---
 asm/assemble.c                |  6 +--
 autoconf/m4/pa_add_headers.m4 |  1 -
 autoconf/m4/pa_csym.m4        | 13 ++++++
 autoconf/m4/pa_endian.m4      | 63 ++++++++++++++++++--------
 autoconf/m4/pa_find_func.m4   | 16 +++++++
 autoconf/m4/pa_have_func.m4   | 42 ++++++++++++------
 autoconf/m4/pa_sym.m4         |  7 ++-
 config/unconfig.h             | 12 ++---
 configure.ac                  | 19 ++++----
 include/bytesex.h             | 80 ++++++++++++++++-----------------
 nasmlib/file.c                |  8 ++--
 nasmlib/saa.c                 |  8 ++--
 output/outelf.c               | 84 +++++++++++++++++------------------
 13 files changed, 212 insertions(+), 147 deletions(-)
 create mode 100644 autoconf/m4/pa_csym.m4
 create mode 100644 autoconf/m4/pa_find_func.m4

diff --git a/asm/assemble.c b/asm/assemble.c
index 804fc3cd..340c84dc 100644
--- a/asm/assemble.c
+++ b/asm/assemble.c
@@ -424,7 +424,7 @@ static void out(struct out_data *data)
                     data->flags &= ~OUT_SIGNMASK;
                 }
                 warn_overflow_out(addrval, asize, data->flags, data->what);
-                xdata.q = cpu_to_le64(addrval);
+                xdata.q = htole64(addrval);
                 data->data = xdata.b;
                 data->type = OUT_RAWDATA;
                 asize = amax = 0;   /* No longer an address */
@@ -661,7 +661,7 @@ static void out_rawbyte(struct out_data *data, uint8_t byte)
 #if 0                           /* Currently unused */
 static void out_rawword(struct out_data *data, uint16_t value)
 {
-    uint16_t buf = cpu_to_le16(value);
+    uint16_t buf = htole16(value);
     data->type = OUT_RAWDATA;
     data->data = &buf;
     data->size = 2;
@@ -671,7 +671,7 @@ static void out_rawword(struct out_data *data, uint16_t value)
 
 static void out_rawdword(struct out_data *data, uint32_t value)
 {
-    uint32_t buf = cpu_to_le32(value);
+    uint32_t buf = htole32(value);
     data->type = OUT_RAWDATA;
     data->data = &buf;
     data->size = 4;
diff --git a/autoconf/m4/pa_add_headers.m4 b/autoconf/m4/pa_add_headers.m4
index d3c478a4..0da144d1 100644
--- a/autoconf/m4/pa_add_headers.m4
+++ b/autoconf/m4/pa_add_headers.m4
@@ -8,6 +8,5 @@ AC_DEFUN([_PA_ADD_HEADER],
 #include <$1>"
 ])
 ])
-
 AC_DEFUN([PA_ADD_HEADERS],
 [m4_map_args_w([$1],[_PA_ADD_HEADER(],[)])])
diff --git a/autoconf/m4/pa_csym.m4 b/autoconf/m4/pa_csym.m4
new file mode 100644
index 00000000..ea0bf809
--- /dev/null
+++ b/autoconf/m4/pa_csym.m4
@@ -0,0 +1,13 @@
+dnl --------------------------------------------------------------------------
+dnl PA_CSYM(prefix, string)
+dnl
+dnl Convert a (semi-) arbitrary string to a CPP symbol
+dnl Convert non-C characters to underscore, except + which is converted
+dnl to X (so C++ -> CXX). Unlike PA_SYM(), do not compact multiple
+dnl underscores.
+dnl --------------------------------------------------------------------------
+AC_DEFUN([PA_CSYM],
+[m4_toupper(m4_bpatsubsts(m4_quote(m4_toupper(m4_normalize([$*]))),
+[[ ]+],[],[\+],[X],[^\(.\)\([0123456789].*\)$],[[[\1_\2]]],
+[[^ABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789_]],[_],
+[^._\(.*\)_.$],[[[\1]]]))])
diff --git a/autoconf/m4/pa_endian.m4 b/autoconf/m4/pa_endian.m4
index 9f8bfad4..fd9fb4eb 100644
--- a/autoconf/m4/pa_endian.m4
+++ b/autoconf/m4/pa_endian.m4
@@ -11,22 +11,47 @@ byte first (like Motorola and SPARC, unlike Intel and VAX).])
 AH_TEMPLATE(WORDS_LITTLEENDIAN,
 [Define to 1 if your processor stores words with the least significant
 byte first (like Intel and VAX, unlike Motorola and SPARC).])
-PA_ADD_HEADERS(stdbit.h endian.h sys/endian.h machine/endian.h)
-PA_HAVE_FUNC(cpu_to_le16, (0))
-PA_HAVE_FUNC(cpu_to_le32, (0))
-PA_HAVE_FUNC(cpu_to_le64, (0))
-PA_HAVE_FUNC(__cpu_to_le16, (0))
-PA_HAVE_FUNC(__cpu_to_le32, (0))
-PA_HAVE_FUNC(__cpu_to_le64, (0))
-PA_HAVE_FUNC(htole16, (0))
-PA_HAVE_FUNC(htole32, (0))
-PA_HAVE_FUNC(htole64, (0))
-PA_HAVE_FUNC(__bswap_16, (0))
-PA_HAVE_FUNC(__bswap_32, (0))
-PA_HAVE_FUNC(__bswap_64, (0))
-PA_HAVE_FUNC(__builtin_bswap16, (0))
-PA_HAVE_FUNC(__builtin_bswap32, (0))
-PA_HAVE_FUNC(__builtin_bswap64, (0))
-PA_HAVE_FUNC(_byteswap_ushort, (0))
-PA_HAVE_FUNC(_byteswap_ulong, (0))
-PA_HAVE_FUNC(_byteswap_uint64, (0))])
+AC_CHECK_HEADERS_ONCE(stdbit.h)dnl C23 standard header for this stuff
+dnl Note: alwasy look for the canonical POSIX version, to make sure to
+dnl avoid conflict when substituting
+AS_IF([test x$ac_cv_c_bigendian = xno],
+[
+dnl Littleendian
+PA_HAVE_FUNC(htole16,,[endian.h sys/endian.h machine/endian.h])
+PA_HAVE_FUNC(htole32,,[endian.h sys/endian.h machine/endian.h])
+PA_HAVE_FUNC(htole64,,[endian.h sys/endian.h machine/endian.h])
+],[
+dnl Maybe not littleendian
+PA_FIND_FUNC([htole16,,[endian.h sys/endian.h machine/endian.h]],
+[__builtin_bswap16], [bswap_16,,[byteswap.h]], [_byteswap_ushort,,[stdlib.h]],
+[cpu_to_le16], [__cpu_to_le16])
+PA_FIND_FUNC([htole32,,[endian.h sys/endian.h machine/endian.h]],
+[__builtin_bswap32], [bswap_32,,[byteswap.h]], [_byteswap_ulong,,[stdlib.h]],
+[cpu_to_le32], [__cpu_to_le32])
+PA_FIND_FUNC([htole64,,[endian.h sys/endian.h machine/endian.h]],
+[__builtin_bswap64], [bswap_64,,[byteswap.h]], [_byteswap_uint64,,[stdlib.h]],
+[cpu_to_le64], [__cpu_to_le64])
+])
+AS_IF([test x$ac_cv_c_bigendian = xyes],
+[
+dnl Bigendian
+PA_HAVE_FUNC(htobe16,,[endian.h sys/endian.h machine/endian.h])
+PA_HAVE_FUNC(htobe32,,[endian.h sys/endian.h machine/endian.h])
+PA_HAVE_FUNC(htobe64,,[endian.h sys/endian.h machine/endian.h])
+],[
+dnl Maybe not bigendian
+PA_FIND_FUNC([htobe16,,[endian.h sys/endian.h machine/endian.h]],
+[htons,,[arpa/inet.h]],
+[__builtin_bswap16], [bswap_16,,[byteswap.h]], [_byteswap_ushort,,[stdlib.h]],
+[cpu_to_be16], [__cpu_to_be16])
+PA_FIND_FUNC([htobe32,,[endian.h sys/endian.h machine/endian.h]],
+[htonl,,[arpa/inet.h]],
+[__builtin_bswap32], [bswap_32,,[byteswap.h]], [_byteswap_ulong,,[stdlib.h]],
+[cpu_to_be32], [__cpu_to_le32])
+PA_FIND_FUNC([htobe64,,[endian.h sys/endian.h machine/endian.h]],
+[htonq,,[arpa/inet.h]],
+[__builtin_bswap64], [bswap_64,,[byteswap.h]], [_byteswap_uint64,,[stdlib.h]],
+[cpu_to_be64], [__cpu_to_be64])
+])
+])
+
diff --git a/autoconf/m4/pa_find_func.m4 b/autoconf/m4/pa_find_func.m4
new file mode 100644
index 00000000..bba4dfe1
--- /dev/null
+++ b/autoconf/m4/pa_find_func.m4
@@ -0,0 +1,16 @@
+dnl --------------------------------------------------------------------------
+dnl PA_FIND_FUNC(func_description ...)
+dnl
+dnl Each argument must be a list of arguments to PA_HAVE_FUNC. Stop after
+dnl the first function in the list found.
+dnl --------------------------------------------------------------------------
+AC_DEFUN([_PA_FIND_FUNC],
+[
+AS_IF([test x"$pa_find_func_found" != xyes],
+[PA_HAVE_FUNC($@)
+pa_find_func_found="$pa_cv_func_$1"
+])])
+
+AC_DEFUN([PA_FIND_FUNC],
+[pa_find_func_found=no
+m4_map([_PA_FIND_FUNC],[$@])])
diff --git a/autoconf/m4/pa_have_func.m4 b/autoconf/m4/pa_have_func.m4
index 867320e0..d32b1fd1 100644
--- a/autoconf/m4/pa_have_func.m4
+++ b/autoconf/m4/pa_have_func.m4
@@ -1,20 +1,34 @@
 dnl --------------------------------------------------------------------------
-dnl PA_HAVE_FUNC(func_name, arguments)
+dnl PA_HAVE_FUNC([func_name ...][,arguments [,headers [,return_type]]])
 dnl
 dnl Look for a function with the specified arguments which could be
-dnl a macro/builtin/intrinsic function.
+dnl a macro/builtin/intrinsic function. If "arguments" are omitted,
+dnl then (0) is used assumed; if "return_type" is omitted or "void", the
+dnl expression is cast to (void).
 dnl --------------------------------------------------------------------------
+AC_DEFUN([_PA_HAVE_FUNC_INCLUDE],
+[m4_echo([#ifdef ]PA_CSYM([HAVE_$1])[
+#include <$1>
+#endif
+])])
+
 AC_DEFUN([PA_HAVE_FUNC],
-[AC_MSG_CHECKING([for $1])
- AC_LINK_IFELSE([AC_LANG_SOURCE([
-AC_INCLUDES_DEFAULT
-int main(void) {
-    (void)$1$2;
-    return 0;
-}
- ])],
- [AC_MSG_RESULT([yes])
-  AC_DEFINE(AS_TR_CPP([HAVE_$1]), 1,
-  [Define to 1 if you have the `$1' intrinsic function.])],
- [AC_MSG_RESULT([no])])
+[AC_CACHE_CHECK([for $1], [pa_cv_func_$1],
+[
+m4_ifnblank([$3],[AC_CHECK_HEADERS_ONCE($3)])dnl
+m4_pushdef([pa_func_args],m4_strip(m4_default([$2],[(0)])))dnl
+m4_pushdef([pa_func_type],m4_default([$4],[void]))dnl
+AC_LINK_IFELSE([AC_LANG_PROGRAM(
+m4_map_args_w([$3], [_PA_HAVE_FUNC_INCLUDE(], [)]),
+m4_cond(pa_func_type,[void],[
+    (void)$1]pa_func_args[;],[
+    ]pa_func_type[ tmp = $1]pa_func_args[;
+    (void)tmp;])
+)],[pa_cv_func_$1=yes],[pa_cv_func_$1=no])
+m4_popdef([pa_func_args])dnl
+m4_popdef([pa_func_type])dnl
 ])
+
+AS_IF([test "x$pa_cv_func_$1" = xyes],
+[AC_DEFINE(PA_CSYM([HAVE_$1]), 1,
+["Define to 1 if you have the `$1' intrinsic function."])])])
diff --git a/autoconf/m4/pa_sym.m4 b/autoconf/m4/pa_sym.m4
index d3a8965d..737f17d8 100644
--- a/autoconf/m4/pa_sym.m4
+++ b/autoconf/m4/pa_sym.m4
@@ -4,8 +4,7 @@ dnl
 dnl Convert a (semi-) arbitrary string to a CPP symbol
 dnl Compact underscores and convert non-C characters to underscore,
 dnl except + which is converted to X (so C++ -> CXX).
+dnl
+dnl Contract multiple underscores together.
 dnl --------------------------------------------------------------------------
-AC_DEFUN([PA_SYM],
-[m4_bpatsubsts(m4_quote(m4_toupper([$*])),
- [,],[],[\+],[X],[[^ABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789]+],[_],dnl
-[^._?\(.*\)_.$],[[\1]])])
+AC_DEFUN([PA_SYM],[m4_bpatsubsts(PA_CSYM([$*]),[__+],[_])])
diff --git a/config/unconfig.h b/config/unconfig.h
index 3afcb206..824f18c4 100644
--- a/config/unconfig.h
+++ b/config/unconfig.h
@@ -4,7 +4,7 @@
 #define CONFIG_UNCONFIG_H
 
 #ifndef alloc_size_func2
-# ifdef HAVE_FUNC_ATTRIBUTE2_ALLOC_SIZE
+# ifdef HAVE_FUNC_ATTRIBUTE_2_ALLOC_SIZE
 #  define alloc_size_func2(x1,x2) ATTRIBUTE(alloc_size(x1,x2))
 # else
 #  define alloc_size_func2(x1,x2)
@@ -12,7 +12,7 @@
 #endif
 
 #ifndef alloc_size_func2_ptr
-# ifdef HAVE_FUNC_PTR_ATTRIBUTE2_ALLOC_SIZE
+# ifdef HAVE_FUNC_PTR_ATTRIBUTE_2_ALLOC_SIZE
 #  define alloc_size_func2_ptr(x1,x2) ATTRIBUTE(alloc_size(x1,x2))
 # else
 #  define alloc_size_func2_ptr(x1,x2)
@@ -36,7 +36,7 @@
 #endif
 
 #ifndef format_func3
-# ifdef HAVE_FUNC_ATTRIBUTE3_FORMAT
+# ifdef HAVE_FUNC_ATTRIBUTE_3_FORMAT
 #  define format_func3(x1,x2,x3) ATTRIBUTE(format(x1,x2,x3))
 # else
 #  define format_func3(x1,x2,x3)
@@ -44,7 +44,7 @@
 #endif
 
 #ifndef format_func3_ptr
-# ifdef HAVE_FUNC_PTR_ATTRIBUTE3_FORMAT
+# ifdef HAVE_FUNC_PTR_ATTRIBUTE_3_FORMAT
 #  define format_func3_ptr(x1,x2,x3) ATTRIBUTE(format(x1,x2,x3))
 # else
 #  define format_func3_ptr(x1,x2,x3)
@@ -164,7 +164,7 @@
 #endif
 
 #ifndef alloc_size_func1
-# ifdef HAVE_FUNC_ATTRIBUTE1_ALLOC_SIZE
+# ifdef HAVE_FUNC_ATTRIBUTE_1_ALLOC_SIZE
 #  define alloc_size_func1(x1) ATTRIBUTE(alloc_size(x1))
 # else
 #  define alloc_size_func1(x1)
@@ -172,7 +172,7 @@
 #endif
 
 #ifndef alloc_size_func1_ptr
-# ifdef HAVE_FUNC_PTR_ATTRIBUTE1_ALLOC_SIZE
+# ifdef HAVE_FUNC_PTR_ATTRIBUTE_1_ALLOC_SIZE
 #  define alloc_size_func1_ptr(x1) ATTRIBUTE(alloc_size(x1))
 # else
 #  define alloc_size_func1_ptr(x1)
diff --git a/configure.ac b/configure.ac
index 1c545c11..3019d40c 100644
--- a/configure.ac
+++ b/configure.ac
@@ -226,7 +226,7 @@ AC_CHECK_FUNCS(sysconf)
 
 AC_CHECK_FUNCS([access _access faccessat])
 
-PA_HAVE_FUNC(__builtin_expect, (1,1))
+PA_HAVE_FUNC(__builtin_expect,(1,1))
 
 PA_FUNC_SNPRINTF
 PA_FUNC_VSNPRINTF
@@ -264,20 +264,19 @@ PA_ARG_DISABLED([pdf-compression],
 AC_SUBST([PDFOPT])
 
 dnl
-dnl Look for byte-swapping support and <stdbit.h>
+dnl Look for byte-swapping support and endian detection
 dnl
 PA_ENDIAN
 
 dnl
-dnl ilog2() building blocks, must come after PA_ENDIAN for <stdbit.h>
+dnl ilog2() building blocks
 dnl
-PA_HAVE_FUNC(stdc_leading_zeros, (0UL))
-PA_ADD_HEADERS(intrin.h)
-PA_HAVE_FUNC(__builtin_clz, (0U))
-PA_HAVE_FUNC(__builtin_clzl, (0UL))
-PA_HAVE_FUNC(__builtin_clzll, (0ULL))
-PA_HAVE_FUNC(_BitScanReverse, (0))
-PA_HAVE_FUNC(_BitScanReverse64, (0))
+PA_HAVE_FUNC([stdc_leading_zeros],(0U),[stdbit.h])
+AS_IF([test x$pa_cv_func_stdc_leading_zeros != xyes],
+[PA_FIND_FUNC([__builtin_clz,(0U)],
+[_BitScanReverse,(0UL),[intrin.h]])
+PA_FIND_FUNC([__builtin_clzll,(0ULL)],
+[_BitScanReverse64,(0ULL),[intrin.h]])])
 
 dnl
 dnl Some rather useful gcc extensions...
diff --git a/include/bytesex.h b/include/bytesex.h
index 00e1c8ba..7450ad7c 100644
--- a/include/bytesex.h
+++ b/include/bytesex.h
@@ -73,15 +73,15 @@
                                                                         \
     return xx
 
-#ifndef HAVE_CPU_TO_LE16
-static inline uint16_t cpu_to_le16(uint16_t v)
+#ifndef HAVE_HTOLE16
+static inline uint16_t htole16(uint16_t v)
 {
 # ifdef WORDS_LITTLEENDIAN
     return v;
+# elif defined(HAVE_CPU_TO_LE16)
+    return cpu_to_le16(v);
 # elif defined(HAVE___CPU_TO_LE16)
     return __cpu_to_le16(v);
-# elif defined(HAVE_HTOLE16)
-    return htole16(v);
 # elif defined(WORDS_BIGENDIAN)
 #  ifdef HAVE___BSWAP_16
     return __bswap_16(v);
@@ -99,15 +99,15 @@ static inline uint16_t cpu_to_le16(uint16_t v)
 }
 #endif
 
-#ifndef HAVE_CPU_TO_LE32
-static inline uint32_t cpu_to_le32(uint32_t v)
+#ifndef HAVE_HTOLE32
+static inline uint32_t htole32(uint32_t v)
 {
 # ifdef WORDS_LITTLEENDIAN
     return v;
+# elif defined(HAVE_CPU_TO_LE32)
+    return cpu_to_le32(v);
 # elif defined(HAVE___CPU_TO_LE32)
     return __cpu_to_le32(v);
-# elif defined(HAVE_HTOLE32)
-    return htole32(v);
 # elif defined(WORDS_BIGENDIAN)
 #  ifdef HAVE___BSWAP_32
     return __bswap_32(v);
@@ -126,15 +126,15 @@ static inline uint32_t cpu_to_le32(uint32_t v)
 }
 #endif
 
-#ifndef HAVE_CPU_TO_LE64
-static inline uint64_t cpu_to_le64(uint64_t v)
+#ifndef HAVE_HTOLE64
+static inline uint64_t htole64(uint64_t v)
 {
 #ifdef WORDS_LITTLEENDIAN
     return v;
+# elif defined(HAVE_CPU_TO_LE64)
+    return cpu_to_le64(v);
 # elif defined(HAVE___CPU_TO_LE64)
     return __cpu_to_le64(v);
-# elif defined(HAVE_HTOLE64)
-    return htole64(v);
 # elif defined(WORDS_BIGENDIAN)
 #  ifdef HAVE___BSWAP_64
     return __bswap_64(v);
@@ -155,8 +155,8 @@ static inline uint64_t cpu_to_le64(uint64_t v)
 }
 #endif
 
-#ifndef HAVE_LE16_TO_CPU
-static inline uint16_t le16_to_cpu(uint16_t v)
+#ifndef HAVE_HTOLE16
+static inline uint16_t le16toh(uint16_t v)
 {
 #ifdef WORDS_LITTLEENDIAN
     return v;
@@ -165,41 +165,41 @@ static inline uint16_t le16_to_cpu(uint16_t v)
 # elif defined(HAVE_LE16TOH)
     return le64toh(v);
 # elif defined(WORDS_BIGENDIAN)
-    return cpu_to_le16(v);
+    return htole16(v);
 # else
     LE_TO_CPU(16);
 # endif
 }
 #endif
 
-#ifndef HAVE_LE32_TO_CPU
-static inline uint32_t le32_to_cpu(uint32_t v)
+#ifndef HAVE_HTOLE32
+static inline uint32_t le32toh(uint32_t v)
 {
 #ifdef WORDS_LITTLEENDIAN
     return v;
-# elif defined(HAVE___LE32_TO_CPU)
+# elif defined(HAVE_CPU_TO_LE32)
+    return le32_to_cpu(v);
+# elif defined(HAVE___CPU_TO_LE32)
     return __le32_to_cpu(v);
-# elif defined(HAVE_LE32TOH)
-    return le64toh(v);
 # elif defined(WORDS_BIGENDIAN)
-    return cpu_to_le32(v);
+    return htole32(v);
 # else
     LE_TO_CPU(32);
 # endif
 }
 #endif
 
-#ifndef HAVE_LE64_TO_CPU
-static inline uint64_t le64_to_cpu(uint64_t v)
+#ifndef HAVE_HTOLE64
+static inline uint64_t le64toh(uint64_t v)
 {
 #ifdef WORDS_LITTLEENDIAN
     return v;
-# elif defined(HAVE___LE64_TO_CPU)
+# elif defined(HAVE_CPU_TO_LE64)
+    return le64_to_cpu(v);
+# elif defined(HAVE___CPU_TO_LE64)
     return __le64_to_cpu(v);
-# elif defined(HAVE_LE64TOH)
-    return le64toh(v);
 # elif defined(WORDS_BIGENDIAN)
-    return cpu_to_le64(v);
+    return htole64(v);
 # else
     LE_TO_CPU(64);
 # endif
@@ -232,11 +232,11 @@ struct unaligned16 {
 } __attribute__((packed));
 static inline uint16_t getu16(const void *p)
 {
-    return le16_to_cpu(((const struct unaligned16 *)p)->v);
+    return le16toh(((const struct unaligned16 *)p)->v);
 }
 static inline uint16_t setu16(void *p, uint16_t v)
 {
-    ((struct unaligned16 *)p)->v = cpu_to_le16(v);
+    ((struct unaligned16 *)p)->v = htole16(v);
     return v;
 }
 
@@ -245,11 +245,11 @@ struct unaligned32 {
 } __attribute__((packed));
 static inline uint32_t getu32(const void *p)
 {
-    return le32_to_cpu(((const struct unaligned32 *)p)->v);
+    return l32toh(((const struct unaligned32 *)p)->v);
 }
 static inline uint32_t setu32(void *p, uint32_t v)
 {
-    ((struct unaligned32 *)p)->v = cpu_to_le32(v);
+    ((struct unaligned32 *)p)->v = htole32(v);
     return v;
 }
 
@@ -258,11 +258,11 @@ struct unaligned64 {
 } __attribute__((packed));
 static inline uint64_t getu64(const void *p)
 {
-    return le64_to_cpu(((const struct unaligned64 *)p)->v);
+    return le64toh(((const struct unaligned64 *)p)->v);
 }
 static inline uint64_t setu64(void *p, uint64_t v)
 {
-    ((struct unaligned64 *)p)->v = cpu_to_le64(v);
+    ((struct unaligned64 *)p)->v = htole64(v);
     return v;
 }
 
@@ -271,36 +271,36 @@ static inline uint64_t setu64(void *p, uint64_t v)
 static inline uint16_t getu16(const void *p)
 {
     const uint16_t _unaligned *pp = p;
-    return le16_to_cpu(*pp);
+    return le16toh(*pp);
 }
 static inline uint16_t setu16(void *p, uint16_t v)
 {
     uint16_t _unaligned *pp = p;
-    *pp = cpu_to_le16(v);
+    *pp = htole16(v);
     return v;
 }
 
 static inline uint32_t getu32(const void *p)
 {
     const uint32_t _unaligned *pp = p;
-    return le32_to_cpu(*pp);
+    return l32toh(*pp);
 }
 static inline uint32_t setu32(void *p, uint32_t v)
 {
     uint32_t _unaligned *pp = p;
-    *pp = cpu_to_le32(v);
+    *pp = htole32(v);
     return v;
 }
 
 static inline uint64_t getu64(const void *p)
 {
     const uint64_t _unaligned *pp = p;
-    return le64_to_cpu(*pp);
+    return le64toh(*pp);
 }
 static inline uint64_t setu64(void *p, uint64_t v)
 {
     uint32_t _unaligned *pp = p;
-    *pp = cpu_to_le64(v);
+    *pp = htole64(v);
     return v;
 }
 
@@ -396,7 +396,7 @@ static inline uint64_t setu64(void *p, uint64_t v)
 
 #define WRITEADDR(p,v,s)                                        \
     do {                                                        \
-        const uint64_t _wa_v = cpu_to_le64(v);                  \
+        const uint64_t _wa_v = htole64(v);                  \
         (p) = mempcpy((p), &_wa_v, (s));                        \
     } while (0)
 
diff --git a/nasmlib/file.c b/nasmlib/file.c
index 62b854de..2ec3204b 100644
--- a/nasmlib/file.c
+++ b/nasmlib/file.c
@@ -52,25 +52,25 @@ void nasm_write(const void *ptr, size_t size, FILE *f)
 
 void fwriteint16_t(uint16_t data, FILE * fp)
 {
-    data = cpu_to_le16(data);
+    data = htole16(data);
     nasm_write(&data, 2, fp);
 }
 
 void fwriteint32_t(uint32_t data, FILE * fp)
 {
-    data = cpu_to_le32(data);
+    data = htole32(data);
     nasm_write(&data, 4, fp);
 }
 
 void fwriteint64_t(uint64_t data, FILE * fp)
 {
-    data = cpu_to_le64(data);
+    data = htole64(data);
     nasm_write(&data, 8, fp);
 }
 
 void fwriteaddr(uint64_t data, int size, FILE * fp)
 {
-    data = cpu_to_le64(data);
+    data = htole64(data);
     nasm_write(&data, size, fp);
 }
 
diff --git a/nasmlib/saa.c b/nasmlib/saa.c
index dcc2c019..4080e105 100644
--- a/nasmlib/saa.c
+++ b/nasmlib/saa.c
@@ -307,25 +307,25 @@ void saa_write8(struct SAA *s, uint8_t v)
 
 void saa_write16(struct SAA *s, uint16_t v)
 {
-    v = cpu_to_le16(v);
+    v = htole16(v);
     saa_wbytes(s, &v, 2);
 }
 
 void saa_write32(struct SAA *s, uint32_t v)
 {
-    v = cpu_to_le32(v);
+    v = htole32(v);
     saa_wbytes(s, &v, 4);
 }
 
 void saa_write64(struct SAA *s, uint64_t v)
 {
-    v = cpu_to_le64(v);
+    v = htole64(v);
     saa_wbytes(s, &v, 8);
 }
 
 void saa_writeaddr(struct SAA *s, uint64_t v, size_t len)
 {
-    v = cpu_to_le64(v);
+    v = htole64(v);
     saa_wbytes(s, &v, len);
 }
 
diff --git a/output/outelf.c b/output/outelf.c
index 3ec5c70a..2fc7f58c 100644
--- a/output/outelf.c
+++ b/output/outelf.c
@@ -1826,7 +1826,7 @@ rel12adr:
  */
 static inline uint16_t elf_shndx(int section, uint16_t overflow)
 {
-    return cpu_to_le16(section < (int)SHN_LORESERVE ? section : overflow);
+    return htole16(section < (int)SHN_LORESERVE ? section : overflow);
 }
 
 struct ehdr_common {
@@ -1911,20 +1911,20 @@ static void elf_write(void)
     ehdr.com.e_ident[EI_VERSION]    = EV_CURRENT;
     ehdr.com.e_ident[EI_OSABI]      = elf_osabi;
     ehdr.com.e_ident[EI_ABIVERSION] = elf_abiver;
-    ehdr.com.e_type                 = cpu_to_le16(ET_REL);
-    ehdr.com.e_machine              = cpu_to_le16(efmt->e_machine);
-    ehdr.com.e_version              = cpu_to_le16(EV_CURRENT);
+    ehdr.com.e_type                 = htole16(ET_REL);
+    ehdr.com.e_machine              = htole16(efmt->e_machine);
+    ehdr.com.e_version              = htole16(EV_CURRENT);
 
     if (!efmt->elf64) {
-        ehdr.ehdr32.e_shoff         = cpu_to_le32(sizeof ehdr);
-        ehdr.ehdr32.e_ehsize        = cpu_to_le16(sizeof(Elf32_Ehdr));
-        ehdr.ehdr32.e_shentsize     = cpu_to_le16(sizeof(Elf32_Shdr));
+        ehdr.ehdr32.e_shoff         = htole32(sizeof ehdr);
+        ehdr.ehdr32.e_ehsize        = htole16(sizeof(Elf32_Ehdr));
+        ehdr.ehdr32.e_shentsize     = htole16(sizeof(Elf32_Shdr));
         ehdr.ehdr32.e_shnum         = elf_shndx(nsections, 0);
         ehdr.ehdr32.e_shstrndx      = elf_shndx(sec_shstrtab, SHN_XINDEX);
     } else {
-        ehdr.ehdr64.e_shoff         = cpu_to_le64(sizeof ehdr);
-        ehdr.ehdr64.e_ehsize        = cpu_to_le16(sizeof(Elf64_Ehdr));
-        ehdr.ehdr64.e_shentsize     = cpu_to_le16(sizeof(Elf64_Shdr));
+        ehdr.ehdr64.e_shoff         = htole64(sizeof ehdr);
+        ehdr.ehdr64.e_ehsize        = htole16(sizeof(Elf64_Ehdr));
+        ehdr.ehdr64.e_shentsize     = htole16(sizeof(Elf64_Shdr));
         ehdr.ehdr64.e_shnum         = elf_shndx(nsections, 0);
         ehdr.ehdr64.e_shstrndx      = elf_shndx(sec_shstrtab, SHN_XINDEX);
     }
@@ -2114,9 +2114,9 @@ static void elf32_sym(const struct elf_symbol *sym)
 {
     Elf32_Sym sym32;
 
-    sym32.st_name     = cpu_to_le32(sym->strpos);
-    sym32.st_value    = cpu_to_le32(sym->symv.key);
-    sym32.st_size     = cpu_to_le32(sym->size);
+    sym32.st_name     = htole32(sym->strpos);
+    sym32.st_value    = htole32(sym->symv.key);
+    sym32.st_size     = htole32(sym->size);
     sym32.st_info     = sym->type;
     sym32.st_other    = sym->other;
     sym32.st_shndx    = elf_shndx(sym->section, SHN_XINDEX);
@@ -2127,9 +2127,9 @@ static void elf64_sym(const struct elf_symbol *sym)
 {
     Elf64_Sym sym64;
 
-    sym64.st_name     = cpu_to_le32(sym->strpos);
-    sym64.st_value    = cpu_to_le64(sym->symv.key);
-    sym64.st_size     = cpu_to_le64(sym->size);
+    sym64.st_name     = htole32(sym->strpos);
+    sym64.st_value    = htole64(sym->symv.key);
+    sym64.st_size     = htole64(sym->size);
     sym64.st_info     = sym->type;
     sym64.st_other    = sym->other;
     sym64.st_shndx    = elf_shndx(sym->section, SHN_XINDEX);
@@ -2240,8 +2240,8 @@ static struct SAA *elf32_build_reltab(const struct elf_reloc *r)
         if (sym >= GLOBAL_TEMP_BASE)
             sym += global_offset;
 
-        rel32.r_offset    = cpu_to_le32(r->address);
-        rel32.r_info      = cpu_to_le32(ELF32_R_INFO(sym, r->type));
+        rel32.r_offset    = htole32(r->address);
+        rel32.r_info      = htole32(ELF32_R_INFO(sym, r->type));
         saa_wbytes(s, &rel32, sizeof rel32);
 
         r = r->next;
@@ -2274,9 +2274,9 @@ static struct SAA *elfx32_build_reltab(const struct elf_reloc *r)
         if (sym >= GLOBAL_TEMP_BASE)
             sym += global_offset;
 
-        rela32.r_offset   = cpu_to_le32(r->address);
-        rela32.r_info     = cpu_to_le32(ELF32_R_INFO(sym, r->type));
-        rela32.r_addend   = cpu_to_le32(r->offset);
+        rela32.r_offset   = htole32(r->address);
+        rela32.r_info     = htole32(ELF32_R_INFO(sym, r->type));
+        rela32.r_addend   = htole32(r->offset);
         saa_wbytes(s, &rela32, sizeof rela32);
 
         r = r->next;
@@ -2309,9 +2309,9 @@ static struct SAA *elf64_build_reltab(const struct elf_reloc *r)
         if (sym >= GLOBAL_TEMP_BASE)
             sym += global_offset;
 
-        rela64.r_offset   = cpu_to_le64(r->address);
-        rela64.r_info     = cpu_to_le64(ELF64_R_INFO(sym, r->type));
-        rela64.r_addend   = cpu_to_le64(r->offset);
+        rela64.r_offset   = htole64(r->address);
+        rela64.r_info     = htole64(ELF64_R_INFO(sym, r->type));
+        rela64.r_addend   = htole64(r->offset);
         saa_wbytes(s, &rela64, sizeof rela64);
 
         r = r->next;
@@ -2333,35 +2333,35 @@ static void elf_section_header(int name, int type, uint64_t flags,
     if (!efmt->elf64) {
         Elf32_Shdr  shdr;
 
-        shdr.sh_name         = cpu_to_le32(name);
-        shdr.sh_type         = cpu_to_le32(type);
-        shdr.sh_flags        = cpu_to_le32(flags);
+        shdr.sh_name         = htole32(name);
+        shdr.sh_type         = htole32(type);
+        shdr.sh_flags        = htole32(flags);
         shdr.sh_addr         = 0;
-        shdr.sh_offset       = cpu_to_le32(type == SHT_NULL ? 0 : elf_foffs);
-        shdr.sh_size         = cpu_to_le32(datalen);
+        shdr.sh_offset       = htole32(type == SHT_NULL ? 0 : elf_foffs);
+        shdr.sh_size         = htole32(datalen);
         if (data)
             elf_foffs += ALIGN(datalen, SEC_FILEALIGN);
-        shdr.sh_link         = cpu_to_le32(link);
-        shdr.sh_info         = cpu_to_le32(info);
-        shdr.sh_addralign    = cpu_to_le32(align);
-        shdr.sh_entsize      = cpu_to_le32(entsize);
+        shdr.sh_link         = htole32(link);
+        shdr.sh_info         = htole32(info);
+        shdr.sh_addralign    = htole32(align);
+        shdr.sh_entsize      = htole32(entsize);
 
         nasm_write(&shdr, sizeof shdr, ofile);
     } else {
         Elf64_Shdr  shdr;
 
-        shdr.sh_name         = cpu_to_le32(name);
-        shdr.sh_type         = cpu_to_le32(type);
-        shdr.sh_flags        = cpu_to_le64(flags);
+        shdr.sh_name         = htole32(name);
+        shdr.sh_type         = htole32(type);
+        shdr.sh_flags        = htole64(flags);
         shdr.sh_addr         = 0;
-        shdr.sh_offset       = cpu_to_le64(type == SHT_NULL ? 0 : elf_foffs);
-        shdr.sh_size         = cpu_to_le64(datalen);
+        shdr.sh_offset       = htole64(type == SHT_NULL ? 0 : elf_foffs);
+        shdr.sh_size         = htole64(datalen);
         if (data)
             elf_foffs += ALIGN(datalen, SEC_FILEALIGN);
-        shdr.sh_link        = cpu_to_le32(link);
-        shdr.sh_info        = cpu_to_le32(info);
-        shdr.sh_addralign   = cpu_to_le64(align);
-        shdr.sh_entsize     = cpu_to_le64(entsize);
+        shdr.sh_link        = htole32(link);
+        shdr.sh_info        = htole32(info);
+        shdr.sh_addralign   = htole64(align);
+        shdr.sh_entsize     = htole64(entsize);
 
         nasm_write(&shdr, sizeof shdr, ofile);
     }