update C headers to LLVM 16

upstream commit 0604154e006e88e9e7f82d8ee5fd076bda206613
2025-02-12 23:50:18 +00:00 · 2023-01-26 13:15:35 -07:00 · 2023-01-26 13:15:35 -07:00 · 85be0b8c65
commit 85be0b8c65
parent b260bf42d3
59 changed files with 21633 additions and 19318 deletions
--- a/lib/include/__clang_cuda_texture_intrinsics.h
+++ b/lib/include/__clang_cuda_texture_intrinsics.h
@ -666,6 +666,7 @@ __device__ static void __tex_fetch(__T *__ptr, cudaTextureObject_t __handle,
      __tex_fetch_v4<__op>::template __run<__FetchT>(__handle, __args...));
 }

+#if CUDA_VERSION < 12000
 // texture<> objects get magically converted into a texture reference.  However,
 // there's no way to convert them to cudaTextureObject_t on C++ level. So, we
 // cheat a bit and use inline assembly to do it. It costs us an extra register
@ -713,6 +714,7 @@ __tex_fetch(__DataT *, __RetT *__ptr,
      __tex_fetch_v4<__op>::template __run<__FetchT>(
          __tex_handle_to_obj(__handle), __args...));
 }
+#endif // CUDA_VERSION
 } // namespace __cuda_tex
 } // namespace
 #pragma pop_macro("__ASM_OUT")
--- a/lib/include/__clang_hip_libdevice_declares.h
+++ b/lib/include/__clang_hip_libdevice_declares.h
@ -288,12 +288,17 @@ __llvm_amdgcn_rsq_f64(double __x) {

 __device__ __attribute__((const)) _Float16 __ocml_ceil_f16(_Float16);
 __device__ _Float16 __ocml_cos_f16(_Float16);
+__device__ __attribute__((const)) _Float16 __ocml_cvtrtn_f16_f32(float);
+__device__ __attribute__((const)) _Float16 __ocml_cvtrtp_f16_f32(float);
+__device__ __attribute__((const)) _Float16 __ocml_cvtrtz_f16_f32(float);
 __device__ __attribute__((pure)) _Float16 __ocml_exp_f16(_Float16);
 __device__ __attribute__((pure)) _Float16 __ocml_exp10_f16(_Float16);
 __device__ __attribute__((pure)) _Float16 __ocml_exp2_f16(_Float16);
 __device__ __attribute__((const)) _Float16 __ocml_floor_f16(_Float16);
 __device__ __attribute__((const)) _Float16 __ocml_fma_f16(_Float16, _Float16,
                                                          _Float16);
+__device__ __attribute__((const)) _Float16 __ocml_fmax_f16(_Float16, _Float16);
+__device__ __attribute__((const)) _Float16 __ocml_fmin_f16(_Float16, _Float16);
 __device__ __attribute__((const)) _Float16 __ocml_fabs_f16(_Float16);
 __device__ __attribute__((const)) int __ocml_isinf_f16(_Float16);
 __device__ __attribute__((const)) int __ocml_isnan_f16(_Float16);
--- a/lib/include/__clang_hip_math.h
+++ b/lib/include/__clang_hip_math.h
@ -70,9 +70,9 @@ __DEVICE__ void __static_assert_equal_size() {
 #endif

 __DEVICE__
-uint64_t __make_mantissa_base8(const char *__tagp) {
+uint64_t __make_mantissa_base8(const char *__tagp __attribute__((nonnull))) {
  uint64_t __r = 0;
-  while (__tagp) {
+  while (*__tagp != '\0') {
    char __tmp = *__tagp;

    if (__tmp >= '0' && __tmp <= '7')
@ -87,9 +87,9 @@ uint64_t __make_mantissa_base8(const char *__tagp) {
 }

 __DEVICE__
-uint64_t __make_mantissa_base10(const char *__tagp) {
+uint64_t __make_mantissa_base10(const char *__tagp __attribute__((nonnull))) {
  uint64_t __r = 0;
-  while (__tagp) {
+  while (*__tagp != '\0') {
    char __tmp = *__tagp;

    if (__tmp >= '0' && __tmp <= '9')
@ -104,9 +104,9 @@ uint64_t __make_mantissa_base10(const char *__tagp) {
 }

 __DEVICE__
-uint64_t __make_mantissa_base16(const char *__tagp) {
+uint64_t __make_mantissa_base16(const char *__tagp __attribute__((nonnull))) {
  uint64_t __r = 0;
-  while (__tagp) {
+  while (*__tagp != '\0') {
    char __tmp = *__tagp;

    if (__tmp >= '0' && __tmp <= '9')
@ -125,10 +125,7 @@ uint64_t __make_mantissa_base16(const char *__tagp) {
 }

 __DEVICE__
-uint64_t __make_mantissa(const char *__tagp) {
-  if (!__tagp)
-    return 0u;
-
+uint64_t __make_mantissa(const char *__tagp __attribute__((nonnull))) {
  if (*__tagp == '0') {
    ++__tagp;

@ -233,7 +230,7 @@ __DEVICE__
 float expm1f(float __x) { return __ocml_expm1_f32(__x); }

 __DEVICE__
-float fabsf(float __x) { return __ocml_fabs_f32(__x); }
+float fabsf(float __x) { return __builtin_fabsf(__x); }

 __DEVICE__
 float fdimf(float __x, float __y) { return __ocml_fdim_f32(__x, __y); }
@ -359,7 +356,7 @@ float modff(float __x, float *__iptr) {
 }

 __DEVICE__
-float nanf(const char *__tagp) {
+float nanf(const char *__tagp __attribute__((nonnull))) {
  union {
    float val;
    struct ieee_float {
@ -792,7 +789,7 @@ __DEVICE__
 double expm1(double __x) { return __ocml_expm1_f64(__x); }

 __DEVICE__
-double fabs(double __x) { return __ocml_fabs_f64(__x); }
+double fabs(double __x) { return __builtin_fabs(__x); }

 __DEVICE__
 double fdim(double __x, double __y) { return __ocml_fdim_f64(__x, __y); }
--- a/lib/include/__clang_hip_runtime_wrapper.h
+++ b/lib/include/__clang_hip_runtime_wrapper.h
@ -113,6 +113,7 @@ __attribute__((weak)) inline __device__ void free(void *__ptr) {

 #include <__clang_hip_libdevice_declares.h>
 #include <__clang_hip_math.h>
+#include <__clang_hip_stdlib.h>

 #if defined(__HIPCC_RTC__)
 #include <__clang_hip_cmath.h>
--- a/lib/include/__clang_hip_stdlib.h
+++ b/lib/include/__clang_hip_stdlib.h
@ -0,0 +1,43 @@
+/*===---- __clang_hip_stdlib.h - Device-side HIP math support --------------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+#ifndef __CLANG_HIP_STDLIB_H__
+
+#if !defined(__HIP__) && !defined(__OPENMP_AMDGCN__)
+#error "This file is for HIP and OpenMP AMDGCN device compilation only."
+#endif
+
+#if !defined(__cplusplus)
+
+#include <limits.h>
+
+#ifdef __OPENMP_AMDGCN__
+#define __DEVICE__ static inline __attribute__((always_inline, nothrow))
+#else
+#define __DEVICE__ static __device__ inline __attribute__((always_inline))
+#endif
+
+__DEVICE__
+int abs(int __x) {
+  int __sgn = __x >> (sizeof(int) * CHAR_BIT - 1);
+  return (__x ^ __sgn) - __sgn;
+}
+__DEVICE__
+long labs(long __x) {
+  long __sgn = __x >> (sizeof(long) * CHAR_BIT - 1);
+  return (__x ^ __sgn) - __sgn;
+}
+__DEVICE__
+long long llabs(long long __x) {
+  long long __sgn = __x >> (sizeof(long long) * CHAR_BIT - 1);
+  return (__x ^ __sgn) - __sgn;
+}
+
+#endif // !defined(__cplusplus)
+
+#endif // #define __CLANG_HIP_STDLIB_H__
--- a/lib/include/altivec.h
+++ b/lib/include/altivec.h
@ -17323,32 +17323,32 @@ provided.
 #define vec_ncipherlast_be __builtin_altivec_crypto_vncipherlast

 #ifdef __VSX__
-static __inline__ vector unsigned long long __attribute__((__always_inline__))
-__builtin_crypto_vsbox(vector unsigned long long __a) {
+static __inline__ vector unsigned char __attribute__((__always_inline__))
+__builtin_crypto_vsbox(vector unsigned char __a) {
  return __builtin_altivec_crypto_vsbox(__a);
 }

-static __inline__ vector unsigned long long __attribute__((__always_inline__))
-__builtin_crypto_vcipher(vector unsigned long long __a,
-                         vector unsigned long long __b) {
+static __inline__ vector unsigned char __attribute__((__always_inline__))
+__builtin_crypto_vcipher(vector unsigned char __a,
+                         vector unsigned char __b) {
  return __builtin_altivec_crypto_vcipher(__a, __b);
 }

-static __inline__ vector unsigned long long __attribute__((__always_inline__))
-__builtin_crypto_vcipherlast(vector unsigned long long __a,
-                             vector unsigned long long __b) {
+static __inline__ vector unsigned char __attribute__((__always_inline__))
+__builtin_crypto_vcipherlast(vector unsigned char __a,
+                             vector unsigned char __b) {
  return __builtin_altivec_crypto_vcipherlast(__a, __b);
 }

-static __inline__ vector unsigned long long __attribute__((__always_inline__))
-__builtin_crypto_vncipher(vector unsigned long long __a,
-                          vector unsigned long long __b) {
+static __inline__ vector unsigned char __attribute__((__always_inline__))
+__builtin_crypto_vncipher(vector unsigned char __a,
+                          vector unsigned char __b) {
  return __builtin_altivec_crypto_vncipher(__a, __b);
 }

-static __inline__ vector unsigned long long __attribute__((__always_inline__))
-__builtin_crypto_vncipherlast(vector unsigned long long __a,
-                              vector unsigned long long __b) {
+static __inline__ vector unsigned char  __attribute__((__always_inline__))
+__builtin_crypto_vncipherlast(vector unsigned char __a,
+                              vector unsigned char __b) {
  return __builtin_altivec_crypto_vncipherlast(__a, __b);
 }
 #endif /* __VSX__ */
--- a/lib/include/amxfp16intrin.h
+++ b/lib/include/amxfp16intrin.h
@ -0,0 +1,58 @@
+/*===------------- amxfp16intrin.h - AMX_FP16 intrinsics -*- C++ -*---------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===------------------------------------------------------------------------===
+ */
+
+#ifndef __IMMINTRIN_H
+#error "Never use <amxfp16intrin.h> directly; use <immintrin.h> instead."
+#endif /* __IMMINTRIN_H */
+
+#ifndef __AMX_FP16INTRIN_H
+#define __AMX_FP16INTRIN_H
+#ifdef __x86_64__
+
+/// Compute dot-product of FP16 (16-bit) floating-point pairs in tiles \a a
+///    and \a b, accumulating the intermediate single-precision (32-bit)
+///    floating-point elements with elements in \a dst, and store the 32-bit
+///    result back to tile \a dst.
+///
+/// \headerfile <immintrin.h>
+///
+/// \code
+/// void _tile_dpfp16ps (__tile dst, __tile a, __tile b)
+/// \endcode
+///
+/// \code{.operation}
+/// FOR m := 0 TO dst.rows - 1
+///	tmp := dst.row[m]
+///	FOR k := 0 TO (a.colsb / 4) - 1
+///		FOR n := 0 TO (dst.colsb / 4) - 1
+///			tmp.fp32[n] += FP32(a.row[m].fp16[2*k+0]) *
+///					FP32(b.row[k].fp16[2*n+0])
+///			tmp.fp32[n] += FP32(a.row[m].fp16[2*k+1]) *
+///					FP32(b.row[k].fp16[2*n+1])
+///		ENDFOR
+///	ENDFOR
+///	write_row_and_zero(dst, m, tmp, dst.colsb)
+/// ENDFOR
+/// zero_upper_rows(dst, dst.rows)
+/// zero_tileconfig_start()
+/// \endcode
+///
+/// This intrinsic corresponds to the \c TDPFP16PS instruction.
+///
+/// \param dst
+///    The destination tile. Max size is 1024 Bytes.
+/// \param a
+///    The 1st source tile. Max size is 1024 Bytes.
+/// \param b
+///    The 2nd source tile. Max size is 1024 Bytes.
+#define _tile_dpfp16ps(dst, a, b)                                \
+  __builtin_ia32_tdpfp16ps(dst, a, b)
+
+#endif /* __x86_64__ */
+#endif /* __AMX_FP16INTRIN_H */
--- a/lib/include/amxintrin.h
+++ b/lib/include/amxintrin.h
@ -22,6 +22,8 @@
  __attribute__((__always_inline__, __nodebug__, __target__("amx-int8")))
 #define __DEFAULT_FN_ATTRS_BF16                                                \
  __attribute__((__always_inline__, __nodebug__, __target__("amx-bf16")))
+#define __DEFAULT_FN_ATTRS_FP16                                                \
+  __attribute__((__always_inline__, __nodebug__, __target__("amx-fp16")))

 /// Load tile configuration from a 64-byte memory location specified by
 /// "mem_addr". The tile configuration includes the tile type palette, the
@ -290,6 +292,13 @@ _tile_dpbf16ps_internal(unsigned short m, unsigned short n, unsigned short k,
  return __builtin_ia32_tdpbf16ps_internal(m, n, k, dst, src1, src2);
 }

+/// This is internal intrinsic. C/C++ user should avoid calling it directly.
+static __inline__ _tile1024i __DEFAULT_FN_ATTRS_FP16
+_tile_dpfp16ps_internal(unsigned short m, unsigned short n, unsigned short k,
+                        _tile1024i dst, _tile1024i src1, _tile1024i src2) {
+  return __builtin_ia32_tdpfp16ps_internal(m, n, k, dst, src1, src2);
+}
+
 /// This struct pack the shape and tile data together for user. We suggest
 /// initializing the struct as early as possible, because compiler depends
 /// on the shape information to do configure. The constant value is preferred
@ -484,9 +493,32 @@ static __inline__ void __tile_dpbf16ps(__tile1024i *dst, __tile1024i src0,
                                      src0.tile, src1.tile);
 }

+/// Compute dot-product of FP16 (16-bit) floating-point pairs in tiles src0 and
+/// src1, accumulating the intermediate single-precision (32-bit) floating-point
+/// elements with elements in "dst", and store the 32-bit result back to tile
+/// "dst".
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the <c> TDPFP16PS </c> instruction.
+///
+/// \param dst
+///    The destination tile. Max size is 1024 Bytes.
+/// \param src0
+///    The 1st source tile. Max size is 1024 Bytes.
+/// \param src1
+///    The 2nd source tile. Max size is 1024 Bytes.
+__DEFAULT_FN_ATTRS_FP16
+static __inline__ void __tile_dpfp16ps(__tile1024i *dst, __tile1024i src0,
+                                       __tile1024i src1) {
+  dst->tile = _tile_dpfp16ps_internal(src0.row, src1.col, src0.col, dst->tile,
+                                      src0.tile, src1.tile);
+}
+
 #undef __DEFAULT_FN_ATTRS_TILE
 #undef __DEFAULT_FN_ATTRS_INT8
 #undef __DEFAULT_FN_ATTRS_BF16
+#undef __DEFAULT_FN_ATTRS_FP16

 #endif /* __x86_64__ */
 #endif /* __AMXINTRIN_H */
--- a/lib/include/arm_acle.h
+++ b/lib/include/arm_acle.h
@ -64,7 +64,7 @@ static __inline__ void __attribute__((__always_inline__, __nodebug__)) __yield(v
 }
 #endif

-#if __ARM_32BIT_STATE
+#if defined(__ARM_32BIT_STATE) && __ARM_32BIT_STATE
 #define __dbg(t) __builtin_arm_dbg(t)
 #endif

@ -82,7 +82,7 @@ __swp(uint32_t __x, volatile uint32_t *__p) {
 /* 8.6.1 Data prefetch */
 #define __pld(addr) __pldx(0, 0, 0, addr)

-#if __ARM_32BIT_STATE
+#if defined(__ARM_32BIT_STATE) && __ARM_32BIT_STATE
 #define __pldx(access_kind, cache_level, retention_policy, addr) \
  __builtin_arm_prefetch(addr, access_kind, 1)
 #else
@ -93,7 +93,7 @@ __swp(uint32_t __x, volatile uint32_t *__p) {
 /* 8.6.2 Instruction prefetch */
 #define __pli(addr) __plix(0, 0, addr)

-#if __ARM_32BIT_STATE
+#if defined(__ARM_32BIT_STATE) && __ARM_32BIT_STATE
 #define __plix(cache_level, retention_policy, addr) \
  __builtin_arm_prefetch(addr, 0, 0)
 #else
@ -140,17 +140,17 @@ __rorl(unsigned long __x, uint32_t __y) {
 /* CLZ */
 static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__))
 __clz(uint32_t __t) {
-  return __builtin_clz(__t);
+  return (uint32_t)__builtin_clz(__t);
 }

 static __inline__ unsigned long __attribute__((__always_inline__, __nodebug__))
 __clzl(unsigned long __t) {
-  return __builtin_clzl(__t);
+  return (unsigned long)__builtin_clzl(__t);
 }

 static __inline__ uint64_t __attribute__((__always_inline__, __nodebug__))
 __clzll(uint64_t __t) {
-  return __builtin_clzll(__t);
+  return (uint64_t)__builtin_clzll(__t);
 }

 /* CLS */
@ -201,7 +201,7 @@ __rev16(uint32_t __t) {

 static __inline__ uint64_t __attribute__((__always_inline__, __nodebug__))
 __rev16ll(uint64_t __t) {
-  return (((uint64_t)__rev16(__t >> 32)) << 32) | __rev16(__t);
+  return (((uint64_t)__rev16(__t >> 32)) << 32) | (uint64_t)__rev16((uint32_t)__t);
 }

 static __inline__ unsigned long __attribute__((__always_inline__, __nodebug__))
@ -216,7 +216,7 @@ __rev16l(unsigned long __t) {
 /* REVSH */
 static __inline__ int16_t __attribute__((__always_inline__, __nodebug__))
 __revsh(int16_t __t) {
-  return __builtin_bswap16(__t);
+  return (int16_t)__builtin_bswap16((uint16_t)__t);
 }

 /* RBIT */
@ -227,7 +227,7 @@ __rbit(uint32_t __t) {

 static __inline__ uint64_t __attribute__((__always_inline__, __nodebug__))
 __rbitll(uint64_t __t) {
-#if __ARM_32BIT_STATE
+#if defined(__ARM_32BIT_STATE) && __ARM_32BIT_STATE
  return (((uint64_t)__builtin_arm_rbit(__t)) << 32) |
         __builtin_arm_rbit(__t >> 32);
 #else
@ -247,7 +247,7 @@ __rbitl(unsigned long __t) {
 /*
 * 9.3 16-bit multiplications
 */
-#if __ARM_FEATURE_DSP
+#if defined(__ARM_FEATURE_DSP) && __ARM_FEATURE_DSP
 static __inline__ int32_t __attribute__((__always_inline__,__nodebug__))
 __smulbb(int32_t __a, int32_t __b) {
  return __builtin_arm_smulbb(__a, __b);
@ -277,17 +277,17 @@ __smulwt(int32_t __a, int32_t __b) {
 /*
 * 9.4 Saturating intrinsics
 *
- * FIXME: Change guard to their corrosponding __ARM_FEATURE flag when Q flag
+ * FIXME: Change guard to their corresponding __ARM_FEATURE flag when Q flag
 * intrinsics are implemented and the flag is enabled.
 */
 /* 9.4.1 Width-specified saturation intrinsics */
-#if __ARM_FEATURE_SAT
+#if defined(__ARM_FEATURE_SAT) && __ARM_FEATURE_SAT
 #define __ssat(x, y) __builtin_arm_ssat(x, y)
 #define __usat(x, y) __builtin_arm_usat(x, y)
 #endif

 /* 9.4.2 Saturating addition and subtraction intrinsics */
-#if __ARM_FEATURE_DSP
+#if defined(__ARM_FEATURE_DSP) && __ARM_FEATURE_DSP
 static __inline__ int32_t __attribute__((__always_inline__, __nodebug__))
 __qadd(int32_t __t, int32_t __v) {
  return __builtin_arm_qadd(__t, __v);
@ -305,7 +305,7 @@ __qdbl(int32_t __t) {
 #endif

 /* 9.4.3 Accumultating multiplications */
-#if __ARM_FEATURE_DSP
+#if defined(__ARM_FEATURE_DSP) && __ARM_FEATURE_DSP
 static __inline__ int32_t __attribute__((__always_inline__, __nodebug__))
 __smlabb(int32_t __a, int32_t __b, int32_t __c) {
  return __builtin_arm_smlabb(__a, __b, __c);
@ -334,13 +334,13 @@ __smlawt(int32_t __a, int32_t __b, int32_t __c) {


 /* 9.5.4 Parallel 16-bit saturation */
-#if __ARM_FEATURE_SIMD32
+#if defined(__ARM_FEATURE_SIMD32) && __ARM_FEATURE_SIMD32
 #define __ssat16(x, y) __builtin_arm_ssat16(x, y)
 #define __usat16(x, y) __builtin_arm_usat16(x, y)
 #endif

 /* 9.5.5 Packing and unpacking */
-#if __ARM_FEATURE_SIMD32
+#if defined(__ARM_FEATURE_SIMD32) && __ARM_FEATURE_SIMD32
 typedef int32_t int8x4_t;
 typedef int32_t int16x2_t;
 typedef uint32_t uint8x4_t;
@ -365,7 +365,7 @@ __uxtb16(int8x4_t __a) {
 #endif

 /* 9.5.6 Parallel selection */
-#if __ARM_FEATURE_SIMD32
+#if defined(__ARM_FEATURE_SIMD32) && __ARM_FEATURE_SIMD32
 static __inline__ uint8x4_t __attribute__((__always_inline__, __nodebug__))
 __sel(uint8x4_t __a, uint8x4_t __b) {
  return __builtin_arm_sel(__a, __b);
@ -373,7 +373,7 @@ __sel(uint8x4_t __a, uint8x4_t __b) {
 #endif

 /* 9.5.7 Parallel 8-bit addition and subtraction */
-#if __ARM_FEATURE_SIMD32
+#if defined(__ARM_FEATURE_SIMD32) && __ARM_FEATURE_SIMD32
 static __inline__ int8x4_t __attribute__((__always_inline__, __nodebug__))
 __qadd8(int8x4_t __a, int8x4_t __b) {
  return __builtin_arm_qadd8(__a, __b);
@ -425,7 +425,7 @@ __usub8(uint8x4_t __a, uint8x4_t __b) {
 #endif

 /* 9.5.8 Sum of 8-bit absolute differences */
-#if __ARM_FEATURE_SIMD32
+#if defined(__ARM_FEATURE_SIMD32) && __ARM_FEATURE_SIMD32
 static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__))
 __usad8(uint8x4_t __a, uint8x4_t __b) {
  return __builtin_arm_usad8(__a, __b);
@ -437,7 +437,7 @@ __usada8(uint8x4_t __a, uint8x4_t __b, uint32_t __c) {
 #endif

 /* 9.5.9 Parallel 16-bit addition and subtraction */
-#if __ARM_FEATURE_SIMD32
+#if defined(__ARM_FEATURE_SIMD32) && __ARM_FEATURE_SIMD32
 static __inline__ int16x2_t __attribute__((__always_inline__, __nodebug__))
 __qadd16(int16x2_t __a, int16x2_t __b) {
  return __builtin_arm_qadd16(__a, __b);
@ -537,7 +537,7 @@ __usub16(uint16x2_t __a, uint16x2_t __b) {
 #endif

 /* 9.5.10 Parallel 16-bit multiplications */
-#if __ARM_FEATURE_SIMD32
+#if defined(__ARM_FEATURE_SIMD32) && __ARM_FEATURE_SIMD32
 static __inline__ int32_t __attribute__((__always_inline__, __nodebug__))
 __smlad(int16x2_t __a, int16x2_t __b, int32_t __c) {
  return __builtin_arm_smlad(__a, __b, __c);
@ -589,155 +589,156 @@ __smusdx(int16x2_t __a, int16x2_t __b) {
 #endif

 /* 9.7 CRC32 intrinsics */
-#if __ARM_FEATURE_CRC32
-static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__))
+#if (defined(__ARM_FEATURE_CRC32) && __ARM_FEATURE_CRC32) ||                   \
+    (defined(__ARM_64BIT_STATE) && __ARM_64BIT_STATE)
+static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__, target("crc")))
 __crc32b(uint32_t __a, uint8_t __b) {
  return __builtin_arm_crc32b(__a, __b);
 }

-static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__))
+static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__, target("crc")))
 __crc32h(uint32_t __a, uint16_t __b) {
  return __builtin_arm_crc32h(__a, __b);
 }

-static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__))
+static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__, target("crc")))
 __crc32w(uint32_t __a, uint32_t __b) {
  return __builtin_arm_crc32w(__a, __b);
 }

-static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__))
+static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__, target("crc")))
 __crc32d(uint32_t __a, uint64_t __b) {
  return __builtin_arm_crc32d(__a, __b);
 }

-static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__))
+static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__, target("crc")))
 __crc32cb(uint32_t __a, uint8_t __b) {
  return __builtin_arm_crc32cb(__a, __b);
 }

-static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__))
+static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__, target("crc")))
 __crc32ch(uint32_t __a, uint16_t __b) {
  return __builtin_arm_crc32ch(__a, __b);
 }

-static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__))
+static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__, target("crc")))
 __crc32cw(uint32_t __a, uint32_t __b) {
  return __builtin_arm_crc32cw(__a, __b);
 }

-static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__))
+static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__, target("crc")))
 __crc32cd(uint32_t __a, uint64_t __b) {
  return __builtin_arm_crc32cd(__a, __b);
 }
 #endif

 /* Armv8.3-A Javascript conversion intrinsic */
-#if __ARM_64BIT_STATE && defined(__ARM_FEATURE_JCVT)
-static __inline__ int32_t __attribute__((__always_inline__, __nodebug__))
+#if defined(__ARM_64BIT_STATE) && __ARM_64BIT_STATE
+static __inline__ int32_t __attribute__((__always_inline__, __nodebug__, target("v8.3a")))
 __jcvt(double __a) {
  return __builtin_arm_jcvt(__a);
 }
 #endif

 /* Armv8.5-A FP rounding intrinsics */
-#if __ARM_64BIT_STATE && defined(__ARM_FEATURE_FRINT)
-static __inline__ float __attribute__((__always_inline__, __nodebug__))
-__frint32zf(float __a) {
-  return __builtin_arm_frint32zf(__a);
+#if defined(__ARM_64BIT_STATE) && __ARM_64BIT_STATE
+static __inline__ float __attribute__((__always_inline__, __nodebug__, target("v8.5a")))
+__rint32zf(float __a) {
+  return __builtin_arm_rint32zf(__a);
 }

-static __inline__ double __attribute__((__always_inline__, __nodebug__))
-__frint32z(double __a) {
-  return __builtin_arm_frint32z(__a);
+static __inline__ double __attribute__((__always_inline__, __nodebug__, target("v8.5a")))
+__rint32z(double __a) {
+  return __builtin_arm_rint32z(__a);
 }

-static __inline__ float __attribute__((__always_inline__, __nodebug__))
-__frint64zf(float __a) {
-  return __builtin_arm_frint64zf(__a);
+static __inline__ float __attribute__((__always_inline__, __nodebug__, target("v8.5a")))
+__rint64zf(float __a) {
+  return __builtin_arm_rint64zf(__a);
 }

-static __inline__ double __attribute__((__always_inline__, __nodebug__))
-__frint64z(double __a) {
-  return __builtin_arm_frint64z(__a);
+static __inline__ double __attribute__((__always_inline__, __nodebug__, target("v8.5a")))
+__rint64z(double __a) {
+  return __builtin_arm_rint64z(__a);
 }

-static __inline__ float __attribute__((__always_inline__, __nodebug__))
-__frint32xf(float __a) {
-  return __builtin_arm_frint32xf(__a);
+static __inline__ float __attribute__((__always_inline__, __nodebug__, target("v8.5a")))
+__rint32xf(float __a) {
+  return __builtin_arm_rint32xf(__a);
 }

-static __inline__ double __attribute__((__always_inline__, __nodebug__))
-__frint32x(double __a) {
-  return __builtin_arm_frint32x(__a);
+static __inline__ double __attribute__((__always_inline__, __nodebug__, target("v8.5a")))
+__rint32x(double __a) {
+  return __builtin_arm_rint32x(__a);
 }

-static __inline__ float __attribute__((__always_inline__, __nodebug__))
-__frint64xf(float __a) {
-  return __builtin_arm_frint64xf(__a);
+static __inline__ float __attribute__((__always_inline__, __nodebug__, target("v8.5a")))
+__rint64xf(float __a) {
+  return __builtin_arm_rint64xf(__a);
 }

-static __inline__ double __attribute__((__always_inline__, __nodebug__))
-__frint64x(double __a) {
-  return __builtin_arm_frint64x(__a);
+static __inline__ double __attribute__((__always_inline__, __nodebug__, target("v8.5a")))
+__rint64x(double __a) {
+  return __builtin_arm_rint64x(__a);
 }
 #endif

 /* Armv8.7-A load/store 64-byte intrinsics */
-#if __ARM_64BIT_STATE && defined(__ARM_FEATURE_LS64)
+#if defined(__ARM_64BIT_STATE) && __ARM_64BIT_STATE
 typedef struct {
    uint64_t val[8];
 } data512_t;

-static __inline__ data512_t __attribute__((__always_inline__, __nodebug__))
+static __inline__ data512_t __attribute__((__always_inline__, __nodebug__, target("ls64")))
 __arm_ld64b(const void *__addr) {
-    data512_t __value;
-    __builtin_arm_ld64b(__addr, __value.val);
-    return __value;
+  data512_t __value;
+  __builtin_arm_ld64b(__addr, __value.val);
+  return __value;
 }
-static __inline__ void __attribute__((__always_inline__, __nodebug__))
+static __inline__ void __attribute__((__always_inline__, __nodebug__, target("ls64")))
 __arm_st64b(void *__addr, data512_t __value) {
-    __builtin_arm_st64b(__addr, __value.val);
+  __builtin_arm_st64b(__addr, __value.val);
 }
-static __inline__ uint64_t __attribute__((__always_inline__, __nodebug__))
+static __inline__ uint64_t __attribute__((__always_inline__, __nodebug__, target("ls64")))
 __arm_st64bv(void *__addr, data512_t __value) {
-    return __builtin_arm_st64bv(__addr, __value.val);
+  return __builtin_arm_st64bv(__addr, __value.val);
 }
-static __inline__ uint64_t __attribute__((__always_inline__, __nodebug__))
+static __inline__ uint64_t __attribute__((__always_inline__, __nodebug__, target("ls64")))
 __arm_st64bv0(void *__addr, data512_t __value) {
-    return __builtin_arm_st64bv0(__addr, __value.val);
+  return __builtin_arm_st64bv0(__addr, __value.val);
 }
 #endif

 /* 10.1 Special register intrinsics */
 #define __arm_rsr(sysreg) __builtin_arm_rsr(sysreg)
 #define __arm_rsr64(sysreg) __builtin_arm_rsr64(sysreg)
+#define __arm_rsr128(sysreg) __builtin_arm_rsr128(sysreg)
 #define __arm_rsrp(sysreg) __builtin_arm_rsrp(sysreg)
 #define __arm_rsrf(sysreg) __builtin_bit_cast(float, __arm_rsr(sysreg))
 #define __arm_rsrf64(sysreg) __builtin_bit_cast(double, __arm_rsr64(sysreg))
 #define __arm_wsr(sysreg, v) __builtin_arm_wsr(sysreg, v)
 #define __arm_wsr64(sysreg, v) __builtin_arm_wsr64(sysreg, v)
+#define __arm_wsr128(sysreg, v) __builtin_arm_wsr128(sysreg, v)
 #define __arm_wsrp(sysreg, v) __builtin_arm_wsrp(sysreg, v)
 #define __arm_wsrf(sysreg, v) __arm_wsr(sysreg, __builtin_bit_cast(uint32_t, v))
 #define __arm_wsrf64(sysreg, v) __arm_wsr64(sysreg, __builtin_bit_cast(uint64_t, v))

 /* Memory Tagging Extensions (MTE) Intrinsics */
-#if __ARM_FEATURE_MEMORY_TAGGING
+#if defined(__ARM_64BIT_STATE) && __ARM_64BIT_STATE
 #define __arm_mte_create_random_tag(__ptr, __mask)  __builtin_arm_irg(__ptr, __mask)
 #define __arm_mte_increment_tag(__ptr, __tag_offset)  __builtin_arm_addg(__ptr, __tag_offset)
 #define __arm_mte_exclude_tag(__ptr, __excluded)  __builtin_arm_gmi(__ptr, __excluded)
 #define __arm_mte_get_tag(__ptr) __builtin_arm_ldg(__ptr)
 #define __arm_mte_set_tag(__ptr) __builtin_arm_stg(__ptr)
 #define __arm_mte_ptrdiff(__ptra, __ptrb) __builtin_arm_subp(__ptra, __ptrb)
-#endif

 /* Memory Operations Intrinsics */
-#if __ARM_FEATURE_MOPS && __ARM_FEATURE_MEMORY_TAGGING
 #define __arm_mops_memset_tag(__tagged_address, __value, __size)    \
  __builtin_arm_mops_memset_tag(__tagged_address, __value, __size)
 #endif

 /* Transactional Memory Extension (TME) Intrinsics */
-#if __ARM_FEATURE_TME
+#if defined(__ARM_FEATURE_TME) && __ARM_FEATURE_TME

 #define _TMFAILURE_REASON  0x00007fffu
 #define _TMFAILURE_RTRY    0x00008000u
@ -759,12 +760,12 @@ __arm_st64bv0(void *__addr, data512_t __value) {
 #endif /* __ARM_FEATURE_TME */

 /* Armv8.5-A Random number generation intrinsics */
-#if __ARM_64BIT_STATE && defined(__ARM_FEATURE_RNG)
-static __inline__ int __attribute__((__always_inline__, __nodebug__))
+#if defined(__ARM_64BIT_STATE) && __ARM_64BIT_STATE
+static __inline__ int __attribute__((__always_inline__, __nodebug__, target("rand")))
 __rndr(uint64_t *__p) {
  return __builtin_arm_rndr(__p);
 }
-static __inline__ int __attribute__((__always_inline__, __nodebug__))
+static __inline__ int __attribute__((__always_inline__, __nodebug__, target("rand")))
 __rndrrs(uint64_t *__p) {
  return __builtin_arm_rndrrs(__p);
 }
--- a/lib/include/arm_fp16.h
+++ b/lib/include/arm_fp16.h
@ -29,7 +29,7 @@
 typedef __fp16 float16_t;
 #define __ai static __inline__ __attribute__((__always_inline__, __nodebug__))

-#if defined(__ARM_FEATURE_FP16_SCALAR_ARITHMETIC) && defined(__aarch64__)
+#if defined(__aarch64__)
 #define vabdh_f16(__p0, __p1) __extension__ ({ \
  float16_t __ret; \
  float16_t __s0 = __p0; \
--- a/lib/include/arm_neon.h
+++ b/lib/include/arm_neon.h
--- a/lib/include/arm_neon_sve_bridge.h
+++ b/lib/include/arm_neon_sve_bridge.h
@ -0,0 +1,182 @@
+/*===---- arm_neon_sve_bridge.h - ARM NEON SVE Bridge intrinsics -----------===
+ *
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __ARM_NEON_SVE_BRIDGE_H
+#define __ARM_NEON_SVE_BRIDGE_H
+
+#include <arm_neon.h>
+#include <arm_sve.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* Function attributes */
+#define __ai static __inline__ __attribute__((__always_inline__, __nodebug__))
+#define __aio                                                                  \
+  static __inline__                                                            \
+      __attribute__((__always_inline__, __nodebug__, __overloadable__))
+
+__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svset_neonq_s8)))
+svint8_t svset_neonq(svint8_t, int8x16_t);
+__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svset_neonq_s16)))
+svint16_t svset_neonq(svint16_t, int16x8_t);
+__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svset_neonq_s32)))
+svint32_t svset_neonq(svint32_t, int32x4_t);
+__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svset_neonq_s64)))
+svint64_t svset_neonq(svint64_t, int64x2_t);
+__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svset_neonq_u8)))
+svuint8_t svset_neonq(svuint8_t, uint8x16_t);
+__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svset_neonq_u16)))
+svuint16_t svset_neonq(svuint16_t, uint16x8_t);
+__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svset_neonq_u32)))
+svuint32_t svset_neonq(svuint32_t, uint32x4_t);
+__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svset_neonq_u64)))
+svuint64_t svset_neonq(svuint64_t, uint64x2_t);
+__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svset_neonq_f16)))
+svfloat16_t svset_neonq(svfloat16_t, float16x8_t);
+__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svset_neonq_f32)))
+svfloat32_t svset_neonq(svfloat32_t, float32x4_t);
+__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svset_neonq_f64)))
+svfloat64_t svset_neonq(svfloat64_t, float64x2_t);
+__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svset_neonq_s8)))
+svint8_t svset_neonq_s8(svint8_t, int8x16_t);
+__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svset_neonq_s16)))
+svint16_t svset_neonq_s16(svint16_t, int16x8_t);
+__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svset_neonq_s32)))
+svint32_t svset_neonq_s32(svint32_t, int32x4_t);
+__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svset_neonq_s64)))
+svint64_t svset_neonq_s64(svint64_t, int64x2_t);
+__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svset_neonq_u8)))
+svuint8_t svset_neonq_u8(svuint8_t, uint8x16_t);
+__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svset_neonq_u16)))
+svuint16_t svset_neonq_u16(svuint16_t, uint16x8_t);
+__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svset_neonq_u32)))
+svuint32_t svset_neonq_u32(svuint32_t, uint32x4_t);
+__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svset_neonq_u64)))
+svuint64_t svset_neonq_u64(svuint64_t, uint64x2_t);
+__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svset_neonq_f16)))
+svfloat16_t svset_neonq_f16(svfloat16_t, float16x8_t);
+__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svset_neonq_f32)))
+svfloat32_t svset_neonq_f32(svfloat32_t, float32x4_t);
+__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svset_neonq_f64)))
+svfloat64_t svset_neonq_f64(svfloat64_t, float64x2_t);
+
+__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svget_neonq_s8)))
+int8x16_t svget_neonq(svint8_t);
+__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svget_neonq_s16)))
+int16x8_t svget_neonq(svint16_t);
+__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svget_neonq_s32)))
+int32x4_t svget_neonq(svint32_t);
+__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svget_neonq_s64)))
+int64x2_t svget_neonq(svint64_t);
+__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svget_neonq_u8)))
+uint8x16_t svget_neonq(svuint8_t);
+__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svget_neonq_u16)))
+uint16x8_t svget_neonq(svuint16_t);
+__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svget_neonq_u32)))
+uint32x4_t svget_neonq(svuint32_t);
+__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svget_neonq_u64)))
+uint64x2_t svget_neonq(svuint64_t);
+__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svget_neonq_f16)))
+float16x8_t svget_neonq(svfloat16_t);
+__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svget_neonq_f32)))
+float32x4_t svget_neonq(svfloat32_t);
+__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svget_neonq_f64)))
+float64x2_t svget_neonq(svfloat64_t);
+__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svget_neonq_s8)))
+int8x16_t svget_neonq_s8(svint8_t);
+__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svget_neonq_s16)))
+int16x8_t svget_neonq_s16(svint16_t);
+__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svget_neonq_s32)))
+int32x4_t svget_neonq_s32(svint32_t);
+__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svget_neonq_s64)))
+int64x2_t svget_neonq_s64(svint64_t);
+__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svget_neonq_u8)))
+uint8x16_t svget_neonq_u8(svuint8_t);
+__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svget_neonq_u16)))
+uint16x8_t svget_neonq_u16(svuint16_t);
+__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svget_neonq_u32)))
+uint32x4_t svget_neonq_u32(svuint32_t);
+__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svget_neonq_u64)))
+uint64x2_t svget_neonq_u64(svuint64_t);
+__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svget_neonq_f16)))
+float16x8_t svget_neonq_f16(svfloat16_t);
+__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svget_neonq_f32)))
+float32x4_t svget_neonq_f32(svfloat32_t);
+__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svget_neonq_f64)))
+float64x2_t svget_neonq_f64(svfloat64_t);
+
+__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_neonq_s8)))
+svint8_t svdup_neonq(int8x16_t);
+__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_neonq_s16)))
+svint16_t svdup_neonq(int16x8_t);
+__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_neonq_s32)))
+svint32_t svdup_neonq(int32x4_t);
+__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_neonq_s64)))
+svint64_t svdup_neonq(int64x2_t);
+__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_neonq_u8)))
+svuint8_t svdup_neonq(uint8x16_t);
+__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_neonq_u16)))
+svuint16_t svdup_neonq(uint16x8_t);
+__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_neonq_u32)))
+svuint32_t svdup_neonq(uint32x4_t);
+__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_neonq_u64)))
+svuint64_t svdup_neonq(uint64x2_t);
+__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_neonq_f16)))
+svfloat16_t svdup_neonq(float16x8_t);
+__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_neonq_f32)))
+svfloat32_t svdup_neonq(float32x4_t);
+__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_neonq_f64)))
+svfloat64_t svdup_neonq(float64x2_t);
+__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_neonq_s8)))
+svint8_t svdup_neonq_s8(int8x16_t);
+__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_neonq_s16)))
+svint16_t svdup_neonq_s16(int16x8_t);
+__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_neonq_s32)))
+svint32_t svdup_neonq_s32(int32x4_t);
+__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_neonq_s64)))
+svint64_t svdup_neonq_s64(int64x2_t);
+__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_neonq_u8)))
+svuint8_t svdup_neonq_u8(uint8x16_t);
+__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_neonq_u16)))
+svuint16_t svdup_neonq_u16(uint16x8_t);
+__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_neonq_u32)))
+svuint32_t svdup_neonq_u32(uint32x4_t);
+__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_neonq_u64)))
+svuint64_t svdup_neonq_u64(uint64x2_t);
+__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_neonq_f16)))
+svfloat16_t svdup_neonq_f16(float16x8_t);
+__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_neonq_f32)))
+svfloat32_t svdup_neonq_f32(float32x4_t);
+__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_neonq_f64)))
+svfloat64_t svdup_neonq_f64(float64x2_t);
+
+__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svset_neonq_bf16)))
+svbfloat16_t svset_neonq(svbfloat16_t, bfloat16x8_t);
+__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svset_neonq_bf16)))
+svbfloat16_t svset_neonq_bf16(svbfloat16_t, bfloat16x8_t);
+__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svget_neonq_bf16)))
+bfloat16x8_t svget_neonq(svbfloat16_t);
+__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svget_neonq_bf16)))
+bfloat16x8_t svget_neonq_bf16(svbfloat16_t);
+__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_neonq_bf16)))
+svbfloat16_t svdup_neonq(bfloat16x8_t);
+__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_neonq_bf16)))
+svbfloat16_t svdup_neonq_bf16(bfloat16x8_t);
+
+#undef __ai
+#undef __aio
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif //__ARM_NEON_SVE_BRIDGE_H
--- a/lib/include/arm_sve.h
+++ b/lib/include/arm_sve.h
--- a/lib/include/avx512bf16intrin.h
+++ b/lib/include/avx512bf16intrin.h
@ -10,12 +10,14 @@
 #error "Never use <avx512bf16intrin.h> directly; include <immintrin.h> instead."
 #endif

+#ifdef __SSE2__
+
 #ifndef __AVX512BF16INTRIN_H
 #define __AVX512BF16INTRIN_H

-typedef short __m512bh __attribute__((__vector_size__(64), __aligned__(64)));
-typedef short __m256bh __attribute__((__vector_size__(32), __aligned__(32)));
-typedef unsigned short __bfloat16;
+typedef __bf16 __v32bf __attribute__((__vector_size__(64), __aligned__(64)));
+typedef __bf16 __m512bh __attribute__((__vector_size__(64), __aligned__(64)));
+typedef __bf16 __bfloat16 __attribute__((deprecated("use __bf16 instead")));

 #define __DEFAULT_FN_ATTRS512 \
  __attribute__((__always_inline__, __nodebug__, __target__("avx512bf16"), \
@ -33,7 +35,7 @@ typedef unsigned short __bfloat16;
 ///    A bfloat data.
 /// \returns A float data whose sign field and exponent field keep unchanged,
 ///    and fraction field is extended to 23 bits.
-static __inline__ float __DEFAULT_FN_ATTRS _mm_cvtsbh_ss(__bfloat16 __A) {
+static __inline__ float __DEFAULT_FN_ATTRS _mm_cvtsbh_ss(__bf16 __A) {
  return __builtin_ia32_cvtsbf162ss_32(__A);
 }

@ -74,9 +76,9 @@ _mm512_cvtne2ps_pbh(__m512 __A, __m512 __B) {
 ///    conversion of __B, and higher 256 bits come from conversion of __A.
 static __inline__ __m512bh __DEFAULT_FN_ATTRS512
 _mm512_mask_cvtne2ps_pbh(__m512bh __W, __mmask32 __U, __m512 __A, __m512 __B) {
-  return (__m512bh)__builtin_ia32_selectw_512((__mmask32)__U,
-                                        (__v32hi)_mm512_cvtne2ps_pbh(__A, __B),
-                                        (__v32hi)__W);
+  return (__m512bh)__builtin_ia32_selectpbf_512((__mmask32)__U,
+                                        (__v32bf)_mm512_cvtne2ps_pbh(__A, __B),
+                                        (__v32bf)__W);
 }

 /// Convert Two Packed Single Data to One Packed BF16 Data.
@ -96,9 +98,9 @@ _mm512_mask_cvtne2ps_pbh(__m512bh __W, __mmask32 __U, __m512 __A, __m512 __B) {
 ///    conversion of __B, and higher 256 bits come from conversion of __A.
 static __inline__ __m512bh __DEFAULT_FN_ATTRS512
 _mm512_maskz_cvtne2ps_pbh(__mmask32 __U, __m512 __A, __m512 __B) {
-  return (__m512bh)__builtin_ia32_selectw_512((__mmask32)__U,
-                                        (__v32hi)_mm512_cvtne2ps_pbh(__A, __B),
-                                        (__v32hi)_mm512_setzero_si512());
+  return (__m512bh)__builtin_ia32_selectpbf_512((__mmask32)__U,
+                                        (__v32bf)_mm512_cvtne2ps_pbh(__A, __B),
+                                        (__v32bf)_mm512_setzero_si512());
 }

 /// Convert Packed Single Data to Packed BF16 Data.
@ -113,7 +115,7 @@ _mm512_maskz_cvtne2ps_pbh(__mmask32 __U, __m512 __A, __m512 __B) {
 static __inline__ __m256bh __DEFAULT_FN_ATTRS512
 _mm512_cvtneps_pbh(__m512 __A) {
  return (__m256bh)__builtin_ia32_cvtneps2bf16_512_mask((__v16sf)__A,
-                                              (__v16hi)_mm256_undefined_si256(),
+                                              (__v16bf)_mm256_undefined_si256(),
                                              (__mmask16)-1);
 }

@ -134,7 +136,7 @@ _mm512_cvtneps_pbh(__m512 __A) {
 static __inline__ __m256bh __DEFAULT_FN_ATTRS512
 _mm512_mask_cvtneps_pbh(__m256bh __W, __mmask16 __U, __m512 __A) {
  return (__m256bh)__builtin_ia32_cvtneps2bf16_512_mask((__v16sf)__A,
-                                                        (__v16hi)__W,
+                                                        (__v16bf)__W,
                                                        (__mmask16)__U);
 }

@ -153,7 +155,7 @@ _mm512_mask_cvtneps_pbh(__m256bh __W, __mmask16 __U, __m512 __A) {
 static __inline__ __m256bh __DEFAULT_FN_ATTRS512
 _mm512_maskz_cvtneps_pbh(__mmask16 __U, __m512 __A) {
  return (__m256bh)__builtin_ia32_cvtneps2bf16_512_mask((__v16sf)__A,
-                                                (__v16hi)_mm256_setzero_si256(),
+                                                (__v16bf)_mm256_setzero_si256(),
                                                (__mmask16)__U);
 }

@ -174,8 +176,8 @@ _mm512_maskz_cvtneps_pbh(__mmask16 __U, __m512 __A) {
 static __inline__ __m512 __DEFAULT_FN_ATTRS512
 _mm512_dpbf16_ps(__m512 __D, __m512bh __A, __m512bh __B) {
  return (__m512)__builtin_ia32_dpbf16ps_512((__v16sf) __D,
-                                             (__v16si) __A,
-                                             (__v16si) __B);
+                                             (__v32bf) __A,
+                                             (__v32bf) __B);
 }

 /// Dot Product of BF16 Pairs Accumulated into Packed Single Precision.
@ -277,3 +279,4 @@ _mm512_mask_cvtpbh_ps(__m512 __S, __mmask16 __U, __m256bh __A) {
 #undef __DEFAULT_FN_ATTRS512

 #endif
+#endif
--- a/lib/include/avx512fintrin.h
+++ b/lib/include/avx512fintrin.h
@ -256,8 +256,8 @@ _mm512_maskz_broadcastq_epi64 (__mmask8 __M, __m128i __A)
 static __inline __m512 __DEFAULT_FN_ATTRS512
 _mm512_setzero_ps(void)
 {
-  return __extension__ (__m512){ 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
-                                 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0 };
+  return __extension__ (__m512){ 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+                                 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f };
 }

 #define _mm512_setzero _mm512_setzero_ps
--- a/lib/include/avx512fp16intrin.h
+++ b/lib/include/avx512fp16intrin.h
@ -10,6 +10,8 @@
 #error "Never use <avx512fp16intrin.h> directly; include <immintrin.h> instead."
 #endif

+#ifdef __SSE2__
+
 #ifndef __AVX512FP16INTRIN_H
 #define __AVX512FP16INTRIN_H

@ -17,12 +19,6 @@
 typedef _Float16 __v32hf __attribute__((__vector_size__(64), __aligned__(64)));
 typedef _Float16 __m512h __attribute__((__vector_size__(64), __aligned__(64)));
 typedef _Float16 __m512h_u __attribute__((__vector_size__(64), __aligned__(1)));
-typedef _Float16 __v8hf __attribute__((__vector_size__(16), __aligned__(16)));
-typedef _Float16 __m128h __attribute__((__vector_size__(16), __aligned__(16)));
-typedef _Float16 __m128h_u __attribute__((__vector_size__(16), __aligned__(1)));
-typedef _Float16 __v16hf __attribute__((__vector_size__(32), __aligned__(32)));
-typedef _Float16 __m256h __attribute__((__vector_size__(32), __aligned__(32)));
-typedef _Float16 __m256h_u __attribute__((__vector_size__(32), __aligned__(1)));

 /* Define the default attributes for the functions in this file. */
 #define __DEFAULT_FN_ATTRS512                                                  \
@ -829,7 +825,7 @@ static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_load_sh(void const *__dp) {
  struct __mm_load_sh_struct {
    _Float16 __u;
  } __attribute__((__packed__, __may_alias__));
-  _Float16 __u = ((struct __mm_load_sh_struct *)__dp)->__u;
+  _Float16 __u = ((const struct __mm_load_sh_struct *)__dp)->__u;
  return (__m128h){__u, 0, 0, 0, 0, 0, 0, 0};
 }

@ -838,13 +834,13 @@ _mm_mask_load_sh(__m128h __W, __mmask8 __U, const void *__A) {
  __m128h src = (__v8hf)__builtin_shufflevector(
      (__v8hf)__W, (__v8hf)_mm_setzero_ph(), 0, 8, 8, 8, 8, 8, 8, 8);

-  return (__m128h)__builtin_ia32_loadsh128_mask((__v8hf *)__A, src, __U & 1);
+  return (__m128h)__builtin_ia32_loadsh128_mask((const __v8hf *)__A, src, __U & 1);
 }

 static __inline__ __m128h __DEFAULT_FN_ATTRS128
 _mm_maskz_load_sh(__mmask8 __U, const void *__A) {
  return (__m128h)__builtin_ia32_loadsh128_mask(
-      (__v8hf *)__A, (__v8hf)_mm_setzero_ph(), __U & 1);
+      (const __v8hf *)__A, (__v8hf)_mm_setzero_ph(), __U & 1);
 }

 static __inline__ __m512h __DEFAULT_FN_ATTRS512
@ -3347,3 +3343,4 @@ _mm512_permutexvar_ph(__m512i __A, __m512h __B) {
 #undef __DEFAULT_FN_ATTRS512

 #endif
+#endif
--- a/lib/include/avx512ifmavlintrin.h
+++ b/lib/include/avx512ifmavlintrin.h
@ -18,14 +18,21 @@
 #define __DEFAULT_FN_ATTRS128 __attribute__((__always_inline__, __nodebug__, __target__("avx512ifma,avx512vl"), __min_vector_width__(128)))
 #define __DEFAULT_FN_ATTRS256 __attribute__((__always_inline__, __nodebug__, __target__("avx512ifma,avx512vl"), __min_vector_width__(256)))

+#define _mm_madd52hi_epu64(X, Y, Z)                                            \
+  ((__m128i)__builtin_ia32_vpmadd52huq128((__v2di)(X), (__v2di)(Y),            \
+                                          (__v2di)(Z)))

+#define _mm256_madd52hi_epu64(X, Y, Z)                                         \
+  ((__m256i)__builtin_ia32_vpmadd52huq256((__v4di)(X), (__v4di)(Y),            \
+                                          (__v4di)(Z)))

-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_madd52hi_epu64 (__m128i __X, __m128i __Y, __m128i __Z)
-{
-  return (__m128i)__builtin_ia32_vpmadd52huq128((__v2di) __X, (__v2di) __Y,
-                                                (__v2di) __Z);
-}
+#define _mm_madd52lo_epu64(X, Y, Z)                                            \
+  ((__m128i)__builtin_ia32_vpmadd52luq128((__v2di)(X), (__v2di)(Y),            \
+                                          (__v2di)(Z)))
+
+#define _mm256_madd52lo_epu64(X, Y, Z)                                         \
+  ((__m256i)__builtin_ia32_vpmadd52luq256((__v4di)(X), (__v4di)(Y),            \
+                                          (__v4di)(Z)))

 static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_mask_madd52hi_epu64 (__m128i __W, __mmask8 __M, __m128i __X, __m128i __Y)
@ -43,13 +50,6 @@ _mm_maskz_madd52hi_epu64 (__mmask8 __M, __m128i __X, __m128i __Y, __m128i __Z)
                                      (__v2di)_mm_setzero_si128());
 }

-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_madd52hi_epu64 (__m256i __X, __m256i __Y, __m256i __Z)
-{
-  return (__m256i)__builtin_ia32_vpmadd52huq256((__v4di)__X, (__v4di)__Y,
-                                                (__v4di)__Z);
-}
-
 static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_mask_madd52hi_epu64 (__m256i __W, __mmask8 __M, __m256i __X, __m256i __Y)
 {
@ -66,13 +66,6 @@ _mm256_maskz_madd52hi_epu64 (__mmask8 __M, __m256i __X, __m256i __Y, __m256i __Z
                                   (__v4di)_mm256_setzero_si256());
 }

-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_madd52lo_epu64 (__m128i __X, __m128i __Y, __m128i __Z)
-{
-  return (__m128i)__builtin_ia32_vpmadd52luq128((__v2di)__X, (__v2di)__Y,
-                                                (__v2di)__Z);
-}
-
 static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_mask_madd52lo_epu64 (__m128i __W, __mmask8 __M, __m128i __X, __m128i __Y)
 {
@ -89,13 +82,6 @@ _mm_maskz_madd52lo_epu64 (__mmask8 __M, __m128i __X, __m128i __Y, __m128i __Z)
                                      (__v2di)_mm_setzero_si128());
 }

-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_madd52lo_epu64 (__m256i __X, __m256i __Y, __m256i __Z)
-{
-  return (__m256i)__builtin_ia32_vpmadd52luq256((__v4di)__X, (__v4di)__Y,
-                                                (__v4di)__Z);
-}
-
 static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_mask_madd52lo_epu64 (__m256i __W, __mmask8 __M, __m256i __X, __m256i __Y)
 {
--- a/lib/include/avx512vlbf16intrin.h
+++ b/lib/include/avx512vlbf16intrin.h
@ -10,11 +10,11 @@
 #error "Never use <avx512vlbf16intrin.h> directly; include <immintrin.h> instead."
 #endif

+#ifdef __SSE2__
+
 #ifndef __AVX512VLBF16INTRIN_H
 #define __AVX512VLBF16INTRIN_H

-typedef short __m128bh __attribute__((__vector_size__(16), __aligned__(16)));
-
 #define __DEFAULT_FN_ATTRS128 \
  __attribute__((__always_inline__, __nodebug__, \
                 __target__("avx512vl, avx512bf16"), __min_vector_width__(128)))
@ -59,9 +59,9 @@ _mm_cvtne2ps_pbh(__m128 __A, __m128 __B) {
 ///    conversion of __B, and higher 64 bits come from conversion of __A.
 static __inline__ __m128bh __DEFAULT_FN_ATTRS128
 _mm_mask_cvtne2ps_pbh(__m128bh __W, __mmask8 __U, __m128 __A, __m128 __B) {
-  return (__m128bh)__builtin_ia32_selectw_128((__mmask8)__U,
-                                             (__v8hi)_mm_cvtne2ps_pbh(__A, __B),
-                                             (__v8hi)__W);
+  return (__m128bh)__builtin_ia32_selectpbf_128((__mmask8)__U,
+                                             (__v8bf)_mm_cvtne2ps_pbh(__A, __B),
+                                             (__v8bf)__W);
 }

 /// Convert Two Packed Single Data to One Packed BF16 Data.
@ -81,9 +81,9 @@ _mm_mask_cvtne2ps_pbh(__m128bh __W, __mmask8 __U, __m128 __A, __m128 __B) {
 ///    conversion of __B, and higher 64 bits come from conversion of __A.
 static __inline__ __m128bh __DEFAULT_FN_ATTRS128
 _mm_maskz_cvtne2ps_pbh(__mmask8 __U, __m128 __A, __m128 __B) {
-  return (__m128bh)__builtin_ia32_selectw_128((__mmask8)__U,
-                                             (__v8hi)_mm_cvtne2ps_pbh(__A, __B),
-                                             (__v8hi)_mm_setzero_si128());
+  return (__m128bh)__builtin_ia32_selectpbf_128((__mmask8)__U,
+                                             (__v8bf)_mm_cvtne2ps_pbh(__A, __B),
+                                             (__v8bf)_mm_setzero_si128());
 }

 /// Convert Two Packed Single Data to One Packed BF16 Data.
@ -123,9 +123,9 @@ _mm256_cvtne2ps_pbh(__m256 __A, __m256 __B) {
 ///    conversion of __B, and higher 128 bits come from conversion of __A.
 static __inline__ __m256bh __DEFAULT_FN_ATTRS256
 _mm256_mask_cvtne2ps_pbh(__m256bh __W, __mmask16 __U, __m256 __A, __m256 __B) {
-  return (__m256bh)__builtin_ia32_selectw_256((__mmask16)__U,
-                                         (__v16hi)_mm256_cvtne2ps_pbh(__A, __B),
-                                         (__v16hi)__W);
+  return (__m256bh)__builtin_ia32_selectpbf_256((__mmask16)__U,
+                                         (__v16bf)_mm256_cvtne2ps_pbh(__A, __B),
+                                         (__v16bf)__W);
 }

 /// Convert Two Packed Single Data to One Packed BF16 Data.
@ -145,9 +145,9 @@ _mm256_mask_cvtne2ps_pbh(__m256bh __W, __mmask16 __U, __m256 __A, __m256 __B) {
 ///    conversion of __B, and higher 128 bits come from conversion of __A.
 static __inline__ __m256bh __DEFAULT_FN_ATTRS256
 _mm256_maskz_cvtne2ps_pbh(__mmask16 __U, __m256 __A, __m256 __B) {
-  return (__m256bh)__builtin_ia32_selectw_256((__mmask16)__U,
-                                         (__v16hi)_mm256_cvtne2ps_pbh(__A, __B),
-                                         (__v16hi)_mm256_setzero_si256());
+  return (__m256bh)__builtin_ia32_selectpbf_256((__mmask16)__U,
+                                         (__v16bf)_mm256_cvtne2ps_pbh(__A, __B),
+                                         (__v16bf)_mm256_setzero_si256());
 }

 /// Convert Packed Single Data to Packed BF16 Data.
@ -160,12 +160,8 @@ _mm256_maskz_cvtne2ps_pbh(__mmask16 __U, __m256 __A, __m256 __B) {
 ///    A 128-bit vector of [4 x float].
 /// \returns A 128-bit vector of [8 x bfloat] whose lower 64 bits come from
 ///    conversion of __A, and higher 64 bits are 0.
-static __inline__ __m128bh __DEFAULT_FN_ATTRS128
-_mm_cvtneps_pbh(__m128 __A) {
-  return (__m128bh)__builtin_ia32_cvtneps2bf16_128_mask((__v4sf) __A,
-                                                  (__v8hi)_mm_undefined_si128(),
-                                                  (__mmask8)-1);
-}
+#define _mm_cvtneps_pbh(A)                                                     \
+  ((__m128bh)__builtin_ia32_vcvtneps2bf16128((__v4sf)(A)))

 /// Convert Packed Single Data to Packed BF16 Data.
 ///
@ -185,7 +181,7 @@ _mm_cvtneps_pbh(__m128 __A) {
 static __inline__ __m128bh __DEFAULT_FN_ATTRS128
 _mm_mask_cvtneps_pbh(__m128bh __W, __mmask8 __U, __m128 __A) {
  return (__m128bh)__builtin_ia32_cvtneps2bf16_128_mask((__v4sf) __A,
-                                                        (__v8hi)__W,
+                                                        (__v8bf)__W,
                                                        (__mmask8)__U);
 }

@ -205,7 +201,7 @@ _mm_mask_cvtneps_pbh(__m128bh __W, __mmask8 __U, __m128 __A) {
 static __inline__ __m128bh __DEFAULT_FN_ATTRS128
 _mm_maskz_cvtneps_pbh(__mmask8 __U, __m128 __A) {
  return (__m128bh)__builtin_ia32_cvtneps2bf16_128_mask((__v4sf) __A,
-                                                    (__v8hi)_mm_setzero_si128(),
+                                                    (__v8bf)_mm_setzero_si128(),
                                                    (__mmask8)__U);
 }

@ -218,12 +214,8 @@ _mm_maskz_cvtneps_pbh(__mmask8 __U, __m128 __A) {
 /// \param __A
 ///    A 256-bit vector of [8 x float].
 /// \returns A 128-bit vector of [8 x bfloat] comes from conversion of __A.
-static __inline__ __m128bh __DEFAULT_FN_ATTRS256
-_mm256_cvtneps_pbh(__m256 __A) {
-  return (__m128bh)__builtin_ia32_cvtneps2bf16_256_mask((__v8sf)__A,
-                                                  (__v8hi)_mm_undefined_si128(),
-                                                  (__mmask8)-1);
-}
+#define _mm256_cvtneps_pbh(A)                                                  \
+  ((__m128bh)__builtin_ia32_vcvtneps2bf16256((__v8sf)(A)))

 /// Convert Packed Single Data to Packed BF16 Data.
 ///
@ -242,7 +234,7 @@ _mm256_cvtneps_pbh(__m256 __A) {
 static __inline__ __m128bh __DEFAULT_FN_ATTRS256
 _mm256_mask_cvtneps_pbh(__m128bh __W, __mmask8 __U, __m256 __A) {
  return (__m128bh)__builtin_ia32_cvtneps2bf16_256_mask((__v8sf)__A,
-                                                        (__v8hi)__W,
+                                                        (__v8bf)__W,
                                                        (__mmask8)__U);
 }

@ -261,7 +253,7 @@ _mm256_mask_cvtneps_pbh(__m128bh __W, __mmask8 __U, __m256 __A) {
 static __inline__ __m128bh __DEFAULT_FN_ATTRS256
 _mm256_maskz_cvtneps_pbh(__mmask8 __U, __m256 __A) {
  return (__m128bh)__builtin_ia32_cvtneps2bf16_256_mask((__v8sf)__A,
-                                                    (__v8hi)_mm_setzero_si128(),
+                                                    (__v8bf)_mm_setzero_si128(),
                                                    (__mmask8)__U);
 }

@ -282,8 +274,8 @@ _mm256_maskz_cvtneps_pbh(__mmask8 __U, __m256 __A) {
 static __inline__ __m128 __DEFAULT_FN_ATTRS128
 _mm_dpbf16_ps(__m128 __D, __m128bh __A, __m128bh __B) {
  return (__m128)__builtin_ia32_dpbf16ps_128((__v4sf)__D,
-                                             (__v4si)__A,
-                                             (__v4si)__B);
+                                             (__v8bf)__A,
+                                             (__v8bf)__B);
 }

 /// Dot Product of BF16 Pairs Accumulated into Packed Single Precision.
@ -351,8 +343,8 @@ _mm_maskz_dpbf16_ps(__mmask8 __U, __m128 __D, __m128bh __A, __m128bh __B) {
 static __inline__ __m256 __DEFAULT_FN_ATTRS256
 _mm256_dpbf16_ps(__m256 __D, __m256bh __A, __m256bh __B) {
  return (__m256)__builtin_ia32_dpbf16ps_256((__v8sf)__D,
-                                             (__v8si)__A,
-                                             (__v8si)__B);
+                                             (__v16bf)__A,
+                                             (__v16bf)__B);
 }

 /// Dot Product of BF16 Pairs Accumulated into Packed Single Precision.
@ -413,11 +405,11 @@ _mm256_maskz_dpbf16_ps(__mmask8 __U, __m256 __D, __m256bh __A, __m256bh __B) {
 ///    A float data.
 /// \returns A bf16 data whose sign field and exponent field keep unchanged,
 ///    and fraction field is truncated to 7 bits.
-static __inline__ __bfloat16 __DEFAULT_FN_ATTRS128 _mm_cvtness_sbh(float __A) {
+static __inline__ __bf16 __DEFAULT_FN_ATTRS128 _mm_cvtness_sbh(float __A) {
  __v4sf __V = {__A, 0, 0, 0};
-  __v8hi __R = __builtin_ia32_cvtneps2bf16_128_mask(
-      (__v4sf)__V, (__v8hi)_mm_undefined_si128(), (__mmask8)-1);
-  return (__bfloat16)__R[0];
+  __v8bf __R = __builtin_ia32_cvtneps2bf16_128_mask(
+      (__v4sf)__V, (__v8bf)_mm_undefined_si128(), (__mmask8)-1);
+  return (__bf16)__R[0];
 }

 /// Convert Packed BF16 Data to Packed float Data.
@ -520,3 +512,4 @@ _mm256_mask_cvtpbh_ps(__m256 __S, __mmask8 __U, __m128bh __A) {
 #undef __DEFAULT_FN_ATTRS256

 #endif
+#endif
--- a/lib/include/avx512vlbwintrin.h
+++ b/lib/include/avx512vlbwintrin.h
@ -2803,6 +2803,358 @@ _mm256_mask_permutexvar_epi16 (__m256i __W, __mmask16 __M, __m256i __A,
                                  (__v16hi)_mm256_dbsad_epu8((A), (B), (imm)), \
                                  (__v16hi)_mm256_setzero_si256()))

+static __inline__ short __DEFAULT_FN_ATTRS128
+_mm_reduce_add_epi16(__m128i __W) {
+  return __builtin_reduce_add((__v8hi)__W);
+}
+
+static __inline__ short __DEFAULT_FN_ATTRS128
+_mm_reduce_mul_epi16(__m128i __W) {
+  return __builtin_reduce_mul((__v8hi)__W);
+}
+
+static __inline__ short __DEFAULT_FN_ATTRS128
+_mm_reduce_and_epi16(__m128i __W) {
+  return __builtin_reduce_and((__v8hi)__W);
+}
+
+static __inline__ short __DEFAULT_FN_ATTRS128
+_mm_reduce_or_epi16(__m128i __W) {
+  return __builtin_reduce_or((__v8hi)__W);
+}
+
+static __inline__ short __DEFAULT_FN_ATTRS128
+_mm_mask_reduce_add_epi16( __mmask8 __M, __m128i __W) {
+  __W = _mm_maskz_mov_epi16(__M, __W);
+  return __builtin_reduce_add((__v8hi)__W);
+}
+
+static __inline__ short __DEFAULT_FN_ATTRS128
+_mm_mask_reduce_mul_epi16( __mmask8 __M, __m128i __W) {
+  __W = _mm_mask_mov_epi16(_mm_set1_epi16(1), __M, __W);
+  return __builtin_reduce_mul((__v8hi)__W);
+}
+
+static __inline__ short __DEFAULT_FN_ATTRS128
+_mm_mask_reduce_and_epi16( __mmask8 __M, __m128i __W) {
+  __W = _mm_mask_mov_epi16(_mm_set1_epi16(-1), __M, __W);
+  return __builtin_reduce_and((__v8hi)__W);
+}
+
+static __inline__ short __DEFAULT_FN_ATTRS128
+_mm_mask_reduce_or_epi16(__mmask8 __M, __m128i __W) {
+  __W = _mm_maskz_mov_epi16(__M, __W);
+  return __builtin_reduce_or((__v8hi)__W);
+}
+
+static __inline__ short __DEFAULT_FN_ATTRS128
+_mm_reduce_max_epi16(__m128i __V) {
+  return __builtin_reduce_max((__v8hi)__V);
+}
+
+static __inline__ unsigned short __DEFAULT_FN_ATTRS128
+_mm_reduce_max_epu16(__m128i __V) {
+  return __builtin_reduce_max((__v8hu)__V);
+}
+
+static __inline__ short __DEFAULT_FN_ATTRS128
+_mm_reduce_min_epi16(__m128i __V) {
+  return __builtin_reduce_min((__v8hi)__V);
+}
+
+static __inline__ unsigned short __DEFAULT_FN_ATTRS128
+_mm_reduce_min_epu16(__m128i __V) {
+  return __builtin_reduce_min((__v8hu)__V);
+}
+
+static __inline__ short __DEFAULT_FN_ATTRS128
+_mm_mask_reduce_max_epi16(__mmask16 __M, __m128i __V) {
+  __V = _mm_mask_mov_epi16(_mm_set1_epi16(-32767-1), __M, __V);
+  return __builtin_reduce_max((__v8hi)__V);
+}
+
+static __inline__ unsigned short __DEFAULT_FN_ATTRS128
+_mm_mask_reduce_max_epu16(__mmask16 __M, __m128i __V) {
+  __V = _mm_maskz_mov_epi16(__M, __V);
+  return __builtin_reduce_max((__v8hu)__V);
+}
+
+static __inline__ short __DEFAULT_FN_ATTRS128
+_mm_mask_reduce_min_epi16(__mmask16 __M, __m128i __V) {
+  __V = _mm_mask_mov_epi16(_mm_set1_epi16(32767), __M, __V);
+  return __builtin_reduce_min((__v8hi)__V);
+}
+
+static __inline__ unsigned short __DEFAULT_FN_ATTRS128
+_mm_mask_reduce_min_epu16(__mmask16 __M, __m128i __V) {
+  __V = _mm_mask_mov_epi16(_mm_set1_epi16(-1), __M, __V);
+  return __builtin_reduce_min((__v8hu)__V);
+}
+
+static __inline__ short __DEFAULT_FN_ATTRS256
+_mm256_reduce_add_epi16(__m256i __W) {
+  return __builtin_reduce_add((__v16hi)__W);
+}
+
+static __inline__ short __DEFAULT_FN_ATTRS256
+_mm256_reduce_mul_epi16(__m256i __W) {
+  return __builtin_reduce_mul((__v16hi)__W);
+}
+
+static __inline__ short __DEFAULT_FN_ATTRS256
+_mm256_reduce_and_epi16(__m256i __W) {
+  return __builtin_reduce_and((__v16hi)__W);
+}
+
+static __inline__ short __DEFAULT_FN_ATTRS256
+_mm256_reduce_or_epi16(__m256i __W) {
+  return __builtin_reduce_or((__v16hi)__W);
+}
+
+static __inline__ short __DEFAULT_FN_ATTRS256
+_mm256_mask_reduce_add_epi16( __mmask16 __M, __m256i __W) {
+  __W = _mm256_maskz_mov_epi16(__M, __W);
+  return __builtin_reduce_add((__v16hi)__W);
+}
+
+static __inline__ short __DEFAULT_FN_ATTRS256
+_mm256_mask_reduce_mul_epi16( __mmask16 __M, __m256i __W) {
+  __W = _mm256_mask_mov_epi16(_mm256_set1_epi16(1), __M, __W);
+  return __builtin_reduce_mul((__v16hi)__W);
+}
+
+static __inline__ short __DEFAULT_FN_ATTRS256
+_mm256_mask_reduce_and_epi16( __mmask16 __M, __m256i __W) {
+  __W = _mm256_mask_mov_epi16(_mm256_set1_epi16(-1), __M, __W);
+  return __builtin_reduce_and((__v16hi)__W);
+}
+
+static __inline__ short __DEFAULT_FN_ATTRS256
+_mm256_mask_reduce_or_epi16(__mmask16 __M, __m256i __W) {
+  __W = _mm256_maskz_mov_epi16(__M, __W);
+  return __builtin_reduce_or((__v16hi)__W);
+}
+
+static __inline__ short __DEFAULT_FN_ATTRS256
+_mm256_reduce_max_epi16(__m256i __V) {
+  return __builtin_reduce_max((__v16hi)__V);
+}
+
+static __inline__ unsigned short __DEFAULT_FN_ATTRS256
+_mm256_reduce_max_epu16(__m256i __V) {
+  return __builtin_reduce_max((__v16hu)__V);
+}
+
+static __inline__ short __DEFAULT_FN_ATTRS256
+_mm256_reduce_min_epi16(__m256i __V) {
+  return __builtin_reduce_min((__v16hi)__V);
+}
+
+static __inline__ unsigned short __DEFAULT_FN_ATTRS256
+_mm256_reduce_min_epu16(__m256i __V) {
+  return __builtin_reduce_min((__v16hu)__V);
+}
+
+static __inline__ short __DEFAULT_FN_ATTRS256
+_mm256_mask_reduce_max_epi16(__mmask16 __M, __m256i __V) {
+  __V = _mm256_mask_mov_epi16(_mm256_set1_epi16(-32767-1), __M, __V);
+  return __builtin_reduce_max((__v16hi)__V);
+}
+
+static __inline__ unsigned short __DEFAULT_FN_ATTRS256
+_mm256_mask_reduce_max_epu16(__mmask16 __M, __m256i __V) {
+  __V = _mm256_maskz_mov_epi16(__M, __V);
+  return __builtin_reduce_max((__v16hu)__V);
+}
+
+static __inline__ short __DEFAULT_FN_ATTRS256
+_mm256_mask_reduce_min_epi16(__mmask16 __M, __m256i __V) {
+  __V = _mm256_mask_mov_epi16(_mm256_set1_epi16(32767), __M, __V);
+  return __builtin_reduce_min((__v16hi)__V);
+}
+
+static __inline__ unsigned short __DEFAULT_FN_ATTRS256
+_mm256_mask_reduce_min_epu16(__mmask16 __M, __m256i __V) {
+  __V = _mm256_mask_mov_epi16(_mm256_set1_epi16(-1), __M, __V);
+  return __builtin_reduce_min((__v16hu)__V);
+}
+
+static __inline__ signed char __DEFAULT_FN_ATTRS128
+_mm_reduce_add_epi8(__m128i __W) {
+  return __builtin_reduce_add((__v16qs)__W);
+}
+
+static __inline__ signed char __DEFAULT_FN_ATTRS128
+_mm_reduce_mul_epi8(__m128i __W) {
+  return __builtin_reduce_mul((__v16qs)__W);
+}
+
+static __inline__ signed char __DEFAULT_FN_ATTRS128
+_mm_reduce_and_epi8(__m128i __W) {
+  return __builtin_reduce_and((__v16qs)__W);
+}
+
+static __inline__ signed char __DEFAULT_FN_ATTRS128
+_mm_reduce_or_epi8(__m128i __W) {
+  return __builtin_reduce_or((__v16qs)__W);
+}
+
+static __inline__ signed char __DEFAULT_FN_ATTRS128
+_mm_mask_reduce_add_epi8(__mmask16 __M, __m128i __W) {
+  __W = _mm_maskz_mov_epi8(__M, __W);
+  return __builtin_reduce_add((__v16qs)__W);
+}
+
+static __inline__ signed char __DEFAULT_FN_ATTRS128
+_mm_mask_reduce_mul_epi8(__mmask16 __M, __m128i __W) {
+  __W = _mm_mask_mov_epi8(_mm_set1_epi8(1), __M, __W);
+  return __builtin_reduce_mul((__v16qs)__W);
+}
+
+static __inline__ signed char __DEFAULT_FN_ATTRS128
+_mm_mask_reduce_and_epi8(__mmask16 __M, __m128i __W) {
+  __W = _mm_mask_mov_epi8(_mm_set1_epi8(-1), __M, __W);
+  return __builtin_reduce_and((__v16qs)__W);
+}
+
+static __inline__ signed char __DEFAULT_FN_ATTRS128
+_mm_mask_reduce_or_epi8(__mmask16 __M, __m128i __W) {
+  __W = _mm_maskz_mov_epi8(__M, __W);
+  return __builtin_reduce_or((__v16qs)__W);
+}
+
+static __inline__ signed char __DEFAULT_FN_ATTRS128
+_mm_reduce_max_epi8(__m128i __V) {
+  return __builtin_reduce_max((__v16qs)__V);
+}
+
+static __inline__ unsigned char __DEFAULT_FN_ATTRS128
+_mm_reduce_max_epu8(__m128i __V) {
+  return __builtin_reduce_max((__v16qu)__V);
+}
+
+static __inline__ signed char __DEFAULT_FN_ATTRS128
+_mm_reduce_min_epi8(__m128i __V) {
+  return __builtin_reduce_min((__v16qs)__V);
+}
+
+static __inline__ unsigned char __DEFAULT_FN_ATTRS128
+_mm_reduce_min_epu8(__m128i __V) {
+  return __builtin_reduce_min((__v16qu)__V);
+}
+
+static __inline__ signed char __DEFAULT_FN_ATTRS128
+_mm_mask_reduce_max_epi8(__mmask16 __M, __m128i __V) {
+  __V = _mm_mask_mov_epi8(_mm_set1_epi8(-127-1), __M, __V);
+  return __builtin_reduce_max((__v16qs)__V);
+}
+
+static __inline__ unsigned char __DEFAULT_FN_ATTRS128
+_mm_mask_reduce_max_epu8(__mmask16 __M, __m128i __V) {
+  __V = _mm_maskz_mov_epi8(__M, __V);
+  return __builtin_reduce_max((__v16qu)__V);
+}
+
+static __inline__ signed char __DEFAULT_FN_ATTRS128
+_mm_mask_reduce_min_epi8(__mmask16 __M, __m128i __V) {
+  __V = _mm_mask_mov_epi8(_mm_set1_epi8(127), __M, __V);
+  return __builtin_reduce_min((__v16qs)__V);
+}
+
+static __inline__ unsigned char __DEFAULT_FN_ATTRS128
+_mm_mask_reduce_min_epu8(__mmask16 __M, __m128i __V) {
+  __V = _mm_mask_mov_epi8(_mm_set1_epi8(-1), __M, __V);
+  return __builtin_reduce_min((__v16qu)__V);
+}
+
+static __inline__ signed char __DEFAULT_FN_ATTRS256
+_mm256_reduce_add_epi8(__m256i __W) {
+  return __builtin_reduce_add((__v32qs)__W);
+}
+
+static __inline__ signed char __DEFAULT_FN_ATTRS256
+_mm256_reduce_mul_epi8(__m256i __W) {
+  return __builtin_reduce_mul((__v32qs)__W);
+}
+
+static __inline__ signed char __DEFAULT_FN_ATTRS256
+_mm256_reduce_and_epi8(__m256i __W) {
+  return __builtin_reduce_and((__v32qs)__W);
+}
+
+static __inline__ signed char __DEFAULT_FN_ATTRS256
+_mm256_reduce_or_epi8(__m256i __W) {
+  return __builtin_reduce_or((__v32qs)__W);
+}
+
+static __inline__ signed char __DEFAULT_FN_ATTRS256
+_mm256_mask_reduce_add_epi8(__mmask32 __M, __m256i __W) {
+  __W = _mm256_maskz_mov_epi8(__M, __W);
+  return __builtin_reduce_add((__v32qs)__W);
+}
+
+static __inline__ signed char __DEFAULT_FN_ATTRS256
+_mm256_mask_reduce_mul_epi8(__mmask32 __M, __m256i __W) {
+  __W = _mm256_mask_mov_epi8(_mm256_set1_epi8(1), __M, __W);
+  return __builtin_reduce_mul((__v32qs)__W);
+}
+
+static __inline__ signed char __DEFAULT_FN_ATTRS256
+_mm256_mask_reduce_and_epi8(__mmask32 __M, __m256i __W) {
+  __W = _mm256_mask_mov_epi8(_mm256_set1_epi8(-1), __M, __W);
+  return __builtin_reduce_and((__v32qs)__W);
+}
+
+static __inline__ signed char __DEFAULT_FN_ATTRS256
+_mm256_mask_reduce_or_epi8(__mmask32 __M, __m256i __W) {
+  __W = _mm256_maskz_mov_epi8(__M, __W);
+  return __builtin_reduce_or((__v32qs)__W);
+}
+
+static __inline__ signed char __DEFAULT_FN_ATTRS256
+_mm256_reduce_max_epi8(__m256i __V) {
+  return __builtin_reduce_max((__v32qs)__V);
+}
+
+static __inline__ unsigned char __DEFAULT_FN_ATTRS256
+_mm256_reduce_max_epu8(__m256i __V) {
+  return __builtin_reduce_max((__v32qu)__V);
+}
+
+static __inline__ signed char __DEFAULT_FN_ATTRS256
+_mm256_reduce_min_epi8(__m256i __V) {
+  return __builtin_reduce_min((__v32qs)__V);
+}
+
+static __inline__ unsigned char __DEFAULT_FN_ATTRS256
+_mm256_reduce_min_epu8(__m256i __V) {
+  return __builtin_reduce_min((__v32qu)__V);
+}
+
+static __inline__ signed char __DEFAULT_FN_ATTRS256
+_mm256_mask_reduce_max_epi8(__mmask32 __M, __m256i __V) {
+  __V = _mm256_mask_mov_epi8(_mm256_set1_epi8(-127-1), __M, __V);
+  return __builtin_reduce_max((__v32qs)__V);
+}
+
+static __inline__ unsigned char __DEFAULT_FN_ATTRS256
+_mm256_mask_reduce_max_epu8(__mmask32 __M, __m256i __V) {
+  __V = _mm256_maskz_mov_epi8(__M, __V);
+  return __builtin_reduce_max((__v32qu)__V);
+}
+
+static __inline__ signed char __DEFAULT_FN_ATTRS256
+_mm256_mask_reduce_min_epi8(__mmask32 __M, __m256i __V) {
+  __V = _mm256_mask_mov_epi8(_mm256_set1_epi8(127), __M, __V);
+  return __builtin_reduce_min((__v32qs)__V);
+}
+
+static __inline__ unsigned char __DEFAULT_FN_ATTRS256
+_mm256_mask_reduce_min_epu8(__mmask32 __M, __m256i __V) {
+  __V = _mm256_mask_mov_epi8(_mm256_set1_epi8(-1), __M, __V);
+  return __builtin_reduce_min((__v32qu)__V);
+}
+
 #undef __DEFAULT_FN_ATTRS128
 #undef __DEFAULT_FN_ATTRS256

--- a/lib/include/avx512vlfp16intrin.h
+++ b/lib/include/avx512vlfp16intrin.h
@ -11,6 +11,8 @@
    "Never use <avx512vlfp16intrin.h> directly; include <immintrin.h> instead."
 #endif

+#ifdef __SSE2__
+
 #ifndef __AVX512VLFP16INTRIN_H
 #define __AVX512VLFP16INTRIN_H

@ -2066,3 +2068,4 @@ _mm_reduce_min_ph(__m128h __V) {
 #undef __DEFAULT_FN_ATTRS256

 #endif
+#endif
--- a/lib/include/avxifmaintrin.h
+++ b/lib/include/avxifmaintrin.h
@ -0,0 +1,177 @@
+/*===----------------- avxifmaintrin.h - IFMA intrinsics -------------------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __IMMINTRIN_H
+#error "Never use <avxifmaintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef __AVXIFMAINTRIN_H
+#define __AVXIFMAINTRIN_H
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS128                                                  \
+  __attribute__((__always_inline__, __nodebug__, __target__("avxifma"),        \
+                 __min_vector_width__(128)))
+#define __DEFAULT_FN_ATTRS256                                                  \
+  __attribute__((__always_inline__, __nodebug__, __target__("avxifma"),        \
+                 __min_vector_width__(256)))
+
+// must vex-encoding
+
+/// Multiply packed unsigned 52-bit integers in each 64-bit element of \a __Y
+/// and \a __Z to form a 104-bit intermediate result. Add the high 52-bit
+/// unsigned integer from the intermediate result with the corresponding
+/// unsigned 64-bit integer in \a __X, and store the results in \a dst.
+///
+/// \headerfile <immintrin.h>
+///
+/// \code
+/// __m128i
+/// _mm_madd52hi_avx_epu64 (__m128i __X, __m128i __Y, __m128i __Z)
+/// \endcode
+///
+/// This intrinsic corresponds to the \c VPMADD52HUQ instruction.
+///
+/// \return
+/// 	return __m128i dst.
+/// \param __X
+/// 	A 128-bit vector of [2 x i64]
+/// \param __Y
+/// 	A 128-bit vector of [2 x i64]
+/// \param __Z
+/// 	A 128-bit vector of [2 x i64]
+///
+/// \code{.operation}
+/// FOR j := 0 to 1
+/// 	i := j*64
+/// 	tmp[127:0] := ZeroExtend64(__Y[i+51:i]) * ZeroExtend64(__Z[i+51:i])
+/// 	dst[i+63:i] := __X[i+63:i] + ZeroExtend64(tmp[103:52])
+/// ENDFOR
+/// dst[MAX:128] := 0
+/// \endcode
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_madd52hi_avx_epu64(__m128i __X, __m128i __Y, __m128i __Z) {
+  return (__m128i)__builtin_ia32_vpmadd52huq128((__v2di)__X, (__v2di)__Y,
+                                                (__v2di)__Z);
+}
+
+/// Multiply packed unsigned 52-bit integers in each 64-bit element of \a __Y
+/// and \a __Z to form a 104-bit intermediate result. Add the high 52-bit
+/// unsigned integer from the intermediate result with the corresponding
+/// unsigned 64-bit integer in \a __X, and store the results in \a dst.
+///
+/// \headerfile <immintrin.h>
+///
+/// \code
+/// __m256i
+/// _mm256_madd52hi_avx_epu64 (__m256i __X, __m256i __Y, __m256i __Z)
+/// \endcode
+///
+/// This intrinsic corresponds to the \c VPMADD52HUQ instruction.
+///
+/// \return
+/// 	return __m256i dst.
+/// \param __X
+/// 	A 256-bit vector of [4 x i64]
+/// \param __Y
+/// 	A 256-bit vector of [4 x i64]
+/// \param __Z
+/// 	A 256-bit vector of [4 x i64]
+///
+/// \code{.operation}
+/// FOR j := 0 to 3
+/// 	i := j*64
+/// 	tmp[127:0] := ZeroExtend64(__Y[i+51:i]) * ZeroExtend64(__Z[i+51:i])
+/// 	dst[i+63:i] := __X[i+63:i] + ZeroExtend64(tmp[103:52])
+/// ENDFOR
+/// dst[MAX:256] := 0
+/// \endcode
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_madd52hi_avx_epu64(__m256i __X, __m256i __Y, __m256i __Z) {
+  return (__m256i)__builtin_ia32_vpmadd52huq256((__v4di)__X, (__v4di)__Y,
+                                                (__v4di)__Z);
+}
+
+/// Multiply packed unsigned 52-bit integers in each 64-bit element of \a __Y
+/// and \a __Z to form a 104-bit intermediate result. Add the low 52-bit
+/// unsigned integer from the intermediate result with the corresponding
+/// unsigned 64-bit integer in \a __X, and store the results in \a dst.
+///
+/// \headerfile <immintrin.h>
+///
+/// \code
+/// __m128i
+/// _mm_madd52lo_avx_epu64 (__m128i __X, __m128i __Y, __m128i __Z)
+/// \endcode
+///
+/// This intrinsic corresponds to the \c VPMADD52LUQ instruction.
+///
+/// \return
+/// 	return __m128i dst.
+/// \param __X
+/// 	A 128-bit vector of [2 x i64]
+/// \param __Y
+/// 	A 128-bit vector of [2 x i64]
+/// \param __Z
+/// 	A 128-bit vector of [2 x i64]
+///
+/// \code{.operation}
+/// FOR j := 0 to 1
+/// 	i := j*64
+/// 	tmp[127:0] := ZeroExtend64(__Y[i+51:i]) * ZeroExtend64(__Z[i+51:i])
+/// 	dst[i+63:i] := __X[i+63:i] + ZeroExtend64(tmp[51:0])
+/// ENDFOR
+/// dst[MAX:128] := 0
+/// \endcode
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_madd52lo_avx_epu64(__m128i __X, __m128i __Y, __m128i __Z) {
+  return (__m128i)__builtin_ia32_vpmadd52luq128((__v2di)__X, (__v2di)__Y,
+                                                (__v2di)__Z);
+}
+
+/// Multiply packed unsigned 52-bit integers in each 64-bit element of \a __Y
+/// and \a __Z to form a 104-bit intermediate result. Add the low 52-bit
+/// unsigned integer from the intermediate result with the corresponding
+/// unsigned 64-bit integer in \a __X, and store the results in \a dst.
+///
+/// \headerfile <immintrin.h>
+///
+/// \code
+/// __m256i
+/// _mm256_madd52lo_avx_epu64 (__m256i __X, __m256i __Y, __m256i __Z)
+/// \endcode
+///
+/// This intrinsic corresponds to the \c VPMADD52LUQ instruction.
+///
+/// \return
+/// 	return __m256i dst.
+/// \param __X
+/// 	A 256-bit vector of [4 x i64]
+/// \param __Y
+/// 	A 256-bit vector of [4 x i64]
+/// \param __Z
+/// 	A 256-bit vector of [4 x i64]
+///
+/// \code{.operation}
+/// FOR j := 0 to 3
+/// 	i := j*64
+/// 	tmp[127:0] := ZeroExtend64(__Y[i+51:i]) * ZeroExtend64(__Z[i+51:i])
+/// 	dst[i+63:i] := __X[i+63:i] + ZeroExtend64(tmp[51:0])
+/// ENDFOR
+/// dst[MAX:256] := 0
+/// \endcode
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_madd52lo_avx_epu64(__m256i __X, __m256i __Y, __m256i __Z) {
+  return (__m256i)__builtin_ia32_vpmadd52luq256((__v4di)__X, (__v4di)__Y,
+                                                (__v4di)__Z);
+}
+#undef __DEFAULT_FN_ATTRS128
+#undef __DEFAULT_FN_ATTRS256
+
+#endif // __AVXIFMAINTRIN_H
--- a/lib/include/avxintrin.h
+++ b/lib/include/avxintrin.h
@ -39,6 +39,16 @@ typedef float __m256_u __attribute__ ((__vector_size__ (32), __aligned__(1)));
 typedef double __m256d_u __attribute__((__vector_size__(32), __aligned__(1)));
 typedef long long __m256i_u __attribute__((__vector_size__(32), __aligned__(1)));

+#ifdef __SSE2__
+/* Both _Float16 and __bf16 require SSE2 being enabled. */
+typedef _Float16 __v16hf __attribute__((__vector_size__(32), __aligned__(32)));
+typedef _Float16 __m256h __attribute__((__vector_size__(32), __aligned__(32)));
+typedef _Float16 __m256h_u __attribute__((__vector_size__(32), __aligned__(1)));
+
+typedef __bf16 __v16bf __attribute__((__vector_size__(32), __aligned__(32)));
+typedef __bf16 __m256bh __attribute__((__vector_size__(32), __aligned__(32)));
+#endif
+
 /* Define the default attributes for the functions in this file. */
 #define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx"), __min_vector_width__(256)))
 #define __DEFAULT_FN_ATTRS128 __attribute__((__always_inline__, __nodebug__, __target__("avx"), __min_vector_width__(128)))
@ -4288,7 +4298,7 @@ _mm256_set1_epi64x(long long __q)
 static __inline __m256d __DEFAULT_FN_ATTRS
 _mm256_setzero_pd(void)
 {
-  return __extension__ (__m256d){ 0, 0, 0, 0 };
+  return __extension__ (__m256d){ 0.0, 0.0, 0.0, 0.0 };
 }

 /// Constructs a 256-bit floating-point vector of [8 x float] with all
@ -4302,7 +4312,7 @@ _mm256_setzero_pd(void)
 static __inline __m256 __DEFAULT_FN_ATTRS
 _mm256_setzero_ps(void)
 {
-  return __extension__ (__m256){ 0, 0, 0, 0, 0, 0, 0, 0 };
+  return __extension__ (__m256){ 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f };
 }

 /// Constructs a 256-bit integer vector initialized to zero.
--- a/lib/include/avxneconvertintrin.h
+++ b/lib/include/avxneconvertintrin.h
@ -0,0 +1,484 @@
+/*===-------------- avxneconvertintrin.h - AVXNECONVERT --------------------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __IMMINTRIN_H
+#error                                                                         \
+    "Never use <avxneconvertintrin.h> directly; include <immintrin.h> instead."
+#endif // __IMMINTRIN_H
+
+#ifdef __SSE2__
+
+#ifndef __AVXNECONVERTINTRIN_H
+#define __AVXNECONVERTINTRIN_H
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS128                                                  \
+  __attribute__((__always_inline__, __nodebug__, __target__("avxneconvert"),   \
+                 __min_vector_width__(128)))
+#define __DEFAULT_FN_ATTRS256                                                  \
+  __attribute__((__always_inline__, __nodebug__, __target__("avxneconvert"),   \
+                 __min_vector_width__(256)))
+
+/// Convert scalar BF16 (16-bit) floating-point element
+/// stored at memory locations starting at location \a __A to a
+/// single-precision (32-bit) floating-point, broadcast it to packed
+/// single-precision (32-bit) floating-point elements, and store the results in
+/// \a dst.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// _mm_bcstnebf16_ps(const void *__A);
+/// \endcode
+///
+/// This intrinsic corresponds to the \c VBCSTNEBF162PS instruction.
+///
+/// \param __A
+///    A pointer to a 16-bit memory location. The address of the memory
+///    location does not have to be aligned.
+/// \returns
+///    A 128-bit vector of [4 x float].
+///
+/// \code{.operation}
+/// b := Convert_BF16_To_FP32(MEM[__A+15:__A])
+/// FOR j := 0 to 3
+///   m := j*32
+///   dst[m+31:m] := b
+/// ENDFOR
+/// dst[MAX:128] := 0
+/// \endcode
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
+_mm_bcstnebf16_ps(const void *__A) {
+  return (__m128)__builtin_ia32_vbcstnebf162ps128((const __bf16 *)__A);
+}
+
+/// Convert scalar BF16 (16-bit) floating-point element
+/// stored at memory locations starting at location \a __A to a
+/// single-precision (32-bit) floating-point, broadcast it to packed
+/// single-precision (32-bit) floating-point elements, and store the results in
+/// \a dst.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// _mm256_bcstnebf16_ps(const void *__A);
+/// \endcode
+///
+/// This intrinsic corresponds to the \c VBCSTNEBF162PS instruction.
+///
+/// \param __A
+///    A pointer to a 16-bit memory location. The address of the memory
+///    location does not have to be aligned.
+/// \returns
+///    A 256-bit vector of [8 x float].
+///
+/// \code{.operation}
+/// b := Convert_BF16_To_FP32(MEM[__A+15:__A])
+/// FOR j := 0 to 7
+///   m := j*32
+///   dst[m+31:m] := b
+/// ENDFOR
+/// dst[MAX:256] := 0
+/// \endcode
+static __inline__ __m256 __DEFAULT_FN_ATTRS256
+_mm256_bcstnebf16_ps(const void *__A) {
+  return (__m256)__builtin_ia32_vbcstnebf162ps256((const __bf16 *)__A);
+}
+
+/// Convert scalar half-precision (16-bit) floating-point element
+/// stored at memory locations starting at location \a __A to a
+/// single-precision (32-bit) floating-point, broadcast it to packed
+/// single-precision (32-bit) floating-point elements, and store the results in
+/// \a dst.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// _mm_bcstnesh_ps(const void *__A);
+/// \endcode
+///
+/// This intrinsic corresponds to the \c VBCSTNESH2PS instruction.
+///
+/// \param __A
+///    A pointer to a 16-bit memory location. The address of the memory
+///    location does not have to be aligned.
+/// \returns
+///    A 128-bit vector of [4 x float].
+///
+/// \code{.operation}
+/// b := Convert_FP16_To_FP32(MEM[__A+15:__A])
+/// FOR j := 0 to 3
+///   m := j*32
+///   dst[m+31:m] := b
+/// ENDFOR
+/// dst[MAX:128] := 0
+/// \endcode
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
+_mm_bcstnesh_ps(const void *__A) {
+  return (__m128)__builtin_ia32_vbcstnesh2ps128((const _Float16 *)__A);
+}
+
+/// Convert scalar half-precision (16-bit) floating-point element
+/// stored at memory locations starting at location \a __A to a
+/// single-precision (32-bit) floating-point, broadcast it to packed
+/// single-precision (32-bit) floating-point elements, and store the results in
+/// \a dst.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// _mm256_bcstnesh_ps(const void *__A);
+/// \endcode
+///
+/// This intrinsic corresponds to the \c VBCSTNESH2PS instruction.
+///
+/// \param __A
+///    A pointer to a 16-bit memory location. The address of the memory
+///    location does not have to be aligned.
+/// \returns
+///    A 256-bit vector of [8 x float].
+///
+/// \code{.operation}
+/// b := Convert_FP16_To_FP32(MEM[__A+15:__A])
+/// FOR j := 0 to 7
+///   m := j*32
+///   dst[m+31:m] := b
+/// ENDFOR
+/// dst[MAX:256] := 0
+/// \endcode
+static __inline__ __m256 __DEFAULT_FN_ATTRS256
+_mm256_bcstnesh_ps(const void *__A) {
+  return (__m256)__builtin_ia32_vbcstnesh2ps256((const _Float16 *)__A);
+}
+
+/// Convert packed BF16 (16-bit) floating-point even-indexed elements
+/// stored at memory locations starting at location \a __A to packed
+/// single-precision (32-bit) floating-point elements, and store the results in
+/// \a dst.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// _mm_cvtneebf16_ps(const __m128bh *__A);
+/// \endcode
+///
+/// This intrinsic corresponds to the \c VCVTNEEBF162PS instruction.
+///
+/// \param __A
+///    A pointer to a 128-bit memory location containing 8 consecutive
+///    BF16 (16-bit) floating-point values.
+/// \returns
+///    A 128-bit vector of [4 x float].
+///
+/// \code{.operation}
+/// FOR j := 0 to 3
+/// 	k := j*2
+/// 	i := k*16
+/// 	m := j*32
+/// 	dst[m+31:m] := Convert_BF16_To_FP32(MEM[__A+i+15:__A+i])
+/// ENDFOR
+/// dst[MAX:128] := 0
+/// \endcode
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
+_mm_cvtneebf16_ps(const __m128bh *__A) {
+  return (__m128)__builtin_ia32_vcvtneebf162ps128((const __v8bf *)__A);
+}
+
+/// Convert packed BF16 (16-bit) floating-point even-indexed elements
+/// stored at memory locations starting at location \a __A to packed
+/// single-precision (32-bit) floating-point elements, and store the results in
+/// \a dst.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// _mm256_cvtneebf16_ps(const __m256bh *__A);
+/// \endcode
+///
+/// This intrinsic corresponds to the \c VCVTNEEBF162PS instruction.
+///
+/// \param __A
+///    A pointer to a 256-bit memory location containing 16 consecutive
+///    BF16 (16-bit) floating-point values.
+/// \returns
+///    A 256-bit vector of [8 x float].
+///
+/// \code{.operation}
+/// FOR j := 0 to 7
+/// 	k := j*2
+/// 	i := k*16
+/// 	m := j*32
+/// 	dst[m+31:m] := Convert_BF16_To_FP32(MEM[__A+i+15:__A+i])
+/// ENDFOR
+/// dst[MAX:256] := 0
+/// \endcode
+static __inline__ __m256 __DEFAULT_FN_ATTRS256
+_mm256_cvtneebf16_ps(const __m256bh *__A) {
+  return (__m256)__builtin_ia32_vcvtneebf162ps256((const __v16bf *)__A);
+}
+
+/// Convert packed half-precision (16-bit) floating-point even-indexed elements
+/// stored at memory locations starting at location \a __A to packed
+/// single-precision (32-bit) floating-point elements, and store the results in
+/// \a dst.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// _mm_cvtneeph_ps(const __m128h *__A);
+/// \endcode
+///
+/// This intrinsic corresponds to the \c VCVTNEEPH2PS instruction.
+///
+/// \param __A
+///    A pointer to a 128-bit memory location containing 8 consecutive
+///    half-precision (16-bit) floating-point values.
+/// \returns
+///    A 128-bit vector of [4 x float].
+///
+/// \code{.operation}
+/// FOR j := 0 to 3
+/// 	k := j*2
+/// 	i := k*16
+/// 	m := j*32
+/// 	dst[m+31:m] := Convert_FP16_To_FP32(MEM[__A+i+15:__A+i])
+/// ENDFOR
+/// dst[MAX:128] := 0
+/// \endcode
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
+_mm_cvtneeph_ps(const __m128h *__A) {
+  return (__m128)__builtin_ia32_vcvtneeph2ps128((const __v8hf *)__A);
+}
+
+/// Convert packed half-precision (16-bit) floating-point even-indexed elements
+/// stored at memory locations starting at location \a __A to packed
+/// single-precision (32-bit) floating-point elements, and store the results in
+/// \a dst.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// _mm256_cvtneeph_ps(const __m256h *__A);
+/// \endcode
+///
+/// This intrinsic corresponds to the \c VCVTNEEPH2PS instruction.
+///
+/// \param __A
+///    A pointer to a 256-bit memory location containing 16 consecutive
+///    half-precision (16-bit) floating-point values.
+/// \returns
+///    A 256-bit vector of [8 x float].
+///
+/// \code{.operation}
+/// FOR j := 0 to 7
+/// 	k := j*2
+/// 	i := k*16
+/// 	m := j*32
+/// 	dst[m+31:m] := Convert_FP16_To_FP32(MEM[__A+i+15:__A+i])
+/// ENDFOR
+/// dst[MAX:256] := 0
+/// \endcode
+static __inline__ __m256 __DEFAULT_FN_ATTRS256
+_mm256_cvtneeph_ps(const __m256h *__A) {
+  return (__m256)__builtin_ia32_vcvtneeph2ps256((const __v16hf *)__A);
+}
+
+/// Convert packed BF16 (16-bit) floating-point odd-indexed elements
+/// stored at memory locations starting at location \a __A to packed
+/// single-precision (32-bit) floating-point elements, and store the results in
+/// \a dst.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// _mm_cvtneobf16_ps(const __m128bh *__A);
+/// \endcode
+///
+/// This intrinsic corresponds to the \c VCVTNEOBF162PS instruction.
+///
+/// \param __A
+///    A pointer to a 128-bit memory location containing 8 consecutive
+///    BF16 (16-bit) floating-point values.
+/// \returns
+///    A 128-bit vector of [4 x float].
+///
+/// \code{.operation}
+/// FOR j := 0 to 3
+/// 	k := j*2+1
+/// 	i := k*16
+/// 	m := j*32
+/// 	dst[m+31:m] := Convert_BF16_To_FP32(MEM[__A+i+15:__A+i])
+/// ENDFOR
+/// dst[MAX:128] := 0
+/// \endcode
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
+_mm_cvtneobf16_ps(const __m128bh *__A) {
+  return (__m128)__builtin_ia32_vcvtneobf162ps128((const __v8bf *)__A);
+}
+
+/// Convert packed BF16 (16-bit) floating-point odd-indexed elements
+/// stored at memory locations starting at location \a __A to packed
+/// single-precision (32-bit) floating-point elements, and store the results in
+/// \a dst.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// _mm256_cvtneobf16_ps(const __m256bh *__A);
+/// \endcode
+///
+/// This intrinsic corresponds to the \c VCVTNEOBF162PS instruction.
+///
+/// \param __A
+///    A pointer to a 256-bit memory location containing 16 consecutive
+///    BF16 (16-bit) floating-point values.
+/// \returns
+///    A 256-bit vector of [8 x float].
+///
+/// \code{.operation}
+/// FOR j := 0 to 7
+/// 	k := j*2+1
+/// 	i := k*16
+/// 	m := j*32
+/// 	dst[m+31:m] := Convert_BF16_To_FP32(MEM[__A+i+15:__A+i])
+/// ENDFOR
+/// dst[MAX:256] := 0
+/// \endcode
+static __inline__ __m256 __DEFAULT_FN_ATTRS256
+_mm256_cvtneobf16_ps(const __m256bh *__A) {
+  return (__m256)__builtin_ia32_vcvtneobf162ps256((const __v16bf *)__A);
+}
+
+/// Convert packed half-precision (16-bit) floating-point odd-indexed elements
+/// stored at memory locations starting at location \a __A to packed
+/// single-precision (32-bit) floating-point elements, and store the results in
+/// \a dst.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// _mm_cvtneoph_ps(const __m128h *__A);
+/// \endcode
+///
+/// This intrinsic corresponds to the \c VCVTNEOPH2PS instruction.
+///
+/// \param __A
+///    A pointer to a 128-bit memory location containing 8 consecutive
+///    half-precision (16-bit) floating-point values.
+/// \returns
+///    A 128-bit vector of [4 x float].
+///
+/// \code{.operation}
+/// FOR j := 0 to 3
+/// 	k := j*2+1
+/// 	i := k*16
+/// 	m := j*32
+/// 	dst[m+31:m] := Convert_FP16_To_FP32(MEM[__A+i+15:__A+i])
+/// ENDFOR
+/// dst[MAX:128] := 0
+/// \endcode
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
+_mm_cvtneoph_ps(const __m128h *__A) {
+  return (__m128)__builtin_ia32_vcvtneoph2ps128((const __v8hf *)__A);
+}
+
+/// Convert packed half-precision (16-bit) floating-point odd-indexed elements
+/// stored at memory locations starting at location \a __A to packed
+/// single-precision (32-bit) floating-point elements, and store the results in
+/// \a dst.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// _mm256_cvtneoph_ps(const __m256h *__A);
+/// \endcode
+///
+/// This intrinsic corresponds to the \c VCVTNEOPH2PS instruction.
+///
+/// \param __A
+///    A pointer to a 256-bit memory location containing 16 consecutive
+///    half-precision (16-bit) floating-point values.
+/// \returns
+///    A 256-bit vector of [8 x float].
+///
+/// \code{.operation}
+/// FOR j := 0 to 7
+/// 	k := j*2+1
+/// 	i := k*16
+/// 	m := j*32
+/// 	dst[m+31:m] := Convert_FP16_To_FP32(MEM[__A+i+15:__A+i])
+/// ENDFOR
+/// dst[MAX:256] := 0
+/// \endcode
+static __inline__ __m256 __DEFAULT_FN_ATTRS256
+_mm256_cvtneoph_ps(const __m256h *__A) {
+  return (__m256)__builtin_ia32_vcvtneoph2ps256((const __v16hf *)__A);
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in \a __A
+/// to packed BF16 (16-bit) floating-point elements, and store the results in \a
+/// dst.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// _mm_cvtneps_avx_pbh(__m128 __A);
+/// \endcode
+///
+/// This intrinsic corresponds to the \c VCVTNEPS2BF16 instruction.
+///
+/// \param __A
+///    A 128-bit vector of [4 x float].
+/// \returns
+///    A 128-bit vector of [8 x bfloat].
+///
+/// \code{.operation}
+/// FOR j := 0 to 3
+/// 	dst.word[j] := Convert_FP32_To_BF16(__A.fp32[j])
+/// ENDFOR
+/// dst[MAX:128] := 0
+/// \endcode
+static __inline__ __m128bh __DEFAULT_FN_ATTRS128
+_mm_cvtneps_avx_pbh(__m128 __A) {
+  return (__m128bh)__builtin_ia32_vcvtneps2bf16128((__v4sf)__A);
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in \a __A
+/// to packed BF16 (16-bit) floating-point elements, and store the results in \a
+/// dst.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// _mm256_cvtneps_avx_pbh(__m256 __A);
+/// \endcode
+///
+/// This intrinsic corresponds to the \c VCVTNEPS2BF16 instruction.
+///
+/// \param __A
+///    A 256-bit vector of [8 x float].
+/// \returns
+///    A 128-bit vector of [8 x bfloat].
+///
+/// \code{.operation}
+/// FOR j := 0 to 7
+/// 	dst.word[j] := Convert_FP32_To_BF16(a.fp32[j])
+/// ENDFOR
+/// dst[MAX:128] := 0
+/// \endcode
+static __inline__ __m128bh __DEFAULT_FN_ATTRS256
+_mm256_cvtneps_avx_pbh(__m256 __A) {
+  return (__m128bh)__builtin_ia32_vcvtneps2bf16256((__v8sf)__A);
+}
+
+#undef __DEFAULT_FN_ATTRS128
+#undef __DEFAULT_FN_ATTRS256
+
+#endif // __AVXNECONVERTINTRIN_H
+#endif // __SSE2__
--- a/lib/include/avxvnniint8intrin.h
+++ b/lib/include/avxvnniint8intrin.h
@ -0,0 +1,471 @@
+/*===-------- avxvnniint8intrin.h - AVXVNNIINT8 intrinsics -----------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+#ifndef __IMMINTRIN_H
+#error                                                                         \
+    "Never use <avxvnniint8intrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef __AVXVNNIINT8INTRIN_H
+#define __AVXVNNIINT8INTRIN_H
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS256                                                  \
+  __attribute__((__always_inline__, __nodebug__, __target__("avxvnniint8"),    \
+                 __min_vector_width__(256)))
+#define __DEFAULT_FN_ATTRS128                                                  \
+  __attribute__((__always_inline__, __nodebug__, __target__("avxvnniint8"),    \
+                 __min_vector_width__(128)))
+
+/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with
+///    corresponding signed 8-bit integers in \a __B, producing 4 intermediate
+///    signed 16-bit results. Sum these 4 results with the corresponding
+///    32-bit integer in \a __W, and store the packed 32-bit results in \a dst.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// _mm_dpbssd_epi32(__m128i __W, __m128i __A, __m128i __B);
+/// \endcode
+///
+/// This intrinsic corresponds to the \c VPDPBSSD instruction.
+///
+/// \param __A
+///    A 128-bit vector of [16 x char].
+/// \param __B
+///    A 128-bit vector of [16 x char].
+/// \returns
+///    A 128-bit vector of [4 x int].
+///
+/// \code{.operation}
+/// FOR j := 0 to 3
+/// 	tmp1.word := SignExtend16(__A.byte[4*j]) * SignExtend16(__B.byte[4*j])
+/// 	tmp2.word := SignExtend16(__A.byte[4*j+1]) * SignExtend16(__B.byte[4*j+1])
+/// 	tmp3.word := SignExtend16(__A.byte[4*j+2]) * SignExtend16(__B.byte[4*j+2])
+/// 	tmp4.word := SignExtend16(__A.byte[4*j+3]) * SignExtend16(__B.byte[4*j+3])
+/// 	dst.dword[j] := __W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4
+/// ENDFOR
+/// dst[MAX:128] := 0
+/// \endcode
+static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpbssd_epi32(__m128i __W,
+                                                                 __m128i __A,
+                                                                 __m128i __B) {
+  return (__m128i)__builtin_ia32_vpdpbssd128((__v4si)__W, (__v4si)__A,
+                                             (__v4si)__B);
+}
+
+/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with
+///    corresponding signed 8-bit integers in \a __B, producing 4 intermediate
+///    signed 16-bit results. Sum these 4 results with the corresponding
+///    32-bit integer in \a __W, and store the packed 32-bit results in \a dst.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// _mm256_dpbssd_epi32(__m256i __W, __m256i __A, __m256i __B);
+/// \endcode
+///
+/// This intrinsic corresponds to the \c VPDPBSSD instruction.
+///
+/// \param __A
+///    A 256-bit vector of [32 x char].
+/// \param __B
+///    A 256-bit vector of [32 x char].
+/// \returns
+///    A 256-bit vector of [8 x int].
+///
+/// \code{.operation}
+/// FOR j := 0 to 7
+/// 	tmp1.word := SignExtend16(__A.byte[4*j]) * SignExtend16(__B.byte[4*j])
+/// 	tmp2.word := SignExtend16(__A.byte[4*j+1]) * SignExtend16(__B.byte[4*j+1])
+/// 	tmp3.word := SignExtend16(__A.byte[4*j+2]) * SignExtend16(__B.byte[4*j+2])
+/// 	tmp4.word := SignExtend16(__A.byte[4*j+3]) * SignExtend16(__B.byte[4*j+3])
+/// 	dst.dword[j] := __W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4
+/// ENDFOR
+/// dst[MAX:256] := 0
+/// \endcode
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_dpbssd_epi32(__m256i __W, __m256i __A, __m256i __B) {
+  return (__m256i)__builtin_ia32_vpdpbssd256((__v8si)__W, (__v8si)__A,
+                                             (__v8si)__B);
+}
+
+/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with
+///    corresponding signed 8-bit integers in \a __B, producing 4 intermediate
+///    signed 16-bit results. Sum these 4 results with the corresponding
+///    32-bit integer in \a __W with signed saturation, and store the packed
+///    32-bit results in \a dst.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// _mm_dpbssds_epi32( __m128i __W, __m128i __A, __m128i __B);
+/// \endcode
+///
+/// This intrinsic corresponds to the \c VPDPBSSD instruction.
+///
+/// \param __A
+///    A 128-bit vector of [16 x char].
+/// \param __B
+///    A 128-bit vector of [16 x char].
+/// \returns
+///    A 128-bit vector of [4 x int].
+///
+/// \code{.operation}
+/// FOR j := 0 to 3
+/// 	tmp1.word := SignExtend16(__A.byte[4*j]) * SignExtend16(__B.byte[4*j])
+/// 	tmp2.word := SignExtend16(__A.byte[4*j+1]) * SignExtend16(__B.byte[4*j+1])
+/// 	tmp3.word := SignExtend16(__A.byte[4*j+2]) * SignExtend16(__B.byte[4*j+2])
+/// 	tmp4.word := SignExtend16(__A.byte[4*j+3]) * SignExtend16(__B.byte[4*j+3])
+/// 	dst.dword[j] := SIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4)
+/// ENDFOR
+/// dst[MAX:128] := 0
+/// \endcode
+static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpbssds_epi32(__m128i __W,
+                                                                  __m128i __A,
+                                                                  __m128i __B) {
+  return (__m128i)__builtin_ia32_vpdpbssds128((__v4si)__W, (__v4si)__A,
+                                              (__v4si)__B);
+}
+
+/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with
+///    corresponding signed 8-bit integers in \a __B, producing 4 intermediate
+///    signed 16-bit results. Sum these 4 results with the corresponding
+///    32-bit integer in \a __W with signed saturation, and store the packed
+///    32-bit results in \a dst.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// _mm256_dpbssds_epi32(__m256i __W, __m256i __A, __m256i __B);
+/// \endcode
+///
+/// This intrinsic corresponds to the \c VPDPBSSD instruction.
+///
+/// \param __A
+///    A 256-bit vector of [32 x char].
+/// \param __B
+///    A 256-bit vector of [32 x char].
+/// \returns
+///    A 256-bit vector of [8 x int].
+///
+/// \code{.operation}
+/// FOR j := 0 to 7
+/// 	tmp1.word := SignExtend16(__A.byte[4*j]) * SignExtend16(__B.byte[4*j])
+/// 	tmp2.word := SignExtend16(__A.byte[4*j+1]) * SignExtend16(__B.byte[4*j+1])
+/// 	tmp3.word := SignExtend16(__A.byte[4*j+2]) * SignExtend16(__B.byte[4*j+2])
+/// 	tmp4.word := SignExtend16(__A.byte[4*j+3]) * SignExtend16(__B.byte[4*j+3])
+/// 	dst.dword[j] := SIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4)
+/// ENDFOR
+/// dst[MAX:256] := 0
+/// \endcode
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_dpbssds_epi32(__m256i __W, __m256i __A, __m256i __B) {
+  return (__m256i)__builtin_ia32_vpdpbssds256((__v8si)__W, (__v8si)__A,
+                                              (__v8si)__B);
+}
+
+/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with
+///    corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate
+///    signed 16-bit results. Sum these 4 results with the corresponding
+///    32-bit integer in \a __W, and store the packed 32-bit results in \a dst.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// _mm_dpbsud_epi32(__m128i __W, __m128i __A, __m128i __B);
+/// \endcode
+///
+/// This intrinsic corresponds to the \c VPDPBSSD instruction.
+///
+/// \param __A
+///    A 128-bit vector of [16 x char].
+/// \param __B
+///    A 128-bit vector of [16 x unsigned char].
+/// \returns
+///    A 128-bit vector of [4 x int].
+///
+/// \code{.operation}
+/// FOR j := 0 to 3
+/// 	tmp1.word := Signed(SignExtend16(__A.byte[4*j]) * ZeroExtend16(__B.byte[4*j]))
+/// 	tmp2.word := Signed(SignExtend16(__A.byte[4*j+1]) * ZeroExtend16(__B.byte[4*j+1]))
+/// 	tmp3.word := Signed(SignExtend16(__A.byte[4*j+2]) * ZeroExtend16(__B.byte[4*j+2]))
+/// 	tmp4.word := Signed(SignExtend16(__A.byte[4*j+3]) * ZeroExtend16(__B.byte[4*j+3]))
+/// 	dst.dword[j] := __W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4
+/// ENDFOR
+/// dst[MAX:128] := 0
+/// \endcode
+static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpbsud_epi32(__m128i __W,
+                                                                 __m128i __A,
+                                                                 __m128i __B) {
+  return (__m128i)__builtin_ia32_vpdpbsud128((__v4si)__W, (__v4si)__A,
+                                             (__v4si)__B);
+}
+
+/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with
+///    corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate
+///    signed 16-bit results. Sum these 4 results with the corresponding
+///    32-bit integer in \a __W, and store the packed 32-bit results in \a dst.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// _mm256_dpbsud_epi32(__m256i __W, __m256i __A, __m256i __B);
+/// \endcode
+///
+/// This intrinsic corresponds to the \c VPDPBSSD instruction.
+///
+/// \param __A
+///    A 256-bit vector of [32 x char].
+/// \param __B
+///    A 256-bit vector of [32 x unsigned char].
+/// \returns
+///    A 256-bit vector of [8 x int].
+///
+/// \code{.operation}
+/// FOR j := 0 to 7
+/// 	tmp1.word := Signed(SignExtend16(__A.byte[4*j]) * ZeroExtend16(__B.byte[4*j]))
+/// 	tmp2.word := Signed(SignExtend16(__A.byte[4*j+1]) * ZeroExtend16(__B.byte[4*j+1]))
+/// 	tmp3.word := Signed(SignExtend16(__A.byte[4*j+2]) * ZeroExtend16(__B.byte[4*j+2]))
+/// 	tmp4.word := Signed(SignExtend16(__A.byte[4*j+3]) * ZeroExtend16(__B.byte[4*j+3]))
+/// 	dst.dword[j] := __W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4
+/// ENDFOR
+/// dst[MAX:256] := 0
+/// \endcode
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_dpbsud_epi32(__m256i __W, __m256i __A, __m256i __B) {
+  return (__m256i)__builtin_ia32_vpdpbsud256((__v8si)__W, (__v8si)__A,
+                                             (__v8si)__B);
+}
+
+/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with
+///    corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate
+///    signed 16-bit results. Sum these 4 results with the corresponding
+///    32-bit integer in \a __W with signed saturation, and store the packed
+///    32-bit results in \a dst.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// _mm_dpbsuds_epi32( __m128i __W, __m128i __A, __m128i __B);
+/// \endcode
+///
+/// This intrinsic corresponds to the \c VPDPBSSD instruction.
+///
+/// \param __A
+///    A 128-bit vector of [16 x char].
+/// \param __B
+///    A 128-bit vector of [16 x unsigned char].
+/// \returns
+///    A 128-bit vector of [4 x int].
+///
+/// \code{.operation}
+/// FOR j := 0 to 3
+/// 	tmp1.word := Signed(SignExtend16(__A.byte[4*j]) * ZeroExtend16(__B.byte[4*j]))
+/// 	tmp2.word := Signed(SignExtend16(__A.byte[4*j+1]) * ZeroExtend16(__B.byte[4*j+1]))
+/// 	tmp3.word := Signed(SignExtend16(__A.byte[4*j+2]) * ZeroExtend16(__B.byte[4*j+2]))
+/// 	tmp4.word := Signed(SignExtend16(__A.byte[4*j+3]) * ZeroExtend16(__B.byte[4*j+3]))
+/// 	dst.dword[j] := SIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4)
+/// ENDFOR
+/// dst[MAX:128] := 0
+/// \endcode
+static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpbsuds_epi32(__m128i __W,
+                                                                  __m128i __A,
+                                                                  __m128i __B) {
+  return (__m128i)__builtin_ia32_vpdpbsuds128((__v4si)__W, (__v4si)__A,
+                                              (__v4si)__B);
+}
+
+/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with
+///    corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate
+///    signed 16-bit results. Sum these 4 results with the corresponding
+///    32-bit integer in \a __W with signed saturation, and store the packed
+///    32-bit results in \a dst.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// _mm256_dpbsuds_epi32(__m256i __W, __m256i __A, __m256i __B);
+/// \endcode
+///
+/// This intrinsic corresponds to the \c VPDPBSSD instruction.
+///
+/// \param __A
+///    A 256-bit vector of [32 x char].
+/// \param __B
+///    A 256-bit vector of [32 x unsigned char].
+/// \returns
+///    A 256-bit vector of [8 x int].
+///
+/// \code{.operation}
+/// FOR j := 0 to 7
+/// 	tmp1.word := Signed(SignExtend16(__A.byte[4*j]) * ZeroExtend16(__B.byte[4*j]))
+/// 	tmp2.word := Signed(SignExtend16(__A.byte[4*j+1]) * ZeroExtend16(__B.byte[4*j+1]))
+/// 	tmp3.word := Signed(SignExtend16(__A.byte[4*j+2]) * ZeroExtend16(__B.byte[4*j+2]))
+/// 	tmp4.word := Signed(SignExtend16(__A.byte[4*j+3]) * ZeroExtend16(__B.byte[4*j+3]))
+/// 	dst.dword[j] := SIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4)
+/// ENDFOR
+/// dst[MAX:256] := 0
+/// \endcode
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_dpbsuds_epi32(__m256i __W, __m256i __A, __m256i __B) {
+  return (__m256i)__builtin_ia32_vpdpbsuds256((__v8si)__W, (__v8si)__A,
+                                              (__v8si)__B);
+}
+
+/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a __A with
+///    corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate
+///    signed 16-bit results. Sum these 4 results with the corresponding
+///    32-bit integer in \a __W, and store the packed 32-bit results in \a dst.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// _mm_dpbuud_epi32(__m128i __W, __m128i __A, __m128i __B);
+/// \endcode
+///
+/// This intrinsic corresponds to the \c VPDPBSSD instruction.
+///
+/// \param __A
+///    A 128-bit vector of [16 x unsigned char].
+/// \param __B
+///    A 128-bit vector of [16 x unsigned char].
+/// \returns
+///    A 128-bit vector of [4 x int].
+///
+/// \code{.operation}
+/// FOR j := 0 to 3
+/// 	tmp1.word := ZeroExtend16(__A.byte[4*j]) * ZeroExtend16(__B.byte[4*j])
+/// 	tmp2.word := ZeroExtend16(__A.byte[4*j+1]) * ZeroExtend16(__B.byte[4*j+1])
+/// 	tmp3.word := ZeroExtend16(__A.byte[4*j+2]) * ZeroExtend16(__B.byte[4*j+2])
+/// 	tmp4.word := ZeroExtend16(__A.byte[4*j+3]) * ZeroExtend16(__B.byte[4*j+3])
+/// 	dst.dword[j] := __W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4
+/// ENDFOR
+/// dst[MAX:128] := 0
+/// \endcode
+static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpbuud_epi32(__m128i __W,
+                                                                 __m128i __A,
+                                                                 __m128i __B) {
+  return (__m128i)__builtin_ia32_vpdpbuud128((__v4si)__W, (__v4si)__A,
+                                             (__v4si)__B);
+}
+
+/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a __A with
+///    corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate
+///    signed 16-bit results. Sum these 4 results with the corresponding
+///    32-bit integer in \a __W, and store the packed 32-bit results in \a dst.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// _mm256_dpbuud_epi32(__m256i __W, __m256i __A, __m256i __B);
+/// \endcode
+///
+/// This intrinsic corresponds to the \c VPDPBSSD instruction.
+///
+/// \param __A
+///    A 256-bit vector of [32 x unsigned char].
+/// \param __B
+///    A 256-bit vector of [32 x unsigned char].
+/// \returns
+///    A 256-bit vector of [8 x int].
+///
+/// \code{.operation}
+/// FOR j := 0 to 7
+/// 	tmp1.word := ZeroExtend16(__A.byte[4*j]) * ZeroExtend16(__B.byte[4*j])
+/// 	tmp2.word := ZeroExtend16(__A.byte[4*j+1]) * ZeroExtend16(__B.byte[4*j+1])
+/// 	tmp3.word := ZeroExtend16(__A.byte[4*j+2]) * ZeroExtend16(__B.byte[4*j+2])
+/// 	tmp4.word := ZeroExtend16(__A.byte[4*j+3]) * ZeroExtend16(__B.byte[4*j+3])
+/// 	dst.dword[j] := __W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4
+/// ENDFOR
+/// dst[MAX:256] := 0
+/// \endcode
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_dpbuud_epi32(__m256i __W, __m256i __A, __m256i __B) {
+  return (__m256i)__builtin_ia32_vpdpbuud256((__v8si)__W, (__v8si)__A,
+                                             (__v8si)__B);
+}
+
+/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a __A with
+///    corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate
+///    signed 16-bit results. Sum these 4 results with the corresponding
+///    32-bit integer in \a __W with signed saturation, and store the packed
+///    32-bit results in \a dst.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// _mm_dpbuuds_epi32( __m128i __W, __m128i __A, __m128i __B);
+/// \endcode
+///
+/// This intrinsic corresponds to the \c VPDPBUUDS instruction.
+///
+/// \param __A
+///    A 128-bit vector of [16 x unsigned char].
+/// \param __B
+///    A 128-bit vector of [16 x unsigned char].
+/// \returns
+///    A 128-bit vector of [4 x int].
+///
+/// \code{.operation}
+/// FOR j := 0 to 3
+/// 	tmp1.word := ZeroExtend16(__A.byte[4*j]) * ZeroExtend16(__B.byte[4*j])
+/// 	tmp2.word := ZeroExtend16(__A.byte[4*j+1]) * ZeroExtend16(__B.byte[4*j+1])
+/// 	tmp3.word := ZeroExtend16(__A.byte[4*j+2]) * ZeroExtend16(__B.byte[4*j+2])
+/// 	tmp4.word := ZeroExtend16(__A.byte[4*j+3]) * ZeroExtend16(__B.byte[4*j+3])
+/// 	dst.dword[j] := UNSIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4)
+/// ENDFOR
+/// dst[MAX:128] := 0
+/// \endcode
+static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpbuuds_epi32(__m128i __W,
+                                                                  __m128i __A,
+                                                                  __m128i __B) {
+  return (__m128i)__builtin_ia32_vpdpbuuds128((__v4si)__W, (__v4si)__A,
+                                              (__v4si)__B);
+}
+
+/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with
+///    corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate
+///    signed 16-bit results. Sum these 4 results with the corresponding
+///    32-bit integer in \a __W with signed saturation, and store the packed
+///    32-bit results in \a dst.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// _mm256_dpbuuds_epi32(__m256i __W, __m256i __A, __m256i __B);
+/// \endcode
+///
+/// This intrinsic corresponds to the \c VPDPBUUDS instruction.
+///
+/// \param __A
+///    A 256-bit vector of [32 x unsigned char].
+/// \param __B
+///    A 256-bit vector of [32 x unsigned char].
+/// \returns
+///    A 256-bit vector of [8 x int].
+///
+/// \code{.operation}
+/// FOR j := 0 to 7
+/// 	tmp1.word := ZeroExtend16(__A.byte[4*j]) * ZeroExtend16(__B.byte[4*j])
+/// 	tmp2.word := ZeroExtend16(__A.byte[4*j+1]) * ZeroExtend16(__B.byte[4*j+1])
+/// 	tmp3.word := ZeroExtend16(__A.byte[4*j+2]) * ZeroExtend16(__B.byte[4*j+2])
+/// 	tmp4.word := ZeroExtend16(__A.byte[4*j+3]) * ZeroExtend16(__B.byte[4*j+3])
+/// 	dst.dword[j] := UNSIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4)
+/// ENDFOR
+/// dst[MAX:256] := 0
+/// \endcode
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_dpbuuds_epi32(__m256i __W, __m256i __A, __m256i __B) {
+  return (__m256i)__builtin_ia32_vpdpbuuds256((__v8si)__W, (__v8si)__A,
+                                              (__v8si)__B);
+}
+#undef __DEFAULT_FN_ATTRS128
+#undef __DEFAULT_FN_ATTRS256
+
+#endif // __AVXVNNIINT8INTRIN_H
--- a/lib/include/cmpccxaddintrin.h
+++ b/lib/include/cmpccxaddintrin.h
@ -0,0 +1,70 @@
+/*===--------------- cmpccxaddintrin.h - CMPCCXADD intrinsics--------------===
+ *
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+#ifndef __X86GPRINTRIN_H
+#error                                                                         \
+    "Never use <cmpccxaddintrin.h> directly; include <x86gprintrin.h> instead."
+#endif // __X86GPRINTRIN_H
+
+#ifndef __CMPCCXADDINTRIN_H
+#define __CMPCCXADDINTRIN_H
+#ifdef __x86_64__
+
+typedef enum {
+  _CMPCCX_O,   /* Overflow.  */
+  _CMPCCX_NO,  /* No overflow.  */
+  _CMPCCX_B,   /* Below.  */
+  _CMPCCX_NB,  /* Not below.  */
+  _CMPCCX_Z,   /* Zero.  */
+  _CMPCCX_NZ,  /* Not zero.  */
+  _CMPCCX_BE,  /* Below or equal.  */
+  _CMPCCX_NBE, /* Neither below nor equal.  */
+  _CMPCCX_S,   /* Sign.  */
+  _CMPCCX_NS,  /* No sign.  */
+  _CMPCCX_P,   /* Parity.  */
+  _CMPCCX_NP,  /* No parity.  */
+  _CMPCCX_L,   /* Less.  */
+  _CMPCCX_NL,  /* Not less.  */
+  _CMPCCX_LE,  /* Less or equal.  */
+  _CMPCCX_NLE, /* Neither less nor equal.  */
+} _CMPCCX_ENUM;
+
+/// Compares the value from the memory __A with the value of __B. If the
+/// specified condition __D is met, then add the third operand __C to the
+/// __A and write it into __A, else the value of __A is unchanged. The return
+/// value is the original value of __A.
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the \c CMPCCXADD instructions.
+///
+/// \param __A
+///    __A pointer specifying the memory address.
+///
+/// \param __B
+///   A integer operand.
+///
+/// \param __C
+///   A integer operand.
+///
+/// \param __D
+///   The specified condition.
+///
+/// \returns a integer which is the original value of first operand.
+
+#define _cmpccxadd_epi32(__A, __B, __C, __D)                                   \
+  ((int)(__builtin_ia32_cmpccxadd32((void *)(__A), (int)(__B), (int)(__C),     \
+                                    (int)(__D))))
+
+#define _cmpccxadd_epi64(__A, __B, __C, __D)                                   \
+  ((long long)(__builtin_ia32_cmpccxadd64((void *)(__A), (long long)(__B),     \
+                                          (long long)(__C), (int)(__D))))
+
+#endif // __x86_64__
+#endif // __CMPCCXADDINTRIN_H
--- a/lib/include/cpuid.h
+++ b/lib/include/cpuid.h
@ -200,9 +200,18 @@
 #define bit_AMXINT8       0x02000000

 /* Features in %eax for leaf 7 sub-leaf 1 */
+#define bit_RAOINT        0x00000008
 #define bit_AVXVNNI       0x00000010
 #define bit_AVX512BF16    0x00000020
+#define bit_CMPCCXADD     0x00000080
+#define bit_AMXFP16       0x00200000
 #define bit_HRESET        0x00400000
+#define bit_AVXIFMA       0x00800000
+
+/* Features in %edx for leaf 7 sub-leaf 1 */
+#define bit_AVXVNNIINT8   0x00000010
+#define bit_AVXNECONVERT  0x00000020
+#define bit_PREFETCHI     0x00004000

 /* Features in %eax for leaf 13 sub-leaf 1 */
 #define bit_XSAVEOPT    0x00000001
@ -261,7 +270,8 @@
        : "0"(__leaf), "2"(__count))
 #endif

-static __inline int __get_cpuid_max (unsigned int __leaf, unsigned int *__sig)
+static __inline unsigned int __get_cpuid_max (unsigned int __leaf,
+                                              unsigned int *__sig)
 {
    unsigned int __eax, __ebx, __ecx, __edx;
 #if __i386__
--- a/lib/include/cuda_wrappers/cmath
+++ b/lib/include/cuda_wrappers/cmath
@ -0,0 +1,90 @@
+/*===---- cmath - CUDA wrapper for <cmath> ---------------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __CLANG_CUDA_WRAPPERS_CMATH
+#define __CLANG_CUDA_WRAPPERS_CMATH
+
+#include_next <cmath>
+
+#if defined(_LIBCPP_STD_VER)
+
+// libc++ will need long double variants of these functions, but CUDA does not
+// provide them. We'll provide their declarations, which should allow the
+// headers to parse, but would not allow accidental use of them on a GPU.
+
+__attribute__((device)) long double logb(long double);
+__attribute__((device)) long double scalbn(long double, int);
+
+namespace std {
+
+// For __constexpr_fmin/fmax we only need device-side overloads before c++14
+// where they are not constexpr.
+#if _LIBCPP_STD_VER < 14
+
+__attribute__((device))
+inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 float __constexpr_fmax(float __x, float __y) _NOEXCEPT {
+  return __builtin_fmaxf(__x, __y);
+}
+
+__attribute__((device))
+inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 double __constexpr_fmax(double __x, double __y) _NOEXCEPT {
+  return __builtin_fmax(__x, __y);
+}
+
+__attribute__((device))
+inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 long double
+__constexpr_fmax(long double __x, long double __y) _NOEXCEPT {
+  return __builtin_fmaxl(__x, __y);
+}
+
+template <class _Tp, class _Up, __enable_if_t<is_arithmetic<_Tp>::value && is_arithmetic<_Up>::value, int> = 0>
+__attribute__((device))
+_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 typename __promote<_Tp, _Up>::type
+__constexpr_fmax(_Tp __x, _Up __y) _NOEXCEPT {
+  using __result_type = typename __promote<_Tp, _Up>::type;
+  return std::__constexpr_fmax(static_cast<__result_type>(__x), static_cast<__result_type>(__y));
+}
+#endif // _LIBCPP_STD_VER < 14
+
+// For logb/scalbn templates we must always provide device overloads because
+// libc++ implementation uses __builtin_XXX which gets translated into a libcall
+// which we can't handle on GPU. We need to forward those to CUDA-provided
+// implementations.
+
+template <class _Tp>
+__attribute__((device))
+_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 _Tp __constexpr_logb(_Tp __x) {
+  return ::logb(__x);
+}
+
+template <class _Tp>
+__attribute__((device))
+_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _Tp __constexpr_scalbn(_Tp __x, int __exp) {
+  return ::scalbn(__x, __exp);
+}
+
+} // namespace std//
+
+#endif // _LIBCPP_STD_VER
+
+#endif // include guard
--- a/lib/include/emmintrin.h
+++ b/lib/include/emmintrin.h
@ -38,6 +38,16 @@ typedef unsigned char __v16qu __attribute__((__vector_size__(16)));
 * appear in the interface though. */
 typedef signed char __v16qs __attribute__((__vector_size__(16)));

+#ifdef __SSE2__
+/* Both _Float16 and __bf16 require SSE2 being enabled. */
+typedef _Float16 __v8hf __attribute__((__vector_size__(16), __aligned__(16)));
+typedef _Float16 __m128h __attribute__((__vector_size__(16), __aligned__(16)));
+typedef _Float16 __m128h_u __attribute__((__vector_size__(16), __aligned__(1)));
+
+typedef __bf16 __v8bf __attribute__((__vector_size__(16), __aligned__(16)));
+typedef __bf16 __m128bh __attribute__((__vector_size__(16), __aligned__(16)));
+#endif
+
 /* Define the default attributes for the functions in this file. */
 #define __DEFAULT_FN_ATTRS                                                     \
  __attribute__((__always_inline__, __nodebug__, __target__("sse2"),           \
@ -1809,7 +1819,7 @@ static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_setr_pd(double __w,
 /// \returns An initialized 128-bit floating-point vector of [2 x double] with
 ///    all elements set to zero.
 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_setzero_pd(void) {
-  return __extension__(__m128d){0, 0};
+  return __extension__(__m128d){0.0, 0.0};
 }

 /// Constructs a 128-bit floating-point vector of [2 x double]. The lower
--- a/lib/include/float.h
+++ b/lib/include/float.h
@ -38,9 +38,10 @@
 #  undef FLT_MANT_DIG
 #  undef DBL_MANT_DIG
 #  undef LDBL_MANT_DIG
-#  if __STDC_VERSION__ >= 199901L || !defined(__STRICT_ANSI__) ||              \
-      __cplusplus >= 201103L ||                                                \
-      (__STDC_HOSTED__ && defined(_AIX) && defined(_ALL_SOURCE))
+#if (defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L) ||              \
+    !defined(__STRICT_ANSI__) ||                                               \
+    (defined(__cplusplus) && __cplusplus >= 201103L) ||                        \
+    (__STDC_HOSTED__ && defined(_AIX) && defined(_ALL_SOURCE))
 #    undef DECIMAL_DIG
 #  endif
 #  undef FLT_DIG
@ -67,9 +68,10 @@
 #  undef FLT_MIN
 #  undef DBL_MIN
 #  undef LDBL_MIN
-#  if __STDC_VERSION__ >= 201112L || !defined(__STRICT_ANSI__) ||              \
-      __cplusplus >= 201703L ||                                                \
-      (__STDC_HOSTED__ && defined(_AIX) && defined(_ALL_SOURCE))
+#if (defined(__STDC_VERSION__) && __STDC_VERSION__ >= 201112L) ||              \
+    !defined(__STRICT_ANSI__) ||                                               \
+    (defined(__cplusplus) && __cplusplus >= 201703L) ||                        \
+    (__STDC_HOSTED__ && defined(_AIX) && defined(_ALL_SOURCE))
 #    undef FLT_TRUE_MIN
 #    undef DBL_TRUE_MIN
 #    undef LDBL_TRUE_MIN
@ -84,7 +86,10 @@

 /* Characteristics of floating point types, C99 5.2.4.2.2 */

+#if (defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L) ||              \
+    (defined(__cplusplus) && __cplusplus >= 201103L)
 #define FLT_EVAL_METHOD __FLT_EVAL_METHOD__
+#endif
 #define FLT_ROUNDS (__builtin_flt_rounds())
 #define FLT_RADIX __FLT_RADIX__

@ -92,8 +97,9 @@
 #define DBL_MANT_DIG __DBL_MANT_DIG__
 #define LDBL_MANT_DIG __LDBL_MANT_DIG__

-#if __STDC_VERSION__ >= 199901L || !defined(__STRICT_ANSI__) ||                \
-    __cplusplus >= 201103L ||                                                  \
+#if (defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L) ||              \
+    !defined(__STRICT_ANSI__) ||                                               \
+    (defined(__cplusplus) && __cplusplus >= 201103L) ||                        \
    (__STDC_HOSTED__ && defined(_AIX) && defined(_ALL_SOURCE))
 #  define DECIMAL_DIG __DECIMAL_DIG__
 #endif
@ -130,8 +136,9 @@
 #define DBL_MIN __DBL_MIN__
 #define LDBL_MIN __LDBL_MIN__

-#if __STDC_VERSION__ >= 201112L || !defined(__STRICT_ANSI__) ||                \
-    __cplusplus >= 201703L ||                                                  \
+#if (defined(__STDC_VERSION__) && __STDC_VERSION__ >= 201112L) ||              \
+    !defined(__STRICT_ANSI__) ||                                               \
+    (defined(__cplusplus) && __cplusplus >= 201703L) ||                        \
    (__STDC_HOSTED__ && defined(_AIX) && defined(_ALL_SOURCE))
 #  define FLT_TRUE_MIN __FLT_DENORM_MIN__
 #  define DBL_TRUE_MIN __DBL_DENORM_MIN__
--- a/lib/include/gfniintrin.h
+++ b/lib/include/gfniintrin.h
@ -20,10 +20,12 @@
 /* Default attributes for YMM unmasked form. */
 #define __DEFAULT_FN_ATTRS_Y __attribute__((__always_inline__, __nodebug__, __target__("avx,gfni"), __min_vector_width__(256)))

-/* Default attributes for ZMM forms. */
-#define __DEFAULT_FN_ATTRS_Z __attribute__((__always_inline__, __nodebug__, __target__("avx512bw,gfni"), __min_vector_width__(512)))
+/* Default attributes for ZMM unmasked forms. */
+#define __DEFAULT_FN_ATTRS_Z __attribute__((__always_inline__, __nodebug__, __target__("avx512f,gfni"), __min_vector_width__(512)))
+/* Default attributes for ZMM masked forms. */
+#define __DEFAULT_FN_ATTRS_Z_MASK __attribute__((__always_inline__, __nodebug__, __target__("avx512bw,gfni"), __min_vector_width__(512)))

-/* Default attributes for VLX forms. */
+/* Default attributes for VLX masked forms. */
 #define __DEFAULT_FN_ATTRS_VL128 __attribute__((__always_inline__, __nodebug__, __target__("avx512bw,avx512vl,gfni"), __min_vector_width__(128)))
 #define __DEFAULT_FN_ATTRS_VL256 __attribute__((__always_inline__, __nodebug__, __target__("avx512bw,avx512vl,gfni"), __min_vector_width__(256)))

@ -99,7 +101,7 @@ _mm512_gf2p8mul_epi8(__m512i __A, __m512i __B)
              (__v64qi) __B);
 }

-static __inline__ __m512i __DEFAULT_FN_ATTRS_Z
+static __inline__ __m512i __DEFAULT_FN_ATTRS_Z_MASK
 _mm512_mask_gf2p8mul_epi8(__m512i __S, __mmask64 __U, __m512i __A, __m512i __B)
 {
  return (__m512i) __builtin_ia32_selectb_512(__U,
@ -107,7 +109,7 @@ _mm512_mask_gf2p8mul_epi8(__m512i __S, __mmask64 __U, __m512i __A, __m512i __B)
              (__v64qi) __S);
 }

-static __inline__ __m512i __DEFAULT_FN_ATTRS_Z
+static __inline__ __m512i __DEFAULT_FN_ATTRS_Z_MASK
 _mm512_maskz_gf2p8mul_epi8(__mmask64 __U, __m512i __A, __m512i __B)
 {
  return _mm512_mask_gf2p8mul_epi8((__m512i)_mm512_setzero_si512(),
--- a/lib/include/hlsl.h
+++ b/lib/include/hlsl.h
@ -1,15 +0,0 @@
-//===----- hlsl.h - HLSL definitions --------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef _HLSL_H_
-#define _HLSL_H_
-
-#include "hlsl/hlsl_basic_types.h"
-#include "hlsl/hlsl_intrinsics.h"
-
-#endif //_HLSL_H_
--- a/lib/include/hlsl_basic_types.h
+++ b/lib/include/hlsl_basic_types.h
@ -1,64 +0,0 @@
-//===----- hlsl_basic_types.h - HLSL definitions for basic types ----------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef _HLSL_HLSL_BASIC_TYPES_H_
-#define _HLSL_HLSL_BASIC_TYPES_H_
-
-// built-in scalar data types:
-
-#ifdef __HLSL_ENABLE_16_BIT
-// 16-bit integer.
-typedef unsigned short uint16_t;
-typedef short int16_t;
-#endif
-
-// unsigned 32-bit integer.
-typedef unsigned int uint;
-
-// 64-bit integer.
-typedef unsigned long uint64_t;
-typedef long int64_t;
-
-// built-in vector data types:
-
-#ifdef __HLSL_ENABLE_16_BIT
-typedef vector<int16_t, 2> int16_t2;
-typedef vector<int16_t, 3> int16_t3;
-typedef vector<int16_t, 4> int16_t4;
-typedef vector<uint16_t, 2> uint16_t2;
-typedef vector<uint16_t, 3> uint16_t3;
-typedef vector<uint16_t, 4> uint16_t4;
-#endif
-
-typedef vector<int, 2> int2;
-typedef vector<int, 3> int3;
-typedef vector<int, 4> int4;
-typedef vector<uint, 2> uint2;
-typedef vector<uint, 3> uint3;
-typedef vector<uint, 4> uint4;
-typedef vector<int64_t, 2> int64_t2;
-typedef vector<int64_t, 3> int64_t3;
-typedef vector<int64_t, 4> int64_t4;
-typedef vector<uint64_t, 2> uint64_t2;
-typedef vector<uint64_t, 3> uint64_t3;
-typedef vector<uint64_t, 4> uint64_t4;
-
-#ifdef __HLSL_ENABLE_16_BIT
-typedef vector<half, 2> half2;
-typedef vector<half, 3> half3;
-typedef vector<half, 4> half4;
-#endif
-
-typedef vector<float, 2> float2;
-typedef vector<float, 3> float3;
-typedef vector<float, 4> float4;
-typedef vector<double, 2> double2;
-typedef vector<double, 3> double3;
-typedef vector<double, 4> double4;
-
-#endif //_HLSL_HLSL_BASIC_TYPES_H_
--- a/lib/include/hlsl_intrinsics.h
+++ b/lib/include/hlsl_intrinsics.h
@ -1,15 +0,0 @@
-//===----- hlsl_intrinsics.h - HLSL definitions for intrinsics ----------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef _HLSL_HLSL_INTRINSICS_H_
-#define _HLSL_HLSL_INTRINSICS_H_
-
-__attribute__((clang_builtin_alias(__builtin_hlsl_wave_active_count_bits))) uint
-WaveActiveCountBits(bool bBit);
-
-#endif //_HLSL_HLSL_INTRINSICS_H_
--- a/lib/include/immintrin.h
+++ b/lib/include/immintrin.h
@ -189,6 +189,11 @@
 #include <avx512ifmavlintrin.h>
 #endif

+#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
+    defined(__AVXIFMA__)
+#include <avxifmaintrin.h>
+#endif
+
 #if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
    defined(__AVX512VBMI__)
 #include <avx512vbmiintrin.h>
@ -214,17 +219,13 @@
 #include <avx512pfintrin.h>
 #endif

-/*
- * FIXME: _Float16 type is legal only when HW support float16 operation.
- * We use __AVX512FP16__ to identify if float16 is supported or not, so
- * when float16 is not supported, the related header is not included.
- *
- */
-#if defined(__AVX512FP16__)
+#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
+    defined(__AVX512FP16__)
 #include <avx512fp16intrin.h>
 #endif

-#if defined(__AVX512FP16__) && defined(__AVX512VL__)
+#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
+    (defined(__AVX512VL__) && defined(__AVX512FP16__))
 #include <avx512vlfp16intrin.h>
 #endif

@ -258,6 +259,16 @@
 #include <gfniintrin.h>
 #endif

+#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
+    defined(__AVXVNNIINT8__)
+#include <avxvnniint8intrin.h>
+#endif
+
+#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
+    defined(__AVXNECONVERT__)
+#include <avxneconvertintrin.h>
+#endif
+
 #if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
    defined(__RDPID__)
 /// Returns the value of the IA32_TSC_AUX MSR (0xc0000103).
@ -291,6 +302,23 @@ _rdrand64_step(unsigned long long *__p)
 {
  return (int)__builtin_ia32_rdrand64_step(__p);
 }
+#else
+// We need to emulate the functionality of 64-bit rdrand with 2 32-bit
+// rdrand instructions.
+static __inline__ int __attribute__((__always_inline__, __nodebug__, __target__("rdrnd")))
+_rdrand64_step(unsigned long long *__p)
+{
+  unsigned int __lo, __hi;
+  unsigned int __res_lo = __builtin_ia32_rdrand32_step(&__lo);
+  unsigned int __res_hi = __builtin_ia32_rdrand32_step(&__hi);
+  if (__res_lo && __res_hi) {
+    *__p = ((unsigned long long)__hi << 32) | (unsigned long long)__lo;
+    return 1;
+  } else {
+    *__p = 0;
+    return 0;
+  }
+}
 #endif
 #endif /* __RDRND__ */

@ -495,6 +523,10 @@ _storebe_i64(void * __P, long long __D) {
    defined(__INVPCID__)
 #include <invpcidintrin.h>
 #endif
+#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
+    defined(__AMXFP16__)
+#include <amxfp16intrin.h>
+#endif

 #if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
    defined(__KL__) || defined(__WIDEKL__)
--- a/lib/include/larchintrin.h
+++ b/lib/include/larchintrin.h
@ -0,0 +1,234 @@
+/*===------------ larchintrin.h - LoongArch intrinsics ---------------------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef _LOONGARCH_BASE_INTRIN_H
+#define _LOONGARCH_BASE_INTRIN_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct rdtime {
+  unsigned int value;
+  unsigned int timeid;
+} __rdtime_t;
+
+#if __loongarch_grlen == 64
+typedef struct drdtime {
+  unsigned long dvalue;
+  unsigned long dtimeid;
+} __drdtime_t;
+
+extern __inline __drdtime_t
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    __rdtime_d(void) {
+  __drdtime_t __drdtime;
+  __asm__ volatile(
+      "rdtime.d %[val], %[tid]\n\t"
+      : [val] "=&r"(__drdtime.dvalue), [tid] "=&r"(__drdtime.dtimeid));
+  return __drdtime;
+}
+#endif
+
+extern __inline __rdtime_t
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    __rdtimeh_w(void) {
+  __rdtime_t __rdtime;
+  __asm__ volatile("rdtimeh.w %[val], %[tid]\n\t"
+                   : [val] "=&r"(__rdtime.value), [tid] "=&r"(__rdtime.timeid));
+  return __rdtime;
+}
+
+extern __inline __rdtime_t
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    __rdtimel_w(void) {
+  __rdtime_t __rdtime;
+  __asm__ volatile("rdtimel.w %[val], %[tid]\n\t"
+                   : [val] "=&r"(__rdtime.value), [tid] "=&r"(__rdtime.timeid));
+  return __rdtime;
+}
+
+#if __loongarch_grlen == 64
+extern __inline int
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    __crc_w_b_w(char _1, int _2) {
+  return (int)__builtin_loongarch_crc_w_b_w((char)_1, (int)_2);
+}
+
+extern __inline int
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    __crc_w_h_w(short _1, int _2) {
+  return (int)__builtin_loongarch_crc_w_h_w((short)_1, (int)_2);
+}
+
+extern __inline int
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    __crc_w_w_w(int _1, int _2) {
+  return (int)__builtin_loongarch_crc_w_w_w((int)_1, (int)_2);
+}
+
+extern __inline int
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    __crc_w_d_w(long int _1, int _2) {
+  return (int)__builtin_loongarch_crc_w_d_w((long int)_1, (int)_2);
+}
+
+extern __inline int
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    __crcc_w_b_w(char _1, int _2) {
+  return (int)__builtin_loongarch_crcc_w_b_w((char)_1, (int)_2);
+}
+
+extern __inline int
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    __crcc_w_h_w(short _1, int _2) {
+  return (int)__builtin_loongarch_crcc_w_h_w((short)_1, (int)_2);
+}
+
+extern __inline int
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    __crcc_w_w_w(int _1, int _2) {
+  return (int)__builtin_loongarch_crcc_w_w_w((int)_1, (int)_2);
+}
+
+extern __inline int
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    __crcc_w_d_w(long int _1, int _2) {
+  return (int)__builtin_loongarch_crcc_w_d_w((long int)_1, (int)_2);
+}
+#endif
+
+#define __break(/*ui15*/ _1) __builtin_loongarch_break((_1))
+
+#if __loongarch_grlen == 32
+#define __cacop_w(/*uimm5*/ _1, /*unsigned int*/ _2, /*simm12*/ _3)            \
+  ((void)__builtin_loongarch_cacop_w((_1), (unsigned int)(_2), (_3)))
+#endif
+
+#if __loongarch_grlen == 64
+#define __cacop_d(/*uimm5*/ _1, /*unsigned long int*/ _2, /*simm12*/ _3)       \
+  ((void)__builtin_loongarch_cacop_d((_1), (unsigned long int)(_2), (_3)))
+#endif
+
+#define __dbar(/*ui15*/ _1) __builtin_loongarch_dbar((_1))
+
+#define __ibar(/*ui15*/ _1) __builtin_loongarch_ibar((_1))
+
+#define __movfcsr2gr(/*ui5*/ _1) __builtin_loongarch_movfcsr2gr((_1));
+
+#define __movgr2fcsr(/*ui5*/ _1, _2)                                           \
+  __builtin_loongarch_movgr2fcsr((_1), (unsigned int)_2);
+
+#define __syscall(/*ui15*/ _1) __builtin_loongarch_syscall((_1))
+
+#define __csrrd_w(/*ui14*/ _1) ((unsigned int)__builtin_loongarch_csrrd_w((_1)))
+
+#define __csrwr_w(/*unsigned int*/ _1, /*ui14*/ _2)                            \
+  ((unsigned int)__builtin_loongarch_csrwr_w((unsigned int)(_1), (_2)))
+
+#define __csrxchg_w(/*unsigned int*/ _1, /*unsigned int*/ _2, /*ui14*/ _3)     \
+  ((unsigned int)__builtin_loongarch_csrxchg_w((unsigned int)(_1),             \
+                                               (unsigned int)(_2), (_3)))
+
+#if __loongarch_grlen == 64
+#define __csrrd_d(/*ui14*/ _1)                                                 \
+  ((unsigned long int)__builtin_loongarch_csrrd_d((_1)))
+
+#define __csrwr_d(/*unsigned long int*/ _1, /*ui14*/ _2)                       \
+  ((unsigned long int)__builtin_loongarch_csrwr_d((unsigned long int)(_1),     \
+                                                  (_2)))
+
+#define __csrxchg_d(/*unsigned long int*/ _1, /*unsigned long int*/ _2,        \
+                    /*ui14*/ _3)                                               \
+  ((unsigned long int)__builtin_loongarch_csrxchg_d(                           \
+      (unsigned long int)(_1), (unsigned long int)(_2), (_3)))
+#endif
+
+extern __inline unsigned char
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    __iocsrrd_b(unsigned int _1) {
+  return (unsigned char)__builtin_loongarch_iocsrrd_b((unsigned int)_1);
+}
+
+extern __inline unsigned char
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    __iocsrrd_h(unsigned int _1) {
+  return (unsigned short)__builtin_loongarch_iocsrrd_h((unsigned int)_1);
+}
+
+extern __inline unsigned int
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    __iocsrrd_w(unsigned int _1) {
+  return (unsigned int)__builtin_loongarch_iocsrrd_w((unsigned int)_1);
+}
+
+#if __loongarch_grlen == 64
+extern __inline unsigned long int
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    __iocsrrd_d(unsigned int _1) {
+  return (unsigned long int)__builtin_loongarch_iocsrrd_d((unsigned int)_1);
+}
+#endif
+
+extern __inline void
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    __iocsrwr_b(unsigned char _1, unsigned int _2) {
+  __builtin_loongarch_iocsrwr_b((unsigned char)_1, (unsigned int)_2);
+}
+
+extern __inline void
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    __iocsrwr_h(unsigned short _1, unsigned int _2) {
+  __builtin_loongarch_iocsrwr_h((unsigned short)_1, (unsigned int)_2);
+}
+
+extern __inline void
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    __iocsrwr_w(unsigned int _1, unsigned int _2) {
+  __builtin_loongarch_iocsrwr_w((unsigned int)_1, (unsigned int)_2);
+}
+
+extern __inline unsigned int
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    __cpucfg(unsigned int _1) {
+  return (unsigned int)__builtin_loongarch_cpucfg((unsigned int)_1);
+}
+
+#if __loongarch_grlen == 64
+extern __inline void
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    __iocsrwr_d(unsigned long int _1, unsigned int _2) {
+  __builtin_loongarch_iocsrwr_d((unsigned long int)_1, (unsigned int)_2);
+}
+
+extern __inline void
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    __asrtgt_d(long int _1, long int _2) {
+  __builtin_loongarch_asrtgt_d((long int)_1, (long int)_2);
+}
+
+extern __inline void
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    __asrtle_d(long int _1, long int _2) {
+  __builtin_loongarch_asrtle_d((long int)_1, (long int)_2);
+}
+#endif
+
+#if __loongarch_grlen == 64
+#define __lddir_d(/*long int*/ _1, /*ui5*/ _2)                                 \
+  ((long int)__builtin_loongarch_lddir_d((long int)(_1), (_2)))
+
+#define __ldpte_d(/*long int*/ _1, /*ui5*/ _2)                                 \
+  ((void)__builtin_loongarch_ldpte_d((long int)(_1), (_2)))
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* _LOONGARCH_BASE_INTRIN_H */
--- a/lib/include/limits.h
+++ b/lib/include/limits.h
@ -65,7 +65,7 @@
 /* C2x 5.2.4.2.1 */
 /* FIXME: This is using the placeholder dates Clang produces for these macros
   in C2x mode; switch to the correct values once they've been published. */
-#if __STDC_VERSION__ >= 202000L
+#if defined(__STDC_VERSION__) && __STDC_VERSION__ >= 202000L
 #define BOOL_WIDTH   __BOOL_WIDTH__
 #define CHAR_WIDTH   CHAR_BIT
 #define SCHAR_WIDTH  CHAR_BIT
@ -93,7 +93,8 @@
 /* C99 5.2.4.2.1: Added long long.
   C++11 18.3.3.2: same contents as the Standard C Library header <limits.h>.
 */
-#if __STDC_VERSION__ >= 199901L || __cplusplus >= 201103L
+#if (defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L) ||              \
+    (defined(__cplusplus) && __cplusplus >= 201103L)

 #undef  LLONG_MIN
 #undef  LLONG_MAX
--- a/lib/include/opencl-c-base.h
+++ b/lib/include/opencl-c-base.h
@ -74,6 +74,25 @@
 #define __opencl_c_atomic_scope_all_devices 1
 #define __opencl_c_read_write_images 1
 #endif // defined(__SPIR__)
+
+// Undefine any feature macros that have been explicitly disabled using
+// an __undef_<feature> macro.
+#ifdef __undef___opencl_c_work_group_collective_functions
+#undef __opencl_c_work_group_collective_functions
+#endif
+#ifdef __undef___opencl_c_atomic_order_seq_cst
+#undef __opencl_c_atomic_order_seq_cst
+#endif
+#ifdef __undef___opencl_c_atomic_scope_device
+#undef __opencl_c_atomic_scope_device
+#endif
+#ifdef __undef___opencl_c_atomic_scope_all_devices
+#undef __opencl_c_atomic_scope_all_devices
+#endif
+#ifdef __undef___opencl_c_read_write_images
+#undef __opencl_c_read_write_images
+#endif
+
 #endif // (__OPENCL_CPP_VERSION__ == 202100 || __OPENCL_C_VERSION__ == 300)

 #if !defined(__opencl_c_generic_address_space)
--- a/lib/include/opencl-c.h
+++ b/lib/include/opencl-c.h
@ -12396,11 +12396,11 @@ void __ovld vstorea_half16_rtn(double16, size_t, __private half *);
 * image objects and then want to read the updated data.
 */

-void __ovld __conv barrier(cl_mem_fence_flags flags);
+void __ovld __conv barrier(cl_mem_fence_flags);

 #if defined(__OPENCL_CPP_VERSION__) || (__OPENCL_C_VERSION__ >= CL_VERSION_2_0)
-void __ovld __conv work_group_barrier(cl_mem_fence_flags flags, memory_scope);
-void __ovld __conv work_group_barrier(cl_mem_fence_flags flags);
+void __ovld __conv work_group_barrier(cl_mem_fence_flags, memory_scope);
+void __ovld __conv work_group_barrier(cl_mem_fence_flags);
 #endif //defined(__OPENCL_CPP_VERSION__) || (__OPENCL_C_VERSION__ >= CL_VERSION_2_0)

 // OpenCL v1.1 s6.11.9, v1.2 s6.12.9 - Explicit Memory Fence Functions
@ -12418,7 +12418,7 @@ void __ovld __conv work_group_barrier(cl_mem_fence_flags flags);
 * CLK_LOCAL_MEM_FENCE
 * CLK_GLOBAL_MEM_FENCE.
 */
-void __ovld mem_fence(cl_mem_fence_flags flags);
+void __ovld mem_fence(cl_mem_fence_flags);

 /**
 * Read memory barrier that orders only
@ -12430,7 +12430,7 @@ void __ovld mem_fence(cl_mem_fence_flags flags);
 * CLK_LOCAL_MEM_FENCE
 * CLK_GLOBAL_MEM_FENCE.
 */
-void __ovld read_mem_fence(cl_mem_fence_flags flags);
+void __ovld read_mem_fence(cl_mem_fence_flags);

 /**
 * Write memory barrier that orders only
@ -12442,7 +12442,7 @@ void __ovld read_mem_fence(cl_mem_fence_flags flags);
 * CLK_LOCAL_MEM_FENCE
 * CLK_GLOBAL_MEM_FENCE.
 */
-void __ovld write_mem_fence(cl_mem_fence_flags flags);
+void __ovld write_mem_fence(cl_mem_fence_flags);

 // OpenCL v2.0 s6.13.9 - Address Space Qualifier Functions

@ -12891,29 +12891,29 @@ void __ovld prefetch(const __global half16 *, size_t);
 * (old + val) and store result at location
 * pointed by p. The function returns old.
 */
-int __ovld atomic_add(volatile __global int *p, int val);
-uint __ovld atomic_add(volatile __global uint *p, uint val);
-int __ovld atomic_add(volatile __local int *p, int val);
-uint __ovld atomic_add(volatile __local uint *p, uint val);
+int __ovld atomic_add(volatile __global int *, int);
+uint __ovld atomic_add(volatile __global uint *, uint);
+int __ovld atomic_add(volatile __local int *, int);
+uint __ovld atomic_add(volatile __local uint *, uint);
 #ifdef __OPENCL_CPP_VERSION__
-int __ovld atomic_add(volatile int *p, int val);
-uint __ovld atomic_add(volatile uint *p, uint val);
+int __ovld atomic_add(volatile int *, int);
+uint __ovld atomic_add(volatile uint *, uint);
 #endif

 #if defined(cl_khr_global_int32_base_atomics)
-int __ovld atom_add(volatile __global int *p, int val);
-uint __ovld atom_add(volatile __global uint *p, uint val);
+int __ovld atom_add(volatile __global int *, int);
+uint __ovld atom_add(volatile __global uint *, uint);
 #endif
 #if defined(cl_khr_local_int32_base_atomics)
-int __ovld atom_add(volatile __local int *p, int val);
-uint __ovld atom_add(volatile __local uint *p, uint val);
+int __ovld atom_add(volatile __local int *, int);
+uint __ovld atom_add(volatile __local uint *, uint);
 #endif

 #if defined(cl_khr_int64_base_atomics)
-long __ovld atom_add(volatile __global long *p, long val);
-ulong __ovld atom_add(volatile __global ulong *p, ulong val);
-long __ovld atom_add(volatile __local long *p, long val);
-ulong __ovld atom_add(volatile __local ulong *p, ulong val);
+long __ovld atom_add(volatile __global long *, long);
+ulong __ovld atom_add(volatile __global ulong *, ulong);
+long __ovld atom_add(volatile __local long *, long);
+ulong __ovld atom_add(volatile __local ulong *, ulong);
 #endif

 /**
@ -12921,29 +12921,29 @@ ulong __ovld atom_add(volatile __local ulong *p, ulong val);
 * Compute (old - val) and store result at location pointed by p. The function
 * returns old.
 */
-int __ovld atomic_sub(volatile __global int *p, int val);
-uint __ovld atomic_sub(volatile __global uint *p, uint val);
-int __ovld atomic_sub(volatile __local int *p, int val);
-uint __ovld atomic_sub(volatile __local uint *p, uint val);
+int __ovld atomic_sub(volatile __global int *, int);
+uint __ovld atomic_sub(volatile __global uint *, uint);
+int __ovld atomic_sub(volatile __local int *, int);
+uint __ovld atomic_sub(volatile __local uint *, uint);
 #ifdef __OPENCL_CPP_VERSION__
-int __ovld atomic_sub(volatile int *p, int val);
-uint __ovld atomic_sub(volatile uint *p, uint val);
+int __ovld atomic_sub(volatile int *, int);
+uint __ovld atomic_sub(volatile uint *, uint);
 #endif

 #if defined(cl_khr_global_int32_base_atomics)
-int __ovld atom_sub(volatile __global int *p, int val);
-uint __ovld atom_sub(volatile __global uint *p, uint val);
+int __ovld atom_sub(volatile __global int *, int);
+uint __ovld atom_sub(volatile __global uint *, uint);
 #endif
 #if defined(cl_khr_local_int32_base_atomics)
-int __ovld atom_sub(volatile __local int *p, int val);
-uint __ovld atom_sub(volatile __local uint *p, uint val);
+int __ovld atom_sub(volatile __local int *, int);
+uint __ovld atom_sub(volatile __local uint *, uint);
 #endif

 #if defined(cl_khr_int64_base_atomics)
-long __ovld atom_sub(volatile __global long *p, long val);
-ulong __ovld atom_sub(volatile __global ulong *p, ulong val);
-long __ovld atom_sub(volatile __local long *p, long val);
-ulong __ovld atom_sub(volatile __local ulong *p, ulong val);
+long __ovld atom_sub(volatile __global long *, long);
+ulong __ovld atom_sub(volatile __global ulong *, ulong);
+long __ovld atom_sub(volatile __local long *, long);
+ulong __ovld atom_sub(volatile __local ulong *, ulong);
 #endif

 /**
@ -12951,32 +12951,32 @@ ulong __ovld atom_sub(volatile __local ulong *p, ulong val);
 * with new value given by val. Returns old
 * value.
 */
-int __ovld atomic_xchg(volatile __global int *p, int val);
-uint __ovld atomic_xchg(volatile __global uint *p, uint val);
-int __ovld atomic_xchg(volatile __local int *p, int val);
-uint __ovld atomic_xchg(volatile __local uint *p, uint val);
-float __ovld atomic_xchg(volatile __global float *p, float val);
-float __ovld atomic_xchg(volatile __local float *p, float val);
+int __ovld atomic_xchg(volatile __global int *, int);
+uint __ovld atomic_xchg(volatile __global uint *, uint);
+int __ovld atomic_xchg(volatile __local int *, int);
+uint __ovld atomic_xchg(volatile __local uint *, uint);
+float __ovld atomic_xchg(volatile __global float *, float);
+float __ovld atomic_xchg(volatile __local float *, float);
 #ifdef __OPENCL_CPP_VERSION__
-int __ovld atomic_xchg(volatile int *p, int val);
-uint __ovld atomic_xchg(volatile uint *p, uint val);
-float __ovld atomic_xchg(volatile float *p, float val);
+int __ovld atomic_xchg(volatile int *, int);
+uint __ovld atomic_xchg(volatile uint *, uint);
+float __ovld atomic_xchg(volatile float *, float);
 #endif

 #if defined(cl_khr_global_int32_base_atomics)
-int __ovld atom_xchg(volatile __global int *p, int val);
-uint __ovld atom_xchg(volatile __global uint *p, uint val);
+int __ovld atom_xchg(volatile __global int *, int);
+uint __ovld atom_xchg(volatile __global uint *, uint);
 #endif
 #if defined(cl_khr_local_int32_base_atomics)
-int __ovld atom_xchg(volatile __local int *p, int val);
-uint __ovld atom_xchg(volatile __local uint *p, uint val);
+int __ovld atom_xchg(volatile __local int *, int);
+uint __ovld atom_xchg(volatile __local uint *, uint);
 #endif

 #if defined(cl_khr_int64_base_atomics)
-long __ovld atom_xchg(volatile __global long *p, long val);
-long __ovld atom_xchg(volatile __local long *p, long val);
-ulong __ovld atom_xchg(volatile __global ulong *p, ulong val);
-ulong __ovld atom_xchg(volatile __local ulong *p, ulong val);
+long __ovld atom_xchg(volatile __global long *, long);
+long __ovld atom_xchg(volatile __local long *, long);
+ulong __ovld atom_xchg(volatile __global ulong *, ulong);
+ulong __ovld atom_xchg(volatile __local ulong *, ulong);
 #endif

 /**
@ -13048,29 +13048,29 @@ ulong __ovld atom_dec(volatile __local ulong *);
 * location pointed by p. The function
 * returns old.
 */
-int __ovld atomic_cmpxchg(volatile __global int *p, int cmp, int val);
-uint __ovld atomic_cmpxchg(volatile __global uint *p, uint cmp, uint val);
-int __ovld atomic_cmpxchg(volatile __local int *p, int cmp, int val);
-uint __ovld atomic_cmpxchg(volatile __local uint *p, uint cmp, uint val);
+int __ovld atomic_cmpxchg(volatile __global int *, int, int);
+uint __ovld atomic_cmpxchg(volatile __global uint *, uint, uint);
+int __ovld atomic_cmpxchg(volatile __local int *, int, int);
+uint __ovld atomic_cmpxchg(volatile __local uint *, uint, uint);
 #ifdef __OPENCL_CPP_VERSION__
-int __ovld atomic_cmpxchg(volatile int *p, int cmp, int val);
-uint __ovld atomic_cmpxchg(volatile uint *p, uint cmp, uint val);
+int __ovld atomic_cmpxchg(volatile int *, int, int);
+uint __ovld atomic_cmpxchg(volatile uint *, uint, uint);
 #endif

 #if defined(cl_khr_global_int32_base_atomics)
-int __ovld atom_cmpxchg(volatile __global int *p, int cmp, int val);
-uint __ovld atom_cmpxchg(volatile __global uint *p, uint cmp, uint val);
+int __ovld atom_cmpxchg(volatile __global int *, int, int);
+uint __ovld atom_cmpxchg(volatile __global uint *, uint, uint);
 #endif
 #if defined(cl_khr_local_int32_base_atomics)
-int __ovld atom_cmpxchg(volatile __local int *p, int cmp, int val);
-uint __ovld atom_cmpxchg(volatile __local uint *p, uint cmp, uint val);
+int __ovld atom_cmpxchg(volatile __local int *, int, int);
+uint __ovld atom_cmpxchg(volatile __local uint *, uint, uint);
 #endif

 #if defined(cl_khr_int64_base_atomics)
-long __ovld atom_cmpxchg(volatile __global long *p, long cmp, long val);
-ulong __ovld atom_cmpxchg(volatile __global ulong *p, ulong cmp, ulong val);
-long __ovld atom_cmpxchg(volatile __local long *p, long cmp, long val);
-ulong __ovld atom_cmpxchg(volatile __local ulong *p, ulong cmp, ulong val);
+long __ovld atom_cmpxchg(volatile __global long *, long, long);
+ulong __ovld atom_cmpxchg(volatile __global ulong *, ulong, ulong);
+long __ovld atom_cmpxchg(volatile __local long *, long, long);
+ulong __ovld atom_cmpxchg(volatile __local ulong *, ulong, ulong);
 #endif

 /**
@ -13080,29 +13080,29 @@ ulong __ovld atom_cmpxchg(volatile __local ulong *p, ulong cmp, ulong val);
 * location pointed by p. The function
 * returns old.
 */
-int __ovld atomic_min(volatile __global int *p, int val);
-uint __ovld atomic_min(volatile __global uint *p, uint val);
-int __ovld atomic_min(volatile __local int *p, int val);
-uint __ovld atomic_min(volatile __local uint *p, uint val);
+int __ovld atomic_min(volatile __global int *, int);
+uint __ovld atomic_min(volatile __global uint *, uint);
+int __ovld atomic_min(volatile __local int *, int);
+uint __ovld atomic_min(volatile __local uint *, uint);
 #ifdef __OPENCL_CPP_VERSION__
-int __ovld atomic_min(volatile int *p, int val);
-uint __ovld atomic_min(volatile uint *p, uint val);
+int __ovld atomic_min(volatile int *, int);
+uint __ovld atomic_min(volatile uint *, uint);
 #endif

 #if defined(cl_khr_global_int32_extended_atomics)
-int __ovld atom_min(volatile __global int *p, int val);
-uint __ovld atom_min(volatile __global uint *p, uint val);
+int __ovld atom_min(volatile __global int *, int);
+uint __ovld atom_min(volatile __global uint *, uint);
 #endif
 #if defined(cl_khr_local_int32_extended_atomics)
-int __ovld atom_min(volatile __local int *p, int val);
-uint __ovld atom_min(volatile __local uint *p, uint val);
+int __ovld atom_min(volatile __local int *, int);
+uint __ovld atom_min(volatile __local uint *, uint);
 #endif

 #if defined(cl_khr_int64_extended_atomics)
-long __ovld atom_min(volatile __global long *p, long val);
-ulong __ovld atom_min(volatile __global ulong *p, ulong val);
-long __ovld atom_min(volatile __local long *p, long val);
-ulong __ovld atom_min(volatile __local ulong *p, ulong val);
+long __ovld atom_min(volatile __global long *, long);
+ulong __ovld atom_min(volatile __global ulong *, ulong);
+long __ovld atom_min(volatile __local long *, long);
+ulong __ovld atom_min(volatile __local ulong *, ulong);
 #endif

 /**
@ -13112,29 +13112,29 @@ ulong __ovld atom_min(volatile __local ulong *p, ulong val);
 * location pointed by p. The function
 * returns old.
 */
-int __ovld atomic_max(volatile __global int *p, int val);
-uint __ovld atomic_max(volatile __global uint *p, uint val);
-int __ovld atomic_max(volatile __local int *p, int val);
-uint __ovld atomic_max(volatile __local uint *p, uint val);
+int __ovld atomic_max(volatile __global int *, int);
+uint __ovld atomic_max(volatile __global uint *, uint);
+int __ovld atomic_max(volatile __local int *, int);
+uint __ovld atomic_max(volatile __local uint *, uint);
 #ifdef __OPENCL_CPP_VERSION__
-int __ovld atomic_max(volatile int *p, int val);
-uint __ovld atomic_max(volatile uint *p, uint val);
+int __ovld atomic_max(volatile int *, int);
+uint __ovld atomic_max(volatile uint *, uint);
 #endif

 #if defined(cl_khr_global_int32_extended_atomics)
-int __ovld atom_max(volatile __global int *p, int val);
-uint __ovld atom_max(volatile __global uint *p, uint val);
+int __ovld atom_max(volatile __global int *, int);
+uint __ovld atom_max(volatile __global uint *, uint);
 #endif
 #if defined(cl_khr_local_int32_extended_atomics)
-int __ovld atom_max(volatile __local int *p, int val);
-uint __ovld atom_max(volatile __local uint *p, uint val);
+int __ovld atom_max(volatile __local int *, int);
+uint __ovld atom_max(volatile __local uint *, uint);
 #endif

 #if defined(cl_khr_int64_extended_atomics)
-long __ovld atom_max(volatile __global long *p, long val);
-ulong __ovld atom_max(volatile __global ulong *p, ulong val);
-long __ovld atom_max(volatile __local long *p, long val);
-ulong __ovld atom_max(volatile __local ulong *p, ulong val);
+long __ovld atom_max(volatile __global long *, long);
+ulong __ovld atom_max(volatile __global ulong *, ulong);
+long __ovld atom_max(volatile __local long *, long);
+ulong __ovld atom_max(volatile __local ulong *, ulong);
 #endif

 /**
@ -13143,29 +13143,29 @@ ulong __ovld atom_max(volatile __local ulong *p, ulong val);
 * (old & val) and store result at location
 * pointed by p. The function returns old.
 */
-int __ovld atomic_and(volatile __global int *p, int val);
-uint __ovld atomic_and(volatile __global uint *p, uint val);
-int __ovld atomic_and(volatile __local int *p, int val);
-uint __ovld atomic_and(volatile __local uint *p, uint val);
+int __ovld atomic_and(volatile __global int *, int);
+uint __ovld atomic_and(volatile __global uint *, uint);
+int __ovld atomic_and(volatile __local int *, int);
+uint __ovld atomic_and(volatile __local uint *, uint);
 #ifdef __OPENCL_CPP_VERSION__
-int __ovld atomic_and(volatile int *p, int val);
-uint __ovld atomic_and(volatile uint *p, uint val);
+int __ovld atomic_and(volatile int *, int);
+uint __ovld atomic_and(volatile uint *, uint);
 #endif

 #if defined(cl_khr_global_int32_extended_atomics)
-int __ovld atom_and(volatile __global int *p, int val);
-uint __ovld atom_and(volatile __global uint *p, uint val);
+int __ovld atom_and(volatile __global int *, int);
+uint __ovld atom_and(volatile __global uint *, uint);
 #endif
 #if defined(cl_khr_local_int32_extended_atomics)
-int __ovld atom_and(volatile __local int *p, int val);
-uint __ovld atom_and(volatile __local uint *p, uint val);
+int __ovld atom_and(volatile __local int *, int);
+uint __ovld atom_and(volatile __local uint *, uint);
 #endif

 #if defined(cl_khr_int64_extended_atomics)
-long __ovld atom_and(volatile __global long *p, long val);
-ulong __ovld atom_and(volatile __global ulong *p, ulong val);
-long __ovld atom_and(volatile __local long *p, long val);
-ulong __ovld atom_and(volatile __local ulong *p, ulong val);
+long __ovld atom_and(volatile __global long *, long);
+ulong __ovld atom_and(volatile __global ulong *, ulong);
+long __ovld atom_and(volatile __local long *, long);
+ulong __ovld atom_and(volatile __local ulong *, ulong);
 #endif

 /**
@ -13174,29 +13174,29 @@ ulong __ovld atom_and(volatile __local ulong *p, ulong val);
 * (old | val) and store result at location
 * pointed by p. The function returns old.
 */
-int __ovld atomic_or(volatile __global int *p, int val);
-uint __ovld atomic_or(volatile __global uint *p, uint val);
-int __ovld atomic_or(volatile __local int *p, int val);
-uint __ovld atomic_or(volatile __local uint *p, uint val);
+int __ovld atomic_or(volatile __global int *, int);
+uint __ovld atomic_or(volatile __global uint *, uint);
+int __ovld atomic_or(volatile __local int *, int);
+uint __ovld atomic_or(volatile __local uint *, uint);
 #ifdef __OPENCL_CPP_VERSION__
-int __ovld atomic_or(volatile int *p, int val);
-uint __ovld atomic_or(volatile uint *p, uint val);
+int __ovld atomic_or(volatile int *, int);
+uint __ovld atomic_or(volatile uint *, uint);
 #endif

 #if defined(cl_khr_global_int32_extended_atomics)
-int __ovld atom_or(volatile __global int *p, int val);
-uint __ovld atom_or(volatile __global uint *p, uint val);
+int __ovld atom_or(volatile __global int *, int);
+uint __ovld atom_or(volatile __global uint *, uint);
 #endif
 #if defined(cl_khr_local_int32_extended_atomics)
-int __ovld atom_or(volatile __local int *p, int val);
-uint __ovld atom_or(volatile __local uint *p, uint val);
+int __ovld atom_or(volatile __local int *, int);
+uint __ovld atom_or(volatile __local uint *, uint);
 #endif

 #if defined(cl_khr_int64_extended_atomics)
-long __ovld atom_or(volatile __global long *p, long val);
-ulong __ovld atom_or(volatile __global ulong *p, ulong val);
-long __ovld atom_or(volatile __local long *p, long val);
-ulong __ovld atom_or(volatile __local ulong *p, ulong val);
+long __ovld atom_or(volatile __global long *, long);
+ulong __ovld atom_or(volatile __global ulong *, ulong);
+long __ovld atom_or(volatile __local long *, long);
+ulong __ovld atom_or(volatile __local ulong *, ulong);
 #endif

 /**
@ -13205,29 +13205,29 @@ ulong __ovld atom_or(volatile __local ulong *p, ulong val);
 * (old ^ val) and store result at location
 * pointed by p. The function returns old.
 */
-int __ovld atomic_xor(volatile __global int *p, int val);
-uint __ovld atomic_xor(volatile __global uint *p, uint val);
-int __ovld atomic_xor(volatile __local int *p, int val);
-uint __ovld atomic_xor(volatile __local uint *p, uint val);
+int __ovld atomic_xor(volatile __global int *, int);
+uint __ovld atomic_xor(volatile __global uint *, uint);
+int __ovld atomic_xor(volatile __local int *, int);
+uint __ovld atomic_xor(volatile __local uint *, uint);
 #ifdef __OPENCL_CPP_VERSION__
-int __ovld atomic_xor(volatile int *p, int val);
-uint __ovld atomic_xor(volatile uint *p, uint val);
+int __ovld atomic_xor(volatile int *, int);
+uint __ovld atomic_xor(volatile uint *, uint);
 #endif

 #if defined(cl_khr_global_int32_extended_atomics)
-int __ovld atom_xor(volatile __global int *p, int val);
-uint __ovld atom_xor(volatile __global uint *p, uint val);
+int __ovld atom_xor(volatile __global int *, int);
+uint __ovld atom_xor(volatile __global uint *, uint);
 #endif
 #if defined(cl_khr_local_int32_extended_atomics)
-int __ovld atom_xor(volatile __local int *p, int val);
-uint __ovld atom_xor(volatile __local uint *p, uint val);
+int __ovld atom_xor(volatile __local int *, int);
+uint __ovld atom_xor(volatile __local uint *, uint);
 #endif

 #if defined(cl_khr_int64_extended_atomics)
-long __ovld atom_xor(volatile __global long *p, long val);
-ulong __ovld atom_xor(volatile __global ulong *p, ulong val);
-long __ovld atom_xor(volatile __local long *p, long val);
-ulong __ovld atom_xor(volatile __local ulong *p, ulong val);
+long __ovld atom_xor(volatile __global long *, long);
+ulong __ovld atom_xor(volatile __global ulong *, ulong);
+long __ovld atom_xor(volatile __local long *, long);
+ulong __ovld atom_xor(volatile __local ulong *, ulong);
 #endif

 #if defined(cl_khr_int64_base_atomics) && defined(cl_khr_int64_extended_atomics)
@ -15257,13 +15257,17 @@ float4 __ovld __purefn read_imagef(read_only image2d_t, sampler_t, float2, float
 int4 __ovld __purefn read_imagei(read_only image2d_t, sampler_t, float2, float);
 uint4 __ovld __purefn read_imageui(read_only image2d_t, sampler_t, float2, float);

+#ifdef cl_khr_depth_images
 float __ovld __purefn read_imagef(read_only image2d_depth_t, sampler_t, float2, float);
+#endif // cl_khr_depth_images

 float4 __ovld __purefn read_imagef(read_only image2d_array_t, sampler_t, float4, float);
 int4 __ovld __purefn read_imagei(read_only image2d_array_t, sampler_t, float4, float);
 uint4 __ovld __purefn read_imageui(read_only image2d_array_t, sampler_t, float4, float);

+#ifdef cl_khr_depth_images
 float __ovld __purefn read_imagef(read_only image2d_array_depth_t, sampler_t, float4, float);
+#endif // cl_khr_depth_images

 float4 __ovld __purefn read_imagef(read_only image3d_t, sampler_t, float4, float);
 int4 __ovld __purefn read_imagei(read_only image3d_t, sampler_t, float4, float);
@ -15281,13 +15285,17 @@ float4 __ovld __purefn read_imagef(read_only image2d_t, sampler_t, float2, float
 int4 __ovld __purefn read_imagei(read_only image2d_t, sampler_t, float2, float2, float2);
 uint4 __ovld __purefn read_imageui(read_only image2d_t, sampler_t, float2, float2, float2);

+#ifdef cl_khr_depth_images
 float __ovld __purefn read_imagef(read_only image2d_depth_t, sampler_t, float2, float2, float2);
+#endif // cl_khr_depth_images

 float4 __ovld __purefn read_imagef(read_only image2d_array_t, sampler_t, float4, float2, float2);
 int4 __ovld __purefn read_imagei(read_only image2d_array_t, sampler_t, float4, float2, float2);
 uint4 __ovld __purefn read_imageui(read_only image2d_array_t, sampler_t, float4, float2, float2);

+#ifdef cl_khr_depth_images
 float __ovld __purefn read_imagef(read_only image2d_array_depth_t, sampler_t, float4, float2, float2);
+#endif // cl_khr_depth_images

 float4 __ovld __purefn read_imagef(read_only image3d_t, sampler_t, float4, float4, float4);
 int4 __ovld __purefn read_imagei(read_only image3d_t, sampler_t, float4, float4, float4);
@ -15380,9 +15388,11 @@ float4 __ovld __purefn read_imagef(read_write image2d_array_t, int4);
 int4 __ovld __purefn read_imagei(read_write image2d_array_t, int4);
 uint4 __ovld __purefn read_imageui(read_write image2d_array_t, int4);

+#ifdef cl_khr_3d_image_writes
 float4 __ovld __purefn read_imagef(read_write image3d_t, int4);
 int4 __ovld __purefn read_imagei(read_write image3d_t, int4);
 uint4 __ovld __purefn read_imageui(read_write image3d_t, int4);
+#endif // cl_khr_3d_image_writes

 #ifdef cl_khr_depth_images
 float __ovld __purefn read_imagef(read_write image2d_depth_t, int2);
@ -15423,9 +15433,11 @@ uint4 __ovld __purefn read_imageui(read_write image2d_array_t, sampler_t, float4

 float __ovld __purefn read_imagef(read_write image2d_array_depth_t, sampler_t, float4, float);

+#ifdef cl_khr_3d_image_writes
 float4 __ovld __purefn read_imagef(read_write image3d_t, sampler_t, float4, float);
 int4 __ovld __purefn read_imagei(read_write image3d_t, sampler_t, float4, float);
 uint4 __ovld __purefn read_imageui(read_write image3d_t, sampler_t, float4, float);
+#endif // cl_khr_3d_image_writes

 float4 __ovld __purefn read_imagef(read_write image1d_t, sampler_t, float, float, float);
 int4 __ovld __purefn read_imagei(read_write image1d_t, sampler_t, float, float, float);
@ -15447,9 +15459,11 @@ uint4 __ovld __purefn read_imageui(read_write image2d_array_t, sampler_t, float4

 float __ovld __purefn read_imagef(read_write image2d_array_depth_t, sampler_t, float4, float2, float2);

+#ifdef cl_khr_3d_image_writes
 float4 __ovld __purefn read_imagef(read_write image3d_t, sampler_t, float4, float4, float4);
 int4 __ovld __purefn read_imagei(read_write image3d_t, sampler_t, float4, float4, float4);
 uint4 __ovld __purefn read_imageui(read_write image3d_t, sampler_t, float4, float4, float4);
+#endif // cl_khr_3d_image_writes

 #endif //cl_khr_mipmap_image

@ -15457,7 +15471,9 @@ uint4 __ovld __purefn read_imageui(read_write image3d_t, sampler_t, float4, floa
 #ifdef cl_khr_fp16
 half4 __ovld __purefn read_imageh(read_write image1d_t, int);
 half4 __ovld __purefn read_imageh(read_write image2d_t, int2);
+#ifdef cl_khr_3d_image_writes
 half4 __ovld __purefn read_imageh(read_write image3d_t, int4);
+#endif // cl_khr_3d_image_writes
 half4 __ovld __purefn read_imageh(read_write image1d_array_t, int2);
 half4 __ovld __purefn read_imageh(read_write image2d_array_t, int4);
 half4 __ovld __purefn read_imageh(read_write image1d_buffer_t, int);
@ -15727,7 +15743,9 @@ int __ovld __cnfn get_image_width(write_only image2d_array_msaa_depth_t);
 int __ovld __cnfn get_image_width(read_write image1d_t);
 int __ovld __cnfn get_image_width(read_write image1d_buffer_t);
 int __ovld __cnfn get_image_width(read_write image2d_t);
+#ifdef cl_khr_3d_image_writes
 int __ovld __cnfn get_image_width(read_write image3d_t);
+#endif // cl_khr_3d_image_writes
 int __ovld __cnfn get_image_width(read_write image1d_array_t);
 int __ovld __cnfn get_image_width(read_write image2d_array_t);
 #ifdef cl_khr_depth_images
@ -15777,7 +15795,9 @@ int __ovld __cnfn get_image_height(write_only image2d_array_msaa_depth_t);

 #if defined(__opencl_c_read_write_images)
 int __ovld __cnfn get_image_height(read_write image2d_t);
+#ifdef cl_khr_3d_image_writes
 int __ovld __cnfn get_image_height(read_write image3d_t);
+#endif // cl_khr_3d_image_writes
 int __ovld __cnfn get_image_height(read_write image2d_array_t);
 #ifdef cl_khr_depth_images
 int __ovld __cnfn get_image_height(read_write image2d_depth_t);
@ -15798,11 +15818,11 @@ int __ovld __cnfn get_image_depth(read_only image3d_t);

 #ifdef cl_khr_3d_image_writes
 int __ovld __cnfn get_image_depth(write_only image3d_t);
-#endif

 #if defined(__opencl_c_read_write_images)
 int __ovld __cnfn get_image_depth(read_write image3d_t);
 #endif //defined(__opencl_c_read_write_images)
+#endif // cl_khr_3d_image_writes

 // OpenCL Extension v2.0 s9.18 - Mipmaps
 #if defined(__OPENCL_CPP_VERSION__) || (__OPENCL_C_VERSION__ >= CL_VERSION_2_0)
@ -15824,24 +15844,32 @@ int __ovld get_image_num_mip_levels(write_only image3d_t);
 #if defined(__opencl_c_read_write_images)
 int __ovld get_image_num_mip_levels(read_write image1d_t);
 int __ovld get_image_num_mip_levels(read_write image2d_t);
+#ifdef cl_khr_3d_image_writes
 int __ovld get_image_num_mip_levels(read_write image3d_t);
+#endif // cl_khr_3d_image_writes
 #endif //defined(__opencl_c_read_write_images)

 int __ovld get_image_num_mip_levels(read_only image1d_array_t);
 int __ovld get_image_num_mip_levels(read_only image2d_array_t);
+#ifdef cl_khr_depth_images
 int __ovld get_image_num_mip_levels(read_only image2d_array_depth_t);
 int __ovld get_image_num_mip_levels(read_only image2d_depth_t);
+#endif // cl_khr_depth_images

 int __ovld get_image_num_mip_levels(write_only image1d_array_t);
 int __ovld get_image_num_mip_levels(write_only image2d_array_t);
+#ifdef cl_khr_depth_images
 int __ovld get_image_num_mip_levels(write_only image2d_array_depth_t);
 int __ovld get_image_num_mip_levels(write_only image2d_depth_t);
+#endif // cl_khr_depth_images

 #if defined(__opencl_c_read_write_images)
 int __ovld get_image_num_mip_levels(read_write image1d_array_t);
 int __ovld get_image_num_mip_levels(read_write image2d_array_t);
+#ifdef cl_khr_depth_images
 int __ovld get_image_num_mip_levels(read_write image2d_array_depth_t);
 int __ovld get_image_num_mip_levels(read_write image2d_depth_t);
+#endif // cl_khr_depth_images
 #endif //defined(__opencl_c_read_write_images)

 #endif //cl_khr_mipmap_image
@ -15906,7 +15934,9 @@ int __ovld __cnfn get_image_channel_data_type(write_only image2d_array_msaa_dept
 int __ovld __cnfn get_image_channel_data_type(read_write image1d_t);
 int __ovld __cnfn get_image_channel_data_type(read_write image1d_buffer_t);
 int __ovld __cnfn get_image_channel_data_type(read_write image2d_t);
+#ifdef cl_khr_3d_image_writes
 int __ovld __cnfn get_image_channel_data_type(read_write image3d_t);
+#endif // cl_khr_3d_image_writes
 int __ovld __cnfn get_image_channel_data_type(read_write image1d_array_t);
 int __ovld __cnfn get_image_channel_data_type(read_write image2d_array_t);
 #ifdef cl_khr_depth_images
@ -15978,7 +16008,9 @@ int __ovld __cnfn get_image_channel_order(write_only image2d_array_msaa_depth_t)
 int __ovld __cnfn get_image_channel_order(read_write image1d_t);
 int __ovld __cnfn get_image_channel_order(read_write image1d_buffer_t);
 int __ovld __cnfn get_image_channel_order(read_write image2d_t);
+#ifdef cl_khr_3d_image_writes
 int __ovld __cnfn get_image_channel_order(read_write image3d_t);
+#endif // cl_khr_3d_image_writes
 int __ovld __cnfn get_image_channel_order(read_write image1d_array_t);
 int __ovld __cnfn get_image_channel_order(read_write image2d_array_t);
 #ifdef cl_khr_depth_images
@ -16048,10 +16080,10 @@ int2 __ovld __cnfn get_image_dim(read_write image2d_array_msaa_depth_t);
 int4 __ovld __cnfn get_image_dim(read_only image3d_t);
 #ifdef cl_khr_3d_image_writes
 int4 __ovld __cnfn get_image_dim(write_only image3d_t);
-#endif
 #if defined(__opencl_c_read_write_images)
 int4 __ovld __cnfn get_image_dim(read_write image3d_t);
 #endif //defined(__opencl_c_read_write_images)
+#endif // cl_khr_3d_image_writes

 /**
 * Return the image array size.
@ -16266,9 +16298,9 @@ uint    __ovld get_enqueued_num_sub_groups(void);
 uint    __ovld get_sub_group_id(void);
 uint    __ovld get_sub_group_local_id(void);

-void    __ovld __conv sub_group_barrier(cl_mem_fence_flags flags);
+void    __ovld __conv sub_group_barrier(cl_mem_fence_flags);
 #if defined(__OPENCL_CPP_VERSION__) || (__OPENCL_C_VERSION__ >= CL_VERSION_2_0)
-void    __ovld __conv sub_group_barrier(cl_mem_fence_flags flags, memory_scope);
+void    __ovld __conv sub_group_barrier(cl_mem_fence_flags, memory_scope);
 #endif //defined(__OPENCL_CPP_VERSION__) || (__OPENCL_C_VERSION__ >= CL_VERSION_2_0)

 int     __ovld __conv sub_group_all(int predicate);
@ -17847,15 +17879,13 @@ intel_sub_group_avc_sic_configure_skc(
    uint skip_block_partition_type, uint skip_motion_vector_mask,
    ulong motion_vectors, uchar bidirectional_weight, uchar skip_sad_adjustment,
    intel_sub_group_avc_sic_payload_t payload);
-intel_sub_group_avc_sic_payload_t __ovld
-intel_sub_group_avc_sic_configure_ipe(
-    uchar luma_intra_partition_mask, uchar intra_neighbour_availabilty,
+intel_sub_group_avc_sic_payload_t __ovld intel_sub_group_avc_sic_configure_ipe(
+    uchar luma_intra_partition_mask, uchar intra_neighbour_availability,
    uchar left_edge_luma_pixels, uchar upper_left_corner_luma_pixel,
    uchar upper_edge_luma_pixels, uchar upper_right_edge_luma_pixels,
    uchar intra_sad_adjustment, intel_sub_group_avc_sic_payload_t payload);
-intel_sub_group_avc_sic_payload_t __ovld
-intel_sub_group_avc_sic_configure_ipe(
-    uchar luma_intra_partition_mask, uchar intra_neighbour_availabilty,
+intel_sub_group_avc_sic_payload_t __ovld intel_sub_group_avc_sic_configure_ipe(
+    uchar luma_intra_partition_mask, uchar intra_neighbour_availability,
    uchar left_edge_luma_pixels, uchar upper_left_corner_luma_pixel,
    uchar upper_edge_luma_pixels, uchar upper_right_edge_luma_pixels,
    ushort left_edge_chroma_pixels, ushort upper_left_corner_chroma_pixel,
--- a/lib/include/ppc_wrappers/emmintrin.h
+++ b/lib/include/ppc_wrappers/emmintrin.h
@ -36,7 +36,7 @@
 #ifndef EMMINTRIN_H_
 #define EMMINTRIN_H_

-#if defined(__ppc64__) &&                                                      \
+#if defined(__powerpc64__) &&                                                  \
    (defined(__linux__) || defined(__FreeBSD__) || defined(_AIX))

 #include <altivec.h>
@ -2262,7 +2262,7 @@ extern __inline __m128d

 #else
 #include_next <emmintrin.h>
-#endif /* defined(__ppc64__) &&
+#endif /* defined(__powerpc64__) &&                                            \
        *   (defined(__linux__) || defined(__FreeBSD__) || defined(_AIX)) */

 #endif /* EMMINTRIN_H_ */
--- a/lib/include/ppc_wrappers/mm_malloc.h
+++ b/lib/include/ppc_wrappers/mm_malloc.h
@ -10,7 +10,7 @@
 #ifndef _MM_MALLOC_H_INCLUDED
 #define _MM_MALLOC_H_INCLUDED

-#if defined(__ppc64__) &&                                                      \
+#if defined(__powerpc64__) &&                                                  \
    (defined(__linux__) || defined(__FreeBSD__) || defined(_AIX))

 #include <stdlib.h>
--- a/lib/include/ppc_wrappers/mmintrin.h
+++ b/lib/include/ppc_wrappers/mmintrin.h
@ -35,7 +35,7 @@
 #ifndef _MMINTRIN_H_INCLUDED
 #define _MMINTRIN_H_INCLUDED

-#if defined(__ppc64__) &&                                                      \
+#if defined(__powerpc64__) &&                                                  \
    (defined(__linux__) || defined(__FreeBSD__) || defined(_AIX))

 #include <altivec.h>
@ -1447,7 +1447,7 @@ extern __inline __m64

 #else
 #include_next <mmintrin.h>
-#endif /* defined(__ppc64__) &&
+#endif /* defined(__powerpc64__) &&                                            \
        *   (defined(__linux__) || defined(__FreeBSD__) || defined(_AIX)) */

 #endif /* _MMINTRIN_H_INCLUDED */
--- a/lib/include/ppc_wrappers/pmmintrin.h
+++ b/lib/include/ppc_wrappers/pmmintrin.h
@ -39,7 +39,7 @@
 #ifndef PMMINTRIN_H_
 #define PMMINTRIN_H_

-#if defined(__ppc64__) &&                                                      \
+#if defined(__powerpc64__) &&                                                  \
    (defined(__linux__) || defined(__FreeBSD__) || defined(_AIX))

 /* We need definitions from the SSE2 and SSE header files*/
@ -139,7 +139,7 @@ extern __inline __m128i

 #else
 #include_next <pmmintrin.h>
-#endif /* defined(__ppc64__) &&
+#endif /* defined(__powerpc64__) &&                                            \
        *   (defined(__linux__) || defined(__FreeBSD__) || defined(_AIX)) */

 #endif /* PMMINTRIN_H_ */
--- a/lib/include/ppc_wrappers/smmintrin.h
+++ b/lib/include/ppc_wrappers/smmintrin.h
@ -29,7 +29,7 @@
 #ifndef SMMINTRIN_H_
 #define SMMINTRIN_H_

-#if defined(__ppc64__) &&                                                      \
+#if defined(__powerpc64__) &&                                                  \
    (defined(__linux__) || defined(__FreeBSD__) || defined(_AIX))

 #include <altivec.h>
@ -657,7 +657,7 @@ extern __inline __m128i

 #else
 #include_next <smmintrin.h>
-#endif /* defined(__ppc64__) &&
+#endif /* defined(__powerpc64__) &&                                            \
        *   (defined(__linux__) || defined(__FreeBSD__) || defined(_AIX)) */

 #endif /* SMMINTRIN_H_ */
--- a/lib/include/ppc_wrappers/tmmintrin.h
+++ b/lib/include/ppc_wrappers/tmmintrin.h
@ -25,7 +25,7 @@
 #ifndef TMMINTRIN_H_
 #define TMMINTRIN_H_

-#if defined(__ppc64__) &&                                                      \
+#if defined(__powerpc64__) &&                                                  \
    (defined(__linux__) || defined(__FreeBSD__) || defined(_AIX))

 #include <altivec.h>
@ -447,7 +447,7 @@ extern __inline __m64

 #else
 #include_next <tmmintrin.h>
-#endif /* defined(__ppc64__) &&
+#endif /* defined(__powerpc64__) &&                                            \
        *   (defined(__linux__) || defined(__FreeBSD__) || defined(_AIX)) */

 #endif /* TMMINTRIN_H_ */
--- a/lib/include/ppc_wrappers/xmmintrin.h
+++ b/lib/include/ppc_wrappers/xmmintrin.h
@ -35,7 +35,7 @@
 #ifndef XMMINTRIN_H_
 #define XMMINTRIN_H_

-#if defined(__ppc64__) &&                                                      \
+#if defined(__powerpc64__) &&                                                  \
    (defined(__linux__) || defined(__FreeBSD__) || defined(_AIX))

 /* Define four value permute mask */
@ -1821,7 +1821,7 @@ extern __inline void

 #else
 #include_next <xmmintrin.h>
-#endif /* defined(__ppc64__) &&
+#endif /* defined(__powerpc64__) &&                                            \
        *   (defined(__linux__) || defined(__FreeBSD__) || defined(_AIX)) */

 #endif /* XMMINTRIN_H_ */
--- a/lib/include/prfchiintrin.h
+++ b/lib/include/prfchiintrin.h
@ -0,0 +1,61 @@
+/*===---- prfchiintrin.h - PREFETCHI intrinsic -----------------------------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __PRFCHIINTRIN_H
+#define __PRFCHIINTRIN_H
+
+#ifdef __x86_64__
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS                                                     \
+  __attribute__((__always_inline__, __nodebug__, __target__("prefetchi")))
+
+/// Loads an instruction sequence containing the specified memory address into
+///    all level cache.
+///
+///    Note that the effect of this intrinsic is dependent on the processor
+///    implementation.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c PREFETCHIT0 instruction.
+///
+/// \param __P
+///    A pointer specifying the memory address to be prefetched.
+static __inline__ void __DEFAULT_FN_ATTRS
+_m_prefetchit0(volatile const void *__P) {
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wcast-qual"
+  __builtin_ia32_prefetchi((const void *)__P, 3 /* _MM_HINT_T0 */);
+#pragma clang diagnostic pop
+}
+
+/// Loads an instruction sequence containing the specified memory address into
+///    all but the first-level cache.
+///
+///    Note that the effect of this intrinsic is dependent on the processor
+///    implementation.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c PREFETCHIT1 instruction.
+///
+/// \param __P
+///    A pointer specifying the memory address to be prefetched.
+static __inline__ void __DEFAULT_FN_ATTRS
+_m_prefetchit1(volatile const void *__P) {
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wcast-qual"
+  __builtin_ia32_prefetchi((const void *)__P, 2 /* _MM_HINT_T1 */);
+#pragma clang diagnostic pop
+}
+#endif /* __x86_64__ */
+#undef __DEFAULT_FN_ATTRS
+
+#endif /* __PRFCHWINTRIN_H */
--- a/lib/include/raointintrin.h
+++ b/lib/include/raointintrin.h
@ -0,0 +1,203 @@
+/*===----------------------- raointintrin.h - RAOINT ------------------------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __X86GPRINTRIN_H
+#error "Never use <raointintrin.h> directly; include <x86gprintrin.h> instead."
+#endif // __X86GPRINTRIN_H
+
+#ifndef __RAOINTINTRIN_H
+#define __RAOINTINTRIN_H
+
+#define __DEFAULT_FN_ATTRS                                                     \
+  __attribute__((__always_inline__, __nodebug__, __target__("raoint")))
+
+/// Atomically add a 32-bit value at memory operand \a __A and a 32-bit \a __B,
+///    and store the result to the same memory location.
+///
+///    This intrinsic should be used for contention or weak ordering. It may
+///    result in bad performance for hot data used by single thread only.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c AADD instruction.
+///
+/// \param __A
+///    A pointer to a 32-bit memory location.
+/// \param __B
+///    A 32-bit integer value.
+///
+/// \code{.operation}
+/// MEM[__A+31:__A] := MEM[__A+31:__A] + __B[31:0]
+/// \endcode
+static __inline__ void __DEFAULT_FN_ATTRS _aadd_i32(int *__A, int __B) {
+  __builtin_ia32_aadd32((int *)__A, __B);
+}
+
+/// Atomically and a 32-bit value at memory operand \a __A and a 32-bit \a __B,
+///    and store the result to the same memory location.
+///
+///    This intrinsic should be used for contention or weak ordering. It may
+///    result in bad performance for hot data used by single thread only.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c AAND instruction.
+///
+/// \param __A
+///    A pointer to a 32-bit memory location.
+/// \param __B
+///    A 32-bit integer value.
+///
+/// \code{.operation}
+/// MEM[__A+31:__A] := MEM[__A+31:__A] AND __B[31:0]
+/// \endcode
+static __inline__ void __DEFAULT_FN_ATTRS _aand_i32(int *__A, int __B) {
+  __builtin_ia32_aand32((int *)__A, __B);
+}
+
+/// Atomically or a 32-bit value at memory operand \a __A and a 32-bit \a __B,
+///    and store the result to the same memory location.
+///
+///    This intrinsic should be used for contention or weak ordering. It may
+///    result in bad performance for hot data used by single thread only.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c AOR instruction.
+///
+/// \param __A
+///    A pointer to a 32-bit memory location.
+/// \param __B
+///    A 32-bit integer value.
+///
+/// \code{.operation}
+/// MEM[__A+31:__A] := MEM[__A+31:__A] OR __B[31:0]
+/// \endcode
+static __inline__ void __DEFAULT_FN_ATTRS _aor_i32(int *__A, int __B) {
+  __builtin_ia32_aor32((int *)__A, __B);
+}
+
+/// Atomically xor a 32-bit value at memory operand \a __A and a 32-bit \a __B,
+///    and store the result to the same memory location.
+///
+///    This intrinsic should be used for contention or weak ordering. It may
+///    result in bad performance for hot data used by single thread only.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c AXOR instruction.
+///
+/// \param __A
+///    A pointer to a 32-bit memory location.
+/// \param __B
+///    A 32-bit integer value.
+///
+/// \code{.operation}
+/// MEM[__A+31:__A] := MEM[__A+31:__A] XOR __B[31:0]
+/// \endcode
+static __inline__ void __DEFAULT_FN_ATTRS _axor_i32(int *__A, int __B) {
+  __builtin_ia32_axor32((int *)__A, __B);
+}
+
+#ifdef __x86_64__
+/// Atomically add a 64-bit value at memory operand \a __A and a 64-bit \a __B,
+///    and store the result to the same memory location.
+///
+///    This intrinsic should be used for contention or weak ordering. It may
+///    result in bad performance for hot data used by single thread only.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c AADD instruction.
+///
+/// \param __A
+///    A pointer to a 64-bit memory location.
+/// \param __B
+///    A 64-bit integer value.
+///
+/// \code{.operation}
+/// MEM[__A+63:__A] := MEM[__A+63:__A] + __B[63:0]
+/// \endcode
+static __inline__ void __DEFAULT_FN_ATTRS _aadd_i64(long long *__A,
+                                                    long long __B) {
+  __builtin_ia32_aadd64((long long *)__A, __B);
+}
+
+/// Atomically and a 64-bit value at memory operand \a __A and a 64-bit \a __B,
+///    and store the result to the same memory location.
+///
+///    This intrinsic should be used for contention or weak ordering. It may
+///    result in bad performance for hot data used by single thread only.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c AAND instruction.
+///
+/// \param __A
+///    A pointer to a 64-bit memory location.
+/// \param __B
+///    A 64-bit integer value.
+///
+/// \code{.operation}
+/// MEM[__A+63:__A] := MEM[__A+63:__A] AND __B[63:0]
+/// \endcode
+static __inline__ void __DEFAULT_FN_ATTRS _aand_i64(long long *__A,
+                                                    long long __B) {
+  __builtin_ia32_aand64((long long *)__A, __B);
+}
+
+/// Atomically or a 64-bit value at memory operand \a __A and a 64-bit \a __B,
+///    and store the result to the same memory location.
+///
+///    This intrinsic should be used for contention or weak ordering. It may
+///    result in bad performance for hot data used by single thread only.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c AOR instruction.
+///
+/// \param __A
+///    A pointer to a 64-bit memory location.
+/// \param __B
+///    A 64-bit integer value.
+///
+/// \code{.operation}
+/// MEM[__A+63:__A] := MEM[__A+63:__A] OR __B[63:0]
+/// \endcode
+static __inline__ void __DEFAULT_FN_ATTRS _aor_i64(long long *__A,
+                                                   long long __B) {
+  __builtin_ia32_aor64((long long *)__A, __B);
+}
+
+/// Atomically xor a 64-bit value at memory operand \a __A and a 64-bit \a __B,
+///    and store the result to the same memory location.
+///
+///    This intrinsic should be used for contention or weak ordering. It may
+///    result in bad performance for hot data used by single thread only.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c AXOR instruction.
+///
+/// \param __A
+///    A pointer to a 64-bit memory location.
+/// \param __B
+///    A 64-bit integer value.
+///
+/// \code{.operation}
+/// MEM[__A+63:__A] := MEM[__A+63:__A] XOR __B[63:0]
+/// \endcode
+static __inline__ void __DEFAULT_FN_ATTRS _axor_i64(long long *__A,
+                                                    long long __B) {
+  __builtin_ia32_axor64((long long *)__A, __B);
+}
+#endif // __x86_64__
+
+#undef __DEFAULT_FN_ATTRS
+#endif // __RAOINTINTRIN_H
--- a/lib/include/riscv_vector.h
+++ b/lib/include/riscv_vector.h
@ -25,6 +25,8 @@ extern "C" {
 #pragma clang riscv intrinsic vector


+#define vlenb() __builtin_rvv_vlenb()
+
 enum RVV_CSR {
  RVV_VSTART = 0,
  RVV_VXSAT,
@ -70,7 +72,6 @@ void vwrite_csr(enum RVV_CSR __csr, unsigned long __value) {
  }
 }

-#define vsetvl_e8mf8(avl) __builtin_rvv_vsetvli((size_t)(avl), 0, 5)
 #define vsetvl_e8mf4(avl) __builtin_rvv_vsetvli((size_t)(avl), 0, 6)
 #define vsetvl_e8mf2(avl) __builtin_rvv_vsetvli((size_t)(avl), 0, 7)
 #define vsetvl_e8m1(avl) __builtin_rvv_vsetvli((size_t)(avl), 0, 0)
@ -78,25 +79,28 @@ void vwrite_csr(enum RVV_CSR __csr, unsigned long __value) {
 #define vsetvl_e8m4(avl) __builtin_rvv_vsetvli((size_t)(avl), 0, 2)
 #define vsetvl_e8m8(avl) __builtin_rvv_vsetvli((size_t)(avl), 0, 3)

-#define vsetvl_e16mf4(avl) __builtin_rvv_vsetvli((size_t)(avl), 1, 6)
 #define vsetvl_e16mf2(avl) __builtin_rvv_vsetvli((size_t)(avl), 1, 7)
 #define vsetvl_e16m1(avl) __builtin_rvv_vsetvli((size_t)(avl), 1, 0)
 #define vsetvl_e16m2(avl) __builtin_rvv_vsetvli((size_t)(avl), 1, 1)
 #define vsetvl_e16m4(avl) __builtin_rvv_vsetvli((size_t)(avl), 1, 2)
 #define vsetvl_e16m8(avl) __builtin_rvv_vsetvli((size_t)(avl), 1, 3)

-#define vsetvl_e32mf2(avl) __builtin_rvv_vsetvli((size_t)(avl), 2, 7)
 #define vsetvl_e32m1(avl) __builtin_rvv_vsetvli((size_t)(avl), 2, 0)
 #define vsetvl_e32m2(avl) __builtin_rvv_vsetvli((size_t)(avl), 2, 1)
 #define vsetvl_e32m4(avl) __builtin_rvv_vsetvli((size_t)(avl), 2, 2)
 #define vsetvl_e32m8(avl) __builtin_rvv_vsetvli((size_t)(avl), 2, 3)

+#if __riscv_v_elen >= 64
+#define vsetvl_e8mf8(avl) __builtin_rvv_vsetvli((size_t)(avl), 0, 5)
+#define vsetvl_e16mf4(avl) __builtin_rvv_vsetvli((size_t)(avl), 1, 6)
+#define vsetvl_e32mf2(avl) __builtin_rvv_vsetvli((size_t)(avl), 2, 7)
+
 #define vsetvl_e64m1(avl) __builtin_rvv_vsetvli((size_t)(avl), 3, 0)
 #define vsetvl_e64m2(avl) __builtin_rvv_vsetvli((size_t)(avl), 3, 1)
 #define vsetvl_e64m4(avl) __builtin_rvv_vsetvli((size_t)(avl), 3, 2)
 #define vsetvl_e64m8(avl) __builtin_rvv_vsetvli((size_t)(avl), 3, 3)
+#endif

-#define vsetvlmax_e8mf8() __builtin_rvv_vsetvlimax(0, 5)
 #define vsetvlmax_e8mf4() __builtin_rvv_vsetvlimax(0, 6)
 #define vsetvlmax_e8mf2() __builtin_rvv_vsetvlimax(0, 7)
 #define vsetvlmax_e8m1() __builtin_rvv_vsetvlimax(0, 0)
@ -104,23 +108,28 @@ void vwrite_csr(enum RVV_CSR __csr, unsigned long __value) {
 #define vsetvlmax_e8m4() __builtin_rvv_vsetvlimax(0, 2)
 #define vsetvlmax_e8m8() __builtin_rvv_vsetvlimax(0, 3)

-#define vsetvlmax_e16mf4() __builtin_rvv_vsetvlimax(1, 6)
 #define vsetvlmax_e16mf2() __builtin_rvv_vsetvlimax(1, 7)
 #define vsetvlmax_e16m1() __builtin_rvv_vsetvlimax(1, 0)
 #define vsetvlmax_e16m2() __builtin_rvv_vsetvlimax(1, 1)
 #define vsetvlmax_e16m4() __builtin_rvv_vsetvlimax(1, 2)
 #define vsetvlmax_e16m8() __builtin_rvv_vsetvlimax(1, 3)

-#define vsetvlmax_e32mf2() __builtin_rvv_vsetvlimax(2, 7)
 #define vsetvlmax_e32m1() __builtin_rvv_vsetvlimax(2, 0)
 #define vsetvlmax_e32m2() __builtin_rvv_vsetvlimax(2, 1)
 #define vsetvlmax_e32m4() __builtin_rvv_vsetvlimax(2, 2)
 #define vsetvlmax_e32m8() __builtin_rvv_vsetvlimax(2, 3)

+#if __riscv_v_elen >= 64
+#define vsetvlmax_e8mf8() __builtin_rvv_vsetvlimax(0, 5)
+#define vsetvlmax_e16mf4() __builtin_rvv_vsetvlimax(1, 6)
+#define vsetvlmax_e32mf2() __builtin_rvv_vsetvlimax(2, 7)
+
 #define vsetvlmax_e64m1() __builtin_rvv_vsetvlimax(3, 0)
 #define vsetvlmax_e64m2() __builtin_rvv_vsetvlimax(3, 1)
 #define vsetvlmax_e64m4() __builtin_rvv_vsetvlimax(3, 2)
 #define vsetvlmax_e64m8() __builtin_rvv_vsetvlimax(3, 3)
+#endif
+
 typedef __rvv_bool64_t vbool64_t;
 typedef __rvv_bool32_t vbool32_t;
 typedef __rvv_bool16_t vbool16_t;
--- a/lib/include/smmintrin.h
+++ b/lib/include/smmintrin.h
@ -818,7 +818,7 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_max_epu32(__m128i __V1,
 ///    parameter, is copied to the result.
 /// \param N
 ///    Specifies which bits from operand \a Y will be copied, which bits in the
-///    result they will be be copied to, and which bits in the result will be
+///    result they will be copied to, and which bits in the result will be
 ///    cleared. The following assignments are made: \n
 ///    Bits [7:6] specify the bits to copy from operand \a Y: \n
 ///      00: Selects bits [31:0] from operand \a Y. \n
--- a/lib/include/stdarg.h
+++ b/lib/include/stdarg.h
@ -8,13 +8,30 @@
 */

 #ifndef __STDARG_H
-#define __STDARG_H

+#ifndef __GNUC_VA_LIST
+#define __GNUC_VA_LIST
+typedef __builtin_va_list __gnuc_va_list;
+#endif
+
+#ifdef __need___va_list
+#undef __need___va_list
+#else
+#define __STDARG_H
 #ifndef _VA_LIST
 typedef __builtin_va_list va_list;
 #define _VA_LIST
 #endif
+
+/* FIXME: This is using the placeholder dates Clang produces for these macros
+   in C2x mode; switch to the correct values once they've been published. */
+#if defined(__STDC_VERSION__) && __STDC_VERSION__ >= 202000L
+/* C2x does not require the second parameter for va_start. */
+#define va_start(ap, ...) __builtin_va_start(ap, 0)
+#else
+/* Versions before C2x do require the second parameter. */
 #define va_start(ap, param) __builtin_va_start(ap, param)
+#endif
 #define va_end(ap)          __builtin_va_end(ap)
 #define va_arg(ap, type)    __builtin_va_arg(ap, type)

@ -23,13 +40,12 @@ typedef __builtin_va_list va_list;
 */
 #define __va_copy(d,s) __builtin_va_copy(d,s)

-#if __STDC_VERSION__ >= 199901L || __cplusplus >= 201103L || !defined(__STRICT_ANSI__)
+#if (defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L) ||              \
+    (defined(__cplusplus) && __cplusplus >= 201103L) ||                        \
+    !defined(__STRICT_ANSI__)
 #define va_copy(dest, src)  __builtin_va_copy(dest, src)
 #endif

-#ifndef __GNUC_VA_LIST
-#define __GNUC_VA_LIST 1
-typedef __builtin_va_list __gnuc_va_list;
-#endif
-
 #endif /* __STDARG_H */
+
+#endif /* not __STDARG_H */
--- a/lib/include/stdatomic.h
+++ b/lib/include/stdatomic.h
@ -15,10 +15,12 @@
 *
 * Exclude the MSVC path as well as the MSVC header as of the 14.31.30818
 * explicitly disallows `stdatomic.h` in the C mode via an `#error`.  Fallback
- * to the clang resource header until that is fully supported.
+ * to the clang resource header until that is fully supported.  The
+ * `stdatomic.h` header requires C++ 23 or newer.
 */
 #if __STDC_HOSTED__ &&                                                         \
-    __has_include_next(<stdatomic.h>) && !(defined(_MSC_VER) && !defined(__cplusplus))
+    __has_include_next(<stdatomic.h>) &&                                       \
+    (!defined(_MSC_VER) || (defined(__cplusplus) && __cplusplus >= 202002L))
 # include_next <stdatomic.h>
 #else

@ -45,7 +47,8 @@ extern "C" {
 /* 7.17.2 Initialization */

 #define ATOMIC_VAR_INIT(value) (value)
-#if (__STDC_VERSION__ >= 201710L || __cplusplus >= 202002L) &&                 \
+#if ((defined(__STDC_VERSION__) && __STDC_VERSION__ >= 201710L) ||             \
+     (defined(__cplusplus) && __cplusplus >= 202002L)) &&                      \
    !defined(_CLANG_DISABLE_CRT_DEPRECATION_WARNINGS)
 /* ATOMIC_VAR_INIT was deprecated in C17 and C++20. */
 #pragma clang deprecated(ATOMIC_VAR_INIT)
--- a/lib/include/stdbool.h
+++ b/lib/include/stdbool.h
@ -12,7 +12,7 @@

 #define __bool_true_false_are_defined 1

-#if __STDC_VERSION__ > 201710L
+#if defined(__STDC_VERSION__) && __STDC_VERSION__ > 201710L
 /* FIXME: We should be issuing a deprecation warning here, but cannot yet due
 * to system headers which include this header file unconditionally.
 */
@ -23,7 +23,7 @@
 #elif defined(__GNUC__) && !defined(__STRICT_ANSI__)
 /* Define _Bool as a GNU extension. */
 #define _Bool bool
-#if __cplusplus < 201103L
+#if defined(__cplusplus) && __cplusplus < 201103L
 /* For C++98, define bool, false, true as a GNU extension. */
 #define bool bool
 #define false false
--- a/lib/include/stddef.h
+++ b/lib/include/stddef.h
@ -97,8 +97,15 @@ using ::std::nullptr_t;
 #undef __need_NULL
 #endif /* defined(__need_NULL) */

+/* FIXME: This is using the placeholder dates Clang produces for these macros
+   in C2x mode; switch to the correct values once they've been published. */
+#if defined(__STDC_VERSION__) && __STDC_VERSION__ >= 202000L
+typedef typeof(nullptr) nullptr_t;
+#endif /* defined(__STDC_VERSION__) && __STDC_VERSION__ >= 202000L */
+
 #if defined(__need_STDDEF_H_misc)
-#if __STDC_VERSION__ >= 201112L || __cplusplus >= 201103L
+#if (defined(__STDC_VERSION__) && __STDC_VERSION__ >= 201112L) ||              \
+    (defined(__cplusplus) && __cplusplus >= 201103L)
 #include "__stddef_max_align_t.h"
 #endif
 #define offsetof(t, d) __builtin_offsetof(t, d)
--- a/lib/include/stdint.h
+++ b/lib/include/stdint.h
@ -96,13 +96,21 @@
 typedef __INT64_TYPE__ int64_t;
 # endif /* __int8_t_defined */
 typedef __UINT64_TYPE__ uint64_t;
+# undef __int_least64_t
 # define __int_least64_t int64_t
+# undef __uint_least64_t
 # define __uint_least64_t uint64_t
+# undef __int_least32_t
 # define __int_least32_t int64_t
+# undef __uint_least32_t
 # define __uint_least32_t uint64_t
+# undef __int_least16_t
 # define __int_least16_t int64_t
+# undef __uint_least16_t
 # define __uint_least16_t uint64_t
+# undef __int_least8_t
 # define __int_least8_t int64_t
+# undef __uint_least8_t
 # define __uint_least8_t uint64_t
 #endif /* __INT64_TYPE__ */

@ -120,11 +128,17 @@ typedef int56_t int_least56_t;
 typedef uint56_t uint_least56_t;
 typedef int56_t int_fast56_t;
 typedef uint56_t uint_fast56_t;
+# undef __int_least32_t
 # define __int_least32_t int56_t
+# undef __uint_least32_t
 # define __uint_least32_t uint56_t
+# undef __int_least16_t
 # define __int_least16_t int56_t
+# undef __uint_least16_t
 # define __uint_least16_t uint56_t
+# undef __int_least8_t
 # define __int_least8_t int56_t
+# undef __uint_least8_t
 # define __uint_least8_t uint56_t
 #endif /* __INT56_TYPE__ */

@ -136,11 +150,17 @@ typedef int48_t int_least48_t;
 typedef uint48_t uint_least48_t;
 typedef int48_t int_fast48_t;
 typedef uint48_t uint_fast48_t;
+# undef __int_least32_t
 # define __int_least32_t int48_t
+# undef __uint_least32_t
 # define __uint_least32_t uint48_t
+# undef __int_least16_t
 # define __int_least16_t int48_t
+# undef __uint_least16_t
 # define __uint_least16_t uint48_t
+# undef __int_least8_t
 # define __int_least8_t int48_t
+# undef __uint_least8_t
 # define __uint_least8_t uint48_t
 #endif /* __INT48_TYPE__ */

@ -152,11 +172,17 @@ typedef int40_t int_least40_t;
 typedef uint40_t uint_least40_t;
 typedef int40_t int_fast40_t;
 typedef uint40_t uint_fast40_t;
+# undef __int_least32_t
 # define __int_least32_t int40_t
+# undef __uint_least32_t
 # define __uint_least32_t uint40_t
+# undef __int_least16_t
 # define __int_least16_t int40_t
+# undef __uint_least16_t
 # define __uint_least16_t uint40_t
+# undef __int_least8_t
 # define __int_least8_t int40_t
+# undef __uint_least8_t
 # define __uint_least8_t uint40_t
 #endif /* __INT40_TYPE__ */

@ -172,11 +198,17 @@ typedef __INT32_TYPE__ int32_t;
 typedef __UINT32_TYPE__ uint32_t;
 # endif /* __uint32_t_defined */

+# undef __int_least32_t
 # define __int_least32_t int32_t
+# undef __uint_least32_t
 # define __uint_least32_t uint32_t
+# undef __int_least16_t
 # define __int_least16_t int32_t
+# undef __uint_least16_t
 # define __uint_least16_t uint32_t
+# undef __int_least8_t
 # define __int_least8_t int32_t
+# undef __uint_least8_t
 # define __uint_least8_t uint32_t
 #endif /* __INT32_TYPE__ */

@ -194,9 +226,13 @@ typedef int24_t int_least24_t;
 typedef uint24_t uint_least24_t;
 typedef int24_t int_fast24_t;
 typedef uint24_t uint_fast24_t;
+# undef __int_least16_t
 # define __int_least16_t int24_t
+# undef __uint_least16_t
 # define __uint_least16_t uint24_t
+# undef __int_least8_t
 # define __int_least8_t int24_t
+# undef __uint_least8_t
 # define __uint_least8_t uint24_t
 #endif /* __INT24_TYPE__ */

@ -205,9 +241,13 @@ typedef uint24_t uint_fast24_t;
 typedef __INT16_TYPE__ int16_t;
 #endif /* __int8_t_defined */
 typedef __UINT16_TYPE__ uint16_t;
+# undef __int_least16_t
 # define __int_least16_t int16_t
+# undef __uint_least16_t
 # define __uint_least16_t uint16_t
+# undef __int_least8_t
 # define __int_least8_t int16_t
+# undef __uint_least8_t
 # define __uint_least8_t uint16_t
 #endif /* __INT16_TYPE__ */

@ -224,7 +264,9 @@ typedef __uint_least16_t uint_fast16_t;
 typedef __INT8_TYPE__ int8_t;
 #endif /* __int8_t_defined */
 typedef __UINT8_TYPE__ uint8_t;
+# undef __int_least8_t
 # define __int_least8_t int8_t
+# undef __uint_least8_t
 # define __uint_least8_t uint8_t
 #endif /* __INT8_TYPE__ */

@ -285,16 +327,15 @@ typedef __UINTMAX_TYPE__ uintmax_t;


 #ifdef __INT64_TYPE__
+# undef __int64_c_suffix
+# undef __int32_c_suffix
+# undef __int16_c_suffix
+# undef  __int8_c_suffix
 # ifdef __INT64_C_SUFFIX__
 #  define __int64_c_suffix __INT64_C_SUFFIX__
 #  define __int32_c_suffix __INT64_C_SUFFIX__
 #  define __int16_c_suffix __INT64_C_SUFFIX__
 #  define  __int8_c_suffix __INT64_C_SUFFIX__
-# else
-#  undef __int64_c_suffix
-#  undef __int32_c_suffix
-#  undef __int16_c_suffix
-#  undef  __int8_c_suffix
 # endif /* __INT64_C_SUFFIX__ */
 #endif /* __INT64_TYPE__ */

@ -310,6 +351,9 @@ typedef __UINTMAX_TYPE__ uintmax_t;


 #ifdef __INT56_TYPE__
+# undef __int32_c_suffix
+# undef __int16_c_suffix
+# undef  __int8_c_suffix
 # ifdef __INT56_C_SUFFIX__
 #  define INT56_C(v) __int_c(v, __INT56_C_SUFFIX__)
 #  define UINT56_C(v) __uint_c(v, __INT56_C_SUFFIX__)
@ -319,14 +363,14 @@ typedef __UINTMAX_TYPE__ uintmax_t;
 # else
 #  define INT56_C(v) v
 #  define UINT56_C(v) v ## U
-#  undef __int32_c_suffix
-#  undef __int16_c_suffix
-#  undef  __int8_c_suffix
 # endif /* __INT56_C_SUFFIX__ */
 #endif /* __INT56_TYPE__ */


 #ifdef __INT48_TYPE__
+# undef __int32_c_suffix
+# undef __int16_c_suffix
+# undef  __int8_c_suffix
 # ifdef __INT48_C_SUFFIX__
 #  define INT48_C(v) __int_c(v, __INT48_C_SUFFIX__)
 #  define UINT48_C(v) __uint_c(v, __INT48_C_SUFFIX__)
@ -336,14 +380,14 @@ typedef __UINTMAX_TYPE__ uintmax_t;
 # else
 #  define INT48_C(v) v
 #  define UINT48_C(v) v ## U
-#  undef __int32_c_suffix
-#  undef __int16_c_suffix
-#  undef  __int8_c_suffix
 # endif /* __INT48_C_SUFFIX__ */
 #endif /* __INT48_TYPE__ */


 #ifdef __INT40_TYPE__
+# undef __int32_c_suffix
+# undef __int16_c_suffix
+# undef  __int8_c_suffix
 # ifdef __INT40_C_SUFFIX__
 #  define INT40_C(v) __int_c(v, __INT40_C_SUFFIX__)
 #  define UINT40_C(v) __uint_c(v, __INT40_C_SUFFIX__)
@ -353,22 +397,18 @@ typedef __UINTMAX_TYPE__ uintmax_t;
 # else
 #  define INT40_C(v) v
 #  define UINT40_C(v) v ## U
-#  undef __int32_c_suffix
-#  undef __int16_c_suffix
-#  undef  __int8_c_suffix
 # endif /* __INT40_C_SUFFIX__ */
 #endif /* __INT40_TYPE__ */


 #ifdef __INT32_TYPE__
+# undef __int32_c_suffix
+# undef __int16_c_suffix
+# undef  __int8_c_suffix
 # ifdef __INT32_C_SUFFIX__
 #  define __int32_c_suffix __INT32_C_SUFFIX__
 #  define __int16_c_suffix __INT32_C_SUFFIX__
 #  define __int8_c_suffix  __INT32_C_SUFFIX__
-#else
-#  undef __int32_c_suffix
-#  undef __int16_c_suffix
-#  undef  __int8_c_suffix
 # endif /* __INT32_C_SUFFIX__ */
 #endif /* __INT32_TYPE__ */

@ -384,6 +424,8 @@ typedef __UINTMAX_TYPE__ uintmax_t;


 #ifdef __INT24_TYPE__
+# undef __int16_c_suffix
+# undef  __int8_c_suffix
 # ifdef __INT24_C_SUFFIX__
 #  define INT24_C(v) __int_c(v, __INT24_C_SUFFIX__)
 #  define UINT24_C(v) __uint_c(v, __INT24_C_SUFFIX__)
@ -392,19 +434,16 @@ typedef __UINTMAX_TYPE__ uintmax_t;
 # else
 #  define INT24_C(v) v
 #  define UINT24_C(v) v ## U
-#  undef __int16_c_suffix
-#  undef  __int8_c_suffix
 # endif /* __INT24_C_SUFFIX__ */
 #endif /* __INT24_TYPE__ */


 #ifdef __INT16_TYPE__
+# undef __int16_c_suffix
+# undef  __int8_c_suffix
 # ifdef __INT16_C_SUFFIX__
 #  define __int16_c_suffix __INT16_C_SUFFIX__
 #  define __int8_c_suffix  __INT16_C_SUFFIX__
-#else
-#  undef __int16_c_suffix
-#  undef  __int8_c_suffix
 # endif /* __INT16_C_SUFFIX__ */
 #endif /* __INT16_TYPE__ */

@ -420,10 +459,9 @@ typedef __UINTMAX_TYPE__ uintmax_t;


 #ifdef __INT8_TYPE__
+# undef  __int8_c_suffix
 # ifdef __INT8_C_SUFFIX__
 #  define __int8_c_suffix __INT8_C_SUFFIX__
-#else
-#  undef  __int8_c_suffix
 # endif /* __INT8_C_SUFFIX__ */
 #endif /* __INT8_TYPE__ */

@ -463,27 +501,39 @@ typedef __UINTMAX_TYPE__ uintmax_t;
 # define UINT64_MAX         UINT64_C(18446744073709551615)
 /* FIXME: This is using the placeholder dates Clang produces for these macros
   in C2x mode; switch to the correct values once they've been published. */
-#if __STDC_VERSION__ >= 202000L
+#if defined(__STDC_VERSION__) && __STDC_VERSION__ >= 202000L
 # define UINT64_WIDTH         64
 # define INT64_WIDTH          UINT64_WIDTH

 # define __UINT_LEAST64_WIDTH UINT64_WIDTH
+# undef __UINT_LEAST32_WIDTH
 # define __UINT_LEAST32_WIDTH UINT64_WIDTH
+# undef __UINT_LEAST16_WIDTH
 # define __UINT_LEAST16_WIDTH UINT64_WIDTH
+# undef __UINT_LEAST8_MAX
 # define __UINT_LEAST8_MAX UINT64_MAX
 #endif /* __STDC_VERSION__ */

 # define __INT_LEAST64_MIN   INT64_MIN
 # define __INT_LEAST64_MAX   INT64_MAX
 # define __UINT_LEAST64_MAX UINT64_MAX
+# undef __INT_LEAST32_MIN
 # define __INT_LEAST32_MIN   INT64_MIN
+# undef __INT_LEAST32_MAX
 # define __INT_LEAST32_MAX   INT64_MAX
+# undef __UINT_LEAST32_MAX
 # define __UINT_LEAST32_MAX UINT64_MAX
+# undef __INT_LEAST16_MIN
 # define __INT_LEAST16_MIN   INT64_MIN
+# undef __INT_LEAST16_MAX
 # define __INT_LEAST16_MAX   INT64_MAX
+# undef __UINT_LEAST16_MAX
 # define __UINT_LEAST16_MAX UINT64_MAX
+# undef __INT_LEAST8_MIN
 # define __INT_LEAST8_MIN    INT64_MIN
+# undef __INT_LEAST8_MAX
 # define __INT_LEAST8_MAX    INT64_MAX
+# undef __UINT_LEAST8_MAX
 # define __UINT_LEAST8_MAX  UINT64_MAX
 #endif /* __INT64_TYPE__ */

@ -497,7 +547,7 @@ typedef __UINTMAX_TYPE__ uintmax_t;

 /* FIXME: This is using the placeholder dates Clang produces for these macros
   in C2x mode; switch to the correct values once they've been published. */
-#if __STDC_VERSION__ >= 202000L
+#if defined(__STDC_VERSION__) &&  __STDC_VERSION__ >= 202000L
 # define UINT_LEAST64_WIDTH __UINT_LEAST64_WIDTH
 # define INT_LEAST64_WIDTH  UINT_LEAST64_WIDTH
 # define UINT_FAST64_WIDTH  __UINT_LEAST64_WIDTH
@ -517,27 +567,39 @@ typedef __UINTMAX_TYPE__ uintmax_t;
 # define INT_FAST56_MAX      INT56_MAX
 # define UINT_FAST56_MAX    UINT56_MAX

+# undef __INT_LEAST32_MIN
 # define __INT_LEAST32_MIN   INT56_MIN
+# undef __INT_LEAST32_MAX
 # define __INT_LEAST32_MAX   INT56_MAX
+# undef __UINT_LEAST32_MAX
 # define __UINT_LEAST32_MAX UINT56_MAX
+# undef __INT_LEAST16_MIN
 # define __INT_LEAST16_MIN   INT56_MIN
+# undef __INT_LEAST16_MAX
 # define __INT_LEAST16_MAX   INT56_MAX
+# undef __UINT_LEAST16_MAX
 # define __UINT_LEAST16_MAX UINT56_MAX
+# undef __INT_LEAST8_MIN
 # define __INT_LEAST8_MIN    INT56_MIN
+# undef __INT_LEAST8_MAX
 # define __INT_LEAST8_MAX    INT56_MAX
+# undef __UINT_LEAST8_MAX
 # define __UINT_LEAST8_MAX  UINT56_MAX

 /* FIXME: This is using the placeholder dates Clang produces for these macros
   in C2x mode; switch to the correct values once they've been published. */
-#if __STDC_VERSION__ >= 202000L
+#if defined(__STDC_VERSION__) && __STDC_VERSION__ >= 202000L
 # define UINT56_WIDTH         56
 # define INT56_WIDTH          UINT56_WIDTH
 # define UINT_LEAST56_WIDTH   UINT56_WIDTH
 # define INT_LEAST56_WIDTH    UINT_LEAST56_WIDTH
 # define UINT_FAST56_WIDTH    UINT56_WIDTH
 # define INT_FAST56_WIDTH     UINT_FAST56_WIDTH
+# undef __UINT_LEAST32_WIDTH
 # define __UINT_LEAST32_WIDTH UINT56_WIDTH
+# undef __UINT_LEAST16_WIDTH
 # define __UINT_LEAST16_WIDTH UINT56_WIDTH
+# undef __UINT_LEAST8_WIDTH
 # define __UINT_LEAST8_WIDTH  UINT56_WIDTH
 #endif /* __STDC_VERSION__ */
 #endif /* __INT56_TYPE__ */
@ -554,27 +616,39 @@ typedef __UINTMAX_TYPE__ uintmax_t;
 # define INT_FAST48_MAX      INT48_MAX
 # define UINT_FAST48_MAX    UINT48_MAX

+# undef __INT_LEAST32_MIN
 # define __INT_LEAST32_MIN   INT48_MIN
+# undef __INT_LEAST32_MAX
 # define __INT_LEAST32_MAX   INT48_MAX
+# undef __UINT_LEAST32_MAX
 # define __UINT_LEAST32_MAX UINT48_MAX
+# undef __INT_LEAST16_MIN
 # define __INT_LEAST16_MIN   INT48_MIN
+# undef __INT_LEAST16_MAX
 # define __INT_LEAST16_MAX   INT48_MAX
+# undef __UINT_LEAST16_MAX
 # define __UINT_LEAST16_MAX UINT48_MAX
+# undef __INT_LEAST8_MIN
 # define __INT_LEAST8_MIN    INT48_MIN
+# undef __INT_LEAST8_MAX
 # define __INT_LEAST8_MAX    INT48_MAX
+# undef __UINT_LEAST8_MAX
 # define __UINT_LEAST8_MAX  UINT48_MAX

 /* FIXME: This is using the placeholder dates Clang produces for these macros
   in C2x mode; switch to the correct values once they've been published. */
-#if __STDC_VERSION__ >= 202000L
+#if defined(__STDC_VERSION__) && __STDC_VERSION__ >= 202000L
 #define UINT48_WIDTH         48
 #define INT48_WIDTH          UINT48_WIDTH
 #define UINT_LEAST48_WIDTH   UINT48_WIDTH
 #define INT_LEAST48_WIDTH    UINT_LEAST48_WIDTH
 #define UINT_FAST48_WIDTH    UINT48_WIDTH
 #define INT_FAST48_WIDTH     UINT_FAST48_WIDTH
+#undef __UINT_LEAST32_WIDTH
 #define __UINT_LEAST32_WIDTH UINT48_WIDTH
+# undef __UINT_LEAST16_WIDTH
 #define __UINT_LEAST16_WIDTH UINT48_WIDTH
+# undef __UINT_LEAST8_WIDTH
 #define __UINT_LEAST8_WIDTH  UINT48_WIDTH
 #endif /* __STDC_VERSION__ */
 #endif /* __INT48_TYPE__ */
@ -591,27 +665,39 @@ typedef __UINTMAX_TYPE__ uintmax_t;
 # define INT_FAST40_MAX      INT40_MAX
 # define UINT_FAST40_MAX    UINT40_MAX

+# undef __INT_LEAST32_MIN
 # define __INT_LEAST32_MIN   INT40_MIN
+# undef __INT_LEAST32_MAX
 # define __INT_LEAST32_MAX   INT40_MAX
+# undef __UINT_LEAST32_MAX
 # define __UINT_LEAST32_MAX UINT40_MAX
+# undef __INT_LEAST16_MIN
 # define __INT_LEAST16_MIN   INT40_MIN
+# undef __INT_LEAST16_MAX
 # define __INT_LEAST16_MAX   INT40_MAX
+# undef __UINT_LEAST16_MAX
 # define __UINT_LEAST16_MAX UINT40_MAX
+# undef __INT_LEAST8_MIN
 # define __INT_LEAST8_MIN    INT40_MIN
+# undef __INT_LEAST8_MAX
 # define __INT_LEAST8_MAX    INT40_MAX
+# undef __UINT_LEAST8_MAX
 # define __UINT_LEAST8_MAX  UINT40_MAX

 /* FIXME: This is using the placeholder dates Clang produces for these macros
   in C2x mode; switch to the correct values once they've been published. */
-#if __STDC_VERSION__ >= 202000L
+#if defined(__STDC_VERSION__) && __STDC_VERSION__ >= 202000L
 # define UINT40_WIDTH         40
 # define INT40_WIDTH          UINT40_WIDTH
 # define UINT_LEAST40_WIDTH   UINT40_WIDTH
 # define INT_LEAST40_WIDTH    UINT_LEAST40_WIDTH
 # define UINT_FAST40_WIDTH    UINT40_WIDTH
 # define INT_FAST40_WIDTH     UINT_FAST40_WIDTH
+# undef __UINT_LEAST32_WIDTH
 # define __UINT_LEAST32_WIDTH UINT40_WIDTH
+# undef __UINT_LEAST16_WIDTH
 # define __UINT_LEAST16_WIDTH UINT40_WIDTH
+# undef __UINT_LEAST8_WIDTH
 # define __UINT_LEAST8_WIDTH  UINT40_WIDTH
 #endif /* __STDC_VERSION__ */
 #endif /* __INT40_TYPE__ */
@ -622,23 +708,35 @@ typedef __UINTMAX_TYPE__ uintmax_t;
 # define INT32_MIN         (-INT32_C(2147483647)-1)
 # define UINT32_MAX         UINT32_C(4294967295)

+# undef __INT_LEAST32_MIN
 # define __INT_LEAST32_MIN   INT32_MIN
+# undef __INT_LEAST32_MAX
 # define __INT_LEAST32_MAX   INT32_MAX
+# undef __UINT_LEAST32_MAX
 # define __UINT_LEAST32_MAX UINT32_MAX
+# undef __INT_LEAST16_MIN
 # define __INT_LEAST16_MIN   INT32_MIN
+# undef __INT_LEAST16_MAX
 # define __INT_LEAST16_MAX   INT32_MAX
+# undef __UINT_LEAST16_MAX
 # define __UINT_LEAST16_MAX UINT32_MAX
+# undef __INT_LEAST8_MIN
 # define __INT_LEAST8_MIN    INT32_MIN
+# undef __INT_LEAST8_MAX
 # define __INT_LEAST8_MAX    INT32_MAX
+# undef __UINT_LEAST8_MAX
 # define __UINT_LEAST8_MAX  UINT32_MAX

 /* FIXME: This is using the placeholder dates Clang produces for these macros
   in C2x mode; switch to the correct values once they've been published. */
-#if __STDC_VERSION__ >= 202000L
+#if defined(__STDC_VERSION__) && __STDC_VERSION__ >= 202000L
 # define UINT32_WIDTH         32
 # define INT32_WIDTH          UINT32_WIDTH
+# undef __UINT_LEAST32_WIDTH
 # define __UINT_LEAST32_WIDTH UINT32_WIDTH
+# undef __UINT_LEAST16_WIDTH
 # define __UINT_LEAST16_WIDTH UINT32_WIDTH
+# undef __UINT_LEAST8_WIDTH
 # define __UINT_LEAST8_WIDTH  UINT32_WIDTH
 #endif /* __STDC_VERSION__ */
 #endif /* __INT32_TYPE__ */
@ -653,7 +751,7 @@ typedef __UINTMAX_TYPE__ uintmax_t;

 /* FIXME: This is using the placeholder dates Clang produces for these macros
   in C2x mode; switch to the correct values once they've been published. */
-#if __STDC_VERSION__ >= 202000L
+#if defined(__STDC_VERSION__) && __STDC_VERSION__ >= 202000L
 # define UINT_LEAST32_WIDTH __UINT_LEAST32_WIDTH
 # define INT_LEAST32_WIDTH  UINT_LEAST32_WIDTH
 # define UINT_FAST32_WIDTH  __UINT_LEAST32_WIDTH
@ -673,23 +771,31 @@ typedef __UINTMAX_TYPE__ uintmax_t;
 # define INT_FAST24_MAX      INT24_MAX
 # define UINT_FAST24_MAX    UINT24_MAX

+# undef __INT_LEAST16_MIN
 # define __INT_LEAST16_MIN   INT24_MIN
+# undef __INT_LEAST16_MAX
 # define __INT_LEAST16_MAX   INT24_MAX
+# undef __UINT_LEAST16_MAX
 # define __UINT_LEAST16_MAX UINT24_MAX
+# undef __INT_LEAST8_MIN
 # define __INT_LEAST8_MIN    INT24_MIN
+# undef __INT_LEAST8_MAX
 # define __INT_LEAST8_MAX    INT24_MAX
+# undef __UINT_LEAST8_MAX
 # define __UINT_LEAST8_MAX  UINT24_MAX

 /* FIXME: This is using the placeholder dates Clang produces for these macros
   in C2x mode; switch to the correct values once they've been published. */
-#if __STDC_VERSION__ >= 202000L
+#if defined(__STDC_VERSION__) && __STDC_VERSION__ >= 202000L
 # define UINT24_WIDTH         24
 # define INT24_WIDTH          UINT24_WIDTH
 # define UINT_LEAST24_WIDTH   UINT24_WIDTH
 # define INT_LEAST24_WIDTH    UINT_LEAST24_WIDTH
 # define UINT_FAST24_WIDTH    UINT24_WIDTH
 # define INT_FAST24_WIDTH     UINT_FAST24_WIDTH
+# undef __UINT_LEAST16_WIDTH
 # define __UINT_LEAST16_WIDTH UINT24_WIDTH
+# undef __UINT_LEAST8_WIDTH
 # define __UINT_LEAST8_WIDTH  UINT24_WIDTH
 #endif /* __STDC_VERSION__ */
 #endif /* __INT24_TYPE__ */
@ -700,19 +806,27 @@ typedef __UINTMAX_TYPE__ uintmax_t;
 #define INT16_MIN          (-INT16_C(32767)-1)
 #define UINT16_MAX          UINT16_C(65535)

+# undef __INT_LEAST16_MIN
 # define __INT_LEAST16_MIN   INT16_MIN
+# undef __INT_LEAST16_MAX
 # define __INT_LEAST16_MAX   INT16_MAX
+# undef __UINT_LEAST16_MAX
 # define __UINT_LEAST16_MAX UINT16_MAX
+# undef __INT_LEAST8_MIN
 # define __INT_LEAST8_MIN    INT16_MIN
+# undef __INT_LEAST8_MAX
 # define __INT_LEAST8_MAX    INT16_MAX
+# undef __UINT_LEAST8_MAX
 # define __UINT_LEAST8_MAX  UINT16_MAX

 /* FIXME: This is using the placeholder dates Clang produces for these macros
   in C2x mode; switch to the correct values once they've been published. */
-#if __STDC_VERSION__ >= 202000L
+#if defined(__STDC_VERSION__) && __STDC_VERSION__ >= 202000L
 # define UINT16_WIDTH         16
 # define INT16_WIDTH          UINT16_WIDTH
+# undef __UINT_LEAST16_WIDTH
 # define __UINT_LEAST16_WIDTH UINT16_WIDTH
+# undef __UINT_LEAST8_WIDTH
 # define __UINT_LEAST8_WIDTH  UINT16_WIDTH
 #endif /* __STDC_VERSION__ */
 #endif /* __INT16_TYPE__ */
@ -727,7 +841,7 @@ typedef __UINTMAX_TYPE__ uintmax_t;

 /* FIXME: This is using the placeholder dates Clang produces for these macros
   in C2x mode; switch to the correct values once they've been published. */
-#if __STDC_VERSION__ >= 202000L
+#if defined(__STDC_VERSION__) && __STDC_VERSION__ >= 202000L
 # define UINT_LEAST16_WIDTH __UINT_LEAST16_WIDTH
 # define INT_LEAST16_WIDTH  UINT_LEAST16_WIDTH
 # define UINT_FAST16_WIDTH  __UINT_LEAST16_WIDTH
@ -741,15 +855,19 @@ typedef __UINTMAX_TYPE__ uintmax_t;
 # define INT8_MIN          (-INT8_C(127)-1)
 # define UINT8_MAX          UINT8_C(255)

+# undef __INT_LEAST8_MIN
 # define __INT_LEAST8_MIN    INT8_MIN
+# undef __INT_LEAST8_MAX
 # define __INT_LEAST8_MAX    INT8_MAX
+# undef __UINT_LEAST8_MAX
 # define __UINT_LEAST8_MAX  UINT8_MAX

 /* FIXME: This is using the placeholder dates Clang produces for these macros
   in C2x mode; switch to the correct values once they've been published. */
-#if __STDC_VERSION__ >= 202000L
+#if defined(__STDC_VERSION__) && __STDC_VERSION__ >= 202000L
 # define UINT8_WIDTH         8
 # define INT8_WIDTH          UINT8_WIDTH
+# undef __UINT_LEAST8_WIDTH
 # define __UINT_LEAST8_WIDTH UINT8_WIDTH
 #endif /* __STDC_VERSION__ */
 #endif /* __INT8_TYPE__ */
@ -764,7 +882,7 @@ typedef __UINTMAX_TYPE__ uintmax_t;

 /* FIXME: This is using the placeholder dates Clang produces for these macros
   in C2x mode; switch to the correct values once they've been published. */
-#if __STDC_VERSION__ >= 202000L
+#if defined(__STDC_VERSION__) && __STDC_VERSION__ >= 202000L
 # define UINT_LEAST8_WIDTH __UINT_LEAST8_WIDTH
 # define INT_LEAST8_WIDTH  UINT_LEAST8_WIDTH
 # define UINT_FAST8_WIDTH  __UINT_LEAST8_WIDTH
@ -792,7 +910,7 @@ typedef __UINTMAX_TYPE__ uintmax_t;
 /* C2x 7.20.2.4 Width of integer types capable of holding object pointers. */
 /* FIXME: This is using the placeholder dates Clang produces for these macros
   in C2x mode; switch to the correct values once they've been published. */
-#if __STDC_VERSION__ >= 202000L
+#if defined(__STDC_VERSION__) && __STDC_VERSION__ >= 202000L
 /* NB: The C standard requires that these be the same value, but the compiler
   exposes separate internal width macros. */
 #define INTPTR_WIDTH  __INTPTR_WIDTH__
@ -813,7 +931,7 @@ typedef __UINTMAX_TYPE__ uintmax_t;
 /* C2x 7.20.2.5 Width of greatest-width integer types. */
 /* FIXME: This is using the placeholder dates Clang produces for these macros
   in C2x mode; switch to the correct values once they've been published. */
-#if __STDC_VERSION__ >= 202000L
+#if defined(__STDC_VERSION__) && __STDC_VERSION__ >= 202000L
 /* NB: The C standard requires that these be the same value, but the compiler
   exposes separate internal width macros. */
 #define INTMAX_WIDTH __INTMAX_WIDTH__
@ -849,7 +967,7 @@ typedef __UINTMAX_TYPE__ uintmax_t;
 /* C2x 7.20.3.x Width of other integer types. */
 /* FIXME: This is using the placeholder dates Clang produces for these macros
   in C2x mode; switch to the correct values once they've been published. */
-#if __STDC_VERSION__ >= 202000L
+#if defined(__STDC_VERSION__) && __STDC_VERSION__ >= 202000L
 #define PTRDIFF_WIDTH    __PTRDIFF_WIDTH__
 #define SIG_ATOMIC_WIDTH __SIG_ATOMIC_WIDTH__
 #define SIZE_WIDTH       __SIZE_WIDTH__
--- a/lib/include/stdnoreturn.h
+++ b/lib/include/stdnoreturn.h
@ -13,7 +13,7 @@
 #define noreturn _Noreturn
 #define __noreturn_is_defined 1

-#if __STDC_VERSION__ > 201710L &&                                              \
+#if (defined(__STDC_VERSION__) && __STDC_VERSION__ > 201710L) &&               \
    !defined(_CLANG_DISABLE_CRT_DEPRECATION_WARNINGS)
 /* The noreturn macro is deprecated in C2x. We do not mark it as such because
   including the header file in C2x is also deprecated and we do not want to
--- a/lib/include/unwind.h
+++ b/lib/include/unwind.h
@ -65,7 +65,8 @@ struct _Unwind_Context;
 #if defined(__arm__) && !(defined(__USING_SJLJ_EXCEPTIONS__) || \
                          defined(__ARM_DWARF_EH__) || defined(__SEH__))
 struct _Unwind_Control_Block;
-typedef struct _Unwind_Control_Block _Unwind_Exception; /* Alias */
+typedef struct _Unwind_Control_Block _Unwind_Control_Block;
+#define _Unwind_Exception _Unwind_Control_Block /* Alias */
 #else
 struct _Unwind_Exception;
 typedef struct _Unwind_Exception _Unwind_Exception;
--- a/lib/include/velintrin.h
+++ b/lib/include/velintrin.h
@ -13,7 +13,7 @@
 typedef double __vr __attribute__((__vector_size__(2048)));

 // Vector mask registers
-#if __STDC_VERSION__ >= 199901L
+#if defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L
 // For C99
 typedef _Bool __vm    __attribute__((ext_vector_type(256)));
 typedef _Bool __vm256 __attribute__((ext_vector_type(256)));
--- a/lib/include/x86gprintrin.h
+++ b/lib/include/x86gprintrin.h
@ -25,23 +25,35 @@
 #include <crc32intrin.h>
 #endif

+#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
+    defined(__PRFCHI__)
+#include <prfchiintrin.h>
+#endif
+
+#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
+    defined(__RAOINT__)
+#include <raointintrin.h>
+#endif
+
+#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
+    defined(__CMPCCXADD__)
+#include <cmpccxaddintrin.h>
+#endif
+
 #if defined(__i386__)
-#define __FULLBX "ebx"
+#define __SAVE_GPRBX "mov {%%ebx, %%eax |eax, ebx};"
+#define __RESTORE_GPRBX "mov {%%eax, %%ebx |ebx, eax};"
 #define __TMPGPR "eax"
 #else
 // When in 64-bit target, the 32-bit operands generate a 32-bit result,
 // zero-extended to a 64-bit result in the destination general-purpose,
 // It means "mov x %ebx" will clobber the higher 32 bits of rbx, so we
 // should preserve the 64-bit register rbx.
-#define __FULLBX "rbx"
+#define __SAVE_GPRBX "mov {%%rbx, %%rax |rax, rbx};"
+#define __RESTORE_GPRBX "mov {%%rax, %%rbx |rbx, rax};"
 #define __TMPGPR "rax"
 #endif

-#define __MOVEGPR(__r1, __r2) "mov {%%"__r1 ", %%"__r2 "|"__r2 ", "__r1"};"
-
-#define __SAVE_GPRBX __MOVEGPR(__FULLBX, __TMPGPR)
-#define __RESTORE_GPRBX __MOVEGPR(__TMPGPR, __FULLBX)
-
 #define __SSC_MARK(__Tag)                                                      \
  __asm__ __volatile__( __SAVE_GPRBX                                           \
                       "mov {%0, %%ebx|ebx, %0}; "                             \
--- a/lib/include/xmmintrin.h
+++ b/lib/include/xmmintrin.h
@ -1906,7 +1906,7 @@ _mm_setr_ps(float __z, float __y, float __x, float __w)
 static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_setzero_ps(void)
 {
-  return __extension__ (__m128){ 0, 0, 0, 0 };
+  return __extension__ (__m128){ 0.0f, 0.0f, 0.0f, 0.0f };
 }

 /// Stores the upper 64 bits of a 128-bit vector of [4 x float] to a
@ -3005,7 +3005,6 @@ do { \
 #define _m_pavgw _mm_avg_pu16
 #define _m_psadbw _mm_sad_pu8
 #define _m_ _mm_
-#define _m_ _mm_

 #undef __DEFAULT_FN_ATTRS
 #undef __DEFAULT_FN_ATTRS_MMX