mirror of
https://github.com/ziglang/zig.git
synced 2025-02-12 23:50:18 +00:00
update C headers to LLVM 16
upstream commit 0604154e006e88e9e7f82d8ee5fd076bda206613
This commit is contained in:
parent
b260bf42d3
commit
85be0b8c65
@ -666,6 +666,7 @@ __device__ static void __tex_fetch(__T *__ptr, cudaTextureObject_t __handle,
|
||||
__tex_fetch_v4<__op>::template __run<__FetchT>(__handle, __args...));
|
||||
}
|
||||
|
||||
#if CUDA_VERSION < 12000
|
||||
// texture<> objects get magically converted into a texture reference. However,
|
||||
// there's no way to convert them to cudaTextureObject_t on C++ level. So, we
|
||||
// cheat a bit and use inline assembly to do it. It costs us an extra register
|
||||
@ -713,6 +714,7 @@ __tex_fetch(__DataT *, __RetT *__ptr,
|
||||
__tex_fetch_v4<__op>::template __run<__FetchT>(
|
||||
__tex_handle_to_obj(__handle), __args...));
|
||||
}
|
||||
#endif // CUDA_VERSION
|
||||
} // namespace __cuda_tex
|
||||
} // namespace
|
||||
#pragma pop_macro("__ASM_OUT")
|
||||
|
5
lib/include/__clang_hip_libdevice_declares.h
vendored
5
lib/include/__clang_hip_libdevice_declares.h
vendored
@ -288,12 +288,17 @@ __llvm_amdgcn_rsq_f64(double __x) {
|
||||
|
||||
__device__ __attribute__((const)) _Float16 __ocml_ceil_f16(_Float16);
|
||||
__device__ _Float16 __ocml_cos_f16(_Float16);
|
||||
__device__ __attribute__((const)) _Float16 __ocml_cvtrtn_f16_f32(float);
|
||||
__device__ __attribute__((const)) _Float16 __ocml_cvtrtp_f16_f32(float);
|
||||
__device__ __attribute__((const)) _Float16 __ocml_cvtrtz_f16_f32(float);
|
||||
__device__ __attribute__((pure)) _Float16 __ocml_exp_f16(_Float16);
|
||||
__device__ __attribute__((pure)) _Float16 __ocml_exp10_f16(_Float16);
|
||||
__device__ __attribute__((pure)) _Float16 __ocml_exp2_f16(_Float16);
|
||||
__device__ __attribute__((const)) _Float16 __ocml_floor_f16(_Float16);
|
||||
__device__ __attribute__((const)) _Float16 __ocml_fma_f16(_Float16, _Float16,
|
||||
_Float16);
|
||||
__device__ __attribute__((const)) _Float16 __ocml_fmax_f16(_Float16, _Float16);
|
||||
__device__ __attribute__((const)) _Float16 __ocml_fmin_f16(_Float16, _Float16);
|
||||
__device__ __attribute__((const)) _Float16 __ocml_fabs_f16(_Float16);
|
||||
__device__ __attribute__((const)) int __ocml_isinf_f16(_Float16);
|
||||
__device__ __attribute__((const)) int __ocml_isnan_f16(_Float16);
|
||||
|
23
lib/include/__clang_hip_math.h
vendored
23
lib/include/__clang_hip_math.h
vendored
@ -70,9 +70,9 @@ __DEVICE__ void __static_assert_equal_size() {
|
||||
#endif
|
||||
|
||||
__DEVICE__
|
||||
uint64_t __make_mantissa_base8(const char *__tagp) {
|
||||
uint64_t __make_mantissa_base8(const char *__tagp __attribute__((nonnull))) {
|
||||
uint64_t __r = 0;
|
||||
while (__tagp) {
|
||||
while (*__tagp != '\0') {
|
||||
char __tmp = *__tagp;
|
||||
|
||||
if (__tmp >= '0' && __tmp <= '7')
|
||||
@ -87,9 +87,9 @@ uint64_t __make_mantissa_base8(const char *__tagp) {
|
||||
}
|
||||
|
||||
__DEVICE__
|
||||
uint64_t __make_mantissa_base10(const char *__tagp) {
|
||||
uint64_t __make_mantissa_base10(const char *__tagp __attribute__((nonnull))) {
|
||||
uint64_t __r = 0;
|
||||
while (__tagp) {
|
||||
while (*__tagp != '\0') {
|
||||
char __tmp = *__tagp;
|
||||
|
||||
if (__tmp >= '0' && __tmp <= '9')
|
||||
@ -104,9 +104,9 @@ uint64_t __make_mantissa_base10(const char *__tagp) {
|
||||
}
|
||||
|
||||
__DEVICE__
|
||||
uint64_t __make_mantissa_base16(const char *__tagp) {
|
||||
uint64_t __make_mantissa_base16(const char *__tagp __attribute__((nonnull))) {
|
||||
uint64_t __r = 0;
|
||||
while (__tagp) {
|
||||
while (*__tagp != '\0') {
|
||||
char __tmp = *__tagp;
|
||||
|
||||
if (__tmp >= '0' && __tmp <= '9')
|
||||
@ -125,10 +125,7 @@ uint64_t __make_mantissa_base16(const char *__tagp) {
|
||||
}
|
||||
|
||||
__DEVICE__
|
||||
uint64_t __make_mantissa(const char *__tagp) {
|
||||
if (!__tagp)
|
||||
return 0u;
|
||||
|
||||
uint64_t __make_mantissa(const char *__tagp __attribute__((nonnull))) {
|
||||
if (*__tagp == '0') {
|
||||
++__tagp;
|
||||
|
||||
@ -233,7 +230,7 @@ __DEVICE__
|
||||
float expm1f(float __x) { return __ocml_expm1_f32(__x); }
|
||||
|
||||
__DEVICE__
|
||||
float fabsf(float __x) { return __ocml_fabs_f32(__x); }
|
||||
float fabsf(float __x) { return __builtin_fabsf(__x); }
|
||||
|
||||
__DEVICE__
|
||||
float fdimf(float __x, float __y) { return __ocml_fdim_f32(__x, __y); }
|
||||
@ -359,7 +356,7 @@ float modff(float __x, float *__iptr) {
|
||||
}
|
||||
|
||||
__DEVICE__
|
||||
float nanf(const char *__tagp) {
|
||||
float nanf(const char *__tagp __attribute__((nonnull))) {
|
||||
union {
|
||||
float val;
|
||||
struct ieee_float {
|
||||
@ -792,7 +789,7 @@ __DEVICE__
|
||||
double expm1(double __x) { return __ocml_expm1_f64(__x); }
|
||||
|
||||
__DEVICE__
|
||||
double fabs(double __x) { return __ocml_fabs_f64(__x); }
|
||||
double fabs(double __x) { return __builtin_fabs(__x); }
|
||||
|
||||
__DEVICE__
|
||||
double fdim(double __x, double __y) { return __ocml_fdim_f64(__x, __y); }
|
||||
|
1
lib/include/__clang_hip_runtime_wrapper.h
vendored
1
lib/include/__clang_hip_runtime_wrapper.h
vendored
@ -113,6 +113,7 @@ __attribute__((weak)) inline __device__ void free(void *__ptr) {
|
||||
|
||||
#include <__clang_hip_libdevice_declares.h>
|
||||
#include <__clang_hip_math.h>
|
||||
#include <__clang_hip_stdlib.h>
|
||||
|
||||
#if defined(__HIPCC_RTC__)
|
||||
#include <__clang_hip_cmath.h>
|
||||
|
43
lib/include/__clang_hip_stdlib.h
vendored
Normal file
43
lib/include/__clang_hip_stdlib.h
vendored
Normal file
@ -0,0 +1,43 @@
|
||||
/*===---- __clang_hip_stdlib.h - Device-side HIP math support --------------===
|
||||
*
|
||||
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||
* See https://llvm.org/LICENSE.txt for license information.
|
||||
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||
*
|
||||
*===-----------------------------------------------------------------------===
|
||||
*/
|
||||
#ifndef __CLANG_HIP_STDLIB_H__
|
||||
|
||||
#if !defined(__HIP__) && !defined(__OPENMP_AMDGCN__)
|
||||
#error "This file is for HIP and OpenMP AMDGCN device compilation only."
|
||||
#endif
|
||||
|
||||
#if !defined(__cplusplus)
|
||||
|
||||
#include <limits.h>
|
||||
|
||||
#ifdef __OPENMP_AMDGCN__
|
||||
#define __DEVICE__ static inline __attribute__((always_inline, nothrow))
|
||||
#else
|
||||
#define __DEVICE__ static __device__ inline __attribute__((always_inline))
|
||||
#endif
|
||||
|
||||
__DEVICE__
|
||||
int abs(int __x) {
|
||||
int __sgn = __x >> (sizeof(int) * CHAR_BIT - 1);
|
||||
return (__x ^ __sgn) - __sgn;
|
||||
}
|
||||
__DEVICE__
|
||||
long labs(long __x) {
|
||||
long __sgn = __x >> (sizeof(long) * CHAR_BIT - 1);
|
||||
return (__x ^ __sgn) - __sgn;
|
||||
}
|
||||
__DEVICE__
|
||||
long long llabs(long long __x) {
|
||||
long long __sgn = __x >> (sizeof(long long) * CHAR_BIT - 1);
|
||||
return (__x ^ __sgn) - __sgn;
|
||||
}
|
||||
|
||||
#endif // !defined(__cplusplus)
|
||||
|
||||
#endif // #define __CLANG_HIP_STDLIB_H__
|
28
lib/include/altivec.h
vendored
28
lib/include/altivec.h
vendored
@ -17323,32 +17323,32 @@ provided.
|
||||
#define vec_ncipherlast_be __builtin_altivec_crypto_vncipherlast
|
||||
|
||||
#ifdef __VSX__
|
||||
static __inline__ vector unsigned long long __attribute__((__always_inline__))
|
||||
__builtin_crypto_vsbox(vector unsigned long long __a) {
|
||||
static __inline__ vector unsigned char __attribute__((__always_inline__))
|
||||
__builtin_crypto_vsbox(vector unsigned char __a) {
|
||||
return __builtin_altivec_crypto_vsbox(__a);
|
||||
}
|
||||
|
||||
static __inline__ vector unsigned long long __attribute__((__always_inline__))
|
||||
__builtin_crypto_vcipher(vector unsigned long long __a,
|
||||
vector unsigned long long __b) {
|
||||
static __inline__ vector unsigned char __attribute__((__always_inline__))
|
||||
__builtin_crypto_vcipher(vector unsigned char __a,
|
||||
vector unsigned char __b) {
|
||||
return __builtin_altivec_crypto_vcipher(__a, __b);
|
||||
}
|
||||
|
||||
static __inline__ vector unsigned long long __attribute__((__always_inline__))
|
||||
__builtin_crypto_vcipherlast(vector unsigned long long __a,
|
||||
vector unsigned long long __b) {
|
||||
static __inline__ vector unsigned char __attribute__((__always_inline__))
|
||||
__builtin_crypto_vcipherlast(vector unsigned char __a,
|
||||
vector unsigned char __b) {
|
||||
return __builtin_altivec_crypto_vcipherlast(__a, __b);
|
||||
}
|
||||
|
||||
static __inline__ vector unsigned long long __attribute__((__always_inline__))
|
||||
__builtin_crypto_vncipher(vector unsigned long long __a,
|
||||
vector unsigned long long __b) {
|
||||
static __inline__ vector unsigned char __attribute__((__always_inline__))
|
||||
__builtin_crypto_vncipher(vector unsigned char __a,
|
||||
vector unsigned char __b) {
|
||||
return __builtin_altivec_crypto_vncipher(__a, __b);
|
||||
}
|
||||
|
||||
static __inline__ vector unsigned long long __attribute__((__always_inline__))
|
||||
__builtin_crypto_vncipherlast(vector unsigned long long __a,
|
||||
vector unsigned long long __b) {
|
||||
static __inline__ vector unsigned char __attribute__((__always_inline__))
|
||||
__builtin_crypto_vncipherlast(vector unsigned char __a,
|
||||
vector unsigned char __b) {
|
||||
return __builtin_altivec_crypto_vncipherlast(__a, __b);
|
||||
}
|
||||
#endif /* __VSX__ */
|
||||
|
58
lib/include/amxfp16intrin.h
vendored
Normal file
58
lib/include/amxfp16intrin.h
vendored
Normal file
@ -0,0 +1,58 @@
|
||||
/*===------------- amxfp16intrin.h - AMX_FP16 intrinsics -*- C++ -*---------===
|
||||
*
|
||||
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||
* See https://llvm.org/LICENSE.txt for license information.
|
||||
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||
*
|
||||
*===------------------------------------------------------------------------===
|
||||
*/
|
||||
|
||||
#ifndef __IMMINTRIN_H
|
||||
#error "Never use <amxfp16intrin.h> directly; use <immintrin.h> instead."
|
||||
#endif /* __IMMINTRIN_H */
|
||||
|
||||
#ifndef __AMX_FP16INTRIN_H
|
||||
#define __AMX_FP16INTRIN_H
|
||||
#ifdef __x86_64__
|
||||
|
||||
/// Compute dot-product of FP16 (16-bit) floating-point pairs in tiles \a a
|
||||
/// and \a b, accumulating the intermediate single-precision (32-bit)
|
||||
/// floating-point elements with elements in \a dst, and store the 32-bit
|
||||
/// result back to tile \a dst.
|
||||
///
|
||||
/// \headerfile <immintrin.h>
|
||||
///
|
||||
/// \code
|
||||
/// void _tile_dpfp16ps (__tile dst, __tile a, __tile b)
|
||||
/// \endcode
|
||||
///
|
||||
/// \code{.operation}
|
||||
/// FOR m := 0 TO dst.rows - 1
|
||||
/// tmp := dst.row[m]
|
||||
/// FOR k := 0 TO (a.colsb / 4) - 1
|
||||
/// FOR n := 0 TO (dst.colsb / 4) - 1
|
||||
/// tmp.fp32[n] += FP32(a.row[m].fp16[2*k+0]) *
|
||||
/// FP32(b.row[k].fp16[2*n+0])
|
||||
/// tmp.fp32[n] += FP32(a.row[m].fp16[2*k+1]) *
|
||||
/// FP32(b.row[k].fp16[2*n+1])
|
||||
/// ENDFOR
|
||||
/// ENDFOR
|
||||
/// write_row_and_zero(dst, m, tmp, dst.colsb)
|
||||
/// ENDFOR
|
||||
/// zero_upper_rows(dst, dst.rows)
|
||||
/// zero_tileconfig_start()
|
||||
/// \endcode
|
||||
///
|
||||
/// This intrinsic corresponds to the \c TDPFP16PS instruction.
|
||||
///
|
||||
/// \param dst
|
||||
/// The destination tile. Max size is 1024 Bytes.
|
||||
/// \param a
|
||||
/// The 1st source tile. Max size is 1024 Bytes.
|
||||
/// \param b
|
||||
/// The 2nd source tile. Max size is 1024 Bytes.
|
||||
#define _tile_dpfp16ps(dst, a, b) \
|
||||
__builtin_ia32_tdpfp16ps(dst, a, b)
|
||||
|
||||
#endif /* __x86_64__ */
|
||||
#endif /* __AMX_FP16INTRIN_H */
|
32
lib/include/amxintrin.h
vendored
32
lib/include/amxintrin.h
vendored
@ -22,6 +22,8 @@
|
||||
__attribute__((__always_inline__, __nodebug__, __target__("amx-int8")))
|
||||
#define __DEFAULT_FN_ATTRS_BF16 \
|
||||
__attribute__((__always_inline__, __nodebug__, __target__("amx-bf16")))
|
||||
#define __DEFAULT_FN_ATTRS_FP16 \
|
||||
__attribute__((__always_inline__, __nodebug__, __target__("amx-fp16")))
|
||||
|
||||
/// Load tile configuration from a 64-byte memory location specified by
|
||||
/// "mem_addr". The tile configuration includes the tile type palette, the
|
||||
@ -290,6 +292,13 @@ _tile_dpbf16ps_internal(unsigned short m, unsigned short n, unsigned short k,
|
||||
return __builtin_ia32_tdpbf16ps_internal(m, n, k, dst, src1, src2);
|
||||
}
|
||||
|
||||
/// This is internal intrinsic. C/C++ user should avoid calling it directly.
|
||||
static __inline__ _tile1024i __DEFAULT_FN_ATTRS_FP16
|
||||
_tile_dpfp16ps_internal(unsigned short m, unsigned short n, unsigned short k,
|
||||
_tile1024i dst, _tile1024i src1, _tile1024i src2) {
|
||||
return __builtin_ia32_tdpfp16ps_internal(m, n, k, dst, src1, src2);
|
||||
}
|
||||
|
||||
/// This struct pack the shape and tile data together for user. We suggest
|
||||
/// initializing the struct as early as possible, because compiler depends
|
||||
/// on the shape information to do configure. The constant value is preferred
|
||||
@ -484,9 +493,32 @@ static __inline__ void __tile_dpbf16ps(__tile1024i *dst, __tile1024i src0,
|
||||
src0.tile, src1.tile);
|
||||
}
|
||||
|
||||
/// Compute dot-product of FP16 (16-bit) floating-point pairs in tiles src0 and
|
||||
/// src1, accumulating the intermediate single-precision (32-bit) floating-point
|
||||
/// elements with elements in "dst", and store the 32-bit result back to tile
|
||||
/// "dst".
|
||||
///
|
||||
/// \headerfile <immintrin.h>
|
||||
///
|
||||
/// This intrinsic corresponds to the <c> TDPFP16PS </c> instruction.
|
||||
///
|
||||
/// \param dst
|
||||
/// The destination tile. Max size is 1024 Bytes.
|
||||
/// \param src0
|
||||
/// The 1st source tile. Max size is 1024 Bytes.
|
||||
/// \param src1
|
||||
/// The 2nd source tile. Max size is 1024 Bytes.
|
||||
__DEFAULT_FN_ATTRS_FP16
|
||||
static __inline__ void __tile_dpfp16ps(__tile1024i *dst, __tile1024i src0,
|
||||
__tile1024i src1) {
|
||||
dst->tile = _tile_dpfp16ps_internal(src0.row, src1.col, src0.col, dst->tile,
|
||||
src0.tile, src1.tile);
|
||||
}
|
||||
|
||||
#undef __DEFAULT_FN_ATTRS_TILE
|
||||
#undef __DEFAULT_FN_ATTRS_INT8
|
||||
#undef __DEFAULT_FN_ATTRS_BF16
|
||||
#undef __DEFAULT_FN_ATTRS_FP16
|
||||
|
||||
#endif /* __x86_64__ */
|
||||
#endif /* __AMXINTRIN_H */
|
||||
|
151
lib/include/arm_acle.h
vendored
151
lib/include/arm_acle.h
vendored
@ -64,7 +64,7 @@ static __inline__ void __attribute__((__always_inline__, __nodebug__)) __yield(v
|
||||
}
|
||||
#endif
|
||||
|
||||
#if __ARM_32BIT_STATE
|
||||
#if defined(__ARM_32BIT_STATE) && __ARM_32BIT_STATE
|
||||
#define __dbg(t) __builtin_arm_dbg(t)
|
||||
#endif
|
||||
|
||||
@ -82,7 +82,7 @@ __swp(uint32_t __x, volatile uint32_t *__p) {
|
||||
/* 8.6.1 Data prefetch */
|
||||
#define __pld(addr) __pldx(0, 0, 0, addr)
|
||||
|
||||
#if __ARM_32BIT_STATE
|
||||
#if defined(__ARM_32BIT_STATE) && __ARM_32BIT_STATE
|
||||
#define __pldx(access_kind, cache_level, retention_policy, addr) \
|
||||
__builtin_arm_prefetch(addr, access_kind, 1)
|
||||
#else
|
||||
@ -93,7 +93,7 @@ __swp(uint32_t __x, volatile uint32_t *__p) {
|
||||
/* 8.6.2 Instruction prefetch */
|
||||
#define __pli(addr) __plix(0, 0, addr)
|
||||
|
||||
#if __ARM_32BIT_STATE
|
||||
#if defined(__ARM_32BIT_STATE) && __ARM_32BIT_STATE
|
||||
#define __plix(cache_level, retention_policy, addr) \
|
||||
__builtin_arm_prefetch(addr, 0, 0)
|
||||
#else
|
||||
@ -140,17 +140,17 @@ __rorl(unsigned long __x, uint32_t __y) {
|
||||
/* CLZ */
|
||||
static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__))
|
||||
__clz(uint32_t __t) {
|
||||
return __builtin_clz(__t);
|
||||
return (uint32_t)__builtin_clz(__t);
|
||||
}
|
||||
|
||||
static __inline__ unsigned long __attribute__((__always_inline__, __nodebug__))
|
||||
__clzl(unsigned long __t) {
|
||||
return __builtin_clzl(__t);
|
||||
return (unsigned long)__builtin_clzl(__t);
|
||||
}
|
||||
|
||||
static __inline__ uint64_t __attribute__((__always_inline__, __nodebug__))
|
||||
__clzll(uint64_t __t) {
|
||||
return __builtin_clzll(__t);
|
||||
return (uint64_t)__builtin_clzll(__t);
|
||||
}
|
||||
|
||||
/* CLS */
|
||||
@ -201,7 +201,7 @@ __rev16(uint32_t __t) {
|
||||
|
||||
static __inline__ uint64_t __attribute__((__always_inline__, __nodebug__))
|
||||
__rev16ll(uint64_t __t) {
|
||||
return (((uint64_t)__rev16(__t >> 32)) << 32) | __rev16(__t);
|
||||
return (((uint64_t)__rev16(__t >> 32)) << 32) | (uint64_t)__rev16((uint32_t)__t);
|
||||
}
|
||||
|
||||
static __inline__ unsigned long __attribute__((__always_inline__, __nodebug__))
|
||||
@ -216,7 +216,7 @@ __rev16l(unsigned long __t) {
|
||||
/* REVSH */
|
||||
static __inline__ int16_t __attribute__((__always_inline__, __nodebug__))
|
||||
__revsh(int16_t __t) {
|
||||
return __builtin_bswap16(__t);
|
||||
return (int16_t)__builtin_bswap16((uint16_t)__t);
|
||||
}
|
||||
|
||||
/* RBIT */
|
||||
@ -227,7 +227,7 @@ __rbit(uint32_t __t) {
|
||||
|
||||
static __inline__ uint64_t __attribute__((__always_inline__, __nodebug__))
|
||||
__rbitll(uint64_t __t) {
|
||||
#if __ARM_32BIT_STATE
|
||||
#if defined(__ARM_32BIT_STATE) && __ARM_32BIT_STATE
|
||||
return (((uint64_t)__builtin_arm_rbit(__t)) << 32) |
|
||||
__builtin_arm_rbit(__t >> 32);
|
||||
#else
|
||||
@ -247,7 +247,7 @@ __rbitl(unsigned long __t) {
|
||||
/*
|
||||
* 9.3 16-bit multiplications
|
||||
*/
|
||||
#if __ARM_FEATURE_DSP
|
||||
#if defined(__ARM_FEATURE_DSP) && __ARM_FEATURE_DSP
|
||||
static __inline__ int32_t __attribute__((__always_inline__,__nodebug__))
|
||||
__smulbb(int32_t __a, int32_t __b) {
|
||||
return __builtin_arm_smulbb(__a, __b);
|
||||
@ -277,17 +277,17 @@ __smulwt(int32_t __a, int32_t __b) {
|
||||
/*
|
||||
* 9.4 Saturating intrinsics
|
||||
*
|
||||
* FIXME: Change guard to their corrosponding __ARM_FEATURE flag when Q flag
|
||||
* FIXME: Change guard to their corresponding __ARM_FEATURE flag when Q flag
|
||||
* intrinsics are implemented and the flag is enabled.
|
||||
*/
|
||||
/* 9.4.1 Width-specified saturation intrinsics */
|
||||
#if __ARM_FEATURE_SAT
|
||||
#if defined(__ARM_FEATURE_SAT) && __ARM_FEATURE_SAT
|
||||
#define __ssat(x, y) __builtin_arm_ssat(x, y)
|
||||
#define __usat(x, y) __builtin_arm_usat(x, y)
|
||||
#endif
|
||||
|
||||
/* 9.4.2 Saturating addition and subtraction intrinsics */
|
||||
#if __ARM_FEATURE_DSP
|
||||
#if defined(__ARM_FEATURE_DSP) && __ARM_FEATURE_DSP
|
||||
static __inline__ int32_t __attribute__((__always_inline__, __nodebug__))
|
||||
__qadd(int32_t __t, int32_t __v) {
|
||||
return __builtin_arm_qadd(__t, __v);
|
||||
@ -305,7 +305,7 @@ __qdbl(int32_t __t) {
|
||||
#endif
|
||||
|
||||
/* 9.4.3 Accumultating multiplications */
|
||||
#if __ARM_FEATURE_DSP
|
||||
#if defined(__ARM_FEATURE_DSP) && __ARM_FEATURE_DSP
|
||||
static __inline__ int32_t __attribute__((__always_inline__, __nodebug__))
|
||||
__smlabb(int32_t __a, int32_t __b, int32_t __c) {
|
||||
return __builtin_arm_smlabb(__a, __b, __c);
|
||||
@ -334,13 +334,13 @@ __smlawt(int32_t __a, int32_t __b, int32_t __c) {
|
||||
|
||||
|
||||
/* 9.5.4 Parallel 16-bit saturation */
|
||||
#if __ARM_FEATURE_SIMD32
|
||||
#if defined(__ARM_FEATURE_SIMD32) && __ARM_FEATURE_SIMD32
|
||||
#define __ssat16(x, y) __builtin_arm_ssat16(x, y)
|
||||
#define __usat16(x, y) __builtin_arm_usat16(x, y)
|
||||
#endif
|
||||
|
||||
/* 9.5.5 Packing and unpacking */
|
||||
#if __ARM_FEATURE_SIMD32
|
||||
#if defined(__ARM_FEATURE_SIMD32) && __ARM_FEATURE_SIMD32
|
||||
typedef int32_t int8x4_t;
|
||||
typedef int32_t int16x2_t;
|
||||
typedef uint32_t uint8x4_t;
|
||||
@ -365,7 +365,7 @@ __uxtb16(int8x4_t __a) {
|
||||
#endif
|
||||
|
||||
/* 9.5.6 Parallel selection */
|
||||
#if __ARM_FEATURE_SIMD32
|
||||
#if defined(__ARM_FEATURE_SIMD32) && __ARM_FEATURE_SIMD32
|
||||
static __inline__ uint8x4_t __attribute__((__always_inline__, __nodebug__))
|
||||
__sel(uint8x4_t __a, uint8x4_t __b) {
|
||||
return __builtin_arm_sel(__a, __b);
|
||||
@ -373,7 +373,7 @@ __sel(uint8x4_t __a, uint8x4_t __b) {
|
||||
#endif
|
||||
|
||||
/* 9.5.7 Parallel 8-bit addition and subtraction */
|
||||
#if __ARM_FEATURE_SIMD32
|
||||
#if defined(__ARM_FEATURE_SIMD32) && __ARM_FEATURE_SIMD32
|
||||
static __inline__ int8x4_t __attribute__((__always_inline__, __nodebug__))
|
||||
__qadd8(int8x4_t __a, int8x4_t __b) {
|
||||
return __builtin_arm_qadd8(__a, __b);
|
||||
@ -425,7 +425,7 @@ __usub8(uint8x4_t __a, uint8x4_t __b) {
|
||||
#endif
|
||||
|
||||
/* 9.5.8 Sum of 8-bit absolute differences */
|
||||
#if __ARM_FEATURE_SIMD32
|
||||
#if defined(__ARM_FEATURE_SIMD32) && __ARM_FEATURE_SIMD32
|
||||
static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__))
|
||||
__usad8(uint8x4_t __a, uint8x4_t __b) {
|
||||
return __builtin_arm_usad8(__a, __b);
|
||||
@ -437,7 +437,7 @@ __usada8(uint8x4_t __a, uint8x4_t __b, uint32_t __c) {
|
||||
#endif
|
||||
|
||||
/* 9.5.9 Parallel 16-bit addition and subtraction */
|
||||
#if __ARM_FEATURE_SIMD32
|
||||
#if defined(__ARM_FEATURE_SIMD32) && __ARM_FEATURE_SIMD32
|
||||
static __inline__ int16x2_t __attribute__((__always_inline__, __nodebug__))
|
||||
__qadd16(int16x2_t __a, int16x2_t __b) {
|
||||
return __builtin_arm_qadd16(__a, __b);
|
||||
@ -537,7 +537,7 @@ __usub16(uint16x2_t __a, uint16x2_t __b) {
|
||||
#endif
|
||||
|
||||
/* 9.5.10 Parallel 16-bit multiplications */
|
||||
#if __ARM_FEATURE_SIMD32
|
||||
#if defined(__ARM_FEATURE_SIMD32) && __ARM_FEATURE_SIMD32
|
||||
static __inline__ int32_t __attribute__((__always_inline__, __nodebug__))
|
||||
__smlad(int16x2_t __a, int16x2_t __b, int32_t __c) {
|
||||
return __builtin_arm_smlad(__a, __b, __c);
|
||||
@ -589,155 +589,156 @@ __smusdx(int16x2_t __a, int16x2_t __b) {
|
||||
#endif
|
||||
|
||||
/* 9.7 CRC32 intrinsics */
|
||||
#if __ARM_FEATURE_CRC32
|
||||
static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__))
|
||||
#if (defined(__ARM_FEATURE_CRC32) && __ARM_FEATURE_CRC32) || \
|
||||
(defined(__ARM_64BIT_STATE) && __ARM_64BIT_STATE)
|
||||
static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__, target("crc")))
|
||||
__crc32b(uint32_t __a, uint8_t __b) {
|
||||
return __builtin_arm_crc32b(__a, __b);
|
||||
}
|
||||
|
||||
static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__))
|
||||
static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__, target("crc")))
|
||||
__crc32h(uint32_t __a, uint16_t __b) {
|
||||
return __builtin_arm_crc32h(__a, __b);
|
||||
}
|
||||
|
||||
static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__))
|
||||
static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__, target("crc")))
|
||||
__crc32w(uint32_t __a, uint32_t __b) {
|
||||
return __builtin_arm_crc32w(__a, __b);
|
||||
}
|
||||
|
||||
static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__))
|
||||
static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__, target("crc")))
|
||||
__crc32d(uint32_t __a, uint64_t __b) {
|
||||
return __builtin_arm_crc32d(__a, __b);
|
||||
}
|
||||
|
||||
static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__))
|
||||
static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__, target("crc")))
|
||||
__crc32cb(uint32_t __a, uint8_t __b) {
|
||||
return __builtin_arm_crc32cb(__a, __b);
|
||||
}
|
||||
|
||||
static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__))
|
||||
static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__, target("crc")))
|
||||
__crc32ch(uint32_t __a, uint16_t __b) {
|
||||
return __builtin_arm_crc32ch(__a, __b);
|
||||
}
|
||||
|
||||
static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__))
|
||||
static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__, target("crc")))
|
||||
__crc32cw(uint32_t __a, uint32_t __b) {
|
||||
return __builtin_arm_crc32cw(__a, __b);
|
||||
}
|
||||
|
||||
static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__))
|
||||
static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__, target("crc")))
|
||||
__crc32cd(uint32_t __a, uint64_t __b) {
|
||||
return __builtin_arm_crc32cd(__a, __b);
|
||||
}
|
||||
#endif
|
||||
|
||||
/* Armv8.3-A Javascript conversion intrinsic */
|
||||
#if __ARM_64BIT_STATE && defined(__ARM_FEATURE_JCVT)
|
||||
static __inline__ int32_t __attribute__((__always_inline__, __nodebug__))
|
||||
#if defined(__ARM_64BIT_STATE) && __ARM_64BIT_STATE
|
||||
static __inline__ int32_t __attribute__((__always_inline__, __nodebug__, target("v8.3a")))
|
||||
__jcvt(double __a) {
|
||||
return __builtin_arm_jcvt(__a);
|
||||
}
|
||||
#endif
|
||||
|
||||
/* Armv8.5-A FP rounding intrinsics */
|
||||
#if __ARM_64BIT_STATE && defined(__ARM_FEATURE_FRINT)
|
||||
static __inline__ float __attribute__((__always_inline__, __nodebug__))
|
||||
__frint32zf(float __a) {
|
||||
return __builtin_arm_frint32zf(__a);
|
||||
#if defined(__ARM_64BIT_STATE) && __ARM_64BIT_STATE
|
||||
static __inline__ float __attribute__((__always_inline__, __nodebug__, target("v8.5a")))
|
||||
__rint32zf(float __a) {
|
||||
return __builtin_arm_rint32zf(__a);
|
||||
}
|
||||
|
||||
static __inline__ double __attribute__((__always_inline__, __nodebug__))
|
||||
__frint32z(double __a) {
|
||||
return __builtin_arm_frint32z(__a);
|
||||
static __inline__ double __attribute__((__always_inline__, __nodebug__, target("v8.5a")))
|
||||
__rint32z(double __a) {
|
||||
return __builtin_arm_rint32z(__a);
|
||||
}
|
||||
|
||||
static __inline__ float __attribute__((__always_inline__, __nodebug__))
|
||||
__frint64zf(float __a) {
|
||||
return __builtin_arm_frint64zf(__a);
|
||||
static __inline__ float __attribute__((__always_inline__, __nodebug__, target("v8.5a")))
|
||||
__rint64zf(float __a) {
|
||||
return __builtin_arm_rint64zf(__a);
|
||||
}
|
||||
|
||||
static __inline__ double __attribute__((__always_inline__, __nodebug__))
|
||||
__frint64z(double __a) {
|
||||
return __builtin_arm_frint64z(__a);
|
||||
static __inline__ double __attribute__((__always_inline__, __nodebug__, target("v8.5a")))
|
||||
__rint64z(double __a) {
|
||||
return __builtin_arm_rint64z(__a);
|
||||
}
|
||||
|
||||
static __inline__ float __attribute__((__always_inline__, __nodebug__))
|
||||
__frint32xf(float __a) {
|
||||
return __builtin_arm_frint32xf(__a);
|
||||
static __inline__ float __attribute__((__always_inline__, __nodebug__, target("v8.5a")))
|
||||
__rint32xf(float __a) {
|
||||
return __builtin_arm_rint32xf(__a);
|
||||
}
|
||||
|
||||
static __inline__ double __attribute__((__always_inline__, __nodebug__))
|
||||
__frint32x(double __a) {
|
||||
return __builtin_arm_frint32x(__a);
|
||||
static __inline__ double __attribute__((__always_inline__, __nodebug__, target("v8.5a")))
|
||||
__rint32x(double __a) {
|
||||
return __builtin_arm_rint32x(__a);
|
||||
}
|
||||
|
||||
static __inline__ float __attribute__((__always_inline__, __nodebug__))
|
||||
__frint64xf(float __a) {
|
||||
return __builtin_arm_frint64xf(__a);
|
||||
static __inline__ float __attribute__((__always_inline__, __nodebug__, target("v8.5a")))
|
||||
__rint64xf(float __a) {
|
||||
return __builtin_arm_rint64xf(__a);
|
||||
}
|
||||
|
||||
static __inline__ double __attribute__((__always_inline__, __nodebug__))
|
||||
__frint64x(double __a) {
|
||||
return __builtin_arm_frint64x(__a);
|
||||
static __inline__ double __attribute__((__always_inline__, __nodebug__, target("v8.5a")))
|
||||
__rint64x(double __a) {
|
||||
return __builtin_arm_rint64x(__a);
|
||||
}
|
||||
#endif
|
||||
|
||||
/* Armv8.7-A load/store 64-byte intrinsics */
|
||||
#if __ARM_64BIT_STATE && defined(__ARM_FEATURE_LS64)
|
||||
#if defined(__ARM_64BIT_STATE) && __ARM_64BIT_STATE
|
||||
typedef struct {
|
||||
uint64_t val[8];
|
||||
} data512_t;
|
||||
|
||||
static __inline__ data512_t __attribute__((__always_inline__, __nodebug__))
|
||||
static __inline__ data512_t __attribute__((__always_inline__, __nodebug__, target("ls64")))
|
||||
__arm_ld64b(const void *__addr) {
|
||||
data512_t __value;
|
||||
__builtin_arm_ld64b(__addr, __value.val);
|
||||
return __value;
|
||||
data512_t __value;
|
||||
__builtin_arm_ld64b(__addr, __value.val);
|
||||
return __value;
|
||||
}
|
||||
static __inline__ void __attribute__((__always_inline__, __nodebug__))
|
||||
static __inline__ void __attribute__((__always_inline__, __nodebug__, target("ls64")))
|
||||
__arm_st64b(void *__addr, data512_t __value) {
|
||||
__builtin_arm_st64b(__addr, __value.val);
|
||||
__builtin_arm_st64b(__addr, __value.val);
|
||||
}
|
||||
static __inline__ uint64_t __attribute__((__always_inline__, __nodebug__))
|
||||
static __inline__ uint64_t __attribute__((__always_inline__, __nodebug__, target("ls64")))
|
||||
__arm_st64bv(void *__addr, data512_t __value) {
|
||||
return __builtin_arm_st64bv(__addr, __value.val);
|
||||
return __builtin_arm_st64bv(__addr, __value.val);
|
||||
}
|
||||
static __inline__ uint64_t __attribute__((__always_inline__, __nodebug__))
|
||||
static __inline__ uint64_t __attribute__((__always_inline__, __nodebug__, target("ls64")))
|
||||
__arm_st64bv0(void *__addr, data512_t __value) {
|
||||
return __builtin_arm_st64bv0(__addr, __value.val);
|
||||
return __builtin_arm_st64bv0(__addr, __value.val);
|
||||
}
|
||||
#endif
|
||||
|
||||
/* 10.1 Special register intrinsics */
|
||||
#define __arm_rsr(sysreg) __builtin_arm_rsr(sysreg)
|
||||
#define __arm_rsr64(sysreg) __builtin_arm_rsr64(sysreg)
|
||||
#define __arm_rsr128(sysreg) __builtin_arm_rsr128(sysreg)
|
||||
#define __arm_rsrp(sysreg) __builtin_arm_rsrp(sysreg)
|
||||
#define __arm_rsrf(sysreg) __builtin_bit_cast(float, __arm_rsr(sysreg))
|
||||
#define __arm_rsrf64(sysreg) __builtin_bit_cast(double, __arm_rsr64(sysreg))
|
||||
#define __arm_wsr(sysreg, v) __builtin_arm_wsr(sysreg, v)
|
||||
#define __arm_wsr64(sysreg, v) __builtin_arm_wsr64(sysreg, v)
|
||||
#define __arm_wsr128(sysreg, v) __builtin_arm_wsr128(sysreg, v)
|
||||
#define __arm_wsrp(sysreg, v) __builtin_arm_wsrp(sysreg, v)
|
||||
#define __arm_wsrf(sysreg, v) __arm_wsr(sysreg, __builtin_bit_cast(uint32_t, v))
|
||||
#define __arm_wsrf64(sysreg, v) __arm_wsr64(sysreg, __builtin_bit_cast(uint64_t, v))
|
||||
|
||||
/* Memory Tagging Extensions (MTE) Intrinsics */
|
||||
#if __ARM_FEATURE_MEMORY_TAGGING
|
||||
#if defined(__ARM_64BIT_STATE) && __ARM_64BIT_STATE
|
||||
#define __arm_mte_create_random_tag(__ptr, __mask) __builtin_arm_irg(__ptr, __mask)
|
||||
#define __arm_mte_increment_tag(__ptr, __tag_offset) __builtin_arm_addg(__ptr, __tag_offset)
|
||||
#define __arm_mte_exclude_tag(__ptr, __excluded) __builtin_arm_gmi(__ptr, __excluded)
|
||||
#define __arm_mte_get_tag(__ptr) __builtin_arm_ldg(__ptr)
|
||||
#define __arm_mte_set_tag(__ptr) __builtin_arm_stg(__ptr)
|
||||
#define __arm_mte_ptrdiff(__ptra, __ptrb) __builtin_arm_subp(__ptra, __ptrb)
|
||||
#endif
|
||||
|
||||
/* Memory Operations Intrinsics */
|
||||
#if __ARM_FEATURE_MOPS && __ARM_FEATURE_MEMORY_TAGGING
|
||||
#define __arm_mops_memset_tag(__tagged_address, __value, __size) \
|
||||
__builtin_arm_mops_memset_tag(__tagged_address, __value, __size)
|
||||
#endif
|
||||
|
||||
/* Transactional Memory Extension (TME) Intrinsics */
|
||||
#if __ARM_FEATURE_TME
|
||||
#if defined(__ARM_FEATURE_TME) && __ARM_FEATURE_TME
|
||||
|
||||
#define _TMFAILURE_REASON 0x00007fffu
|
||||
#define _TMFAILURE_RTRY 0x00008000u
|
||||
@ -759,12 +760,12 @@ __arm_st64bv0(void *__addr, data512_t __value) {
|
||||
#endif /* __ARM_FEATURE_TME */
|
||||
|
||||
/* Armv8.5-A Random number generation intrinsics */
|
||||
#if __ARM_64BIT_STATE && defined(__ARM_FEATURE_RNG)
|
||||
static __inline__ int __attribute__((__always_inline__, __nodebug__))
|
||||
#if defined(__ARM_64BIT_STATE) && __ARM_64BIT_STATE
|
||||
static __inline__ int __attribute__((__always_inline__, __nodebug__, target("rand")))
|
||||
__rndr(uint64_t *__p) {
|
||||
return __builtin_arm_rndr(__p);
|
||||
}
|
||||
static __inline__ int __attribute__((__always_inline__, __nodebug__))
|
||||
static __inline__ int __attribute__((__always_inline__, __nodebug__, target("rand")))
|
||||
__rndrrs(uint64_t *__p) {
|
||||
return __builtin_arm_rndrrs(__p);
|
||||
}
|
||||
|
2
lib/include/arm_fp16.h
vendored
2
lib/include/arm_fp16.h
vendored
@ -29,7 +29,7 @@
|
||||
typedef __fp16 float16_t;
|
||||
#define __ai static __inline__ __attribute__((__always_inline__, __nodebug__))
|
||||
|
||||
#if defined(__ARM_FEATURE_FP16_SCALAR_ARITHMETIC) && defined(__aarch64__)
|
||||
#if defined(__aarch64__)
|
||||
#define vabdh_f16(__p0, __p1) __extension__ ({ \
|
||||
float16_t __ret; \
|
||||
float16_t __s0 = __p0; \
|
||||
|
35180
lib/include/arm_neon.h
vendored
35180
lib/include/arm_neon.h
vendored
File diff suppressed because it is too large
Load Diff
182
lib/include/arm_neon_sve_bridge.h
vendored
Normal file
182
lib/include/arm_neon_sve_bridge.h
vendored
Normal file
@ -0,0 +1,182 @@
|
||||
/*===---- arm_neon_sve_bridge.h - ARM NEON SVE Bridge intrinsics -----------===
|
||||
*
|
||||
*
|
||||
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||
* See https://llvm.org/LICENSE.txt for license information.
|
||||
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||
*
|
||||
*===-----------------------------------------------------------------------===
|
||||
*/
|
||||
|
||||
#ifndef __ARM_NEON_SVE_BRIDGE_H
|
||||
#define __ARM_NEON_SVE_BRIDGE_H
|
||||
|
||||
#include <arm_neon.h>
|
||||
#include <arm_sve.h>
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
/* Function attributes */
|
||||
#define __ai static __inline__ __attribute__((__always_inline__, __nodebug__))
|
||||
#define __aio \
|
||||
static __inline__ \
|
||||
__attribute__((__always_inline__, __nodebug__, __overloadable__))
|
||||
|
||||
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svset_neonq_s8)))
|
||||
svint8_t svset_neonq(svint8_t, int8x16_t);
|
||||
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svset_neonq_s16)))
|
||||
svint16_t svset_neonq(svint16_t, int16x8_t);
|
||||
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svset_neonq_s32)))
|
||||
svint32_t svset_neonq(svint32_t, int32x4_t);
|
||||
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svset_neonq_s64)))
|
||||
svint64_t svset_neonq(svint64_t, int64x2_t);
|
||||
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svset_neonq_u8)))
|
||||
svuint8_t svset_neonq(svuint8_t, uint8x16_t);
|
||||
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svset_neonq_u16)))
|
||||
svuint16_t svset_neonq(svuint16_t, uint16x8_t);
|
||||
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svset_neonq_u32)))
|
||||
svuint32_t svset_neonq(svuint32_t, uint32x4_t);
|
||||
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svset_neonq_u64)))
|
||||
svuint64_t svset_neonq(svuint64_t, uint64x2_t);
|
||||
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svset_neonq_f16)))
|
||||
svfloat16_t svset_neonq(svfloat16_t, float16x8_t);
|
||||
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svset_neonq_f32)))
|
||||
svfloat32_t svset_neonq(svfloat32_t, float32x4_t);
|
||||
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svset_neonq_f64)))
|
||||
svfloat64_t svset_neonq(svfloat64_t, float64x2_t);
|
||||
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svset_neonq_s8)))
|
||||
svint8_t svset_neonq_s8(svint8_t, int8x16_t);
|
||||
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svset_neonq_s16)))
|
||||
svint16_t svset_neonq_s16(svint16_t, int16x8_t);
|
||||
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svset_neonq_s32)))
|
||||
svint32_t svset_neonq_s32(svint32_t, int32x4_t);
|
||||
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svset_neonq_s64)))
|
||||
svint64_t svset_neonq_s64(svint64_t, int64x2_t);
|
||||
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svset_neonq_u8)))
|
||||
svuint8_t svset_neonq_u8(svuint8_t, uint8x16_t);
|
||||
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svset_neonq_u16)))
|
||||
svuint16_t svset_neonq_u16(svuint16_t, uint16x8_t);
|
||||
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svset_neonq_u32)))
|
||||
svuint32_t svset_neonq_u32(svuint32_t, uint32x4_t);
|
||||
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svset_neonq_u64)))
|
||||
svuint64_t svset_neonq_u64(svuint64_t, uint64x2_t);
|
||||
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svset_neonq_f16)))
|
||||
svfloat16_t svset_neonq_f16(svfloat16_t, float16x8_t);
|
||||
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svset_neonq_f32)))
|
||||
svfloat32_t svset_neonq_f32(svfloat32_t, float32x4_t);
|
||||
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svset_neonq_f64)))
|
||||
svfloat64_t svset_neonq_f64(svfloat64_t, float64x2_t);
|
||||
|
||||
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svget_neonq_s8)))
|
||||
int8x16_t svget_neonq(svint8_t);
|
||||
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svget_neonq_s16)))
|
||||
int16x8_t svget_neonq(svint16_t);
|
||||
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svget_neonq_s32)))
|
||||
int32x4_t svget_neonq(svint32_t);
|
||||
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svget_neonq_s64)))
|
||||
int64x2_t svget_neonq(svint64_t);
|
||||
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svget_neonq_u8)))
|
||||
uint8x16_t svget_neonq(svuint8_t);
|
||||
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svget_neonq_u16)))
|
||||
uint16x8_t svget_neonq(svuint16_t);
|
||||
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svget_neonq_u32)))
|
||||
uint32x4_t svget_neonq(svuint32_t);
|
||||
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svget_neonq_u64)))
|
||||
uint64x2_t svget_neonq(svuint64_t);
|
||||
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svget_neonq_f16)))
|
||||
float16x8_t svget_neonq(svfloat16_t);
|
||||
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svget_neonq_f32)))
|
||||
float32x4_t svget_neonq(svfloat32_t);
|
||||
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svget_neonq_f64)))
|
||||
float64x2_t svget_neonq(svfloat64_t);
|
||||
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svget_neonq_s8)))
|
||||
int8x16_t svget_neonq_s8(svint8_t);
|
||||
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svget_neonq_s16)))
|
||||
int16x8_t svget_neonq_s16(svint16_t);
|
||||
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svget_neonq_s32)))
|
||||
int32x4_t svget_neonq_s32(svint32_t);
|
||||
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svget_neonq_s64)))
|
||||
int64x2_t svget_neonq_s64(svint64_t);
|
||||
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svget_neonq_u8)))
|
||||
uint8x16_t svget_neonq_u8(svuint8_t);
|
||||
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svget_neonq_u16)))
|
||||
uint16x8_t svget_neonq_u16(svuint16_t);
|
||||
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svget_neonq_u32)))
|
||||
uint32x4_t svget_neonq_u32(svuint32_t);
|
||||
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svget_neonq_u64)))
|
||||
uint64x2_t svget_neonq_u64(svuint64_t);
|
||||
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svget_neonq_f16)))
|
||||
float16x8_t svget_neonq_f16(svfloat16_t);
|
||||
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svget_neonq_f32)))
|
||||
float32x4_t svget_neonq_f32(svfloat32_t);
|
||||
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svget_neonq_f64)))
|
||||
float64x2_t svget_neonq_f64(svfloat64_t);
|
||||
|
||||
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_neonq_s8)))
|
||||
svint8_t svdup_neonq(int8x16_t);
|
||||
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_neonq_s16)))
|
||||
svint16_t svdup_neonq(int16x8_t);
|
||||
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_neonq_s32)))
|
||||
svint32_t svdup_neonq(int32x4_t);
|
||||
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_neonq_s64)))
|
||||
svint64_t svdup_neonq(int64x2_t);
|
||||
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_neonq_u8)))
|
||||
svuint8_t svdup_neonq(uint8x16_t);
|
||||
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_neonq_u16)))
|
||||
svuint16_t svdup_neonq(uint16x8_t);
|
||||
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_neonq_u32)))
|
||||
svuint32_t svdup_neonq(uint32x4_t);
|
||||
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_neonq_u64)))
|
||||
svuint64_t svdup_neonq(uint64x2_t);
|
||||
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_neonq_f16)))
|
||||
svfloat16_t svdup_neonq(float16x8_t);
|
||||
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_neonq_f32)))
|
||||
svfloat32_t svdup_neonq(float32x4_t);
|
||||
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_neonq_f64)))
|
||||
svfloat64_t svdup_neonq(float64x2_t);
|
||||
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_neonq_s8)))
|
||||
svint8_t svdup_neonq_s8(int8x16_t);
|
||||
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_neonq_s16)))
|
||||
svint16_t svdup_neonq_s16(int16x8_t);
|
||||
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_neonq_s32)))
|
||||
svint32_t svdup_neonq_s32(int32x4_t);
|
||||
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_neonq_s64)))
|
||||
svint64_t svdup_neonq_s64(int64x2_t);
|
||||
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_neonq_u8)))
|
||||
svuint8_t svdup_neonq_u8(uint8x16_t);
|
||||
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_neonq_u16)))
|
||||
svuint16_t svdup_neonq_u16(uint16x8_t);
|
||||
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_neonq_u32)))
|
||||
svuint32_t svdup_neonq_u32(uint32x4_t);
|
||||
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_neonq_u64)))
|
||||
svuint64_t svdup_neonq_u64(uint64x2_t);
|
||||
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_neonq_f16)))
|
||||
svfloat16_t svdup_neonq_f16(float16x8_t);
|
||||
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_neonq_f32)))
|
||||
svfloat32_t svdup_neonq_f32(float32x4_t);
|
||||
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_neonq_f64)))
|
||||
svfloat64_t svdup_neonq_f64(float64x2_t);
|
||||
|
||||
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svset_neonq_bf16)))
|
||||
svbfloat16_t svset_neonq(svbfloat16_t, bfloat16x8_t);
|
||||
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svset_neonq_bf16)))
|
||||
svbfloat16_t svset_neonq_bf16(svbfloat16_t, bfloat16x8_t);
|
||||
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svget_neonq_bf16)))
|
||||
bfloat16x8_t svget_neonq(svbfloat16_t);
|
||||
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svget_neonq_bf16)))
|
||||
bfloat16x8_t svget_neonq_bf16(svbfloat16_t);
|
||||
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_neonq_bf16)))
|
||||
svbfloat16_t svdup_neonq(bfloat16x8_t);
|
||||
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_neonq_bf16)))
|
||||
svbfloat16_t svdup_neonq_bf16(bfloat16x8_t);
|
||||
|
||||
#undef __ai
|
||||
#undef __aio
|
||||
|
||||
#ifdef __cplusplus
|
||||
} // extern "C"
|
||||
#endif
|
||||
|
||||
#endif //__ARM_NEON_SVE_BRIDGE_H
|
2040
lib/include/arm_sve.h
vendored
2040
lib/include/arm_sve.h
vendored
File diff suppressed because it is too large
Load Diff
33
lib/include/avx512bf16intrin.h
vendored
33
lib/include/avx512bf16intrin.h
vendored
@ -10,12 +10,14 @@
|
||||
#error "Never use <avx512bf16intrin.h> directly; include <immintrin.h> instead."
|
||||
#endif
|
||||
|
||||
#ifdef __SSE2__
|
||||
|
||||
#ifndef __AVX512BF16INTRIN_H
|
||||
#define __AVX512BF16INTRIN_H
|
||||
|
||||
typedef short __m512bh __attribute__((__vector_size__(64), __aligned__(64)));
|
||||
typedef short __m256bh __attribute__((__vector_size__(32), __aligned__(32)));
|
||||
typedef unsigned short __bfloat16;
|
||||
typedef __bf16 __v32bf __attribute__((__vector_size__(64), __aligned__(64)));
|
||||
typedef __bf16 __m512bh __attribute__((__vector_size__(64), __aligned__(64)));
|
||||
typedef __bf16 __bfloat16 __attribute__((deprecated("use __bf16 instead")));
|
||||
|
||||
#define __DEFAULT_FN_ATTRS512 \
|
||||
__attribute__((__always_inline__, __nodebug__, __target__("avx512bf16"), \
|
||||
@ -33,7 +35,7 @@ typedef unsigned short __bfloat16;
|
||||
/// A bfloat data.
|
||||
/// \returns A float data whose sign field and exponent field keep unchanged,
|
||||
/// and fraction field is extended to 23 bits.
|
||||
static __inline__ float __DEFAULT_FN_ATTRS _mm_cvtsbh_ss(__bfloat16 __A) {
|
||||
static __inline__ float __DEFAULT_FN_ATTRS _mm_cvtsbh_ss(__bf16 __A) {
|
||||
return __builtin_ia32_cvtsbf162ss_32(__A);
|
||||
}
|
||||
|
||||
@ -74,9 +76,9 @@ _mm512_cvtne2ps_pbh(__m512 __A, __m512 __B) {
|
||||
/// conversion of __B, and higher 256 bits come from conversion of __A.
|
||||
static __inline__ __m512bh __DEFAULT_FN_ATTRS512
|
||||
_mm512_mask_cvtne2ps_pbh(__m512bh __W, __mmask32 __U, __m512 __A, __m512 __B) {
|
||||
return (__m512bh)__builtin_ia32_selectw_512((__mmask32)__U,
|
||||
(__v32hi)_mm512_cvtne2ps_pbh(__A, __B),
|
||||
(__v32hi)__W);
|
||||
return (__m512bh)__builtin_ia32_selectpbf_512((__mmask32)__U,
|
||||
(__v32bf)_mm512_cvtne2ps_pbh(__A, __B),
|
||||
(__v32bf)__W);
|
||||
}
|
||||
|
||||
/// Convert Two Packed Single Data to One Packed BF16 Data.
|
||||
@ -96,9 +98,9 @@ _mm512_mask_cvtne2ps_pbh(__m512bh __W, __mmask32 __U, __m512 __A, __m512 __B) {
|
||||
/// conversion of __B, and higher 256 bits come from conversion of __A.
|
||||
static __inline__ __m512bh __DEFAULT_FN_ATTRS512
|
||||
_mm512_maskz_cvtne2ps_pbh(__mmask32 __U, __m512 __A, __m512 __B) {
|
||||
return (__m512bh)__builtin_ia32_selectw_512((__mmask32)__U,
|
||||
(__v32hi)_mm512_cvtne2ps_pbh(__A, __B),
|
||||
(__v32hi)_mm512_setzero_si512());
|
||||
return (__m512bh)__builtin_ia32_selectpbf_512((__mmask32)__U,
|
||||
(__v32bf)_mm512_cvtne2ps_pbh(__A, __B),
|
||||
(__v32bf)_mm512_setzero_si512());
|
||||
}
|
||||
|
||||
/// Convert Packed Single Data to Packed BF16 Data.
|
||||
@ -113,7 +115,7 @@ _mm512_maskz_cvtne2ps_pbh(__mmask32 __U, __m512 __A, __m512 __B) {
|
||||
static __inline__ __m256bh __DEFAULT_FN_ATTRS512
|
||||
_mm512_cvtneps_pbh(__m512 __A) {
|
||||
return (__m256bh)__builtin_ia32_cvtneps2bf16_512_mask((__v16sf)__A,
|
||||
(__v16hi)_mm256_undefined_si256(),
|
||||
(__v16bf)_mm256_undefined_si256(),
|
||||
(__mmask16)-1);
|
||||
}
|
||||
|
||||
@ -134,7 +136,7 @@ _mm512_cvtneps_pbh(__m512 __A) {
|
||||
static __inline__ __m256bh __DEFAULT_FN_ATTRS512
|
||||
_mm512_mask_cvtneps_pbh(__m256bh __W, __mmask16 __U, __m512 __A) {
|
||||
return (__m256bh)__builtin_ia32_cvtneps2bf16_512_mask((__v16sf)__A,
|
||||
(__v16hi)__W,
|
||||
(__v16bf)__W,
|
||||
(__mmask16)__U);
|
||||
}
|
||||
|
||||
@ -153,7 +155,7 @@ _mm512_mask_cvtneps_pbh(__m256bh __W, __mmask16 __U, __m512 __A) {
|
||||
static __inline__ __m256bh __DEFAULT_FN_ATTRS512
|
||||
_mm512_maskz_cvtneps_pbh(__mmask16 __U, __m512 __A) {
|
||||
return (__m256bh)__builtin_ia32_cvtneps2bf16_512_mask((__v16sf)__A,
|
||||
(__v16hi)_mm256_setzero_si256(),
|
||||
(__v16bf)_mm256_setzero_si256(),
|
||||
(__mmask16)__U);
|
||||
}
|
||||
|
||||
@ -174,8 +176,8 @@ _mm512_maskz_cvtneps_pbh(__mmask16 __U, __m512 __A) {
|
||||
static __inline__ __m512 __DEFAULT_FN_ATTRS512
|
||||
_mm512_dpbf16_ps(__m512 __D, __m512bh __A, __m512bh __B) {
|
||||
return (__m512)__builtin_ia32_dpbf16ps_512((__v16sf) __D,
|
||||
(__v16si) __A,
|
||||
(__v16si) __B);
|
||||
(__v32bf) __A,
|
||||
(__v32bf) __B);
|
||||
}
|
||||
|
||||
/// Dot Product of BF16 Pairs Accumulated into Packed Single Precision.
|
||||
@ -277,3 +279,4 @@ _mm512_mask_cvtpbh_ps(__m512 __S, __mmask16 __U, __m256bh __A) {
|
||||
#undef __DEFAULT_FN_ATTRS512
|
||||
|
||||
#endif
|
||||
#endif
|
||||
|
4
lib/include/avx512fintrin.h
vendored
4
lib/include/avx512fintrin.h
vendored
@ -256,8 +256,8 @@ _mm512_maskz_broadcastq_epi64 (__mmask8 __M, __m128i __A)
|
||||
static __inline __m512 __DEFAULT_FN_ATTRS512
|
||||
_mm512_setzero_ps(void)
|
||||
{
|
||||
return __extension__ (__m512){ 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
|
||||
0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0 };
|
||||
return __extension__ (__m512){ 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
|
||||
0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f };
|
||||
}
|
||||
|
||||
#define _mm512_setzero _mm512_setzero_ps
|
||||
|
15
lib/include/avx512fp16intrin.h
vendored
15
lib/include/avx512fp16intrin.h
vendored
@ -10,6 +10,8 @@
|
||||
#error "Never use <avx512fp16intrin.h> directly; include <immintrin.h> instead."
|
||||
#endif
|
||||
|
||||
#ifdef __SSE2__
|
||||
|
||||
#ifndef __AVX512FP16INTRIN_H
|
||||
#define __AVX512FP16INTRIN_H
|
||||
|
||||
@ -17,12 +19,6 @@
|
||||
typedef _Float16 __v32hf __attribute__((__vector_size__(64), __aligned__(64)));
|
||||
typedef _Float16 __m512h __attribute__((__vector_size__(64), __aligned__(64)));
|
||||
typedef _Float16 __m512h_u __attribute__((__vector_size__(64), __aligned__(1)));
|
||||
typedef _Float16 __v8hf __attribute__((__vector_size__(16), __aligned__(16)));
|
||||
typedef _Float16 __m128h __attribute__((__vector_size__(16), __aligned__(16)));
|
||||
typedef _Float16 __m128h_u __attribute__((__vector_size__(16), __aligned__(1)));
|
||||
typedef _Float16 __v16hf __attribute__((__vector_size__(32), __aligned__(32)));
|
||||
typedef _Float16 __m256h __attribute__((__vector_size__(32), __aligned__(32)));
|
||||
typedef _Float16 __m256h_u __attribute__((__vector_size__(32), __aligned__(1)));
|
||||
|
||||
/* Define the default attributes for the functions in this file. */
|
||||
#define __DEFAULT_FN_ATTRS512 \
|
||||
@ -829,7 +825,7 @@ static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_load_sh(void const *__dp) {
|
||||
struct __mm_load_sh_struct {
|
||||
_Float16 __u;
|
||||
} __attribute__((__packed__, __may_alias__));
|
||||
_Float16 __u = ((struct __mm_load_sh_struct *)__dp)->__u;
|
||||
_Float16 __u = ((const struct __mm_load_sh_struct *)__dp)->__u;
|
||||
return (__m128h){__u, 0, 0, 0, 0, 0, 0, 0};
|
||||
}
|
||||
|
||||
@ -838,13 +834,13 @@ _mm_mask_load_sh(__m128h __W, __mmask8 __U, const void *__A) {
|
||||
__m128h src = (__v8hf)__builtin_shufflevector(
|
||||
(__v8hf)__W, (__v8hf)_mm_setzero_ph(), 0, 8, 8, 8, 8, 8, 8, 8);
|
||||
|
||||
return (__m128h)__builtin_ia32_loadsh128_mask((__v8hf *)__A, src, __U & 1);
|
||||
return (__m128h)__builtin_ia32_loadsh128_mask((const __v8hf *)__A, src, __U & 1);
|
||||
}
|
||||
|
||||
static __inline__ __m128h __DEFAULT_FN_ATTRS128
|
||||
_mm_maskz_load_sh(__mmask8 __U, const void *__A) {
|
||||
return (__m128h)__builtin_ia32_loadsh128_mask(
|
||||
(__v8hf *)__A, (__v8hf)_mm_setzero_ph(), __U & 1);
|
||||
(const __v8hf *)__A, (__v8hf)_mm_setzero_ph(), __U & 1);
|
||||
}
|
||||
|
||||
static __inline__ __m512h __DEFAULT_FN_ATTRS512
|
||||
@ -3347,3 +3343,4 @@ _mm512_permutexvar_ph(__m512i __A, __m512h __B) {
|
||||
#undef __DEFAULT_FN_ATTRS512
|
||||
|
||||
#endif
|
||||
#endif
|
||||
|
40
lib/include/avx512ifmavlintrin.h
vendored
40
lib/include/avx512ifmavlintrin.h
vendored
@ -18,14 +18,21 @@
|
||||
#define __DEFAULT_FN_ATTRS128 __attribute__((__always_inline__, __nodebug__, __target__("avx512ifma,avx512vl"), __min_vector_width__(128)))
|
||||
#define __DEFAULT_FN_ATTRS256 __attribute__((__always_inline__, __nodebug__, __target__("avx512ifma,avx512vl"), __min_vector_width__(256)))
|
||||
|
||||
#define _mm_madd52hi_epu64(X, Y, Z) \
|
||||
((__m128i)__builtin_ia32_vpmadd52huq128((__v2di)(X), (__v2di)(Y), \
|
||||
(__v2di)(Z)))
|
||||
|
||||
#define _mm256_madd52hi_epu64(X, Y, Z) \
|
||||
((__m256i)__builtin_ia32_vpmadd52huq256((__v4di)(X), (__v4di)(Y), \
|
||||
(__v4di)(Z)))
|
||||
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS128
|
||||
_mm_madd52hi_epu64 (__m128i __X, __m128i __Y, __m128i __Z)
|
||||
{
|
||||
return (__m128i)__builtin_ia32_vpmadd52huq128((__v2di) __X, (__v2di) __Y,
|
||||
(__v2di) __Z);
|
||||
}
|
||||
#define _mm_madd52lo_epu64(X, Y, Z) \
|
||||
((__m128i)__builtin_ia32_vpmadd52luq128((__v2di)(X), (__v2di)(Y), \
|
||||
(__v2di)(Z)))
|
||||
|
||||
#define _mm256_madd52lo_epu64(X, Y, Z) \
|
||||
((__m256i)__builtin_ia32_vpmadd52luq256((__v4di)(X), (__v4di)(Y), \
|
||||
(__v4di)(Z)))
|
||||
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS128
|
||||
_mm_mask_madd52hi_epu64 (__m128i __W, __mmask8 __M, __m128i __X, __m128i __Y)
|
||||
@ -43,13 +50,6 @@ _mm_maskz_madd52hi_epu64 (__mmask8 __M, __m128i __X, __m128i __Y, __m128i __Z)
|
||||
(__v2di)_mm_setzero_si128());
|
||||
}
|
||||
|
||||
static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
||||
_mm256_madd52hi_epu64 (__m256i __X, __m256i __Y, __m256i __Z)
|
||||
{
|
||||
return (__m256i)__builtin_ia32_vpmadd52huq256((__v4di)__X, (__v4di)__Y,
|
||||
(__v4di)__Z);
|
||||
}
|
||||
|
||||
static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
||||
_mm256_mask_madd52hi_epu64 (__m256i __W, __mmask8 __M, __m256i __X, __m256i __Y)
|
||||
{
|
||||
@ -66,13 +66,6 @@ _mm256_maskz_madd52hi_epu64 (__mmask8 __M, __m256i __X, __m256i __Y, __m256i __Z
|
||||
(__v4di)_mm256_setzero_si256());
|
||||
}
|
||||
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS128
|
||||
_mm_madd52lo_epu64 (__m128i __X, __m128i __Y, __m128i __Z)
|
||||
{
|
||||
return (__m128i)__builtin_ia32_vpmadd52luq128((__v2di)__X, (__v2di)__Y,
|
||||
(__v2di)__Z);
|
||||
}
|
||||
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS128
|
||||
_mm_mask_madd52lo_epu64 (__m128i __W, __mmask8 __M, __m128i __X, __m128i __Y)
|
||||
{
|
||||
@ -89,13 +82,6 @@ _mm_maskz_madd52lo_epu64 (__mmask8 __M, __m128i __X, __m128i __Y, __m128i __Z)
|
||||
(__v2di)_mm_setzero_si128());
|
||||
}
|
||||
|
||||
static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
||||
_mm256_madd52lo_epu64 (__m256i __X, __m256i __Y, __m256i __Z)
|
||||
{
|
||||
return (__m256i)__builtin_ia32_vpmadd52luq256((__v4di)__X, (__v4di)__Y,
|
||||
(__v4di)__Z);
|
||||
}
|
||||
|
||||
static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
||||
_mm256_mask_madd52lo_epu64 (__m256i __W, __mmask8 __M, __m256i __X, __m256i __Y)
|
||||
{
|
||||
|
69
lib/include/avx512vlbf16intrin.h
vendored
69
lib/include/avx512vlbf16intrin.h
vendored
@ -10,11 +10,11 @@
|
||||
#error "Never use <avx512vlbf16intrin.h> directly; include <immintrin.h> instead."
|
||||
#endif
|
||||
|
||||
#ifdef __SSE2__
|
||||
|
||||
#ifndef __AVX512VLBF16INTRIN_H
|
||||
#define __AVX512VLBF16INTRIN_H
|
||||
|
||||
typedef short __m128bh __attribute__((__vector_size__(16), __aligned__(16)));
|
||||
|
||||
#define __DEFAULT_FN_ATTRS128 \
|
||||
__attribute__((__always_inline__, __nodebug__, \
|
||||
__target__("avx512vl, avx512bf16"), __min_vector_width__(128)))
|
||||
@ -59,9 +59,9 @@ _mm_cvtne2ps_pbh(__m128 __A, __m128 __B) {
|
||||
/// conversion of __B, and higher 64 bits come from conversion of __A.
|
||||
static __inline__ __m128bh __DEFAULT_FN_ATTRS128
|
||||
_mm_mask_cvtne2ps_pbh(__m128bh __W, __mmask8 __U, __m128 __A, __m128 __B) {
|
||||
return (__m128bh)__builtin_ia32_selectw_128((__mmask8)__U,
|
||||
(__v8hi)_mm_cvtne2ps_pbh(__A, __B),
|
||||
(__v8hi)__W);
|
||||
return (__m128bh)__builtin_ia32_selectpbf_128((__mmask8)__U,
|
||||
(__v8bf)_mm_cvtne2ps_pbh(__A, __B),
|
||||
(__v8bf)__W);
|
||||
}
|
||||
|
||||
/// Convert Two Packed Single Data to One Packed BF16 Data.
|
||||
@ -81,9 +81,9 @@ _mm_mask_cvtne2ps_pbh(__m128bh __W, __mmask8 __U, __m128 __A, __m128 __B) {
|
||||
/// conversion of __B, and higher 64 bits come from conversion of __A.
|
||||
static __inline__ __m128bh __DEFAULT_FN_ATTRS128
|
||||
_mm_maskz_cvtne2ps_pbh(__mmask8 __U, __m128 __A, __m128 __B) {
|
||||
return (__m128bh)__builtin_ia32_selectw_128((__mmask8)__U,
|
||||
(__v8hi)_mm_cvtne2ps_pbh(__A, __B),
|
||||
(__v8hi)_mm_setzero_si128());
|
||||
return (__m128bh)__builtin_ia32_selectpbf_128((__mmask8)__U,
|
||||
(__v8bf)_mm_cvtne2ps_pbh(__A, __B),
|
||||
(__v8bf)_mm_setzero_si128());
|
||||
}
|
||||
|
||||
/// Convert Two Packed Single Data to One Packed BF16 Data.
|
||||
@ -123,9 +123,9 @@ _mm256_cvtne2ps_pbh(__m256 __A, __m256 __B) {
|
||||
/// conversion of __B, and higher 128 bits come from conversion of __A.
|
||||
static __inline__ __m256bh __DEFAULT_FN_ATTRS256
|
||||
_mm256_mask_cvtne2ps_pbh(__m256bh __W, __mmask16 __U, __m256 __A, __m256 __B) {
|
||||
return (__m256bh)__builtin_ia32_selectw_256((__mmask16)__U,
|
||||
(__v16hi)_mm256_cvtne2ps_pbh(__A, __B),
|
||||
(__v16hi)__W);
|
||||
return (__m256bh)__builtin_ia32_selectpbf_256((__mmask16)__U,
|
||||
(__v16bf)_mm256_cvtne2ps_pbh(__A, __B),
|
||||
(__v16bf)__W);
|
||||
}
|
||||
|
||||
/// Convert Two Packed Single Data to One Packed BF16 Data.
|
||||
@ -145,9 +145,9 @@ _mm256_mask_cvtne2ps_pbh(__m256bh __W, __mmask16 __U, __m256 __A, __m256 __B) {
|
||||
/// conversion of __B, and higher 128 bits come from conversion of __A.
|
||||
static __inline__ __m256bh __DEFAULT_FN_ATTRS256
|
||||
_mm256_maskz_cvtne2ps_pbh(__mmask16 __U, __m256 __A, __m256 __B) {
|
||||
return (__m256bh)__builtin_ia32_selectw_256((__mmask16)__U,
|
||||
(__v16hi)_mm256_cvtne2ps_pbh(__A, __B),
|
||||
(__v16hi)_mm256_setzero_si256());
|
||||
return (__m256bh)__builtin_ia32_selectpbf_256((__mmask16)__U,
|
||||
(__v16bf)_mm256_cvtne2ps_pbh(__A, __B),
|
||||
(__v16bf)_mm256_setzero_si256());
|
||||
}
|
||||
|
||||
/// Convert Packed Single Data to Packed BF16 Data.
|
||||
@ -160,12 +160,8 @@ _mm256_maskz_cvtne2ps_pbh(__mmask16 __U, __m256 __A, __m256 __B) {
|
||||
/// A 128-bit vector of [4 x float].
|
||||
/// \returns A 128-bit vector of [8 x bfloat] whose lower 64 bits come from
|
||||
/// conversion of __A, and higher 64 bits are 0.
|
||||
static __inline__ __m128bh __DEFAULT_FN_ATTRS128
|
||||
_mm_cvtneps_pbh(__m128 __A) {
|
||||
return (__m128bh)__builtin_ia32_cvtneps2bf16_128_mask((__v4sf) __A,
|
||||
(__v8hi)_mm_undefined_si128(),
|
||||
(__mmask8)-1);
|
||||
}
|
||||
#define _mm_cvtneps_pbh(A) \
|
||||
((__m128bh)__builtin_ia32_vcvtneps2bf16128((__v4sf)(A)))
|
||||
|
||||
/// Convert Packed Single Data to Packed BF16 Data.
|
||||
///
|
||||
@ -185,7 +181,7 @@ _mm_cvtneps_pbh(__m128 __A) {
|
||||
static __inline__ __m128bh __DEFAULT_FN_ATTRS128
|
||||
_mm_mask_cvtneps_pbh(__m128bh __W, __mmask8 __U, __m128 __A) {
|
||||
return (__m128bh)__builtin_ia32_cvtneps2bf16_128_mask((__v4sf) __A,
|
||||
(__v8hi)__W,
|
||||
(__v8bf)__W,
|
||||
(__mmask8)__U);
|
||||
}
|
||||
|
||||
@ -205,7 +201,7 @@ _mm_mask_cvtneps_pbh(__m128bh __W, __mmask8 __U, __m128 __A) {
|
||||
static __inline__ __m128bh __DEFAULT_FN_ATTRS128
|
||||
_mm_maskz_cvtneps_pbh(__mmask8 __U, __m128 __A) {
|
||||
return (__m128bh)__builtin_ia32_cvtneps2bf16_128_mask((__v4sf) __A,
|
||||
(__v8hi)_mm_setzero_si128(),
|
||||
(__v8bf)_mm_setzero_si128(),
|
||||
(__mmask8)__U);
|
||||
}
|
||||
|
||||
@ -218,12 +214,8 @@ _mm_maskz_cvtneps_pbh(__mmask8 __U, __m128 __A) {
|
||||
/// \param __A
|
||||
/// A 256-bit vector of [8 x float].
|
||||
/// \returns A 128-bit vector of [8 x bfloat] comes from conversion of __A.
|
||||
static __inline__ __m128bh __DEFAULT_FN_ATTRS256
|
||||
_mm256_cvtneps_pbh(__m256 __A) {
|
||||
return (__m128bh)__builtin_ia32_cvtneps2bf16_256_mask((__v8sf)__A,
|
||||
(__v8hi)_mm_undefined_si128(),
|
||||
(__mmask8)-1);
|
||||
}
|
||||
#define _mm256_cvtneps_pbh(A) \
|
||||
((__m128bh)__builtin_ia32_vcvtneps2bf16256((__v8sf)(A)))
|
||||
|
||||
/// Convert Packed Single Data to Packed BF16 Data.
|
||||
///
|
||||
@ -242,7 +234,7 @@ _mm256_cvtneps_pbh(__m256 __A) {
|
||||
static __inline__ __m128bh __DEFAULT_FN_ATTRS256
|
||||
_mm256_mask_cvtneps_pbh(__m128bh __W, __mmask8 __U, __m256 __A) {
|
||||
return (__m128bh)__builtin_ia32_cvtneps2bf16_256_mask((__v8sf)__A,
|
||||
(__v8hi)__W,
|
||||
(__v8bf)__W,
|
||||
(__mmask8)__U);
|
||||
}
|
||||
|
||||
@ -261,7 +253,7 @@ _mm256_mask_cvtneps_pbh(__m128bh __W, __mmask8 __U, __m256 __A) {
|
||||
static __inline__ __m128bh __DEFAULT_FN_ATTRS256
|
||||
_mm256_maskz_cvtneps_pbh(__mmask8 __U, __m256 __A) {
|
||||
return (__m128bh)__builtin_ia32_cvtneps2bf16_256_mask((__v8sf)__A,
|
||||
(__v8hi)_mm_setzero_si128(),
|
||||
(__v8bf)_mm_setzero_si128(),
|
||||
(__mmask8)__U);
|
||||
}
|
||||
|
||||
@ -282,8 +274,8 @@ _mm256_maskz_cvtneps_pbh(__mmask8 __U, __m256 __A) {
|
||||
static __inline__ __m128 __DEFAULT_FN_ATTRS128
|
||||
_mm_dpbf16_ps(__m128 __D, __m128bh __A, __m128bh __B) {
|
||||
return (__m128)__builtin_ia32_dpbf16ps_128((__v4sf)__D,
|
||||
(__v4si)__A,
|
||||
(__v4si)__B);
|
||||
(__v8bf)__A,
|
||||
(__v8bf)__B);
|
||||
}
|
||||
|
||||
/// Dot Product of BF16 Pairs Accumulated into Packed Single Precision.
|
||||
@ -351,8 +343,8 @@ _mm_maskz_dpbf16_ps(__mmask8 __U, __m128 __D, __m128bh __A, __m128bh __B) {
|
||||
static __inline__ __m256 __DEFAULT_FN_ATTRS256
|
||||
_mm256_dpbf16_ps(__m256 __D, __m256bh __A, __m256bh __B) {
|
||||
return (__m256)__builtin_ia32_dpbf16ps_256((__v8sf)__D,
|
||||
(__v8si)__A,
|
||||
(__v8si)__B);
|
||||
(__v16bf)__A,
|
||||
(__v16bf)__B);
|
||||
}
|
||||
|
||||
/// Dot Product of BF16 Pairs Accumulated into Packed Single Precision.
|
||||
@ -413,11 +405,11 @@ _mm256_maskz_dpbf16_ps(__mmask8 __U, __m256 __D, __m256bh __A, __m256bh __B) {
|
||||
/// A float data.
|
||||
/// \returns A bf16 data whose sign field and exponent field keep unchanged,
|
||||
/// and fraction field is truncated to 7 bits.
|
||||
static __inline__ __bfloat16 __DEFAULT_FN_ATTRS128 _mm_cvtness_sbh(float __A) {
|
||||
static __inline__ __bf16 __DEFAULT_FN_ATTRS128 _mm_cvtness_sbh(float __A) {
|
||||
__v4sf __V = {__A, 0, 0, 0};
|
||||
__v8hi __R = __builtin_ia32_cvtneps2bf16_128_mask(
|
||||
(__v4sf)__V, (__v8hi)_mm_undefined_si128(), (__mmask8)-1);
|
||||
return (__bfloat16)__R[0];
|
||||
__v8bf __R = __builtin_ia32_cvtneps2bf16_128_mask(
|
||||
(__v4sf)__V, (__v8bf)_mm_undefined_si128(), (__mmask8)-1);
|
||||
return (__bf16)__R[0];
|
||||
}
|
||||
|
||||
/// Convert Packed BF16 Data to Packed float Data.
|
||||
@ -520,3 +512,4 @@ _mm256_mask_cvtpbh_ps(__m256 __S, __mmask8 __U, __m128bh __A) {
|
||||
#undef __DEFAULT_FN_ATTRS256
|
||||
|
||||
#endif
|
||||
#endif
|
||||
|
352
lib/include/avx512vlbwintrin.h
vendored
352
lib/include/avx512vlbwintrin.h
vendored
@ -2803,6 +2803,358 @@ _mm256_mask_permutexvar_epi16 (__m256i __W, __mmask16 __M, __m256i __A,
|
||||
(__v16hi)_mm256_dbsad_epu8((A), (B), (imm)), \
|
||||
(__v16hi)_mm256_setzero_si256()))
|
||||
|
||||
static __inline__ short __DEFAULT_FN_ATTRS128
|
||||
_mm_reduce_add_epi16(__m128i __W) {
|
||||
return __builtin_reduce_add((__v8hi)__W);
|
||||
}
|
||||
|
||||
static __inline__ short __DEFAULT_FN_ATTRS128
|
||||
_mm_reduce_mul_epi16(__m128i __W) {
|
||||
return __builtin_reduce_mul((__v8hi)__W);
|
||||
}
|
||||
|
||||
static __inline__ short __DEFAULT_FN_ATTRS128
|
||||
_mm_reduce_and_epi16(__m128i __W) {
|
||||
return __builtin_reduce_and((__v8hi)__W);
|
||||
}
|
||||
|
||||
static __inline__ short __DEFAULT_FN_ATTRS128
|
||||
_mm_reduce_or_epi16(__m128i __W) {
|
||||
return __builtin_reduce_or((__v8hi)__W);
|
||||
}
|
||||
|
||||
static __inline__ short __DEFAULT_FN_ATTRS128
|
||||
_mm_mask_reduce_add_epi16( __mmask8 __M, __m128i __W) {
|
||||
__W = _mm_maskz_mov_epi16(__M, __W);
|
||||
return __builtin_reduce_add((__v8hi)__W);
|
||||
}
|
||||
|
||||
static __inline__ short __DEFAULT_FN_ATTRS128
|
||||
_mm_mask_reduce_mul_epi16( __mmask8 __M, __m128i __W) {
|
||||
__W = _mm_mask_mov_epi16(_mm_set1_epi16(1), __M, __W);
|
||||
return __builtin_reduce_mul((__v8hi)__W);
|
||||
}
|
||||
|
||||
static __inline__ short __DEFAULT_FN_ATTRS128
|
||||
_mm_mask_reduce_and_epi16( __mmask8 __M, __m128i __W) {
|
||||
__W = _mm_mask_mov_epi16(_mm_set1_epi16(-1), __M, __W);
|
||||
return __builtin_reduce_and((__v8hi)__W);
|
||||
}
|
||||
|
||||
static __inline__ short __DEFAULT_FN_ATTRS128
|
||||
_mm_mask_reduce_or_epi16(__mmask8 __M, __m128i __W) {
|
||||
__W = _mm_maskz_mov_epi16(__M, __W);
|
||||
return __builtin_reduce_or((__v8hi)__W);
|
||||
}
|
||||
|
||||
static __inline__ short __DEFAULT_FN_ATTRS128
|
||||
_mm_reduce_max_epi16(__m128i __V) {
|
||||
return __builtin_reduce_max((__v8hi)__V);
|
||||
}
|
||||
|
||||
static __inline__ unsigned short __DEFAULT_FN_ATTRS128
|
||||
_mm_reduce_max_epu16(__m128i __V) {
|
||||
return __builtin_reduce_max((__v8hu)__V);
|
||||
}
|
||||
|
||||
static __inline__ short __DEFAULT_FN_ATTRS128
|
||||
_mm_reduce_min_epi16(__m128i __V) {
|
||||
return __builtin_reduce_min((__v8hi)__V);
|
||||
}
|
||||
|
||||
static __inline__ unsigned short __DEFAULT_FN_ATTRS128
|
||||
_mm_reduce_min_epu16(__m128i __V) {
|
||||
return __builtin_reduce_min((__v8hu)__V);
|
||||
}
|
||||
|
||||
static __inline__ short __DEFAULT_FN_ATTRS128
|
||||
_mm_mask_reduce_max_epi16(__mmask16 __M, __m128i __V) {
|
||||
__V = _mm_mask_mov_epi16(_mm_set1_epi16(-32767-1), __M, __V);
|
||||
return __builtin_reduce_max((__v8hi)__V);
|
||||
}
|
||||
|
||||
static __inline__ unsigned short __DEFAULT_FN_ATTRS128
|
||||
_mm_mask_reduce_max_epu16(__mmask16 __M, __m128i __V) {
|
||||
__V = _mm_maskz_mov_epi16(__M, __V);
|
||||
return __builtin_reduce_max((__v8hu)__V);
|
||||
}
|
||||
|
||||
static __inline__ short __DEFAULT_FN_ATTRS128
|
||||
_mm_mask_reduce_min_epi16(__mmask16 __M, __m128i __V) {
|
||||
__V = _mm_mask_mov_epi16(_mm_set1_epi16(32767), __M, __V);
|
||||
return __builtin_reduce_min((__v8hi)__V);
|
||||
}
|
||||
|
||||
static __inline__ unsigned short __DEFAULT_FN_ATTRS128
|
||||
_mm_mask_reduce_min_epu16(__mmask16 __M, __m128i __V) {
|
||||
__V = _mm_mask_mov_epi16(_mm_set1_epi16(-1), __M, __V);
|
||||
return __builtin_reduce_min((__v8hu)__V);
|
||||
}
|
||||
|
||||
static __inline__ short __DEFAULT_FN_ATTRS256
|
||||
_mm256_reduce_add_epi16(__m256i __W) {
|
||||
return __builtin_reduce_add((__v16hi)__W);
|
||||
}
|
||||
|
||||
static __inline__ short __DEFAULT_FN_ATTRS256
|
||||
_mm256_reduce_mul_epi16(__m256i __W) {
|
||||
return __builtin_reduce_mul((__v16hi)__W);
|
||||
}
|
||||
|
||||
static __inline__ short __DEFAULT_FN_ATTRS256
|
||||
_mm256_reduce_and_epi16(__m256i __W) {
|
||||
return __builtin_reduce_and((__v16hi)__W);
|
||||
}
|
||||
|
||||
static __inline__ short __DEFAULT_FN_ATTRS256
|
||||
_mm256_reduce_or_epi16(__m256i __W) {
|
||||
return __builtin_reduce_or((__v16hi)__W);
|
||||
}
|
||||
|
||||
static __inline__ short __DEFAULT_FN_ATTRS256
|
||||
_mm256_mask_reduce_add_epi16( __mmask16 __M, __m256i __W) {
|
||||
__W = _mm256_maskz_mov_epi16(__M, __W);
|
||||
return __builtin_reduce_add((__v16hi)__W);
|
||||
}
|
||||
|
||||
static __inline__ short __DEFAULT_FN_ATTRS256
|
||||
_mm256_mask_reduce_mul_epi16( __mmask16 __M, __m256i __W) {
|
||||
__W = _mm256_mask_mov_epi16(_mm256_set1_epi16(1), __M, __W);
|
||||
return __builtin_reduce_mul((__v16hi)__W);
|
||||
}
|
||||
|
||||
static __inline__ short __DEFAULT_FN_ATTRS256
|
||||
_mm256_mask_reduce_and_epi16( __mmask16 __M, __m256i __W) {
|
||||
__W = _mm256_mask_mov_epi16(_mm256_set1_epi16(-1), __M, __W);
|
||||
return __builtin_reduce_and((__v16hi)__W);
|
||||
}
|
||||
|
||||
static __inline__ short __DEFAULT_FN_ATTRS256
|
||||
_mm256_mask_reduce_or_epi16(__mmask16 __M, __m256i __W) {
|
||||
__W = _mm256_maskz_mov_epi16(__M, __W);
|
||||
return __builtin_reduce_or((__v16hi)__W);
|
||||
}
|
||||
|
||||
static __inline__ short __DEFAULT_FN_ATTRS256
|
||||
_mm256_reduce_max_epi16(__m256i __V) {
|
||||
return __builtin_reduce_max((__v16hi)__V);
|
||||
}
|
||||
|
||||
static __inline__ unsigned short __DEFAULT_FN_ATTRS256
|
||||
_mm256_reduce_max_epu16(__m256i __V) {
|
||||
return __builtin_reduce_max((__v16hu)__V);
|
||||
}
|
||||
|
||||
static __inline__ short __DEFAULT_FN_ATTRS256
|
||||
_mm256_reduce_min_epi16(__m256i __V) {
|
||||
return __builtin_reduce_min((__v16hi)__V);
|
||||
}
|
||||
|
||||
static __inline__ unsigned short __DEFAULT_FN_ATTRS256
|
||||
_mm256_reduce_min_epu16(__m256i __V) {
|
||||
return __builtin_reduce_min((__v16hu)__V);
|
||||
}
|
||||
|
||||
static __inline__ short __DEFAULT_FN_ATTRS256
|
||||
_mm256_mask_reduce_max_epi16(__mmask16 __M, __m256i __V) {
|
||||
__V = _mm256_mask_mov_epi16(_mm256_set1_epi16(-32767-1), __M, __V);
|
||||
return __builtin_reduce_max((__v16hi)__V);
|
||||
}
|
||||
|
||||
static __inline__ unsigned short __DEFAULT_FN_ATTRS256
|
||||
_mm256_mask_reduce_max_epu16(__mmask16 __M, __m256i __V) {
|
||||
__V = _mm256_maskz_mov_epi16(__M, __V);
|
||||
return __builtin_reduce_max((__v16hu)__V);
|
||||
}
|
||||
|
||||
static __inline__ short __DEFAULT_FN_ATTRS256
|
||||
_mm256_mask_reduce_min_epi16(__mmask16 __M, __m256i __V) {
|
||||
__V = _mm256_mask_mov_epi16(_mm256_set1_epi16(32767), __M, __V);
|
||||
return __builtin_reduce_min((__v16hi)__V);
|
||||
}
|
||||
|
||||
static __inline__ unsigned short __DEFAULT_FN_ATTRS256
|
||||
_mm256_mask_reduce_min_epu16(__mmask16 __M, __m256i __V) {
|
||||
__V = _mm256_mask_mov_epi16(_mm256_set1_epi16(-1), __M, __V);
|
||||
return __builtin_reduce_min((__v16hu)__V);
|
||||
}
|
||||
|
||||
static __inline__ signed char __DEFAULT_FN_ATTRS128
|
||||
_mm_reduce_add_epi8(__m128i __W) {
|
||||
return __builtin_reduce_add((__v16qs)__W);
|
||||
}
|
||||
|
||||
static __inline__ signed char __DEFAULT_FN_ATTRS128
|
||||
_mm_reduce_mul_epi8(__m128i __W) {
|
||||
return __builtin_reduce_mul((__v16qs)__W);
|
||||
}
|
||||
|
||||
static __inline__ signed char __DEFAULT_FN_ATTRS128
|
||||
_mm_reduce_and_epi8(__m128i __W) {
|
||||
return __builtin_reduce_and((__v16qs)__W);
|
||||
}
|
||||
|
||||
static __inline__ signed char __DEFAULT_FN_ATTRS128
|
||||
_mm_reduce_or_epi8(__m128i __W) {
|
||||
return __builtin_reduce_or((__v16qs)__W);
|
||||
}
|
||||
|
||||
static __inline__ signed char __DEFAULT_FN_ATTRS128
|
||||
_mm_mask_reduce_add_epi8(__mmask16 __M, __m128i __W) {
|
||||
__W = _mm_maskz_mov_epi8(__M, __W);
|
||||
return __builtin_reduce_add((__v16qs)__W);
|
||||
}
|
||||
|
||||
static __inline__ signed char __DEFAULT_FN_ATTRS128
|
||||
_mm_mask_reduce_mul_epi8(__mmask16 __M, __m128i __W) {
|
||||
__W = _mm_mask_mov_epi8(_mm_set1_epi8(1), __M, __W);
|
||||
return __builtin_reduce_mul((__v16qs)__W);
|
||||
}
|
||||
|
||||
static __inline__ signed char __DEFAULT_FN_ATTRS128
|
||||
_mm_mask_reduce_and_epi8(__mmask16 __M, __m128i __W) {
|
||||
__W = _mm_mask_mov_epi8(_mm_set1_epi8(-1), __M, __W);
|
||||
return __builtin_reduce_and((__v16qs)__W);
|
||||
}
|
||||
|
||||
static __inline__ signed char __DEFAULT_FN_ATTRS128
|
||||
_mm_mask_reduce_or_epi8(__mmask16 __M, __m128i __W) {
|
||||
__W = _mm_maskz_mov_epi8(__M, __W);
|
||||
return __builtin_reduce_or((__v16qs)__W);
|
||||
}
|
||||
|
||||
static __inline__ signed char __DEFAULT_FN_ATTRS128
|
||||
_mm_reduce_max_epi8(__m128i __V) {
|
||||
return __builtin_reduce_max((__v16qs)__V);
|
||||
}
|
||||
|
||||
static __inline__ unsigned char __DEFAULT_FN_ATTRS128
|
||||
_mm_reduce_max_epu8(__m128i __V) {
|
||||
return __builtin_reduce_max((__v16qu)__V);
|
||||
}
|
||||
|
||||
static __inline__ signed char __DEFAULT_FN_ATTRS128
|
||||
_mm_reduce_min_epi8(__m128i __V) {
|
||||
return __builtin_reduce_min((__v16qs)__V);
|
||||
}
|
||||
|
||||
static __inline__ unsigned char __DEFAULT_FN_ATTRS128
|
||||
_mm_reduce_min_epu8(__m128i __V) {
|
||||
return __builtin_reduce_min((__v16qu)__V);
|
||||
}
|
||||
|
||||
static __inline__ signed char __DEFAULT_FN_ATTRS128
|
||||
_mm_mask_reduce_max_epi8(__mmask16 __M, __m128i __V) {
|
||||
__V = _mm_mask_mov_epi8(_mm_set1_epi8(-127-1), __M, __V);
|
||||
return __builtin_reduce_max((__v16qs)__V);
|
||||
}
|
||||
|
||||
static __inline__ unsigned char __DEFAULT_FN_ATTRS128
|
||||
_mm_mask_reduce_max_epu8(__mmask16 __M, __m128i __V) {
|
||||
__V = _mm_maskz_mov_epi8(__M, __V);
|
||||
return __builtin_reduce_max((__v16qu)__V);
|
||||
}
|
||||
|
||||
static __inline__ signed char __DEFAULT_FN_ATTRS128
|
||||
_mm_mask_reduce_min_epi8(__mmask16 __M, __m128i __V) {
|
||||
__V = _mm_mask_mov_epi8(_mm_set1_epi8(127), __M, __V);
|
||||
return __builtin_reduce_min((__v16qs)__V);
|
||||
}
|
||||
|
||||
static __inline__ unsigned char __DEFAULT_FN_ATTRS128
|
||||
_mm_mask_reduce_min_epu8(__mmask16 __M, __m128i __V) {
|
||||
__V = _mm_mask_mov_epi8(_mm_set1_epi8(-1), __M, __V);
|
||||
return __builtin_reduce_min((__v16qu)__V);
|
||||
}
|
||||
|
||||
static __inline__ signed char __DEFAULT_FN_ATTRS256
|
||||
_mm256_reduce_add_epi8(__m256i __W) {
|
||||
return __builtin_reduce_add((__v32qs)__W);
|
||||
}
|
||||
|
||||
static __inline__ signed char __DEFAULT_FN_ATTRS256
|
||||
_mm256_reduce_mul_epi8(__m256i __W) {
|
||||
return __builtin_reduce_mul((__v32qs)__W);
|
||||
}
|
||||
|
||||
static __inline__ signed char __DEFAULT_FN_ATTRS256
|
||||
_mm256_reduce_and_epi8(__m256i __W) {
|
||||
return __builtin_reduce_and((__v32qs)__W);
|
||||
}
|
||||
|
||||
static __inline__ signed char __DEFAULT_FN_ATTRS256
|
||||
_mm256_reduce_or_epi8(__m256i __W) {
|
||||
return __builtin_reduce_or((__v32qs)__W);
|
||||
}
|
||||
|
||||
static __inline__ signed char __DEFAULT_FN_ATTRS256
|
||||
_mm256_mask_reduce_add_epi8(__mmask32 __M, __m256i __W) {
|
||||
__W = _mm256_maskz_mov_epi8(__M, __W);
|
||||
return __builtin_reduce_add((__v32qs)__W);
|
||||
}
|
||||
|
||||
static __inline__ signed char __DEFAULT_FN_ATTRS256
|
||||
_mm256_mask_reduce_mul_epi8(__mmask32 __M, __m256i __W) {
|
||||
__W = _mm256_mask_mov_epi8(_mm256_set1_epi8(1), __M, __W);
|
||||
return __builtin_reduce_mul((__v32qs)__W);
|
||||
}
|
||||
|
||||
static __inline__ signed char __DEFAULT_FN_ATTRS256
|
||||
_mm256_mask_reduce_and_epi8(__mmask32 __M, __m256i __W) {
|
||||
__W = _mm256_mask_mov_epi8(_mm256_set1_epi8(-1), __M, __W);
|
||||
return __builtin_reduce_and((__v32qs)__W);
|
||||
}
|
||||
|
||||
static __inline__ signed char __DEFAULT_FN_ATTRS256
|
||||
_mm256_mask_reduce_or_epi8(__mmask32 __M, __m256i __W) {
|
||||
__W = _mm256_maskz_mov_epi8(__M, __W);
|
||||
return __builtin_reduce_or((__v32qs)__W);
|
||||
}
|
||||
|
||||
static __inline__ signed char __DEFAULT_FN_ATTRS256
|
||||
_mm256_reduce_max_epi8(__m256i __V) {
|
||||
return __builtin_reduce_max((__v32qs)__V);
|
||||
}
|
||||
|
||||
static __inline__ unsigned char __DEFAULT_FN_ATTRS256
|
||||
_mm256_reduce_max_epu8(__m256i __V) {
|
||||
return __builtin_reduce_max((__v32qu)__V);
|
||||
}
|
||||
|
||||
static __inline__ signed char __DEFAULT_FN_ATTRS256
|
||||
_mm256_reduce_min_epi8(__m256i __V) {
|
||||
return __builtin_reduce_min((__v32qs)__V);
|
||||
}
|
||||
|
||||
static __inline__ unsigned char __DEFAULT_FN_ATTRS256
|
||||
_mm256_reduce_min_epu8(__m256i __V) {
|
||||
return __builtin_reduce_min((__v32qu)__V);
|
||||
}
|
||||
|
||||
static __inline__ signed char __DEFAULT_FN_ATTRS256
|
||||
_mm256_mask_reduce_max_epi8(__mmask32 __M, __m256i __V) {
|
||||
__V = _mm256_mask_mov_epi8(_mm256_set1_epi8(-127-1), __M, __V);
|
||||
return __builtin_reduce_max((__v32qs)__V);
|
||||
}
|
||||
|
||||
static __inline__ unsigned char __DEFAULT_FN_ATTRS256
|
||||
_mm256_mask_reduce_max_epu8(__mmask32 __M, __m256i __V) {
|
||||
__V = _mm256_maskz_mov_epi8(__M, __V);
|
||||
return __builtin_reduce_max((__v32qu)__V);
|
||||
}
|
||||
|
||||
static __inline__ signed char __DEFAULT_FN_ATTRS256
|
||||
_mm256_mask_reduce_min_epi8(__mmask32 __M, __m256i __V) {
|
||||
__V = _mm256_mask_mov_epi8(_mm256_set1_epi8(127), __M, __V);
|
||||
return __builtin_reduce_min((__v32qs)__V);
|
||||
}
|
||||
|
||||
static __inline__ unsigned char __DEFAULT_FN_ATTRS256
|
||||
_mm256_mask_reduce_min_epu8(__mmask32 __M, __m256i __V) {
|
||||
__V = _mm256_mask_mov_epi8(_mm256_set1_epi8(-1), __M, __V);
|
||||
return __builtin_reduce_min((__v32qu)__V);
|
||||
}
|
||||
|
||||
#undef __DEFAULT_FN_ATTRS128
|
||||
#undef __DEFAULT_FN_ATTRS256
|
||||
|
||||
|
3
lib/include/avx512vlfp16intrin.h
vendored
3
lib/include/avx512vlfp16intrin.h
vendored
@ -11,6 +11,8 @@
|
||||
"Never use <avx512vlfp16intrin.h> directly; include <immintrin.h> instead."
|
||||
#endif
|
||||
|
||||
#ifdef __SSE2__
|
||||
|
||||
#ifndef __AVX512VLFP16INTRIN_H
|
||||
#define __AVX512VLFP16INTRIN_H
|
||||
|
||||
@ -2066,3 +2068,4 @@ _mm_reduce_min_ph(__m128h __V) {
|
||||
#undef __DEFAULT_FN_ATTRS256
|
||||
|
||||
#endif
|
||||
#endif
|
||||
|
177
lib/include/avxifmaintrin.h
vendored
Normal file
177
lib/include/avxifmaintrin.h
vendored
Normal file
@ -0,0 +1,177 @@
|
||||
/*===----------------- avxifmaintrin.h - IFMA intrinsics -------------------===
|
||||
*
|
||||
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||
* See https://llvm.org/LICENSE.txt for license information.
|
||||
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||
*
|
||||
*===-----------------------------------------------------------------------===
|
||||
*/
|
||||
|
||||
#ifndef __IMMINTRIN_H
|
||||
#error "Never use <avxifmaintrin.h> directly; include <immintrin.h> instead."
|
||||
#endif
|
||||
|
||||
#ifndef __AVXIFMAINTRIN_H
|
||||
#define __AVXIFMAINTRIN_H
|
||||
|
||||
/* Define the default attributes for the functions in this file. */
|
||||
#define __DEFAULT_FN_ATTRS128 \
|
||||
__attribute__((__always_inline__, __nodebug__, __target__("avxifma"), \
|
||||
__min_vector_width__(128)))
|
||||
#define __DEFAULT_FN_ATTRS256 \
|
||||
__attribute__((__always_inline__, __nodebug__, __target__("avxifma"), \
|
||||
__min_vector_width__(256)))
|
||||
|
||||
// must vex-encoding
|
||||
|
||||
/// Multiply packed unsigned 52-bit integers in each 64-bit element of \a __Y
|
||||
/// and \a __Z to form a 104-bit intermediate result. Add the high 52-bit
|
||||
/// unsigned integer from the intermediate result with the corresponding
|
||||
/// unsigned 64-bit integer in \a __X, and store the results in \a dst.
|
||||
///
|
||||
/// \headerfile <immintrin.h>
|
||||
///
|
||||
/// \code
|
||||
/// __m128i
|
||||
/// _mm_madd52hi_avx_epu64 (__m128i __X, __m128i __Y, __m128i __Z)
|
||||
/// \endcode
|
||||
///
|
||||
/// This intrinsic corresponds to the \c VPMADD52HUQ instruction.
|
||||
///
|
||||
/// \return
|
||||
/// return __m128i dst.
|
||||
/// \param __X
|
||||
/// A 128-bit vector of [2 x i64]
|
||||
/// \param __Y
|
||||
/// A 128-bit vector of [2 x i64]
|
||||
/// \param __Z
|
||||
/// A 128-bit vector of [2 x i64]
|
||||
///
|
||||
/// \code{.operation}
|
||||
/// FOR j := 0 to 1
|
||||
/// i := j*64
|
||||
/// tmp[127:0] := ZeroExtend64(__Y[i+51:i]) * ZeroExtend64(__Z[i+51:i])
|
||||
/// dst[i+63:i] := __X[i+63:i] + ZeroExtend64(tmp[103:52])
|
||||
/// ENDFOR
|
||||
/// dst[MAX:128] := 0
|
||||
/// \endcode
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS128
|
||||
_mm_madd52hi_avx_epu64(__m128i __X, __m128i __Y, __m128i __Z) {
|
||||
return (__m128i)__builtin_ia32_vpmadd52huq128((__v2di)__X, (__v2di)__Y,
|
||||
(__v2di)__Z);
|
||||
}
|
||||
|
||||
/// Multiply packed unsigned 52-bit integers in each 64-bit element of \a __Y
|
||||
/// and \a __Z to form a 104-bit intermediate result. Add the high 52-bit
|
||||
/// unsigned integer from the intermediate result with the corresponding
|
||||
/// unsigned 64-bit integer in \a __X, and store the results in \a dst.
|
||||
///
|
||||
/// \headerfile <immintrin.h>
|
||||
///
|
||||
/// \code
|
||||
/// __m256i
|
||||
/// _mm256_madd52hi_avx_epu64 (__m256i __X, __m256i __Y, __m256i __Z)
|
||||
/// \endcode
|
||||
///
|
||||
/// This intrinsic corresponds to the \c VPMADD52HUQ instruction.
|
||||
///
|
||||
/// \return
|
||||
/// return __m256i dst.
|
||||
/// \param __X
|
||||
/// A 256-bit vector of [4 x i64]
|
||||
/// \param __Y
|
||||
/// A 256-bit vector of [4 x i64]
|
||||
/// \param __Z
|
||||
/// A 256-bit vector of [4 x i64]
|
||||
///
|
||||
/// \code{.operation}
|
||||
/// FOR j := 0 to 3
|
||||
/// i := j*64
|
||||
/// tmp[127:0] := ZeroExtend64(__Y[i+51:i]) * ZeroExtend64(__Z[i+51:i])
|
||||
/// dst[i+63:i] := __X[i+63:i] + ZeroExtend64(tmp[103:52])
|
||||
/// ENDFOR
|
||||
/// dst[MAX:256] := 0
|
||||
/// \endcode
|
||||
static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
||||
_mm256_madd52hi_avx_epu64(__m256i __X, __m256i __Y, __m256i __Z) {
|
||||
return (__m256i)__builtin_ia32_vpmadd52huq256((__v4di)__X, (__v4di)__Y,
|
||||
(__v4di)__Z);
|
||||
}
|
||||
|
||||
/// Multiply packed unsigned 52-bit integers in each 64-bit element of \a __Y
|
||||
/// and \a __Z to form a 104-bit intermediate result. Add the low 52-bit
|
||||
/// unsigned integer from the intermediate result with the corresponding
|
||||
/// unsigned 64-bit integer in \a __X, and store the results in \a dst.
|
||||
///
|
||||
/// \headerfile <immintrin.h>
|
||||
///
|
||||
/// \code
|
||||
/// __m128i
|
||||
/// _mm_madd52lo_avx_epu64 (__m128i __X, __m128i __Y, __m128i __Z)
|
||||
/// \endcode
|
||||
///
|
||||
/// This intrinsic corresponds to the \c VPMADD52LUQ instruction.
|
||||
///
|
||||
/// \return
|
||||
/// return __m128i dst.
|
||||
/// \param __X
|
||||
/// A 128-bit vector of [2 x i64]
|
||||
/// \param __Y
|
||||
/// A 128-bit vector of [2 x i64]
|
||||
/// \param __Z
|
||||
/// A 128-bit vector of [2 x i64]
|
||||
///
|
||||
/// \code{.operation}
|
||||
/// FOR j := 0 to 1
|
||||
/// i := j*64
|
||||
/// tmp[127:0] := ZeroExtend64(__Y[i+51:i]) * ZeroExtend64(__Z[i+51:i])
|
||||
/// dst[i+63:i] := __X[i+63:i] + ZeroExtend64(tmp[51:0])
|
||||
/// ENDFOR
|
||||
/// dst[MAX:128] := 0
|
||||
/// \endcode
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS128
|
||||
_mm_madd52lo_avx_epu64(__m128i __X, __m128i __Y, __m128i __Z) {
|
||||
return (__m128i)__builtin_ia32_vpmadd52luq128((__v2di)__X, (__v2di)__Y,
|
||||
(__v2di)__Z);
|
||||
}
|
||||
|
||||
/// Multiply packed unsigned 52-bit integers in each 64-bit element of \a __Y
|
||||
/// and \a __Z to form a 104-bit intermediate result. Add the low 52-bit
|
||||
/// unsigned integer from the intermediate result with the corresponding
|
||||
/// unsigned 64-bit integer in \a __X, and store the results in \a dst.
|
||||
///
|
||||
/// \headerfile <immintrin.h>
|
||||
///
|
||||
/// \code
|
||||
/// __m256i
|
||||
/// _mm256_madd52lo_avx_epu64 (__m256i __X, __m256i __Y, __m256i __Z)
|
||||
/// \endcode
|
||||
///
|
||||
/// This intrinsic corresponds to the \c VPMADD52LUQ instruction.
|
||||
///
|
||||
/// \return
|
||||
/// return __m256i dst.
|
||||
/// \param __X
|
||||
/// A 256-bit vector of [4 x i64]
|
||||
/// \param __Y
|
||||
/// A 256-bit vector of [4 x i64]
|
||||
/// \param __Z
|
||||
/// A 256-bit vector of [4 x i64]
|
||||
///
|
||||
/// \code{.operation}
|
||||
/// FOR j := 0 to 3
|
||||
/// i := j*64
|
||||
/// tmp[127:0] := ZeroExtend64(__Y[i+51:i]) * ZeroExtend64(__Z[i+51:i])
|
||||
/// dst[i+63:i] := __X[i+63:i] + ZeroExtend64(tmp[51:0])
|
||||
/// ENDFOR
|
||||
/// dst[MAX:256] := 0
|
||||
/// \endcode
|
||||
static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
||||
_mm256_madd52lo_avx_epu64(__m256i __X, __m256i __Y, __m256i __Z) {
|
||||
return (__m256i)__builtin_ia32_vpmadd52luq256((__v4di)__X, (__v4di)__Y,
|
||||
(__v4di)__Z);
|
||||
}
|
||||
#undef __DEFAULT_FN_ATTRS128
|
||||
#undef __DEFAULT_FN_ATTRS256
|
||||
|
||||
#endif // __AVXIFMAINTRIN_H
|
14
lib/include/avxintrin.h
vendored
14
lib/include/avxintrin.h
vendored
@ -39,6 +39,16 @@ typedef float __m256_u __attribute__ ((__vector_size__ (32), __aligned__(1)));
|
||||
typedef double __m256d_u __attribute__((__vector_size__(32), __aligned__(1)));
|
||||
typedef long long __m256i_u __attribute__((__vector_size__(32), __aligned__(1)));
|
||||
|
||||
#ifdef __SSE2__
|
||||
/* Both _Float16 and __bf16 require SSE2 being enabled. */
|
||||
typedef _Float16 __v16hf __attribute__((__vector_size__(32), __aligned__(32)));
|
||||
typedef _Float16 __m256h __attribute__((__vector_size__(32), __aligned__(32)));
|
||||
typedef _Float16 __m256h_u __attribute__((__vector_size__(32), __aligned__(1)));
|
||||
|
||||
typedef __bf16 __v16bf __attribute__((__vector_size__(32), __aligned__(32)));
|
||||
typedef __bf16 __m256bh __attribute__((__vector_size__(32), __aligned__(32)));
|
||||
#endif
|
||||
|
||||
/* Define the default attributes for the functions in this file. */
|
||||
#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx"), __min_vector_width__(256)))
|
||||
#define __DEFAULT_FN_ATTRS128 __attribute__((__always_inline__, __nodebug__, __target__("avx"), __min_vector_width__(128)))
|
||||
@ -4288,7 +4298,7 @@ _mm256_set1_epi64x(long long __q)
|
||||
static __inline __m256d __DEFAULT_FN_ATTRS
|
||||
_mm256_setzero_pd(void)
|
||||
{
|
||||
return __extension__ (__m256d){ 0, 0, 0, 0 };
|
||||
return __extension__ (__m256d){ 0.0, 0.0, 0.0, 0.0 };
|
||||
}
|
||||
|
||||
/// Constructs a 256-bit floating-point vector of [8 x float] with all
|
||||
@ -4302,7 +4312,7 @@ _mm256_setzero_pd(void)
|
||||
static __inline __m256 __DEFAULT_FN_ATTRS
|
||||
_mm256_setzero_ps(void)
|
||||
{
|
||||
return __extension__ (__m256){ 0, 0, 0, 0, 0, 0, 0, 0 };
|
||||
return __extension__ (__m256){ 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f };
|
||||
}
|
||||
|
||||
/// Constructs a 256-bit integer vector initialized to zero.
|
||||
|
484
lib/include/avxneconvertintrin.h
vendored
Normal file
484
lib/include/avxneconvertintrin.h
vendored
Normal file
@ -0,0 +1,484 @@
|
||||
/*===-------------- avxneconvertintrin.h - AVXNECONVERT --------------------===
|
||||
*
|
||||
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||
* See https://llvm.org/LICENSE.txt for license information.
|
||||
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||
*
|
||||
*===-----------------------------------------------------------------------===
|
||||
*/
|
||||
|
||||
#ifndef __IMMINTRIN_H
|
||||
#error \
|
||||
"Never use <avxneconvertintrin.h> directly; include <immintrin.h> instead."
|
||||
#endif // __IMMINTRIN_H
|
||||
|
||||
#ifdef __SSE2__
|
||||
|
||||
#ifndef __AVXNECONVERTINTRIN_H
|
||||
#define __AVXNECONVERTINTRIN_H
|
||||
|
||||
/* Define the default attributes for the functions in this file. */
|
||||
#define __DEFAULT_FN_ATTRS128 \
|
||||
__attribute__((__always_inline__, __nodebug__, __target__("avxneconvert"), \
|
||||
__min_vector_width__(128)))
|
||||
#define __DEFAULT_FN_ATTRS256 \
|
||||
__attribute__((__always_inline__, __nodebug__, __target__("avxneconvert"), \
|
||||
__min_vector_width__(256)))
|
||||
|
||||
/// Convert scalar BF16 (16-bit) floating-point element
|
||||
/// stored at memory locations starting at location \a __A to a
|
||||
/// single-precision (32-bit) floating-point, broadcast it to packed
|
||||
/// single-precision (32-bit) floating-point elements, and store the results in
|
||||
/// \a dst.
|
||||
///
|
||||
/// \headerfile <x86intrin.h>
|
||||
///
|
||||
/// \code
|
||||
/// _mm_bcstnebf16_ps(const void *__A);
|
||||
/// \endcode
|
||||
///
|
||||
/// This intrinsic corresponds to the \c VBCSTNEBF162PS instruction.
|
||||
///
|
||||
/// \param __A
|
||||
/// A pointer to a 16-bit memory location. The address of the memory
|
||||
/// location does not have to be aligned.
|
||||
/// \returns
|
||||
/// A 128-bit vector of [4 x float].
|
||||
///
|
||||
/// \code{.operation}
|
||||
/// b := Convert_BF16_To_FP32(MEM[__A+15:__A])
|
||||
/// FOR j := 0 to 3
|
||||
/// m := j*32
|
||||
/// dst[m+31:m] := b
|
||||
/// ENDFOR
|
||||
/// dst[MAX:128] := 0
|
||||
/// \endcode
|
||||
static __inline__ __m128 __DEFAULT_FN_ATTRS128
|
||||
_mm_bcstnebf16_ps(const void *__A) {
|
||||
return (__m128)__builtin_ia32_vbcstnebf162ps128((const __bf16 *)__A);
|
||||
}
|
||||
|
||||
/// Convert scalar BF16 (16-bit) floating-point element
|
||||
/// stored at memory locations starting at location \a __A to a
|
||||
/// single-precision (32-bit) floating-point, broadcast it to packed
|
||||
/// single-precision (32-bit) floating-point elements, and store the results in
|
||||
/// \a dst.
|
||||
///
|
||||
/// \headerfile <x86intrin.h>
|
||||
///
|
||||
/// \code
|
||||
/// _mm256_bcstnebf16_ps(const void *__A);
|
||||
/// \endcode
|
||||
///
|
||||
/// This intrinsic corresponds to the \c VBCSTNEBF162PS instruction.
|
||||
///
|
||||
/// \param __A
|
||||
/// A pointer to a 16-bit memory location. The address of the memory
|
||||
/// location does not have to be aligned.
|
||||
/// \returns
|
||||
/// A 256-bit vector of [8 x float].
|
||||
///
|
||||
/// \code{.operation}
|
||||
/// b := Convert_BF16_To_FP32(MEM[__A+15:__A])
|
||||
/// FOR j := 0 to 7
|
||||
/// m := j*32
|
||||
/// dst[m+31:m] := b
|
||||
/// ENDFOR
|
||||
/// dst[MAX:256] := 0
|
||||
/// \endcode
|
||||
static __inline__ __m256 __DEFAULT_FN_ATTRS256
|
||||
_mm256_bcstnebf16_ps(const void *__A) {
|
||||
return (__m256)__builtin_ia32_vbcstnebf162ps256((const __bf16 *)__A);
|
||||
}
|
||||
|
||||
/// Convert scalar half-precision (16-bit) floating-point element
|
||||
/// stored at memory locations starting at location \a __A to a
|
||||
/// single-precision (32-bit) floating-point, broadcast it to packed
|
||||
/// single-precision (32-bit) floating-point elements, and store the results in
|
||||
/// \a dst.
|
||||
///
|
||||
/// \headerfile <x86intrin.h>
|
||||
///
|
||||
/// \code
|
||||
/// _mm_bcstnesh_ps(const void *__A);
|
||||
/// \endcode
|
||||
///
|
||||
/// This intrinsic corresponds to the \c VBCSTNESH2PS instruction.
|
||||
///
|
||||
/// \param __A
|
||||
/// A pointer to a 16-bit memory location. The address of the memory
|
||||
/// location does not have to be aligned.
|
||||
/// \returns
|
||||
/// A 128-bit vector of [4 x float].
|
||||
///
|
||||
/// \code{.operation}
|
||||
/// b := Convert_FP16_To_FP32(MEM[__A+15:__A])
|
||||
/// FOR j := 0 to 3
|
||||
/// m := j*32
|
||||
/// dst[m+31:m] := b
|
||||
/// ENDFOR
|
||||
/// dst[MAX:128] := 0
|
||||
/// \endcode
|
||||
static __inline__ __m128 __DEFAULT_FN_ATTRS128
|
||||
_mm_bcstnesh_ps(const void *__A) {
|
||||
return (__m128)__builtin_ia32_vbcstnesh2ps128((const _Float16 *)__A);
|
||||
}
|
||||
|
||||
/// Convert scalar half-precision (16-bit) floating-point element
|
||||
/// stored at memory locations starting at location \a __A to a
|
||||
/// single-precision (32-bit) floating-point, broadcast it to packed
|
||||
/// single-precision (32-bit) floating-point elements, and store the results in
|
||||
/// \a dst.
|
||||
///
|
||||
/// \headerfile <x86intrin.h>
|
||||
///
|
||||
/// \code
|
||||
/// _mm256_bcstnesh_ps(const void *__A);
|
||||
/// \endcode
|
||||
///
|
||||
/// This intrinsic corresponds to the \c VBCSTNESH2PS instruction.
|
||||
///
|
||||
/// \param __A
|
||||
/// A pointer to a 16-bit memory location. The address of the memory
|
||||
/// location does not have to be aligned.
|
||||
/// \returns
|
||||
/// A 256-bit vector of [8 x float].
|
||||
///
|
||||
/// \code{.operation}
|
||||
/// b := Convert_FP16_To_FP32(MEM[__A+15:__A])
|
||||
/// FOR j := 0 to 7
|
||||
/// m := j*32
|
||||
/// dst[m+31:m] := b
|
||||
/// ENDFOR
|
||||
/// dst[MAX:256] := 0
|
||||
/// \endcode
|
||||
static __inline__ __m256 __DEFAULT_FN_ATTRS256
|
||||
_mm256_bcstnesh_ps(const void *__A) {
|
||||
return (__m256)__builtin_ia32_vbcstnesh2ps256((const _Float16 *)__A);
|
||||
}
|
||||
|
||||
/// Convert packed BF16 (16-bit) floating-point even-indexed elements
|
||||
/// stored at memory locations starting at location \a __A to packed
|
||||
/// single-precision (32-bit) floating-point elements, and store the results in
|
||||
/// \a dst.
|
||||
///
|
||||
/// \headerfile <x86intrin.h>
|
||||
///
|
||||
/// \code
|
||||
/// _mm_cvtneebf16_ps(const __m128bh *__A);
|
||||
/// \endcode
|
||||
///
|
||||
/// This intrinsic corresponds to the \c VCVTNEEBF162PS instruction.
|
||||
///
|
||||
/// \param __A
|
||||
/// A pointer to a 128-bit memory location containing 8 consecutive
|
||||
/// BF16 (16-bit) floating-point values.
|
||||
/// \returns
|
||||
/// A 128-bit vector of [4 x float].
|
||||
///
|
||||
/// \code{.operation}
|
||||
/// FOR j := 0 to 3
|
||||
/// k := j*2
|
||||
/// i := k*16
|
||||
/// m := j*32
|
||||
/// dst[m+31:m] := Convert_BF16_To_FP32(MEM[__A+i+15:__A+i])
|
||||
/// ENDFOR
|
||||
/// dst[MAX:128] := 0
|
||||
/// \endcode
|
||||
static __inline__ __m128 __DEFAULT_FN_ATTRS128
|
||||
_mm_cvtneebf16_ps(const __m128bh *__A) {
|
||||
return (__m128)__builtin_ia32_vcvtneebf162ps128((const __v8bf *)__A);
|
||||
}
|
||||
|
||||
/// Convert packed BF16 (16-bit) floating-point even-indexed elements
|
||||
/// stored at memory locations starting at location \a __A to packed
|
||||
/// single-precision (32-bit) floating-point elements, and store the results in
|
||||
/// \a dst.
|
||||
///
|
||||
/// \headerfile <x86intrin.h>
|
||||
///
|
||||
/// \code
|
||||
/// _mm256_cvtneebf16_ps(const __m256bh *__A);
|
||||
/// \endcode
|
||||
///
|
||||
/// This intrinsic corresponds to the \c VCVTNEEBF162PS instruction.
|
||||
///
|
||||
/// \param __A
|
||||
/// A pointer to a 256-bit memory location containing 16 consecutive
|
||||
/// BF16 (16-bit) floating-point values.
|
||||
/// \returns
|
||||
/// A 256-bit vector of [8 x float].
|
||||
///
|
||||
/// \code{.operation}
|
||||
/// FOR j := 0 to 7
|
||||
/// k := j*2
|
||||
/// i := k*16
|
||||
/// m := j*32
|
||||
/// dst[m+31:m] := Convert_BF16_To_FP32(MEM[__A+i+15:__A+i])
|
||||
/// ENDFOR
|
||||
/// dst[MAX:256] := 0
|
||||
/// \endcode
|
||||
static __inline__ __m256 __DEFAULT_FN_ATTRS256
|
||||
_mm256_cvtneebf16_ps(const __m256bh *__A) {
|
||||
return (__m256)__builtin_ia32_vcvtneebf162ps256((const __v16bf *)__A);
|
||||
}
|
||||
|
||||
/// Convert packed half-precision (16-bit) floating-point even-indexed elements
|
||||
/// stored at memory locations starting at location \a __A to packed
|
||||
/// single-precision (32-bit) floating-point elements, and store the results in
|
||||
/// \a dst.
|
||||
///
|
||||
/// \headerfile <x86intrin.h>
|
||||
///
|
||||
/// \code
|
||||
/// _mm_cvtneeph_ps(const __m128h *__A);
|
||||
/// \endcode
|
||||
///
|
||||
/// This intrinsic corresponds to the \c VCVTNEEPH2PS instruction.
|
||||
///
|
||||
/// \param __A
|
||||
/// A pointer to a 128-bit memory location containing 8 consecutive
|
||||
/// half-precision (16-bit) floating-point values.
|
||||
/// \returns
|
||||
/// A 128-bit vector of [4 x float].
|
||||
///
|
||||
/// \code{.operation}
|
||||
/// FOR j := 0 to 3
|
||||
/// k := j*2
|
||||
/// i := k*16
|
||||
/// m := j*32
|
||||
/// dst[m+31:m] := Convert_FP16_To_FP32(MEM[__A+i+15:__A+i])
|
||||
/// ENDFOR
|
||||
/// dst[MAX:128] := 0
|
||||
/// \endcode
|
||||
static __inline__ __m128 __DEFAULT_FN_ATTRS128
|
||||
_mm_cvtneeph_ps(const __m128h *__A) {
|
||||
return (__m128)__builtin_ia32_vcvtneeph2ps128((const __v8hf *)__A);
|
||||
}
|
||||
|
||||
/// Convert packed half-precision (16-bit) floating-point even-indexed elements
|
||||
/// stored at memory locations starting at location \a __A to packed
|
||||
/// single-precision (32-bit) floating-point elements, and store the results in
|
||||
/// \a dst.
|
||||
///
|
||||
/// \headerfile <x86intrin.h>
|
||||
///
|
||||
/// \code
|
||||
/// _mm256_cvtneeph_ps(const __m256h *__A);
|
||||
/// \endcode
|
||||
///
|
||||
/// This intrinsic corresponds to the \c VCVTNEEPH2PS instruction.
|
||||
///
|
||||
/// \param __A
|
||||
/// A pointer to a 256-bit memory location containing 16 consecutive
|
||||
/// half-precision (16-bit) floating-point values.
|
||||
/// \returns
|
||||
/// A 256-bit vector of [8 x float].
|
||||
///
|
||||
/// \code{.operation}
|
||||
/// FOR j := 0 to 7
|
||||
/// k := j*2
|
||||
/// i := k*16
|
||||
/// m := j*32
|
||||
/// dst[m+31:m] := Convert_FP16_To_FP32(MEM[__A+i+15:__A+i])
|
||||
/// ENDFOR
|
||||
/// dst[MAX:256] := 0
|
||||
/// \endcode
|
||||
static __inline__ __m256 __DEFAULT_FN_ATTRS256
|
||||
_mm256_cvtneeph_ps(const __m256h *__A) {
|
||||
return (__m256)__builtin_ia32_vcvtneeph2ps256((const __v16hf *)__A);
|
||||
}
|
||||
|
||||
/// Convert packed BF16 (16-bit) floating-point odd-indexed elements
|
||||
/// stored at memory locations starting at location \a __A to packed
|
||||
/// single-precision (32-bit) floating-point elements, and store the results in
|
||||
/// \a dst.
|
||||
///
|
||||
/// \headerfile <x86intrin.h>
|
||||
///
|
||||
/// \code
|
||||
/// _mm_cvtneobf16_ps(const __m128bh *__A);
|
||||
/// \endcode
|
||||
///
|
||||
/// This intrinsic corresponds to the \c VCVTNEOBF162PS instruction.
|
||||
///
|
||||
/// \param __A
|
||||
/// A pointer to a 128-bit memory location containing 8 consecutive
|
||||
/// BF16 (16-bit) floating-point values.
|
||||
/// \returns
|
||||
/// A 128-bit vector of [4 x float].
|
||||
///
|
||||
/// \code{.operation}
|
||||
/// FOR j := 0 to 3
|
||||
/// k := j*2+1
|
||||
/// i := k*16
|
||||
/// m := j*32
|
||||
/// dst[m+31:m] := Convert_BF16_To_FP32(MEM[__A+i+15:__A+i])
|
||||
/// ENDFOR
|
||||
/// dst[MAX:128] := 0
|
||||
/// \endcode
|
||||
static __inline__ __m128 __DEFAULT_FN_ATTRS128
|
||||
_mm_cvtneobf16_ps(const __m128bh *__A) {
|
||||
return (__m128)__builtin_ia32_vcvtneobf162ps128((const __v8bf *)__A);
|
||||
}
|
||||
|
||||
/// Convert packed BF16 (16-bit) floating-point odd-indexed elements
|
||||
/// stored at memory locations starting at location \a __A to packed
|
||||
/// single-precision (32-bit) floating-point elements, and store the results in
|
||||
/// \a dst.
|
||||
///
|
||||
/// \headerfile <x86intrin.h>
|
||||
///
|
||||
/// \code
|
||||
/// _mm256_cvtneobf16_ps(const __m256bh *__A);
|
||||
/// \endcode
|
||||
///
|
||||
/// This intrinsic corresponds to the \c VCVTNEOBF162PS instruction.
|
||||
///
|
||||
/// \param __A
|
||||
/// A pointer to a 256-bit memory location containing 16 consecutive
|
||||
/// BF16 (16-bit) floating-point values.
|
||||
/// \returns
|
||||
/// A 256-bit vector of [8 x float].
|
||||
///
|
||||
/// \code{.operation}
|
||||
/// FOR j := 0 to 7
|
||||
/// k := j*2+1
|
||||
/// i := k*16
|
||||
/// m := j*32
|
||||
/// dst[m+31:m] := Convert_BF16_To_FP32(MEM[__A+i+15:__A+i])
|
||||
/// ENDFOR
|
||||
/// dst[MAX:256] := 0
|
||||
/// \endcode
|
||||
static __inline__ __m256 __DEFAULT_FN_ATTRS256
|
||||
_mm256_cvtneobf16_ps(const __m256bh *__A) {
|
||||
return (__m256)__builtin_ia32_vcvtneobf162ps256((const __v16bf *)__A);
|
||||
}
|
||||
|
||||
/// Convert packed half-precision (16-bit) floating-point odd-indexed elements
|
||||
/// stored at memory locations starting at location \a __A to packed
|
||||
/// single-precision (32-bit) floating-point elements, and store the results in
|
||||
/// \a dst.
|
||||
///
|
||||
/// \headerfile <x86intrin.h>
|
||||
///
|
||||
/// \code
|
||||
/// _mm_cvtneoph_ps(const __m128h *__A);
|
||||
/// \endcode
|
||||
///
|
||||
/// This intrinsic corresponds to the \c VCVTNEOPH2PS instruction.
|
||||
///
|
||||
/// \param __A
|
||||
/// A pointer to a 128-bit memory location containing 8 consecutive
|
||||
/// half-precision (16-bit) floating-point values.
|
||||
/// \returns
|
||||
/// A 128-bit vector of [4 x float].
|
||||
///
|
||||
/// \code{.operation}
|
||||
/// FOR j := 0 to 3
|
||||
/// k := j*2+1
|
||||
/// i := k*16
|
||||
/// m := j*32
|
||||
/// dst[m+31:m] := Convert_FP16_To_FP32(MEM[__A+i+15:__A+i])
|
||||
/// ENDFOR
|
||||
/// dst[MAX:128] := 0
|
||||
/// \endcode
|
||||
static __inline__ __m128 __DEFAULT_FN_ATTRS128
|
||||
_mm_cvtneoph_ps(const __m128h *__A) {
|
||||
return (__m128)__builtin_ia32_vcvtneoph2ps128((const __v8hf *)__A);
|
||||
}
|
||||
|
||||
/// Convert packed half-precision (16-bit) floating-point odd-indexed elements
|
||||
/// stored at memory locations starting at location \a __A to packed
|
||||
/// single-precision (32-bit) floating-point elements, and store the results in
|
||||
/// \a dst.
|
||||
///
|
||||
/// \headerfile <x86intrin.h>
|
||||
///
|
||||
/// \code
|
||||
/// _mm256_cvtneoph_ps(const __m256h *__A);
|
||||
/// \endcode
|
||||
///
|
||||
/// This intrinsic corresponds to the \c VCVTNEOPH2PS instruction.
|
||||
///
|
||||
/// \param __A
|
||||
/// A pointer to a 256-bit memory location containing 16 consecutive
|
||||
/// half-precision (16-bit) floating-point values.
|
||||
/// \returns
|
||||
/// A 256-bit vector of [8 x float].
|
||||
///
|
||||
/// \code{.operation}
|
||||
/// FOR j := 0 to 7
|
||||
/// k := j*2+1
|
||||
/// i := k*16
|
||||
/// m := j*32
|
||||
/// dst[m+31:m] := Convert_FP16_To_FP32(MEM[__A+i+15:__A+i])
|
||||
/// ENDFOR
|
||||
/// dst[MAX:256] := 0
|
||||
/// \endcode
|
||||
static __inline__ __m256 __DEFAULT_FN_ATTRS256
|
||||
_mm256_cvtneoph_ps(const __m256h *__A) {
|
||||
return (__m256)__builtin_ia32_vcvtneoph2ps256((const __v16hf *)__A);
|
||||
}
|
||||
|
||||
/// Convert packed single-precision (32-bit) floating-point elements in \a __A
|
||||
/// to packed BF16 (16-bit) floating-point elements, and store the results in \a
|
||||
/// dst.
|
||||
///
|
||||
/// \headerfile <x86intrin.h>
|
||||
///
|
||||
/// \code
|
||||
/// _mm_cvtneps_avx_pbh(__m128 __A);
|
||||
/// \endcode
|
||||
///
|
||||
/// This intrinsic corresponds to the \c VCVTNEPS2BF16 instruction.
|
||||
///
|
||||
/// \param __A
|
||||
/// A 128-bit vector of [4 x float].
|
||||
/// \returns
|
||||
/// A 128-bit vector of [8 x bfloat].
|
||||
///
|
||||
/// \code{.operation}
|
||||
/// FOR j := 0 to 3
|
||||
/// dst.word[j] := Convert_FP32_To_BF16(__A.fp32[j])
|
||||
/// ENDFOR
|
||||
/// dst[MAX:128] := 0
|
||||
/// \endcode
|
||||
static __inline__ __m128bh __DEFAULT_FN_ATTRS128
|
||||
_mm_cvtneps_avx_pbh(__m128 __A) {
|
||||
return (__m128bh)__builtin_ia32_vcvtneps2bf16128((__v4sf)__A);
|
||||
}
|
||||
|
||||
/// Convert packed single-precision (32-bit) floating-point elements in \a __A
|
||||
/// to packed BF16 (16-bit) floating-point elements, and store the results in \a
|
||||
/// dst.
|
||||
///
|
||||
/// \headerfile <x86intrin.h>
|
||||
///
|
||||
/// \code
|
||||
/// _mm256_cvtneps_avx_pbh(__m256 __A);
|
||||
/// \endcode
|
||||
///
|
||||
/// This intrinsic corresponds to the \c VCVTNEPS2BF16 instruction.
|
||||
///
|
||||
/// \param __A
|
||||
/// A 256-bit vector of [8 x float].
|
||||
/// \returns
|
||||
/// A 128-bit vector of [8 x bfloat].
|
||||
///
|
||||
/// \code{.operation}
|
||||
/// FOR j := 0 to 7
|
||||
/// dst.word[j] := Convert_FP32_To_BF16(a.fp32[j])
|
||||
/// ENDFOR
|
||||
/// dst[MAX:128] := 0
|
||||
/// \endcode
|
||||
static __inline__ __m128bh __DEFAULT_FN_ATTRS256
|
||||
_mm256_cvtneps_avx_pbh(__m256 __A) {
|
||||
return (__m128bh)__builtin_ia32_vcvtneps2bf16256((__v8sf)__A);
|
||||
}
|
||||
|
||||
#undef __DEFAULT_FN_ATTRS128
|
||||
#undef __DEFAULT_FN_ATTRS256
|
||||
|
||||
#endif // __AVXNECONVERTINTRIN_H
|
||||
#endif // __SSE2__
|
471
lib/include/avxvnniint8intrin.h
vendored
Normal file
471
lib/include/avxvnniint8intrin.h
vendored
Normal file
@ -0,0 +1,471 @@
|
||||
/*===-------- avxvnniint8intrin.h - AVXVNNIINT8 intrinsics -----------===
|
||||
*
|
||||
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||
* See https://llvm.org/LICENSE.txt for license information.
|
||||
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||
*
|
||||
*===-----------------------------------------------------------------------===
|
||||
*/
|
||||
#ifndef __IMMINTRIN_H
|
||||
#error \
|
||||
"Never use <avxvnniint8intrin.h> directly; include <immintrin.h> instead."
|
||||
#endif
|
||||
|
||||
#ifndef __AVXVNNIINT8INTRIN_H
|
||||
#define __AVXVNNIINT8INTRIN_H
|
||||
|
||||
/* Define the default attributes for the functions in this file. */
|
||||
#define __DEFAULT_FN_ATTRS256 \
|
||||
__attribute__((__always_inline__, __nodebug__, __target__("avxvnniint8"), \
|
||||
__min_vector_width__(256)))
|
||||
#define __DEFAULT_FN_ATTRS128 \
|
||||
__attribute__((__always_inline__, __nodebug__, __target__("avxvnniint8"), \
|
||||
__min_vector_width__(128)))
|
||||
|
||||
/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with
|
||||
/// corresponding signed 8-bit integers in \a __B, producing 4 intermediate
|
||||
/// signed 16-bit results. Sum these 4 results with the corresponding
|
||||
/// 32-bit integer in \a __W, and store the packed 32-bit results in \a dst.
|
||||
///
|
||||
/// \headerfile <x86intrin.h>
|
||||
///
|
||||
/// \code
|
||||
/// _mm_dpbssd_epi32(__m128i __W, __m128i __A, __m128i __B);
|
||||
/// \endcode
|
||||
///
|
||||
/// This intrinsic corresponds to the \c VPDPBSSD instruction.
|
||||
///
|
||||
/// \param __A
|
||||
/// A 128-bit vector of [16 x char].
|
||||
/// \param __B
|
||||
/// A 128-bit vector of [16 x char].
|
||||
/// \returns
|
||||
/// A 128-bit vector of [4 x int].
|
||||
///
|
||||
/// \code{.operation}
|
||||
/// FOR j := 0 to 3
|
||||
/// tmp1.word := SignExtend16(__A.byte[4*j]) * SignExtend16(__B.byte[4*j])
|
||||
/// tmp2.word := SignExtend16(__A.byte[4*j+1]) * SignExtend16(__B.byte[4*j+1])
|
||||
/// tmp3.word := SignExtend16(__A.byte[4*j+2]) * SignExtend16(__B.byte[4*j+2])
|
||||
/// tmp4.word := SignExtend16(__A.byte[4*j+3]) * SignExtend16(__B.byte[4*j+3])
|
||||
/// dst.dword[j] := __W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4
|
||||
/// ENDFOR
|
||||
/// dst[MAX:128] := 0
|
||||
/// \endcode
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpbssd_epi32(__m128i __W,
|
||||
__m128i __A,
|
||||
__m128i __B) {
|
||||
return (__m128i)__builtin_ia32_vpdpbssd128((__v4si)__W, (__v4si)__A,
|
||||
(__v4si)__B);
|
||||
}
|
||||
|
||||
/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with
|
||||
/// corresponding signed 8-bit integers in \a __B, producing 4 intermediate
|
||||
/// signed 16-bit results. Sum these 4 results with the corresponding
|
||||
/// 32-bit integer in \a __W, and store the packed 32-bit results in \a dst.
|
||||
///
|
||||
/// \headerfile <x86intrin.h>
|
||||
///
|
||||
/// \code
|
||||
/// _mm256_dpbssd_epi32(__m256i __W, __m256i __A, __m256i __B);
|
||||
/// \endcode
|
||||
///
|
||||
/// This intrinsic corresponds to the \c VPDPBSSD instruction.
|
||||
///
|
||||
/// \param __A
|
||||
/// A 256-bit vector of [32 x char].
|
||||
/// \param __B
|
||||
/// A 256-bit vector of [32 x char].
|
||||
/// \returns
|
||||
/// A 256-bit vector of [8 x int].
|
||||
///
|
||||
/// \code{.operation}
|
||||
/// FOR j := 0 to 7
|
||||
/// tmp1.word := SignExtend16(__A.byte[4*j]) * SignExtend16(__B.byte[4*j])
|
||||
/// tmp2.word := SignExtend16(__A.byte[4*j+1]) * SignExtend16(__B.byte[4*j+1])
|
||||
/// tmp3.word := SignExtend16(__A.byte[4*j+2]) * SignExtend16(__B.byte[4*j+2])
|
||||
/// tmp4.word := SignExtend16(__A.byte[4*j+3]) * SignExtend16(__B.byte[4*j+3])
|
||||
/// dst.dword[j] := __W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4
|
||||
/// ENDFOR
|
||||
/// dst[MAX:256] := 0
|
||||
/// \endcode
|
||||
static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
||||
_mm256_dpbssd_epi32(__m256i __W, __m256i __A, __m256i __B) {
|
||||
return (__m256i)__builtin_ia32_vpdpbssd256((__v8si)__W, (__v8si)__A,
|
||||
(__v8si)__B);
|
||||
}
|
||||
|
||||
/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with
|
||||
/// corresponding signed 8-bit integers in \a __B, producing 4 intermediate
|
||||
/// signed 16-bit results. Sum these 4 results with the corresponding
|
||||
/// 32-bit integer in \a __W with signed saturation, and store the packed
|
||||
/// 32-bit results in \a dst.
|
||||
///
|
||||
/// \headerfile <x86intrin.h>
|
||||
///
|
||||
/// \code
|
||||
/// _mm_dpbssds_epi32( __m128i __W, __m128i __A, __m128i __B);
|
||||
/// \endcode
|
||||
///
|
||||
/// This intrinsic corresponds to the \c VPDPBSSD instruction.
|
||||
///
|
||||
/// \param __A
|
||||
/// A 128-bit vector of [16 x char].
|
||||
/// \param __B
|
||||
/// A 128-bit vector of [16 x char].
|
||||
/// \returns
|
||||
/// A 128-bit vector of [4 x int].
|
||||
///
|
||||
/// \code{.operation}
|
||||
/// FOR j := 0 to 3
|
||||
/// tmp1.word := SignExtend16(__A.byte[4*j]) * SignExtend16(__B.byte[4*j])
|
||||
/// tmp2.word := SignExtend16(__A.byte[4*j+1]) * SignExtend16(__B.byte[4*j+1])
|
||||
/// tmp3.word := SignExtend16(__A.byte[4*j+2]) * SignExtend16(__B.byte[4*j+2])
|
||||
/// tmp4.word := SignExtend16(__A.byte[4*j+3]) * SignExtend16(__B.byte[4*j+3])
|
||||
/// dst.dword[j] := SIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4)
|
||||
/// ENDFOR
|
||||
/// dst[MAX:128] := 0
|
||||
/// \endcode
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpbssds_epi32(__m128i __W,
|
||||
__m128i __A,
|
||||
__m128i __B) {
|
||||
return (__m128i)__builtin_ia32_vpdpbssds128((__v4si)__W, (__v4si)__A,
|
||||
(__v4si)__B);
|
||||
}
|
||||
|
||||
/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with
|
||||
/// corresponding signed 8-bit integers in \a __B, producing 4 intermediate
|
||||
/// signed 16-bit results. Sum these 4 results with the corresponding
|
||||
/// 32-bit integer in \a __W with signed saturation, and store the packed
|
||||
/// 32-bit results in \a dst.
|
||||
///
|
||||
/// \headerfile <x86intrin.h>
|
||||
///
|
||||
/// \code
|
||||
/// _mm256_dpbssds_epi32(__m256i __W, __m256i __A, __m256i __B);
|
||||
/// \endcode
|
||||
///
|
||||
/// This intrinsic corresponds to the \c VPDPBSSD instruction.
|
||||
///
|
||||
/// \param __A
|
||||
/// A 256-bit vector of [32 x char].
|
||||
/// \param __B
|
||||
/// A 256-bit vector of [32 x char].
|
||||
/// \returns
|
||||
/// A 256-bit vector of [8 x int].
|
||||
///
|
||||
/// \code{.operation}
|
||||
/// FOR j := 0 to 7
|
||||
/// tmp1.word := SignExtend16(__A.byte[4*j]) * SignExtend16(__B.byte[4*j])
|
||||
/// tmp2.word := SignExtend16(__A.byte[4*j+1]) * SignExtend16(__B.byte[4*j+1])
|
||||
/// tmp3.word := SignExtend16(__A.byte[4*j+2]) * SignExtend16(__B.byte[4*j+2])
|
||||
/// tmp4.word := SignExtend16(__A.byte[4*j+3]) * SignExtend16(__B.byte[4*j+3])
|
||||
/// dst.dword[j] := SIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4)
|
||||
/// ENDFOR
|
||||
/// dst[MAX:256] := 0
|
||||
/// \endcode
|
||||
static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
||||
_mm256_dpbssds_epi32(__m256i __W, __m256i __A, __m256i __B) {
|
||||
return (__m256i)__builtin_ia32_vpdpbssds256((__v8si)__W, (__v8si)__A,
|
||||
(__v8si)__B);
|
||||
}
|
||||
|
||||
/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with
|
||||
/// corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate
|
||||
/// signed 16-bit results. Sum these 4 results with the corresponding
|
||||
/// 32-bit integer in \a __W, and store the packed 32-bit results in \a dst.
|
||||
///
|
||||
/// \headerfile <x86intrin.h>
|
||||
///
|
||||
/// \code
|
||||
/// _mm_dpbsud_epi32(__m128i __W, __m128i __A, __m128i __B);
|
||||
/// \endcode
|
||||
///
|
||||
/// This intrinsic corresponds to the \c VPDPBSSD instruction.
|
||||
///
|
||||
/// \param __A
|
||||
/// A 128-bit vector of [16 x char].
|
||||
/// \param __B
|
||||
/// A 128-bit vector of [16 x unsigned char].
|
||||
/// \returns
|
||||
/// A 128-bit vector of [4 x int].
|
||||
///
|
||||
/// \code{.operation}
|
||||
/// FOR j := 0 to 3
|
||||
/// tmp1.word := Signed(SignExtend16(__A.byte[4*j]) * ZeroExtend16(__B.byte[4*j]))
|
||||
/// tmp2.word := Signed(SignExtend16(__A.byte[4*j+1]) * ZeroExtend16(__B.byte[4*j+1]))
|
||||
/// tmp3.word := Signed(SignExtend16(__A.byte[4*j+2]) * ZeroExtend16(__B.byte[4*j+2]))
|
||||
/// tmp4.word := Signed(SignExtend16(__A.byte[4*j+3]) * ZeroExtend16(__B.byte[4*j+3]))
|
||||
/// dst.dword[j] := __W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4
|
||||
/// ENDFOR
|
||||
/// dst[MAX:128] := 0
|
||||
/// \endcode
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpbsud_epi32(__m128i __W,
|
||||
__m128i __A,
|
||||
__m128i __B) {
|
||||
return (__m128i)__builtin_ia32_vpdpbsud128((__v4si)__W, (__v4si)__A,
|
||||
(__v4si)__B);
|
||||
}
|
||||
|
||||
/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with
|
||||
/// corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate
|
||||
/// signed 16-bit results. Sum these 4 results with the corresponding
|
||||
/// 32-bit integer in \a __W, and store the packed 32-bit results in \a dst.
|
||||
///
|
||||
/// \headerfile <x86intrin.h>
|
||||
///
|
||||
/// \code
|
||||
/// _mm256_dpbsud_epi32(__m256i __W, __m256i __A, __m256i __B);
|
||||
/// \endcode
|
||||
///
|
||||
/// This intrinsic corresponds to the \c VPDPBSSD instruction.
|
||||
///
|
||||
/// \param __A
|
||||
/// A 256-bit vector of [32 x char].
|
||||
/// \param __B
|
||||
/// A 256-bit vector of [32 x unsigned char].
|
||||
/// \returns
|
||||
/// A 256-bit vector of [8 x int].
|
||||
///
|
||||
/// \code{.operation}
|
||||
/// FOR j := 0 to 7
|
||||
/// tmp1.word := Signed(SignExtend16(__A.byte[4*j]) * ZeroExtend16(__B.byte[4*j]))
|
||||
/// tmp2.word := Signed(SignExtend16(__A.byte[4*j+1]) * ZeroExtend16(__B.byte[4*j+1]))
|
||||
/// tmp3.word := Signed(SignExtend16(__A.byte[4*j+2]) * ZeroExtend16(__B.byte[4*j+2]))
|
||||
/// tmp4.word := Signed(SignExtend16(__A.byte[4*j+3]) * ZeroExtend16(__B.byte[4*j+3]))
|
||||
/// dst.dword[j] := __W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4
|
||||
/// ENDFOR
|
||||
/// dst[MAX:256] := 0
|
||||
/// \endcode
|
||||
static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
||||
_mm256_dpbsud_epi32(__m256i __W, __m256i __A, __m256i __B) {
|
||||
return (__m256i)__builtin_ia32_vpdpbsud256((__v8si)__W, (__v8si)__A,
|
||||
(__v8si)__B);
|
||||
}
|
||||
|
||||
/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with
|
||||
/// corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate
|
||||
/// signed 16-bit results. Sum these 4 results with the corresponding
|
||||
/// 32-bit integer in \a __W with signed saturation, and store the packed
|
||||
/// 32-bit results in \a dst.
|
||||
///
|
||||
/// \headerfile <x86intrin.h>
|
||||
///
|
||||
/// \code
|
||||
/// _mm_dpbsuds_epi32( __m128i __W, __m128i __A, __m128i __B);
|
||||
/// \endcode
|
||||
///
|
||||
/// This intrinsic corresponds to the \c VPDPBSSD instruction.
|
||||
///
|
||||
/// \param __A
|
||||
/// A 128-bit vector of [16 x char].
|
||||
/// \param __B
|
||||
/// A 128-bit vector of [16 x unsigned char].
|
||||
/// \returns
|
||||
/// A 128-bit vector of [4 x int].
|
||||
///
|
||||
/// \code{.operation}
|
||||
/// FOR j := 0 to 3
|
||||
/// tmp1.word := Signed(SignExtend16(__A.byte[4*j]) * ZeroExtend16(__B.byte[4*j]))
|
||||
/// tmp2.word := Signed(SignExtend16(__A.byte[4*j+1]) * ZeroExtend16(__B.byte[4*j+1]))
|
||||
/// tmp3.word := Signed(SignExtend16(__A.byte[4*j+2]) * ZeroExtend16(__B.byte[4*j+2]))
|
||||
/// tmp4.word := Signed(SignExtend16(__A.byte[4*j+3]) * ZeroExtend16(__B.byte[4*j+3]))
|
||||
/// dst.dword[j] := SIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4)
|
||||
/// ENDFOR
|
||||
/// dst[MAX:128] := 0
|
||||
/// \endcode
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpbsuds_epi32(__m128i __W,
|
||||
__m128i __A,
|
||||
__m128i __B) {
|
||||
return (__m128i)__builtin_ia32_vpdpbsuds128((__v4si)__W, (__v4si)__A,
|
||||
(__v4si)__B);
|
||||
}
|
||||
|
||||
/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with
|
||||
/// corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate
|
||||
/// signed 16-bit results. Sum these 4 results with the corresponding
|
||||
/// 32-bit integer in \a __W with signed saturation, and store the packed
|
||||
/// 32-bit results in \a dst.
|
||||
///
|
||||
/// \headerfile <x86intrin.h>
|
||||
///
|
||||
/// \code
|
||||
/// _mm256_dpbsuds_epi32(__m256i __W, __m256i __A, __m256i __B);
|
||||
/// \endcode
|
||||
///
|
||||
/// This intrinsic corresponds to the \c VPDPBSSD instruction.
|
||||
///
|
||||
/// \param __A
|
||||
/// A 256-bit vector of [32 x char].
|
||||
/// \param __B
|
||||
/// A 256-bit vector of [32 x unsigned char].
|
||||
/// \returns
|
||||
/// A 256-bit vector of [8 x int].
|
||||
///
|
||||
/// \code{.operation}
|
||||
/// FOR j := 0 to 7
|
||||
/// tmp1.word := Signed(SignExtend16(__A.byte[4*j]) * ZeroExtend16(__B.byte[4*j]))
|
||||
/// tmp2.word := Signed(SignExtend16(__A.byte[4*j+1]) * ZeroExtend16(__B.byte[4*j+1]))
|
||||
/// tmp3.word := Signed(SignExtend16(__A.byte[4*j+2]) * ZeroExtend16(__B.byte[4*j+2]))
|
||||
/// tmp4.word := Signed(SignExtend16(__A.byte[4*j+3]) * ZeroExtend16(__B.byte[4*j+3]))
|
||||
/// dst.dword[j] := SIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4)
|
||||
/// ENDFOR
|
||||
/// dst[MAX:256] := 0
|
||||
/// \endcode
|
||||
static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
||||
_mm256_dpbsuds_epi32(__m256i __W, __m256i __A, __m256i __B) {
|
||||
return (__m256i)__builtin_ia32_vpdpbsuds256((__v8si)__W, (__v8si)__A,
|
||||
(__v8si)__B);
|
||||
}
|
||||
|
||||
/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a __A with
|
||||
/// corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate
|
||||
/// signed 16-bit results. Sum these 4 results with the corresponding
|
||||
/// 32-bit integer in \a __W, and store the packed 32-bit results in \a dst.
|
||||
///
|
||||
/// \headerfile <x86intrin.h>
|
||||
///
|
||||
/// \code
|
||||
/// _mm_dpbuud_epi32(__m128i __W, __m128i __A, __m128i __B);
|
||||
/// \endcode
|
||||
///
|
||||
/// This intrinsic corresponds to the \c VPDPBSSD instruction.
|
||||
///
|
||||
/// \param __A
|
||||
/// A 128-bit vector of [16 x unsigned char].
|
||||
/// \param __B
|
||||
/// A 128-bit vector of [16 x unsigned char].
|
||||
/// \returns
|
||||
/// A 128-bit vector of [4 x int].
|
||||
///
|
||||
/// \code{.operation}
|
||||
/// FOR j := 0 to 3
|
||||
/// tmp1.word := ZeroExtend16(__A.byte[4*j]) * ZeroExtend16(__B.byte[4*j])
|
||||
/// tmp2.word := ZeroExtend16(__A.byte[4*j+1]) * ZeroExtend16(__B.byte[4*j+1])
|
||||
/// tmp3.word := ZeroExtend16(__A.byte[4*j+2]) * ZeroExtend16(__B.byte[4*j+2])
|
||||
/// tmp4.word := ZeroExtend16(__A.byte[4*j+3]) * ZeroExtend16(__B.byte[4*j+3])
|
||||
/// dst.dword[j] := __W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4
|
||||
/// ENDFOR
|
||||
/// dst[MAX:128] := 0
|
||||
/// \endcode
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpbuud_epi32(__m128i __W,
|
||||
__m128i __A,
|
||||
__m128i __B) {
|
||||
return (__m128i)__builtin_ia32_vpdpbuud128((__v4si)__W, (__v4si)__A,
|
||||
(__v4si)__B);
|
||||
}
|
||||
|
||||
/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a __A with
|
||||
/// corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate
|
||||
/// signed 16-bit results. Sum these 4 results with the corresponding
|
||||
/// 32-bit integer in \a __W, and store the packed 32-bit results in \a dst.
|
||||
///
|
||||
/// \headerfile <x86intrin.h>
|
||||
///
|
||||
/// \code
|
||||
/// _mm256_dpbuud_epi32(__m256i __W, __m256i __A, __m256i __B);
|
||||
/// \endcode
|
||||
///
|
||||
/// This intrinsic corresponds to the \c VPDPBSSD instruction.
|
||||
///
|
||||
/// \param __A
|
||||
/// A 256-bit vector of [32 x unsigned char].
|
||||
/// \param __B
|
||||
/// A 256-bit vector of [32 x unsigned char].
|
||||
/// \returns
|
||||
/// A 256-bit vector of [8 x int].
|
||||
///
|
||||
/// \code{.operation}
|
||||
/// FOR j := 0 to 7
|
||||
/// tmp1.word := ZeroExtend16(__A.byte[4*j]) * ZeroExtend16(__B.byte[4*j])
|
||||
/// tmp2.word := ZeroExtend16(__A.byte[4*j+1]) * ZeroExtend16(__B.byte[4*j+1])
|
||||
/// tmp3.word := ZeroExtend16(__A.byte[4*j+2]) * ZeroExtend16(__B.byte[4*j+2])
|
||||
/// tmp4.word := ZeroExtend16(__A.byte[4*j+3]) * ZeroExtend16(__B.byte[4*j+3])
|
||||
/// dst.dword[j] := __W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4
|
||||
/// ENDFOR
|
||||
/// dst[MAX:256] := 0
|
||||
/// \endcode
|
||||
static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
||||
_mm256_dpbuud_epi32(__m256i __W, __m256i __A, __m256i __B) {
|
||||
return (__m256i)__builtin_ia32_vpdpbuud256((__v8si)__W, (__v8si)__A,
|
||||
(__v8si)__B);
|
||||
}
|
||||
|
||||
/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a __A with
|
||||
/// corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate
|
||||
/// signed 16-bit results. Sum these 4 results with the corresponding
|
||||
/// 32-bit integer in \a __W with signed saturation, and store the packed
|
||||
/// 32-bit results in \a dst.
|
||||
///
|
||||
/// \headerfile <x86intrin.h>
|
||||
///
|
||||
/// \code
|
||||
/// _mm_dpbuuds_epi32( __m128i __W, __m128i __A, __m128i __B);
|
||||
/// \endcode
|
||||
///
|
||||
/// This intrinsic corresponds to the \c VPDPBUUDS instruction.
|
||||
///
|
||||
/// \param __A
|
||||
/// A 128-bit vector of [16 x unsigned char].
|
||||
/// \param __B
|
||||
/// A 128-bit vector of [16 x unsigned char].
|
||||
/// \returns
|
||||
/// A 128-bit vector of [4 x int].
|
||||
///
|
||||
/// \code{.operation}
|
||||
/// FOR j := 0 to 3
|
||||
/// tmp1.word := ZeroExtend16(__A.byte[4*j]) * ZeroExtend16(__B.byte[4*j])
|
||||
/// tmp2.word := ZeroExtend16(__A.byte[4*j+1]) * ZeroExtend16(__B.byte[4*j+1])
|
||||
/// tmp3.word := ZeroExtend16(__A.byte[4*j+2]) * ZeroExtend16(__B.byte[4*j+2])
|
||||
/// tmp4.word := ZeroExtend16(__A.byte[4*j+3]) * ZeroExtend16(__B.byte[4*j+3])
|
||||
/// dst.dword[j] := UNSIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4)
|
||||
/// ENDFOR
|
||||
/// dst[MAX:128] := 0
|
||||
/// \endcode
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpbuuds_epi32(__m128i __W,
|
||||
__m128i __A,
|
||||
__m128i __B) {
|
||||
return (__m128i)__builtin_ia32_vpdpbuuds128((__v4si)__W, (__v4si)__A,
|
||||
(__v4si)__B);
|
||||
}
|
||||
|
||||
/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with
|
||||
/// corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate
|
||||
/// signed 16-bit results. Sum these 4 results with the corresponding
|
||||
/// 32-bit integer in \a __W with signed saturation, and store the packed
|
||||
/// 32-bit results in \a dst.
|
||||
///
|
||||
/// \headerfile <x86intrin.h>
|
||||
///
|
||||
/// \code
|
||||
/// _mm256_dpbuuds_epi32(__m256i __W, __m256i __A, __m256i __B);
|
||||
/// \endcode
|
||||
///
|
||||
/// This intrinsic corresponds to the \c VPDPBUUDS instruction.
|
||||
///
|
||||
/// \param __A
|
||||
/// A 256-bit vector of [32 x unsigned char].
|
||||
/// \param __B
|
||||
/// A 256-bit vector of [32 x unsigned char].
|
||||
/// \returns
|
||||
/// A 256-bit vector of [8 x int].
|
||||
///
|
||||
/// \code{.operation}
|
||||
/// FOR j := 0 to 7
|
||||
/// tmp1.word := ZeroExtend16(__A.byte[4*j]) * ZeroExtend16(__B.byte[4*j])
|
||||
/// tmp2.word := ZeroExtend16(__A.byte[4*j+1]) * ZeroExtend16(__B.byte[4*j+1])
|
||||
/// tmp3.word := ZeroExtend16(__A.byte[4*j+2]) * ZeroExtend16(__B.byte[4*j+2])
|
||||
/// tmp4.word := ZeroExtend16(__A.byte[4*j+3]) * ZeroExtend16(__B.byte[4*j+3])
|
||||
/// dst.dword[j] := UNSIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4)
|
||||
/// ENDFOR
|
||||
/// dst[MAX:256] := 0
|
||||
/// \endcode
|
||||
static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
||||
_mm256_dpbuuds_epi32(__m256i __W, __m256i __A, __m256i __B) {
|
||||
return (__m256i)__builtin_ia32_vpdpbuuds256((__v8si)__W, (__v8si)__A,
|
||||
(__v8si)__B);
|
||||
}
|
||||
#undef __DEFAULT_FN_ATTRS128
|
||||
#undef __DEFAULT_FN_ATTRS256
|
||||
|
||||
#endif // __AVXVNNIINT8INTRIN_H
|
70
lib/include/cmpccxaddintrin.h
vendored
Normal file
70
lib/include/cmpccxaddintrin.h
vendored
Normal file
@ -0,0 +1,70 @@
|
||||
/*===--------------- cmpccxaddintrin.h - CMPCCXADD intrinsics--------------===
|
||||
*
|
||||
*
|
||||
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||
* See https://llvm.org/LICENSE.txt for license information.
|
||||
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||
*
|
||||
*===-----------------------------------------------------------------------===
|
||||
*/
|
||||
#ifndef __X86GPRINTRIN_H
|
||||
#error \
|
||||
"Never use <cmpccxaddintrin.h> directly; include <x86gprintrin.h> instead."
|
||||
#endif // __X86GPRINTRIN_H
|
||||
|
||||
#ifndef __CMPCCXADDINTRIN_H
|
||||
#define __CMPCCXADDINTRIN_H
|
||||
#ifdef __x86_64__
|
||||
|
||||
typedef enum {
|
||||
_CMPCCX_O, /* Overflow. */
|
||||
_CMPCCX_NO, /* No overflow. */
|
||||
_CMPCCX_B, /* Below. */
|
||||
_CMPCCX_NB, /* Not below. */
|
||||
_CMPCCX_Z, /* Zero. */
|
||||
_CMPCCX_NZ, /* Not zero. */
|
||||
_CMPCCX_BE, /* Below or equal. */
|
||||
_CMPCCX_NBE, /* Neither below nor equal. */
|
||||
_CMPCCX_S, /* Sign. */
|
||||
_CMPCCX_NS, /* No sign. */
|
||||
_CMPCCX_P, /* Parity. */
|
||||
_CMPCCX_NP, /* No parity. */
|
||||
_CMPCCX_L, /* Less. */
|
||||
_CMPCCX_NL, /* Not less. */
|
||||
_CMPCCX_LE, /* Less or equal. */
|
||||
_CMPCCX_NLE, /* Neither less nor equal. */
|
||||
} _CMPCCX_ENUM;
|
||||
|
||||
/// Compares the value from the memory __A with the value of __B. If the
|
||||
/// specified condition __D is met, then add the third operand __C to the
|
||||
/// __A and write it into __A, else the value of __A is unchanged. The return
|
||||
/// value is the original value of __A.
|
||||
///
|
||||
/// \headerfile <immintrin.h>
|
||||
///
|
||||
/// This intrinsic corresponds to the \c CMPCCXADD instructions.
|
||||
///
|
||||
/// \param __A
|
||||
/// __A pointer specifying the memory address.
|
||||
///
|
||||
/// \param __B
|
||||
/// A integer operand.
|
||||
///
|
||||
/// \param __C
|
||||
/// A integer operand.
|
||||
///
|
||||
/// \param __D
|
||||
/// The specified condition.
|
||||
///
|
||||
/// \returns a integer which is the original value of first operand.
|
||||
|
||||
#define _cmpccxadd_epi32(__A, __B, __C, __D) \
|
||||
((int)(__builtin_ia32_cmpccxadd32((void *)(__A), (int)(__B), (int)(__C), \
|
||||
(int)(__D))))
|
||||
|
||||
#define _cmpccxadd_epi64(__A, __B, __C, __D) \
|
||||
((long long)(__builtin_ia32_cmpccxadd64((void *)(__A), (long long)(__B), \
|
||||
(long long)(__C), (int)(__D))))
|
||||
|
||||
#endif // __x86_64__
|
||||
#endif // __CMPCCXADDINTRIN_H
|
12
lib/include/cpuid.h
vendored
12
lib/include/cpuid.h
vendored
@ -200,9 +200,18 @@
|
||||
#define bit_AMXINT8 0x02000000
|
||||
|
||||
/* Features in %eax for leaf 7 sub-leaf 1 */
|
||||
#define bit_RAOINT 0x00000008
|
||||
#define bit_AVXVNNI 0x00000010
|
||||
#define bit_AVX512BF16 0x00000020
|
||||
#define bit_CMPCCXADD 0x00000080
|
||||
#define bit_AMXFP16 0x00200000
|
||||
#define bit_HRESET 0x00400000
|
||||
#define bit_AVXIFMA 0x00800000
|
||||
|
||||
/* Features in %edx for leaf 7 sub-leaf 1 */
|
||||
#define bit_AVXVNNIINT8 0x00000010
|
||||
#define bit_AVXNECONVERT 0x00000020
|
||||
#define bit_PREFETCHI 0x00004000
|
||||
|
||||
/* Features in %eax for leaf 13 sub-leaf 1 */
|
||||
#define bit_XSAVEOPT 0x00000001
|
||||
@ -261,7 +270,8 @@
|
||||
: "0"(__leaf), "2"(__count))
|
||||
#endif
|
||||
|
||||
static __inline int __get_cpuid_max (unsigned int __leaf, unsigned int *__sig)
|
||||
static __inline unsigned int __get_cpuid_max (unsigned int __leaf,
|
||||
unsigned int *__sig)
|
||||
{
|
||||
unsigned int __eax, __ebx, __ecx, __edx;
|
||||
#if __i386__
|
||||
|
90
lib/include/cuda_wrappers/cmath
vendored
Normal file
90
lib/include/cuda_wrappers/cmath
vendored
Normal file
@ -0,0 +1,90 @@
|
||||
/*===---- cmath - CUDA wrapper for <cmath> ---------------------------------===
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
* of this software and associated documentation files (the "Software"), to deal
|
||||
* in the Software without restriction, including without limitation the rights
|
||||
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the Software is
|
||||
* furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included in
|
||||
* all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
* THE SOFTWARE.
|
||||
*
|
||||
*===-----------------------------------------------------------------------===
|
||||
*/
|
||||
|
||||
#ifndef __CLANG_CUDA_WRAPPERS_CMATH
|
||||
#define __CLANG_CUDA_WRAPPERS_CMATH
|
||||
|
||||
#include_next <cmath>
|
||||
|
||||
#if defined(_LIBCPP_STD_VER)
|
||||
|
||||
// libc++ will need long double variants of these functions, but CUDA does not
|
||||
// provide them. We'll provide their declarations, which should allow the
|
||||
// headers to parse, but would not allow accidental use of them on a GPU.
|
||||
|
||||
__attribute__((device)) long double logb(long double);
|
||||
__attribute__((device)) long double scalbn(long double, int);
|
||||
|
||||
namespace std {
|
||||
|
||||
// For __constexpr_fmin/fmax we only need device-side overloads before c++14
|
||||
// where they are not constexpr.
|
||||
#if _LIBCPP_STD_VER < 14
|
||||
|
||||
__attribute__((device))
|
||||
inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 float __constexpr_fmax(float __x, float __y) _NOEXCEPT {
|
||||
return __builtin_fmaxf(__x, __y);
|
||||
}
|
||||
|
||||
__attribute__((device))
|
||||
inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 double __constexpr_fmax(double __x, double __y) _NOEXCEPT {
|
||||
return __builtin_fmax(__x, __y);
|
||||
}
|
||||
|
||||
__attribute__((device))
|
||||
inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 long double
|
||||
__constexpr_fmax(long double __x, long double __y) _NOEXCEPT {
|
||||
return __builtin_fmaxl(__x, __y);
|
||||
}
|
||||
|
||||
template <class _Tp, class _Up, __enable_if_t<is_arithmetic<_Tp>::value && is_arithmetic<_Up>::value, int> = 0>
|
||||
__attribute__((device))
|
||||
_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 typename __promote<_Tp, _Up>::type
|
||||
__constexpr_fmax(_Tp __x, _Up __y) _NOEXCEPT {
|
||||
using __result_type = typename __promote<_Tp, _Up>::type;
|
||||
return std::__constexpr_fmax(static_cast<__result_type>(__x), static_cast<__result_type>(__y));
|
||||
}
|
||||
#endif // _LIBCPP_STD_VER < 14
|
||||
|
||||
// For logb/scalbn templates we must always provide device overloads because
|
||||
// libc++ implementation uses __builtin_XXX which gets translated into a libcall
|
||||
// which we can't handle on GPU. We need to forward those to CUDA-provided
|
||||
// implementations.
|
||||
|
||||
template <class _Tp>
|
||||
__attribute__((device))
|
||||
_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 _Tp __constexpr_logb(_Tp __x) {
|
||||
return ::logb(__x);
|
||||
}
|
||||
|
||||
template <class _Tp>
|
||||
__attribute__((device))
|
||||
_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _Tp __constexpr_scalbn(_Tp __x, int __exp) {
|
||||
return ::scalbn(__x, __exp);
|
||||
}
|
||||
|
||||
} // namespace std//
|
||||
|
||||
#endif // _LIBCPP_STD_VER
|
||||
|
||||
#endif // include guard
|
12
lib/include/emmintrin.h
vendored
12
lib/include/emmintrin.h
vendored
@ -38,6 +38,16 @@ typedef unsigned char __v16qu __attribute__((__vector_size__(16)));
|
||||
* appear in the interface though. */
|
||||
typedef signed char __v16qs __attribute__((__vector_size__(16)));
|
||||
|
||||
#ifdef __SSE2__
|
||||
/* Both _Float16 and __bf16 require SSE2 being enabled. */
|
||||
typedef _Float16 __v8hf __attribute__((__vector_size__(16), __aligned__(16)));
|
||||
typedef _Float16 __m128h __attribute__((__vector_size__(16), __aligned__(16)));
|
||||
typedef _Float16 __m128h_u __attribute__((__vector_size__(16), __aligned__(1)));
|
||||
|
||||
typedef __bf16 __v8bf __attribute__((__vector_size__(16), __aligned__(16)));
|
||||
typedef __bf16 __m128bh __attribute__((__vector_size__(16), __aligned__(16)));
|
||||
#endif
|
||||
|
||||
/* Define the default attributes for the functions in this file. */
|
||||
#define __DEFAULT_FN_ATTRS \
|
||||
__attribute__((__always_inline__, __nodebug__, __target__("sse2"), \
|
||||
@ -1809,7 +1819,7 @@ static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_setr_pd(double __w,
|
||||
/// \returns An initialized 128-bit floating-point vector of [2 x double] with
|
||||
/// all elements set to zero.
|
||||
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_setzero_pd(void) {
|
||||
return __extension__(__m128d){0, 0};
|
||||
return __extension__(__m128d){0.0, 0.0};
|
||||
}
|
||||
|
||||
/// Constructs a 128-bit floating-point vector of [2 x double]. The lower
|
||||
|
27
lib/include/float.h
vendored
27
lib/include/float.h
vendored
@ -38,9 +38,10 @@
|
||||
# undef FLT_MANT_DIG
|
||||
# undef DBL_MANT_DIG
|
||||
# undef LDBL_MANT_DIG
|
||||
# if __STDC_VERSION__ >= 199901L || !defined(__STRICT_ANSI__) || \
|
||||
__cplusplus >= 201103L || \
|
||||
(__STDC_HOSTED__ && defined(_AIX) && defined(_ALL_SOURCE))
|
||||
#if (defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L) || \
|
||||
!defined(__STRICT_ANSI__) || \
|
||||
(defined(__cplusplus) && __cplusplus >= 201103L) || \
|
||||
(__STDC_HOSTED__ && defined(_AIX) && defined(_ALL_SOURCE))
|
||||
# undef DECIMAL_DIG
|
||||
# endif
|
||||
# undef FLT_DIG
|
||||
@ -67,9 +68,10 @@
|
||||
# undef FLT_MIN
|
||||
# undef DBL_MIN
|
||||
# undef LDBL_MIN
|
||||
# if __STDC_VERSION__ >= 201112L || !defined(__STRICT_ANSI__) || \
|
||||
__cplusplus >= 201703L || \
|
||||
(__STDC_HOSTED__ && defined(_AIX) && defined(_ALL_SOURCE))
|
||||
#if (defined(__STDC_VERSION__) && __STDC_VERSION__ >= 201112L) || \
|
||||
!defined(__STRICT_ANSI__) || \
|
||||
(defined(__cplusplus) && __cplusplus >= 201703L) || \
|
||||
(__STDC_HOSTED__ && defined(_AIX) && defined(_ALL_SOURCE))
|
||||
# undef FLT_TRUE_MIN
|
||||
# undef DBL_TRUE_MIN
|
||||
# undef LDBL_TRUE_MIN
|
||||
@ -84,7 +86,10 @@
|
||||
|
||||
/* Characteristics of floating point types, C99 5.2.4.2.2 */
|
||||
|
||||
#if (defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L) || \
|
||||
(defined(__cplusplus) && __cplusplus >= 201103L)
|
||||
#define FLT_EVAL_METHOD __FLT_EVAL_METHOD__
|
||||
#endif
|
||||
#define FLT_ROUNDS (__builtin_flt_rounds())
|
||||
#define FLT_RADIX __FLT_RADIX__
|
||||
|
||||
@ -92,8 +97,9 @@
|
||||
#define DBL_MANT_DIG __DBL_MANT_DIG__
|
||||
#define LDBL_MANT_DIG __LDBL_MANT_DIG__
|
||||
|
||||
#if __STDC_VERSION__ >= 199901L || !defined(__STRICT_ANSI__) || \
|
||||
__cplusplus >= 201103L || \
|
||||
#if (defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L) || \
|
||||
!defined(__STRICT_ANSI__) || \
|
||||
(defined(__cplusplus) && __cplusplus >= 201103L) || \
|
||||
(__STDC_HOSTED__ && defined(_AIX) && defined(_ALL_SOURCE))
|
||||
# define DECIMAL_DIG __DECIMAL_DIG__
|
||||
#endif
|
||||
@ -130,8 +136,9 @@
|
||||
#define DBL_MIN __DBL_MIN__
|
||||
#define LDBL_MIN __LDBL_MIN__
|
||||
|
||||
#if __STDC_VERSION__ >= 201112L || !defined(__STRICT_ANSI__) || \
|
||||
__cplusplus >= 201703L || \
|
||||
#if (defined(__STDC_VERSION__) && __STDC_VERSION__ >= 201112L) || \
|
||||
!defined(__STRICT_ANSI__) || \
|
||||
(defined(__cplusplus) && __cplusplus >= 201703L) || \
|
||||
(__STDC_HOSTED__ && defined(_AIX) && defined(_ALL_SOURCE))
|
||||
# define FLT_TRUE_MIN __FLT_DENORM_MIN__
|
||||
# define DBL_TRUE_MIN __DBL_DENORM_MIN__
|
||||
|
12
lib/include/gfniintrin.h
vendored
12
lib/include/gfniintrin.h
vendored
@ -20,10 +20,12 @@
|
||||
/* Default attributes for YMM unmasked form. */
|
||||
#define __DEFAULT_FN_ATTRS_Y __attribute__((__always_inline__, __nodebug__, __target__("avx,gfni"), __min_vector_width__(256)))
|
||||
|
||||
/* Default attributes for ZMM forms. */
|
||||
#define __DEFAULT_FN_ATTRS_Z __attribute__((__always_inline__, __nodebug__, __target__("avx512bw,gfni"), __min_vector_width__(512)))
|
||||
/* Default attributes for ZMM unmasked forms. */
|
||||
#define __DEFAULT_FN_ATTRS_Z __attribute__((__always_inline__, __nodebug__, __target__("avx512f,gfni"), __min_vector_width__(512)))
|
||||
/* Default attributes for ZMM masked forms. */
|
||||
#define __DEFAULT_FN_ATTRS_Z_MASK __attribute__((__always_inline__, __nodebug__, __target__("avx512bw,gfni"), __min_vector_width__(512)))
|
||||
|
||||
/* Default attributes for VLX forms. */
|
||||
/* Default attributes for VLX masked forms. */
|
||||
#define __DEFAULT_FN_ATTRS_VL128 __attribute__((__always_inline__, __nodebug__, __target__("avx512bw,avx512vl,gfni"), __min_vector_width__(128)))
|
||||
#define __DEFAULT_FN_ATTRS_VL256 __attribute__((__always_inline__, __nodebug__, __target__("avx512bw,avx512vl,gfni"), __min_vector_width__(256)))
|
||||
|
||||
@ -99,7 +101,7 @@ _mm512_gf2p8mul_epi8(__m512i __A, __m512i __B)
|
||||
(__v64qi) __B);
|
||||
}
|
||||
|
||||
static __inline__ __m512i __DEFAULT_FN_ATTRS_Z
|
||||
static __inline__ __m512i __DEFAULT_FN_ATTRS_Z_MASK
|
||||
_mm512_mask_gf2p8mul_epi8(__m512i __S, __mmask64 __U, __m512i __A, __m512i __B)
|
||||
{
|
||||
return (__m512i) __builtin_ia32_selectb_512(__U,
|
||||
@ -107,7 +109,7 @@ _mm512_mask_gf2p8mul_epi8(__m512i __S, __mmask64 __U, __m512i __A, __m512i __B)
|
||||
(__v64qi) __S);
|
||||
}
|
||||
|
||||
static __inline__ __m512i __DEFAULT_FN_ATTRS_Z
|
||||
static __inline__ __m512i __DEFAULT_FN_ATTRS_Z_MASK
|
||||
_mm512_maskz_gf2p8mul_epi8(__mmask64 __U, __m512i __A, __m512i __B)
|
||||
{
|
||||
return _mm512_mask_gf2p8mul_epi8((__m512i)_mm512_setzero_si512(),
|
||||
|
15
lib/include/hlsl.h
vendored
15
lib/include/hlsl.h
vendored
@ -1,15 +0,0 @@
|
||||
//===----- hlsl.h - HLSL definitions --------------------------------------===//
|
||||
//
|
||||
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||
// See https://llvm.org/LICENSE.txt for license information.
|
||||
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#ifndef _HLSL_H_
|
||||
#define _HLSL_H_
|
||||
|
||||
#include "hlsl/hlsl_basic_types.h"
|
||||
#include "hlsl/hlsl_intrinsics.h"
|
||||
|
||||
#endif //_HLSL_H_
|
64
lib/include/hlsl_basic_types.h
vendored
64
lib/include/hlsl_basic_types.h
vendored
@ -1,64 +0,0 @@
|
||||
//===----- hlsl_basic_types.h - HLSL definitions for basic types ----------===//
|
||||
//
|
||||
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||
// See https://llvm.org/LICENSE.txt for license information.
|
||||
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#ifndef _HLSL_HLSL_BASIC_TYPES_H_
|
||||
#define _HLSL_HLSL_BASIC_TYPES_H_
|
||||
|
||||
// built-in scalar data types:
|
||||
|
||||
#ifdef __HLSL_ENABLE_16_BIT
|
||||
// 16-bit integer.
|
||||
typedef unsigned short uint16_t;
|
||||
typedef short int16_t;
|
||||
#endif
|
||||
|
||||
// unsigned 32-bit integer.
|
||||
typedef unsigned int uint;
|
||||
|
||||
// 64-bit integer.
|
||||
typedef unsigned long uint64_t;
|
||||
typedef long int64_t;
|
||||
|
||||
// built-in vector data types:
|
||||
|
||||
#ifdef __HLSL_ENABLE_16_BIT
|
||||
typedef vector<int16_t, 2> int16_t2;
|
||||
typedef vector<int16_t, 3> int16_t3;
|
||||
typedef vector<int16_t, 4> int16_t4;
|
||||
typedef vector<uint16_t, 2> uint16_t2;
|
||||
typedef vector<uint16_t, 3> uint16_t3;
|
||||
typedef vector<uint16_t, 4> uint16_t4;
|
||||
#endif
|
||||
|
||||
typedef vector<int, 2> int2;
|
||||
typedef vector<int, 3> int3;
|
||||
typedef vector<int, 4> int4;
|
||||
typedef vector<uint, 2> uint2;
|
||||
typedef vector<uint, 3> uint3;
|
||||
typedef vector<uint, 4> uint4;
|
||||
typedef vector<int64_t, 2> int64_t2;
|
||||
typedef vector<int64_t, 3> int64_t3;
|
||||
typedef vector<int64_t, 4> int64_t4;
|
||||
typedef vector<uint64_t, 2> uint64_t2;
|
||||
typedef vector<uint64_t, 3> uint64_t3;
|
||||
typedef vector<uint64_t, 4> uint64_t4;
|
||||
|
||||
#ifdef __HLSL_ENABLE_16_BIT
|
||||
typedef vector<half, 2> half2;
|
||||
typedef vector<half, 3> half3;
|
||||
typedef vector<half, 4> half4;
|
||||
#endif
|
||||
|
||||
typedef vector<float, 2> float2;
|
||||
typedef vector<float, 3> float3;
|
||||
typedef vector<float, 4> float4;
|
||||
typedef vector<double, 2> double2;
|
||||
typedef vector<double, 3> double3;
|
||||
typedef vector<double, 4> double4;
|
||||
|
||||
#endif //_HLSL_HLSL_BASIC_TYPES_H_
|
15
lib/include/hlsl_intrinsics.h
vendored
15
lib/include/hlsl_intrinsics.h
vendored
@ -1,15 +0,0 @@
|
||||
//===----- hlsl_intrinsics.h - HLSL definitions for intrinsics ----------===//
|
||||
//
|
||||
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||
// See https://llvm.org/LICENSE.txt for license information.
|
||||
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#ifndef _HLSL_HLSL_INTRINSICS_H_
|
||||
#define _HLSL_HLSL_INTRINSICS_H_
|
||||
|
||||
__attribute__((clang_builtin_alias(__builtin_hlsl_wave_active_count_bits))) uint
|
||||
WaveActiveCountBits(bool bBit);
|
||||
|
||||
#endif //_HLSL_HLSL_INTRINSICS_H_
|
48
lib/include/immintrin.h
vendored
48
lib/include/immintrin.h
vendored
@ -189,6 +189,11 @@
|
||||
#include <avx512ifmavlintrin.h>
|
||||
#endif
|
||||
|
||||
#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \
|
||||
defined(__AVXIFMA__)
|
||||
#include <avxifmaintrin.h>
|
||||
#endif
|
||||
|
||||
#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \
|
||||
defined(__AVX512VBMI__)
|
||||
#include <avx512vbmiintrin.h>
|
||||
@ -214,17 +219,13 @@
|
||||
#include <avx512pfintrin.h>
|
||||
#endif
|
||||
|
||||
/*
|
||||
* FIXME: _Float16 type is legal only when HW support float16 operation.
|
||||
* We use __AVX512FP16__ to identify if float16 is supported or not, so
|
||||
* when float16 is not supported, the related header is not included.
|
||||
*
|
||||
*/
|
||||
#if defined(__AVX512FP16__)
|
||||
#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \
|
||||
defined(__AVX512FP16__)
|
||||
#include <avx512fp16intrin.h>
|
||||
#endif
|
||||
|
||||
#if defined(__AVX512FP16__) && defined(__AVX512VL__)
|
||||
#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \
|
||||
(defined(__AVX512VL__) && defined(__AVX512FP16__))
|
||||
#include <avx512vlfp16intrin.h>
|
||||
#endif
|
||||
|
||||
@ -258,6 +259,16 @@
|
||||
#include <gfniintrin.h>
|
||||
#endif
|
||||
|
||||
#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \
|
||||
defined(__AVXVNNIINT8__)
|
||||
#include <avxvnniint8intrin.h>
|
||||
#endif
|
||||
|
||||
#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \
|
||||
defined(__AVXNECONVERT__)
|
||||
#include <avxneconvertintrin.h>
|
||||
#endif
|
||||
|
||||
#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \
|
||||
defined(__RDPID__)
|
||||
/// Returns the value of the IA32_TSC_AUX MSR (0xc0000103).
|
||||
@ -291,6 +302,23 @@ _rdrand64_step(unsigned long long *__p)
|
||||
{
|
||||
return (int)__builtin_ia32_rdrand64_step(__p);
|
||||
}
|
||||
#else
|
||||
// We need to emulate the functionality of 64-bit rdrand with 2 32-bit
|
||||
// rdrand instructions.
|
||||
static __inline__ int __attribute__((__always_inline__, __nodebug__, __target__("rdrnd")))
|
||||
_rdrand64_step(unsigned long long *__p)
|
||||
{
|
||||
unsigned int __lo, __hi;
|
||||
unsigned int __res_lo = __builtin_ia32_rdrand32_step(&__lo);
|
||||
unsigned int __res_hi = __builtin_ia32_rdrand32_step(&__hi);
|
||||
if (__res_lo && __res_hi) {
|
||||
*__p = ((unsigned long long)__hi << 32) | (unsigned long long)__lo;
|
||||
return 1;
|
||||
} else {
|
||||
*__p = 0;
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#endif /* __RDRND__ */
|
||||
|
||||
@ -495,6 +523,10 @@ _storebe_i64(void * __P, long long __D) {
|
||||
defined(__INVPCID__)
|
||||
#include <invpcidintrin.h>
|
||||
#endif
|
||||
#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \
|
||||
defined(__AMXFP16__)
|
||||
#include <amxfp16intrin.h>
|
||||
#endif
|
||||
|
||||
#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \
|
||||
defined(__KL__) || defined(__WIDEKL__)
|
||||
|
234
lib/include/larchintrin.h
vendored
Normal file
234
lib/include/larchintrin.h
vendored
Normal file
@ -0,0 +1,234 @@
|
||||
/*===------------ larchintrin.h - LoongArch intrinsics ---------------------===
|
||||
*
|
||||
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||
* See https://llvm.org/LICENSE.txt for license information.
|
||||
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||
*
|
||||
*===-----------------------------------------------------------------------===
|
||||
*/
|
||||
|
||||
#ifndef _LOONGARCH_BASE_INTRIN_H
|
||||
#define _LOONGARCH_BASE_INTRIN_H
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
typedef struct rdtime {
|
||||
unsigned int value;
|
||||
unsigned int timeid;
|
||||
} __rdtime_t;
|
||||
|
||||
#if __loongarch_grlen == 64
|
||||
typedef struct drdtime {
|
||||
unsigned long dvalue;
|
||||
unsigned long dtimeid;
|
||||
} __drdtime_t;
|
||||
|
||||
extern __inline __drdtime_t
|
||||
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||
__rdtime_d(void) {
|
||||
__drdtime_t __drdtime;
|
||||
__asm__ volatile(
|
||||
"rdtime.d %[val], %[tid]\n\t"
|
||||
: [val] "=&r"(__drdtime.dvalue), [tid] "=&r"(__drdtime.dtimeid));
|
||||
return __drdtime;
|
||||
}
|
||||
#endif
|
||||
|
||||
extern __inline __rdtime_t
|
||||
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||
__rdtimeh_w(void) {
|
||||
__rdtime_t __rdtime;
|
||||
__asm__ volatile("rdtimeh.w %[val], %[tid]\n\t"
|
||||
: [val] "=&r"(__rdtime.value), [tid] "=&r"(__rdtime.timeid));
|
||||
return __rdtime;
|
||||
}
|
||||
|
||||
extern __inline __rdtime_t
|
||||
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||
__rdtimel_w(void) {
|
||||
__rdtime_t __rdtime;
|
||||
__asm__ volatile("rdtimel.w %[val], %[tid]\n\t"
|
||||
: [val] "=&r"(__rdtime.value), [tid] "=&r"(__rdtime.timeid));
|
||||
return __rdtime;
|
||||
}
|
||||
|
||||
#if __loongarch_grlen == 64
|
||||
extern __inline int
|
||||
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||
__crc_w_b_w(char _1, int _2) {
|
||||
return (int)__builtin_loongarch_crc_w_b_w((char)_1, (int)_2);
|
||||
}
|
||||
|
||||
extern __inline int
|
||||
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||
__crc_w_h_w(short _1, int _2) {
|
||||
return (int)__builtin_loongarch_crc_w_h_w((short)_1, (int)_2);
|
||||
}
|
||||
|
||||
extern __inline int
|
||||
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||
__crc_w_w_w(int _1, int _2) {
|
||||
return (int)__builtin_loongarch_crc_w_w_w((int)_1, (int)_2);
|
||||
}
|
||||
|
||||
extern __inline int
|
||||
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||
__crc_w_d_w(long int _1, int _2) {
|
||||
return (int)__builtin_loongarch_crc_w_d_w((long int)_1, (int)_2);
|
||||
}
|
||||
|
||||
extern __inline int
|
||||
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||
__crcc_w_b_w(char _1, int _2) {
|
||||
return (int)__builtin_loongarch_crcc_w_b_w((char)_1, (int)_2);
|
||||
}
|
||||
|
||||
extern __inline int
|
||||
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||
__crcc_w_h_w(short _1, int _2) {
|
||||
return (int)__builtin_loongarch_crcc_w_h_w((short)_1, (int)_2);
|
||||
}
|
||||
|
||||
extern __inline int
|
||||
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||
__crcc_w_w_w(int _1, int _2) {
|
||||
return (int)__builtin_loongarch_crcc_w_w_w((int)_1, (int)_2);
|
||||
}
|
||||
|
||||
extern __inline int
|
||||
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||
__crcc_w_d_w(long int _1, int _2) {
|
||||
return (int)__builtin_loongarch_crcc_w_d_w((long int)_1, (int)_2);
|
||||
}
|
||||
#endif
|
||||
|
||||
#define __break(/*ui15*/ _1) __builtin_loongarch_break((_1))
|
||||
|
||||
#if __loongarch_grlen == 32
|
||||
#define __cacop_w(/*uimm5*/ _1, /*unsigned int*/ _2, /*simm12*/ _3) \
|
||||
((void)__builtin_loongarch_cacop_w((_1), (unsigned int)(_2), (_3)))
|
||||
#endif
|
||||
|
||||
#if __loongarch_grlen == 64
|
||||
#define __cacop_d(/*uimm5*/ _1, /*unsigned long int*/ _2, /*simm12*/ _3) \
|
||||
((void)__builtin_loongarch_cacop_d((_1), (unsigned long int)(_2), (_3)))
|
||||
#endif
|
||||
|
||||
#define __dbar(/*ui15*/ _1) __builtin_loongarch_dbar((_1))
|
||||
|
||||
#define __ibar(/*ui15*/ _1) __builtin_loongarch_ibar((_1))
|
||||
|
||||
#define __movfcsr2gr(/*ui5*/ _1) __builtin_loongarch_movfcsr2gr((_1));
|
||||
|
||||
#define __movgr2fcsr(/*ui5*/ _1, _2) \
|
||||
__builtin_loongarch_movgr2fcsr((_1), (unsigned int)_2);
|
||||
|
||||
#define __syscall(/*ui15*/ _1) __builtin_loongarch_syscall((_1))
|
||||
|
||||
#define __csrrd_w(/*ui14*/ _1) ((unsigned int)__builtin_loongarch_csrrd_w((_1)))
|
||||
|
||||
#define __csrwr_w(/*unsigned int*/ _1, /*ui14*/ _2) \
|
||||
((unsigned int)__builtin_loongarch_csrwr_w((unsigned int)(_1), (_2)))
|
||||
|
||||
#define __csrxchg_w(/*unsigned int*/ _1, /*unsigned int*/ _2, /*ui14*/ _3) \
|
||||
((unsigned int)__builtin_loongarch_csrxchg_w((unsigned int)(_1), \
|
||||
(unsigned int)(_2), (_3)))
|
||||
|
||||
#if __loongarch_grlen == 64
|
||||
#define __csrrd_d(/*ui14*/ _1) \
|
||||
((unsigned long int)__builtin_loongarch_csrrd_d((_1)))
|
||||
|
||||
#define __csrwr_d(/*unsigned long int*/ _1, /*ui14*/ _2) \
|
||||
((unsigned long int)__builtin_loongarch_csrwr_d((unsigned long int)(_1), \
|
||||
(_2)))
|
||||
|
||||
#define __csrxchg_d(/*unsigned long int*/ _1, /*unsigned long int*/ _2, \
|
||||
/*ui14*/ _3) \
|
||||
((unsigned long int)__builtin_loongarch_csrxchg_d( \
|
||||
(unsigned long int)(_1), (unsigned long int)(_2), (_3)))
|
||||
#endif
|
||||
|
||||
extern __inline unsigned char
|
||||
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||
__iocsrrd_b(unsigned int _1) {
|
||||
return (unsigned char)__builtin_loongarch_iocsrrd_b((unsigned int)_1);
|
||||
}
|
||||
|
||||
extern __inline unsigned char
|
||||
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||
__iocsrrd_h(unsigned int _1) {
|
||||
return (unsigned short)__builtin_loongarch_iocsrrd_h((unsigned int)_1);
|
||||
}
|
||||
|
||||
extern __inline unsigned int
|
||||
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||
__iocsrrd_w(unsigned int _1) {
|
||||
return (unsigned int)__builtin_loongarch_iocsrrd_w((unsigned int)_1);
|
||||
}
|
||||
|
||||
#if __loongarch_grlen == 64
|
||||
extern __inline unsigned long int
|
||||
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||
__iocsrrd_d(unsigned int _1) {
|
||||
return (unsigned long int)__builtin_loongarch_iocsrrd_d((unsigned int)_1);
|
||||
}
|
||||
#endif
|
||||
|
||||
extern __inline void
|
||||
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||
__iocsrwr_b(unsigned char _1, unsigned int _2) {
|
||||
__builtin_loongarch_iocsrwr_b((unsigned char)_1, (unsigned int)_2);
|
||||
}
|
||||
|
||||
extern __inline void
|
||||
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||
__iocsrwr_h(unsigned short _1, unsigned int _2) {
|
||||
__builtin_loongarch_iocsrwr_h((unsigned short)_1, (unsigned int)_2);
|
||||
}
|
||||
|
||||
extern __inline void
|
||||
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||
__iocsrwr_w(unsigned int _1, unsigned int _2) {
|
||||
__builtin_loongarch_iocsrwr_w((unsigned int)_1, (unsigned int)_2);
|
||||
}
|
||||
|
||||
extern __inline unsigned int
|
||||
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||
__cpucfg(unsigned int _1) {
|
||||
return (unsigned int)__builtin_loongarch_cpucfg((unsigned int)_1);
|
||||
}
|
||||
|
||||
#if __loongarch_grlen == 64
|
||||
extern __inline void
|
||||
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||
__iocsrwr_d(unsigned long int _1, unsigned int _2) {
|
||||
__builtin_loongarch_iocsrwr_d((unsigned long int)_1, (unsigned int)_2);
|
||||
}
|
||||
|
||||
extern __inline void
|
||||
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||
__asrtgt_d(long int _1, long int _2) {
|
||||
__builtin_loongarch_asrtgt_d((long int)_1, (long int)_2);
|
||||
}
|
||||
|
||||
extern __inline void
|
||||
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||
__asrtle_d(long int _1, long int _2) {
|
||||
__builtin_loongarch_asrtle_d((long int)_1, (long int)_2);
|
||||
}
|
||||
#endif
|
||||
|
||||
#if __loongarch_grlen == 64
|
||||
#define __lddir_d(/*long int*/ _1, /*ui5*/ _2) \
|
||||
((long int)__builtin_loongarch_lddir_d((long int)(_1), (_2)))
|
||||
|
||||
#define __ldpte_d(/*long int*/ _1, /*ui5*/ _2) \
|
||||
((void)__builtin_loongarch_ldpte_d((long int)(_1), (_2)))
|
||||
#endif
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
#endif /* _LOONGARCH_BASE_INTRIN_H */
|
5
lib/include/limits.h
vendored
5
lib/include/limits.h
vendored
@ -65,7 +65,7 @@
|
||||
/* C2x 5.2.4.2.1 */
|
||||
/* FIXME: This is using the placeholder dates Clang produces for these macros
|
||||
in C2x mode; switch to the correct values once they've been published. */
|
||||
#if __STDC_VERSION__ >= 202000L
|
||||
#if defined(__STDC_VERSION__) && __STDC_VERSION__ >= 202000L
|
||||
#define BOOL_WIDTH __BOOL_WIDTH__
|
||||
#define CHAR_WIDTH CHAR_BIT
|
||||
#define SCHAR_WIDTH CHAR_BIT
|
||||
@ -93,7 +93,8 @@
|
||||
/* C99 5.2.4.2.1: Added long long.
|
||||
C++11 18.3.3.2: same contents as the Standard C Library header <limits.h>.
|
||||
*/
|
||||
#if __STDC_VERSION__ >= 199901L || __cplusplus >= 201103L
|
||||
#if (defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L) || \
|
||||
(defined(__cplusplus) && __cplusplus >= 201103L)
|
||||
|
||||
#undef LLONG_MIN
|
||||
#undef LLONG_MAX
|
||||
|
19
lib/include/opencl-c-base.h
vendored
19
lib/include/opencl-c-base.h
vendored
@ -74,6 +74,25 @@
|
||||
#define __opencl_c_atomic_scope_all_devices 1
|
||||
#define __opencl_c_read_write_images 1
|
||||
#endif // defined(__SPIR__)
|
||||
|
||||
// Undefine any feature macros that have been explicitly disabled using
|
||||
// an __undef_<feature> macro.
|
||||
#ifdef __undef___opencl_c_work_group_collective_functions
|
||||
#undef __opencl_c_work_group_collective_functions
|
||||
#endif
|
||||
#ifdef __undef___opencl_c_atomic_order_seq_cst
|
||||
#undef __opencl_c_atomic_order_seq_cst
|
||||
#endif
|
||||
#ifdef __undef___opencl_c_atomic_scope_device
|
||||
#undef __opencl_c_atomic_scope_device
|
||||
#endif
|
||||
#ifdef __undef___opencl_c_atomic_scope_all_devices
|
||||
#undef __opencl_c_atomic_scope_all_devices
|
||||
#endif
|
||||
#ifdef __undef___opencl_c_read_write_images
|
||||
#undef __opencl_c_read_write_images
|
||||
#endif
|
||||
|
||||
#endif // (__OPENCL_CPP_VERSION__ == 202100 || __OPENCL_C_VERSION__ == 300)
|
||||
|
||||
#if !defined(__opencl_c_generic_address_space)
|
||||
|
320
lib/include/opencl-c.h
vendored
320
lib/include/opencl-c.h
vendored
@ -12396,11 +12396,11 @@ void __ovld vstorea_half16_rtn(double16, size_t, __private half *);
|
||||
* image objects and then want to read the updated data.
|
||||
*/
|
||||
|
||||
void __ovld __conv barrier(cl_mem_fence_flags flags);
|
||||
void __ovld __conv barrier(cl_mem_fence_flags);
|
||||
|
||||
#if defined(__OPENCL_CPP_VERSION__) || (__OPENCL_C_VERSION__ >= CL_VERSION_2_0)
|
||||
void __ovld __conv work_group_barrier(cl_mem_fence_flags flags, memory_scope);
|
||||
void __ovld __conv work_group_barrier(cl_mem_fence_flags flags);
|
||||
void __ovld __conv work_group_barrier(cl_mem_fence_flags, memory_scope);
|
||||
void __ovld __conv work_group_barrier(cl_mem_fence_flags);
|
||||
#endif //defined(__OPENCL_CPP_VERSION__) || (__OPENCL_C_VERSION__ >= CL_VERSION_2_0)
|
||||
|
||||
// OpenCL v1.1 s6.11.9, v1.2 s6.12.9 - Explicit Memory Fence Functions
|
||||
@ -12418,7 +12418,7 @@ void __ovld __conv work_group_barrier(cl_mem_fence_flags flags);
|
||||
* CLK_LOCAL_MEM_FENCE
|
||||
* CLK_GLOBAL_MEM_FENCE.
|
||||
*/
|
||||
void __ovld mem_fence(cl_mem_fence_flags flags);
|
||||
void __ovld mem_fence(cl_mem_fence_flags);
|
||||
|
||||
/**
|
||||
* Read memory barrier that orders only
|
||||
@ -12430,7 +12430,7 @@ void __ovld mem_fence(cl_mem_fence_flags flags);
|
||||
* CLK_LOCAL_MEM_FENCE
|
||||
* CLK_GLOBAL_MEM_FENCE.
|
||||
*/
|
||||
void __ovld read_mem_fence(cl_mem_fence_flags flags);
|
||||
void __ovld read_mem_fence(cl_mem_fence_flags);
|
||||
|
||||
/**
|
||||
* Write memory barrier that orders only
|
||||
@ -12442,7 +12442,7 @@ void __ovld read_mem_fence(cl_mem_fence_flags flags);
|
||||
* CLK_LOCAL_MEM_FENCE
|
||||
* CLK_GLOBAL_MEM_FENCE.
|
||||
*/
|
||||
void __ovld write_mem_fence(cl_mem_fence_flags flags);
|
||||
void __ovld write_mem_fence(cl_mem_fence_flags);
|
||||
|
||||
// OpenCL v2.0 s6.13.9 - Address Space Qualifier Functions
|
||||
|
||||
@ -12891,29 +12891,29 @@ void __ovld prefetch(const __global half16 *, size_t);
|
||||
* (old + val) and store result at location
|
||||
* pointed by p. The function returns old.
|
||||
*/
|
||||
int __ovld atomic_add(volatile __global int *p, int val);
|
||||
uint __ovld atomic_add(volatile __global uint *p, uint val);
|
||||
int __ovld atomic_add(volatile __local int *p, int val);
|
||||
uint __ovld atomic_add(volatile __local uint *p, uint val);
|
||||
int __ovld atomic_add(volatile __global int *, int);
|
||||
uint __ovld atomic_add(volatile __global uint *, uint);
|
||||
int __ovld atomic_add(volatile __local int *, int);
|
||||
uint __ovld atomic_add(volatile __local uint *, uint);
|
||||
#ifdef __OPENCL_CPP_VERSION__
|
||||
int __ovld atomic_add(volatile int *p, int val);
|
||||
uint __ovld atomic_add(volatile uint *p, uint val);
|
||||
int __ovld atomic_add(volatile int *, int);
|
||||
uint __ovld atomic_add(volatile uint *, uint);
|
||||
#endif
|
||||
|
||||
#if defined(cl_khr_global_int32_base_atomics)
|
||||
int __ovld atom_add(volatile __global int *p, int val);
|
||||
uint __ovld atom_add(volatile __global uint *p, uint val);
|
||||
int __ovld atom_add(volatile __global int *, int);
|
||||
uint __ovld atom_add(volatile __global uint *, uint);
|
||||
#endif
|
||||
#if defined(cl_khr_local_int32_base_atomics)
|
||||
int __ovld atom_add(volatile __local int *p, int val);
|
||||
uint __ovld atom_add(volatile __local uint *p, uint val);
|
||||
int __ovld atom_add(volatile __local int *, int);
|
||||
uint __ovld atom_add(volatile __local uint *, uint);
|
||||
#endif
|
||||
|
||||
#if defined(cl_khr_int64_base_atomics)
|
||||
long __ovld atom_add(volatile __global long *p, long val);
|
||||
ulong __ovld atom_add(volatile __global ulong *p, ulong val);
|
||||
long __ovld atom_add(volatile __local long *p, long val);
|
||||
ulong __ovld atom_add(volatile __local ulong *p, ulong val);
|
||||
long __ovld atom_add(volatile __global long *, long);
|
||||
ulong __ovld atom_add(volatile __global ulong *, ulong);
|
||||
long __ovld atom_add(volatile __local long *, long);
|
||||
ulong __ovld atom_add(volatile __local ulong *, ulong);
|
||||
#endif
|
||||
|
||||
/**
|
||||
@ -12921,29 +12921,29 @@ ulong __ovld atom_add(volatile __local ulong *p, ulong val);
|
||||
* Compute (old - val) and store result at location pointed by p. The function
|
||||
* returns old.
|
||||
*/
|
||||
int __ovld atomic_sub(volatile __global int *p, int val);
|
||||
uint __ovld atomic_sub(volatile __global uint *p, uint val);
|
||||
int __ovld atomic_sub(volatile __local int *p, int val);
|
||||
uint __ovld atomic_sub(volatile __local uint *p, uint val);
|
||||
int __ovld atomic_sub(volatile __global int *, int);
|
||||
uint __ovld atomic_sub(volatile __global uint *, uint);
|
||||
int __ovld atomic_sub(volatile __local int *, int);
|
||||
uint __ovld atomic_sub(volatile __local uint *, uint);
|
||||
#ifdef __OPENCL_CPP_VERSION__
|
||||
int __ovld atomic_sub(volatile int *p, int val);
|
||||
uint __ovld atomic_sub(volatile uint *p, uint val);
|
||||
int __ovld atomic_sub(volatile int *, int);
|
||||
uint __ovld atomic_sub(volatile uint *, uint);
|
||||
#endif
|
||||
|
||||
#if defined(cl_khr_global_int32_base_atomics)
|
||||
int __ovld atom_sub(volatile __global int *p, int val);
|
||||
uint __ovld atom_sub(volatile __global uint *p, uint val);
|
||||
int __ovld atom_sub(volatile __global int *, int);
|
||||
uint __ovld atom_sub(volatile __global uint *, uint);
|
||||
#endif
|
||||
#if defined(cl_khr_local_int32_base_atomics)
|
||||
int __ovld atom_sub(volatile __local int *p, int val);
|
||||
uint __ovld atom_sub(volatile __local uint *p, uint val);
|
||||
int __ovld atom_sub(volatile __local int *, int);
|
||||
uint __ovld atom_sub(volatile __local uint *, uint);
|
||||
#endif
|
||||
|
||||
#if defined(cl_khr_int64_base_atomics)
|
||||
long __ovld atom_sub(volatile __global long *p, long val);
|
||||
ulong __ovld atom_sub(volatile __global ulong *p, ulong val);
|
||||
long __ovld atom_sub(volatile __local long *p, long val);
|
||||
ulong __ovld atom_sub(volatile __local ulong *p, ulong val);
|
||||
long __ovld atom_sub(volatile __global long *, long);
|
||||
ulong __ovld atom_sub(volatile __global ulong *, ulong);
|
||||
long __ovld atom_sub(volatile __local long *, long);
|
||||
ulong __ovld atom_sub(volatile __local ulong *, ulong);
|
||||
#endif
|
||||
|
||||
/**
|
||||
@ -12951,32 +12951,32 @@ ulong __ovld atom_sub(volatile __local ulong *p, ulong val);
|
||||
* with new value given by val. Returns old
|
||||
* value.
|
||||
*/
|
||||
int __ovld atomic_xchg(volatile __global int *p, int val);
|
||||
uint __ovld atomic_xchg(volatile __global uint *p, uint val);
|
||||
int __ovld atomic_xchg(volatile __local int *p, int val);
|
||||
uint __ovld atomic_xchg(volatile __local uint *p, uint val);
|
||||
float __ovld atomic_xchg(volatile __global float *p, float val);
|
||||
float __ovld atomic_xchg(volatile __local float *p, float val);
|
||||
int __ovld atomic_xchg(volatile __global int *, int);
|
||||
uint __ovld atomic_xchg(volatile __global uint *, uint);
|
||||
int __ovld atomic_xchg(volatile __local int *, int);
|
||||
uint __ovld atomic_xchg(volatile __local uint *, uint);
|
||||
float __ovld atomic_xchg(volatile __global float *, float);
|
||||
float __ovld atomic_xchg(volatile __local float *, float);
|
||||
#ifdef __OPENCL_CPP_VERSION__
|
||||
int __ovld atomic_xchg(volatile int *p, int val);
|
||||
uint __ovld atomic_xchg(volatile uint *p, uint val);
|
||||
float __ovld atomic_xchg(volatile float *p, float val);
|
||||
int __ovld atomic_xchg(volatile int *, int);
|
||||
uint __ovld atomic_xchg(volatile uint *, uint);
|
||||
float __ovld atomic_xchg(volatile float *, float);
|
||||
#endif
|
||||
|
||||
#if defined(cl_khr_global_int32_base_atomics)
|
||||
int __ovld atom_xchg(volatile __global int *p, int val);
|
||||
uint __ovld atom_xchg(volatile __global uint *p, uint val);
|
||||
int __ovld atom_xchg(volatile __global int *, int);
|
||||
uint __ovld atom_xchg(volatile __global uint *, uint);
|
||||
#endif
|
||||
#if defined(cl_khr_local_int32_base_atomics)
|
||||
int __ovld atom_xchg(volatile __local int *p, int val);
|
||||
uint __ovld atom_xchg(volatile __local uint *p, uint val);
|
||||
int __ovld atom_xchg(volatile __local int *, int);
|
||||
uint __ovld atom_xchg(volatile __local uint *, uint);
|
||||
#endif
|
||||
|
||||
#if defined(cl_khr_int64_base_atomics)
|
||||
long __ovld atom_xchg(volatile __global long *p, long val);
|
||||
long __ovld atom_xchg(volatile __local long *p, long val);
|
||||
ulong __ovld atom_xchg(volatile __global ulong *p, ulong val);
|
||||
ulong __ovld atom_xchg(volatile __local ulong *p, ulong val);
|
||||
long __ovld atom_xchg(volatile __global long *, long);
|
||||
long __ovld atom_xchg(volatile __local long *, long);
|
||||
ulong __ovld atom_xchg(volatile __global ulong *, ulong);
|
||||
ulong __ovld atom_xchg(volatile __local ulong *, ulong);
|
||||
#endif
|
||||
|
||||
/**
|
||||
@ -13048,29 +13048,29 @@ ulong __ovld atom_dec(volatile __local ulong *);
|
||||
* location pointed by p. The function
|
||||
* returns old.
|
||||
*/
|
||||
int __ovld atomic_cmpxchg(volatile __global int *p, int cmp, int val);
|
||||
uint __ovld atomic_cmpxchg(volatile __global uint *p, uint cmp, uint val);
|
||||
int __ovld atomic_cmpxchg(volatile __local int *p, int cmp, int val);
|
||||
uint __ovld atomic_cmpxchg(volatile __local uint *p, uint cmp, uint val);
|
||||
int __ovld atomic_cmpxchg(volatile __global int *, int, int);
|
||||
uint __ovld atomic_cmpxchg(volatile __global uint *, uint, uint);
|
||||
int __ovld atomic_cmpxchg(volatile __local int *, int, int);
|
||||
uint __ovld atomic_cmpxchg(volatile __local uint *, uint, uint);
|
||||
#ifdef __OPENCL_CPP_VERSION__
|
||||
int __ovld atomic_cmpxchg(volatile int *p, int cmp, int val);
|
||||
uint __ovld atomic_cmpxchg(volatile uint *p, uint cmp, uint val);
|
||||
int __ovld atomic_cmpxchg(volatile int *, int, int);
|
||||
uint __ovld atomic_cmpxchg(volatile uint *, uint, uint);
|
||||
#endif
|
||||
|
||||
#if defined(cl_khr_global_int32_base_atomics)
|
||||
int __ovld atom_cmpxchg(volatile __global int *p, int cmp, int val);
|
||||
uint __ovld atom_cmpxchg(volatile __global uint *p, uint cmp, uint val);
|
||||
int __ovld atom_cmpxchg(volatile __global int *, int, int);
|
||||
uint __ovld atom_cmpxchg(volatile __global uint *, uint, uint);
|
||||
#endif
|
||||
#if defined(cl_khr_local_int32_base_atomics)
|
||||
int __ovld atom_cmpxchg(volatile __local int *p, int cmp, int val);
|
||||
uint __ovld atom_cmpxchg(volatile __local uint *p, uint cmp, uint val);
|
||||
int __ovld atom_cmpxchg(volatile __local int *, int, int);
|
||||
uint __ovld atom_cmpxchg(volatile __local uint *, uint, uint);
|
||||
#endif
|
||||
|
||||
#if defined(cl_khr_int64_base_atomics)
|
||||
long __ovld atom_cmpxchg(volatile __global long *p, long cmp, long val);
|
||||
ulong __ovld atom_cmpxchg(volatile __global ulong *p, ulong cmp, ulong val);
|
||||
long __ovld atom_cmpxchg(volatile __local long *p, long cmp, long val);
|
||||
ulong __ovld atom_cmpxchg(volatile __local ulong *p, ulong cmp, ulong val);
|
||||
long __ovld atom_cmpxchg(volatile __global long *, long, long);
|
||||
ulong __ovld atom_cmpxchg(volatile __global ulong *, ulong, ulong);
|
||||
long __ovld atom_cmpxchg(volatile __local long *, long, long);
|
||||
ulong __ovld atom_cmpxchg(volatile __local ulong *, ulong, ulong);
|
||||
#endif
|
||||
|
||||
/**
|
||||
@ -13080,29 +13080,29 @@ ulong __ovld atom_cmpxchg(volatile __local ulong *p, ulong cmp, ulong val);
|
||||
* location pointed by p. The function
|
||||
* returns old.
|
||||
*/
|
||||
int __ovld atomic_min(volatile __global int *p, int val);
|
||||
uint __ovld atomic_min(volatile __global uint *p, uint val);
|
||||
int __ovld atomic_min(volatile __local int *p, int val);
|
||||
uint __ovld atomic_min(volatile __local uint *p, uint val);
|
||||
int __ovld atomic_min(volatile __global int *, int);
|
||||
uint __ovld atomic_min(volatile __global uint *, uint);
|
||||
int __ovld atomic_min(volatile __local int *, int);
|
||||
uint __ovld atomic_min(volatile __local uint *, uint);
|
||||
#ifdef __OPENCL_CPP_VERSION__
|
||||
int __ovld atomic_min(volatile int *p, int val);
|
||||
uint __ovld atomic_min(volatile uint *p, uint val);
|
||||
int __ovld atomic_min(volatile int *, int);
|
||||
uint __ovld atomic_min(volatile uint *, uint);
|
||||
#endif
|
||||
|
||||
#if defined(cl_khr_global_int32_extended_atomics)
|
||||
int __ovld atom_min(volatile __global int *p, int val);
|
||||
uint __ovld atom_min(volatile __global uint *p, uint val);
|
||||
int __ovld atom_min(volatile __global int *, int);
|
||||
uint __ovld atom_min(volatile __global uint *, uint);
|
||||
#endif
|
||||
#if defined(cl_khr_local_int32_extended_atomics)
|
||||
int __ovld atom_min(volatile __local int *p, int val);
|
||||
uint __ovld atom_min(volatile __local uint *p, uint val);
|
||||
int __ovld atom_min(volatile __local int *, int);
|
||||
uint __ovld atom_min(volatile __local uint *, uint);
|
||||
#endif
|
||||
|
||||
#if defined(cl_khr_int64_extended_atomics)
|
||||
long __ovld atom_min(volatile __global long *p, long val);
|
||||
ulong __ovld atom_min(volatile __global ulong *p, ulong val);
|
||||
long __ovld atom_min(volatile __local long *p, long val);
|
||||
ulong __ovld atom_min(volatile __local ulong *p, ulong val);
|
||||
long __ovld atom_min(volatile __global long *, long);
|
||||
ulong __ovld atom_min(volatile __global ulong *, ulong);
|
||||
long __ovld atom_min(volatile __local long *, long);
|
||||
ulong __ovld atom_min(volatile __local ulong *, ulong);
|
||||
#endif
|
||||
|
||||
/**
|
||||
@ -13112,29 +13112,29 @@ ulong __ovld atom_min(volatile __local ulong *p, ulong val);
|
||||
* location pointed by p. The function
|
||||
* returns old.
|
||||
*/
|
||||
int __ovld atomic_max(volatile __global int *p, int val);
|
||||
uint __ovld atomic_max(volatile __global uint *p, uint val);
|
||||
int __ovld atomic_max(volatile __local int *p, int val);
|
||||
uint __ovld atomic_max(volatile __local uint *p, uint val);
|
||||
int __ovld atomic_max(volatile __global int *, int);
|
||||
uint __ovld atomic_max(volatile __global uint *, uint);
|
||||
int __ovld atomic_max(volatile __local int *, int);
|
||||
uint __ovld atomic_max(volatile __local uint *, uint);
|
||||
#ifdef __OPENCL_CPP_VERSION__
|
||||
int __ovld atomic_max(volatile int *p, int val);
|
||||
uint __ovld atomic_max(volatile uint *p, uint val);
|
||||
int __ovld atomic_max(volatile int *, int);
|
||||
uint __ovld atomic_max(volatile uint *, uint);
|
||||
#endif
|
||||
|
||||
#if defined(cl_khr_global_int32_extended_atomics)
|
||||
int __ovld atom_max(volatile __global int *p, int val);
|
||||
uint __ovld atom_max(volatile __global uint *p, uint val);
|
||||
int __ovld atom_max(volatile __global int *, int);
|
||||
uint __ovld atom_max(volatile __global uint *, uint);
|
||||
#endif
|
||||
#if defined(cl_khr_local_int32_extended_atomics)
|
||||
int __ovld atom_max(volatile __local int *p, int val);
|
||||
uint __ovld atom_max(volatile __local uint *p, uint val);
|
||||
int __ovld atom_max(volatile __local int *, int);
|
||||
uint __ovld atom_max(volatile __local uint *, uint);
|
||||
#endif
|
||||
|
||||
#if defined(cl_khr_int64_extended_atomics)
|
||||
long __ovld atom_max(volatile __global long *p, long val);
|
||||
ulong __ovld atom_max(volatile __global ulong *p, ulong val);
|
||||
long __ovld atom_max(volatile __local long *p, long val);
|
||||
ulong __ovld atom_max(volatile __local ulong *p, ulong val);
|
||||
long __ovld atom_max(volatile __global long *, long);
|
||||
ulong __ovld atom_max(volatile __global ulong *, ulong);
|
||||
long __ovld atom_max(volatile __local long *, long);
|
||||
ulong __ovld atom_max(volatile __local ulong *, ulong);
|
||||
#endif
|
||||
|
||||
/**
|
||||
@ -13143,29 +13143,29 @@ ulong __ovld atom_max(volatile __local ulong *p, ulong val);
|
||||
* (old & val) and store result at location
|
||||
* pointed by p. The function returns old.
|
||||
*/
|
||||
int __ovld atomic_and(volatile __global int *p, int val);
|
||||
uint __ovld atomic_and(volatile __global uint *p, uint val);
|
||||
int __ovld atomic_and(volatile __local int *p, int val);
|
||||
uint __ovld atomic_and(volatile __local uint *p, uint val);
|
||||
int __ovld atomic_and(volatile __global int *, int);
|
||||
uint __ovld atomic_and(volatile __global uint *, uint);
|
||||
int __ovld atomic_and(volatile __local int *, int);
|
||||
uint __ovld atomic_and(volatile __local uint *, uint);
|
||||
#ifdef __OPENCL_CPP_VERSION__
|
||||
int __ovld atomic_and(volatile int *p, int val);
|
||||
uint __ovld atomic_and(volatile uint *p, uint val);
|
||||
int __ovld atomic_and(volatile int *, int);
|
||||
uint __ovld atomic_and(volatile uint *, uint);
|
||||
#endif
|
||||
|
||||
#if defined(cl_khr_global_int32_extended_atomics)
|
||||
int __ovld atom_and(volatile __global int *p, int val);
|
||||
uint __ovld atom_and(volatile __global uint *p, uint val);
|
||||
int __ovld atom_and(volatile __global int *, int);
|
||||
uint __ovld atom_and(volatile __global uint *, uint);
|
||||
#endif
|
||||
#if defined(cl_khr_local_int32_extended_atomics)
|
||||
int __ovld atom_and(volatile __local int *p, int val);
|
||||
uint __ovld atom_and(volatile __local uint *p, uint val);
|
||||
int __ovld atom_and(volatile __local int *, int);
|
||||
uint __ovld atom_and(volatile __local uint *, uint);
|
||||
#endif
|
||||
|
||||
#if defined(cl_khr_int64_extended_atomics)
|
||||
long __ovld atom_and(volatile __global long *p, long val);
|
||||
ulong __ovld atom_and(volatile __global ulong *p, ulong val);
|
||||
long __ovld atom_and(volatile __local long *p, long val);
|
||||
ulong __ovld atom_and(volatile __local ulong *p, ulong val);
|
||||
long __ovld atom_and(volatile __global long *, long);
|
||||
ulong __ovld atom_and(volatile __global ulong *, ulong);
|
||||
long __ovld atom_and(volatile __local long *, long);
|
||||
ulong __ovld atom_and(volatile __local ulong *, ulong);
|
||||
#endif
|
||||
|
||||
/**
|
||||
@ -13174,29 +13174,29 @@ ulong __ovld atom_and(volatile __local ulong *p, ulong val);
|
||||
* (old | val) and store result at location
|
||||
* pointed by p. The function returns old.
|
||||
*/
|
||||
int __ovld atomic_or(volatile __global int *p, int val);
|
||||
uint __ovld atomic_or(volatile __global uint *p, uint val);
|
||||
int __ovld atomic_or(volatile __local int *p, int val);
|
||||
uint __ovld atomic_or(volatile __local uint *p, uint val);
|
||||
int __ovld atomic_or(volatile __global int *, int);
|
||||
uint __ovld atomic_or(volatile __global uint *, uint);
|
||||
int __ovld atomic_or(volatile __local int *, int);
|
||||
uint __ovld atomic_or(volatile __local uint *, uint);
|
||||
#ifdef __OPENCL_CPP_VERSION__
|
||||
int __ovld atomic_or(volatile int *p, int val);
|
||||
uint __ovld atomic_or(volatile uint *p, uint val);
|
||||
int __ovld atomic_or(volatile int *, int);
|
||||
uint __ovld atomic_or(volatile uint *, uint);
|
||||
#endif
|
||||
|
||||
#if defined(cl_khr_global_int32_extended_atomics)
|
||||
int __ovld atom_or(volatile __global int *p, int val);
|
||||
uint __ovld atom_or(volatile __global uint *p, uint val);
|
||||
int __ovld atom_or(volatile __global int *, int);
|
||||
uint __ovld atom_or(volatile __global uint *, uint);
|
||||
#endif
|
||||
#if defined(cl_khr_local_int32_extended_atomics)
|
||||
int __ovld atom_or(volatile __local int *p, int val);
|
||||
uint __ovld atom_or(volatile __local uint *p, uint val);
|
||||
int __ovld atom_or(volatile __local int *, int);
|
||||
uint __ovld atom_or(volatile __local uint *, uint);
|
||||
#endif
|
||||
|
||||
#if defined(cl_khr_int64_extended_atomics)
|
||||
long __ovld atom_or(volatile __global long *p, long val);
|
||||
ulong __ovld atom_or(volatile __global ulong *p, ulong val);
|
||||
long __ovld atom_or(volatile __local long *p, long val);
|
||||
ulong __ovld atom_or(volatile __local ulong *p, ulong val);
|
||||
long __ovld atom_or(volatile __global long *, long);
|
||||
ulong __ovld atom_or(volatile __global ulong *, ulong);
|
||||
long __ovld atom_or(volatile __local long *, long);
|
||||
ulong __ovld atom_or(volatile __local ulong *, ulong);
|
||||
#endif
|
||||
|
||||
/**
|
||||
@ -13205,29 +13205,29 @@ ulong __ovld atom_or(volatile __local ulong *p, ulong val);
|
||||
* (old ^ val) and store result at location
|
||||
* pointed by p. The function returns old.
|
||||
*/
|
||||
int __ovld atomic_xor(volatile __global int *p, int val);
|
||||
uint __ovld atomic_xor(volatile __global uint *p, uint val);
|
||||
int __ovld atomic_xor(volatile __local int *p, int val);
|
||||
uint __ovld atomic_xor(volatile __local uint *p, uint val);
|
||||
int __ovld atomic_xor(volatile __global int *, int);
|
||||
uint __ovld atomic_xor(volatile __global uint *, uint);
|
||||
int __ovld atomic_xor(volatile __local int *, int);
|
||||
uint __ovld atomic_xor(volatile __local uint *, uint);
|
||||
#ifdef __OPENCL_CPP_VERSION__
|
||||
int __ovld atomic_xor(volatile int *p, int val);
|
||||
uint __ovld atomic_xor(volatile uint *p, uint val);
|
||||
int __ovld atomic_xor(volatile int *, int);
|
||||
uint __ovld atomic_xor(volatile uint *, uint);
|
||||
#endif
|
||||
|
||||
#if defined(cl_khr_global_int32_extended_atomics)
|
||||
int __ovld atom_xor(volatile __global int *p, int val);
|
||||
uint __ovld atom_xor(volatile __global uint *p, uint val);
|
||||
int __ovld atom_xor(volatile __global int *, int);
|
||||
uint __ovld atom_xor(volatile __global uint *, uint);
|
||||
#endif
|
||||
#if defined(cl_khr_local_int32_extended_atomics)
|
||||
int __ovld atom_xor(volatile __local int *p, int val);
|
||||
uint __ovld atom_xor(volatile __local uint *p, uint val);
|
||||
int __ovld atom_xor(volatile __local int *, int);
|
||||
uint __ovld atom_xor(volatile __local uint *, uint);
|
||||
#endif
|
||||
|
||||
#if defined(cl_khr_int64_extended_atomics)
|
||||
long __ovld atom_xor(volatile __global long *p, long val);
|
||||
ulong __ovld atom_xor(volatile __global ulong *p, ulong val);
|
||||
long __ovld atom_xor(volatile __local long *p, long val);
|
||||
ulong __ovld atom_xor(volatile __local ulong *p, ulong val);
|
||||
long __ovld atom_xor(volatile __global long *, long);
|
||||
ulong __ovld atom_xor(volatile __global ulong *, ulong);
|
||||
long __ovld atom_xor(volatile __local long *, long);
|
||||
ulong __ovld atom_xor(volatile __local ulong *, ulong);
|
||||
#endif
|
||||
|
||||
#if defined(cl_khr_int64_base_atomics) && defined(cl_khr_int64_extended_atomics)
|
||||
@ -15257,13 +15257,17 @@ float4 __ovld __purefn read_imagef(read_only image2d_t, sampler_t, float2, float
|
||||
int4 __ovld __purefn read_imagei(read_only image2d_t, sampler_t, float2, float);
|
||||
uint4 __ovld __purefn read_imageui(read_only image2d_t, sampler_t, float2, float);
|
||||
|
||||
#ifdef cl_khr_depth_images
|
||||
float __ovld __purefn read_imagef(read_only image2d_depth_t, sampler_t, float2, float);
|
||||
#endif // cl_khr_depth_images
|
||||
|
||||
float4 __ovld __purefn read_imagef(read_only image2d_array_t, sampler_t, float4, float);
|
||||
int4 __ovld __purefn read_imagei(read_only image2d_array_t, sampler_t, float4, float);
|
||||
uint4 __ovld __purefn read_imageui(read_only image2d_array_t, sampler_t, float4, float);
|
||||
|
||||
#ifdef cl_khr_depth_images
|
||||
float __ovld __purefn read_imagef(read_only image2d_array_depth_t, sampler_t, float4, float);
|
||||
#endif // cl_khr_depth_images
|
||||
|
||||
float4 __ovld __purefn read_imagef(read_only image3d_t, sampler_t, float4, float);
|
||||
int4 __ovld __purefn read_imagei(read_only image3d_t, sampler_t, float4, float);
|
||||
@ -15281,13 +15285,17 @@ float4 __ovld __purefn read_imagef(read_only image2d_t, sampler_t, float2, float
|
||||
int4 __ovld __purefn read_imagei(read_only image2d_t, sampler_t, float2, float2, float2);
|
||||
uint4 __ovld __purefn read_imageui(read_only image2d_t, sampler_t, float2, float2, float2);
|
||||
|
||||
#ifdef cl_khr_depth_images
|
||||
float __ovld __purefn read_imagef(read_only image2d_depth_t, sampler_t, float2, float2, float2);
|
||||
#endif // cl_khr_depth_images
|
||||
|
||||
float4 __ovld __purefn read_imagef(read_only image2d_array_t, sampler_t, float4, float2, float2);
|
||||
int4 __ovld __purefn read_imagei(read_only image2d_array_t, sampler_t, float4, float2, float2);
|
||||
uint4 __ovld __purefn read_imageui(read_only image2d_array_t, sampler_t, float4, float2, float2);
|
||||
|
||||
#ifdef cl_khr_depth_images
|
||||
float __ovld __purefn read_imagef(read_only image2d_array_depth_t, sampler_t, float4, float2, float2);
|
||||
#endif // cl_khr_depth_images
|
||||
|
||||
float4 __ovld __purefn read_imagef(read_only image3d_t, sampler_t, float4, float4, float4);
|
||||
int4 __ovld __purefn read_imagei(read_only image3d_t, sampler_t, float4, float4, float4);
|
||||
@ -15380,9 +15388,11 @@ float4 __ovld __purefn read_imagef(read_write image2d_array_t, int4);
|
||||
int4 __ovld __purefn read_imagei(read_write image2d_array_t, int4);
|
||||
uint4 __ovld __purefn read_imageui(read_write image2d_array_t, int4);
|
||||
|
||||
#ifdef cl_khr_3d_image_writes
|
||||
float4 __ovld __purefn read_imagef(read_write image3d_t, int4);
|
||||
int4 __ovld __purefn read_imagei(read_write image3d_t, int4);
|
||||
uint4 __ovld __purefn read_imageui(read_write image3d_t, int4);
|
||||
#endif // cl_khr_3d_image_writes
|
||||
|
||||
#ifdef cl_khr_depth_images
|
||||
float __ovld __purefn read_imagef(read_write image2d_depth_t, int2);
|
||||
@ -15423,9 +15433,11 @@ uint4 __ovld __purefn read_imageui(read_write image2d_array_t, sampler_t, float4
|
||||
|
||||
float __ovld __purefn read_imagef(read_write image2d_array_depth_t, sampler_t, float4, float);
|
||||
|
||||
#ifdef cl_khr_3d_image_writes
|
||||
float4 __ovld __purefn read_imagef(read_write image3d_t, sampler_t, float4, float);
|
||||
int4 __ovld __purefn read_imagei(read_write image3d_t, sampler_t, float4, float);
|
||||
uint4 __ovld __purefn read_imageui(read_write image3d_t, sampler_t, float4, float);
|
||||
#endif // cl_khr_3d_image_writes
|
||||
|
||||
float4 __ovld __purefn read_imagef(read_write image1d_t, sampler_t, float, float, float);
|
||||
int4 __ovld __purefn read_imagei(read_write image1d_t, sampler_t, float, float, float);
|
||||
@ -15447,9 +15459,11 @@ uint4 __ovld __purefn read_imageui(read_write image2d_array_t, sampler_t, float4
|
||||
|
||||
float __ovld __purefn read_imagef(read_write image2d_array_depth_t, sampler_t, float4, float2, float2);
|
||||
|
||||
#ifdef cl_khr_3d_image_writes
|
||||
float4 __ovld __purefn read_imagef(read_write image3d_t, sampler_t, float4, float4, float4);
|
||||
int4 __ovld __purefn read_imagei(read_write image3d_t, sampler_t, float4, float4, float4);
|
||||
uint4 __ovld __purefn read_imageui(read_write image3d_t, sampler_t, float4, float4, float4);
|
||||
#endif // cl_khr_3d_image_writes
|
||||
|
||||
#endif //cl_khr_mipmap_image
|
||||
|
||||
@ -15457,7 +15471,9 @@ uint4 __ovld __purefn read_imageui(read_write image3d_t, sampler_t, float4, floa
|
||||
#ifdef cl_khr_fp16
|
||||
half4 __ovld __purefn read_imageh(read_write image1d_t, int);
|
||||
half4 __ovld __purefn read_imageh(read_write image2d_t, int2);
|
||||
#ifdef cl_khr_3d_image_writes
|
||||
half4 __ovld __purefn read_imageh(read_write image3d_t, int4);
|
||||
#endif // cl_khr_3d_image_writes
|
||||
half4 __ovld __purefn read_imageh(read_write image1d_array_t, int2);
|
||||
half4 __ovld __purefn read_imageh(read_write image2d_array_t, int4);
|
||||
half4 __ovld __purefn read_imageh(read_write image1d_buffer_t, int);
|
||||
@ -15727,7 +15743,9 @@ int __ovld __cnfn get_image_width(write_only image2d_array_msaa_depth_t);
|
||||
int __ovld __cnfn get_image_width(read_write image1d_t);
|
||||
int __ovld __cnfn get_image_width(read_write image1d_buffer_t);
|
||||
int __ovld __cnfn get_image_width(read_write image2d_t);
|
||||
#ifdef cl_khr_3d_image_writes
|
||||
int __ovld __cnfn get_image_width(read_write image3d_t);
|
||||
#endif // cl_khr_3d_image_writes
|
||||
int __ovld __cnfn get_image_width(read_write image1d_array_t);
|
||||
int __ovld __cnfn get_image_width(read_write image2d_array_t);
|
||||
#ifdef cl_khr_depth_images
|
||||
@ -15777,7 +15795,9 @@ int __ovld __cnfn get_image_height(write_only image2d_array_msaa_depth_t);
|
||||
|
||||
#if defined(__opencl_c_read_write_images)
|
||||
int __ovld __cnfn get_image_height(read_write image2d_t);
|
||||
#ifdef cl_khr_3d_image_writes
|
||||
int __ovld __cnfn get_image_height(read_write image3d_t);
|
||||
#endif // cl_khr_3d_image_writes
|
||||
int __ovld __cnfn get_image_height(read_write image2d_array_t);
|
||||
#ifdef cl_khr_depth_images
|
||||
int __ovld __cnfn get_image_height(read_write image2d_depth_t);
|
||||
@ -15798,11 +15818,11 @@ int __ovld __cnfn get_image_depth(read_only image3d_t);
|
||||
|
||||
#ifdef cl_khr_3d_image_writes
|
||||
int __ovld __cnfn get_image_depth(write_only image3d_t);
|
||||
#endif
|
||||
|
||||
#if defined(__opencl_c_read_write_images)
|
||||
int __ovld __cnfn get_image_depth(read_write image3d_t);
|
||||
#endif //defined(__opencl_c_read_write_images)
|
||||
#endif // cl_khr_3d_image_writes
|
||||
|
||||
// OpenCL Extension v2.0 s9.18 - Mipmaps
|
||||
#if defined(__OPENCL_CPP_VERSION__) || (__OPENCL_C_VERSION__ >= CL_VERSION_2_0)
|
||||
@ -15824,24 +15844,32 @@ int __ovld get_image_num_mip_levels(write_only image3d_t);
|
||||
#if defined(__opencl_c_read_write_images)
|
||||
int __ovld get_image_num_mip_levels(read_write image1d_t);
|
||||
int __ovld get_image_num_mip_levels(read_write image2d_t);
|
||||
#ifdef cl_khr_3d_image_writes
|
||||
int __ovld get_image_num_mip_levels(read_write image3d_t);
|
||||
#endif // cl_khr_3d_image_writes
|
||||
#endif //defined(__opencl_c_read_write_images)
|
||||
|
||||
int __ovld get_image_num_mip_levels(read_only image1d_array_t);
|
||||
int __ovld get_image_num_mip_levels(read_only image2d_array_t);
|
||||
#ifdef cl_khr_depth_images
|
||||
int __ovld get_image_num_mip_levels(read_only image2d_array_depth_t);
|
||||
int __ovld get_image_num_mip_levels(read_only image2d_depth_t);
|
||||
#endif // cl_khr_depth_images
|
||||
|
||||
int __ovld get_image_num_mip_levels(write_only image1d_array_t);
|
||||
int __ovld get_image_num_mip_levels(write_only image2d_array_t);
|
||||
#ifdef cl_khr_depth_images
|
||||
int __ovld get_image_num_mip_levels(write_only image2d_array_depth_t);
|
||||
int __ovld get_image_num_mip_levels(write_only image2d_depth_t);
|
||||
#endif // cl_khr_depth_images
|
||||
|
||||
#if defined(__opencl_c_read_write_images)
|
||||
int __ovld get_image_num_mip_levels(read_write image1d_array_t);
|
||||
int __ovld get_image_num_mip_levels(read_write image2d_array_t);
|
||||
#ifdef cl_khr_depth_images
|
||||
int __ovld get_image_num_mip_levels(read_write image2d_array_depth_t);
|
||||
int __ovld get_image_num_mip_levels(read_write image2d_depth_t);
|
||||
#endif // cl_khr_depth_images
|
||||
#endif //defined(__opencl_c_read_write_images)
|
||||
|
||||
#endif //cl_khr_mipmap_image
|
||||
@ -15906,7 +15934,9 @@ int __ovld __cnfn get_image_channel_data_type(write_only image2d_array_msaa_dept
|
||||
int __ovld __cnfn get_image_channel_data_type(read_write image1d_t);
|
||||
int __ovld __cnfn get_image_channel_data_type(read_write image1d_buffer_t);
|
||||
int __ovld __cnfn get_image_channel_data_type(read_write image2d_t);
|
||||
#ifdef cl_khr_3d_image_writes
|
||||
int __ovld __cnfn get_image_channel_data_type(read_write image3d_t);
|
||||
#endif // cl_khr_3d_image_writes
|
||||
int __ovld __cnfn get_image_channel_data_type(read_write image1d_array_t);
|
||||
int __ovld __cnfn get_image_channel_data_type(read_write image2d_array_t);
|
||||
#ifdef cl_khr_depth_images
|
||||
@ -15978,7 +16008,9 @@ int __ovld __cnfn get_image_channel_order(write_only image2d_array_msaa_depth_t)
|
||||
int __ovld __cnfn get_image_channel_order(read_write image1d_t);
|
||||
int __ovld __cnfn get_image_channel_order(read_write image1d_buffer_t);
|
||||
int __ovld __cnfn get_image_channel_order(read_write image2d_t);
|
||||
#ifdef cl_khr_3d_image_writes
|
||||
int __ovld __cnfn get_image_channel_order(read_write image3d_t);
|
||||
#endif // cl_khr_3d_image_writes
|
||||
int __ovld __cnfn get_image_channel_order(read_write image1d_array_t);
|
||||
int __ovld __cnfn get_image_channel_order(read_write image2d_array_t);
|
||||
#ifdef cl_khr_depth_images
|
||||
@ -16048,10 +16080,10 @@ int2 __ovld __cnfn get_image_dim(read_write image2d_array_msaa_depth_t);
|
||||
int4 __ovld __cnfn get_image_dim(read_only image3d_t);
|
||||
#ifdef cl_khr_3d_image_writes
|
||||
int4 __ovld __cnfn get_image_dim(write_only image3d_t);
|
||||
#endif
|
||||
#if defined(__opencl_c_read_write_images)
|
||||
int4 __ovld __cnfn get_image_dim(read_write image3d_t);
|
||||
#endif //defined(__opencl_c_read_write_images)
|
||||
#endif // cl_khr_3d_image_writes
|
||||
|
||||
/**
|
||||
* Return the image array size.
|
||||
@ -16266,9 +16298,9 @@ uint __ovld get_enqueued_num_sub_groups(void);
|
||||
uint __ovld get_sub_group_id(void);
|
||||
uint __ovld get_sub_group_local_id(void);
|
||||
|
||||
void __ovld __conv sub_group_barrier(cl_mem_fence_flags flags);
|
||||
void __ovld __conv sub_group_barrier(cl_mem_fence_flags);
|
||||
#if defined(__OPENCL_CPP_VERSION__) || (__OPENCL_C_VERSION__ >= CL_VERSION_2_0)
|
||||
void __ovld __conv sub_group_barrier(cl_mem_fence_flags flags, memory_scope);
|
||||
void __ovld __conv sub_group_barrier(cl_mem_fence_flags, memory_scope);
|
||||
#endif //defined(__OPENCL_CPP_VERSION__) || (__OPENCL_C_VERSION__ >= CL_VERSION_2_0)
|
||||
|
||||
int __ovld __conv sub_group_all(int predicate);
|
||||
@ -17847,15 +17879,13 @@ intel_sub_group_avc_sic_configure_skc(
|
||||
uint skip_block_partition_type, uint skip_motion_vector_mask,
|
||||
ulong motion_vectors, uchar bidirectional_weight, uchar skip_sad_adjustment,
|
||||
intel_sub_group_avc_sic_payload_t payload);
|
||||
intel_sub_group_avc_sic_payload_t __ovld
|
||||
intel_sub_group_avc_sic_configure_ipe(
|
||||
uchar luma_intra_partition_mask, uchar intra_neighbour_availabilty,
|
||||
intel_sub_group_avc_sic_payload_t __ovld intel_sub_group_avc_sic_configure_ipe(
|
||||
uchar luma_intra_partition_mask, uchar intra_neighbour_availability,
|
||||
uchar left_edge_luma_pixels, uchar upper_left_corner_luma_pixel,
|
||||
uchar upper_edge_luma_pixels, uchar upper_right_edge_luma_pixels,
|
||||
uchar intra_sad_adjustment, intel_sub_group_avc_sic_payload_t payload);
|
||||
intel_sub_group_avc_sic_payload_t __ovld
|
||||
intel_sub_group_avc_sic_configure_ipe(
|
||||
uchar luma_intra_partition_mask, uchar intra_neighbour_availabilty,
|
||||
intel_sub_group_avc_sic_payload_t __ovld intel_sub_group_avc_sic_configure_ipe(
|
||||
uchar luma_intra_partition_mask, uchar intra_neighbour_availability,
|
||||
uchar left_edge_luma_pixels, uchar upper_left_corner_luma_pixel,
|
||||
uchar upper_edge_luma_pixels, uchar upper_right_edge_luma_pixels,
|
||||
ushort left_edge_chroma_pixels, ushort upper_left_corner_chroma_pixel,
|
||||
|
4
lib/include/ppc_wrappers/emmintrin.h
vendored
4
lib/include/ppc_wrappers/emmintrin.h
vendored
@ -36,7 +36,7 @@
|
||||
#ifndef EMMINTRIN_H_
|
||||
#define EMMINTRIN_H_
|
||||
|
||||
#if defined(__ppc64__) && \
|
||||
#if defined(__powerpc64__) && \
|
||||
(defined(__linux__) || defined(__FreeBSD__) || defined(_AIX))
|
||||
|
||||
#include <altivec.h>
|
||||
@ -2262,7 +2262,7 @@ extern __inline __m128d
|
||||
|
||||
#else
|
||||
#include_next <emmintrin.h>
|
||||
#endif /* defined(__ppc64__) &&
|
||||
#endif /* defined(__powerpc64__) && \
|
||||
* (defined(__linux__) || defined(__FreeBSD__) || defined(_AIX)) */
|
||||
|
||||
#endif /* EMMINTRIN_H_ */
|
||||
|
2
lib/include/ppc_wrappers/mm_malloc.h
vendored
2
lib/include/ppc_wrappers/mm_malloc.h
vendored
@ -10,7 +10,7 @@
|
||||
#ifndef _MM_MALLOC_H_INCLUDED
|
||||
#define _MM_MALLOC_H_INCLUDED
|
||||
|
||||
#if defined(__ppc64__) && \
|
||||
#if defined(__powerpc64__) && \
|
||||
(defined(__linux__) || defined(__FreeBSD__) || defined(_AIX))
|
||||
|
||||
#include <stdlib.h>
|
||||
|
4
lib/include/ppc_wrappers/mmintrin.h
vendored
4
lib/include/ppc_wrappers/mmintrin.h
vendored
@ -35,7 +35,7 @@
|
||||
#ifndef _MMINTRIN_H_INCLUDED
|
||||
#define _MMINTRIN_H_INCLUDED
|
||||
|
||||
#if defined(__ppc64__) && \
|
||||
#if defined(__powerpc64__) && \
|
||||
(defined(__linux__) || defined(__FreeBSD__) || defined(_AIX))
|
||||
|
||||
#include <altivec.h>
|
||||
@ -1447,7 +1447,7 @@ extern __inline __m64
|
||||
|
||||
#else
|
||||
#include_next <mmintrin.h>
|
||||
#endif /* defined(__ppc64__) &&
|
||||
#endif /* defined(__powerpc64__) && \
|
||||
* (defined(__linux__) || defined(__FreeBSD__) || defined(_AIX)) */
|
||||
|
||||
#endif /* _MMINTRIN_H_INCLUDED */
|
||||
|
4
lib/include/ppc_wrappers/pmmintrin.h
vendored
4
lib/include/ppc_wrappers/pmmintrin.h
vendored
@ -39,7 +39,7 @@
|
||||
#ifndef PMMINTRIN_H_
|
||||
#define PMMINTRIN_H_
|
||||
|
||||
#if defined(__ppc64__) && \
|
||||
#if defined(__powerpc64__) && \
|
||||
(defined(__linux__) || defined(__FreeBSD__) || defined(_AIX))
|
||||
|
||||
/* We need definitions from the SSE2 and SSE header files*/
|
||||
@ -139,7 +139,7 @@ extern __inline __m128i
|
||||
|
||||
#else
|
||||
#include_next <pmmintrin.h>
|
||||
#endif /* defined(__ppc64__) &&
|
||||
#endif /* defined(__powerpc64__) && \
|
||||
* (defined(__linux__) || defined(__FreeBSD__) || defined(_AIX)) */
|
||||
|
||||
#endif /* PMMINTRIN_H_ */
|
||||
|
4
lib/include/ppc_wrappers/smmintrin.h
vendored
4
lib/include/ppc_wrappers/smmintrin.h
vendored
@ -29,7 +29,7 @@
|
||||
#ifndef SMMINTRIN_H_
|
||||
#define SMMINTRIN_H_
|
||||
|
||||
#if defined(__ppc64__) && \
|
||||
#if defined(__powerpc64__) && \
|
||||
(defined(__linux__) || defined(__FreeBSD__) || defined(_AIX))
|
||||
|
||||
#include <altivec.h>
|
||||
@ -657,7 +657,7 @@ extern __inline __m128i
|
||||
|
||||
#else
|
||||
#include_next <smmintrin.h>
|
||||
#endif /* defined(__ppc64__) &&
|
||||
#endif /* defined(__powerpc64__) && \
|
||||
* (defined(__linux__) || defined(__FreeBSD__) || defined(_AIX)) */
|
||||
|
||||
#endif /* SMMINTRIN_H_ */
|
||||
|
4
lib/include/ppc_wrappers/tmmintrin.h
vendored
4
lib/include/ppc_wrappers/tmmintrin.h
vendored
@ -25,7 +25,7 @@
|
||||
#ifndef TMMINTRIN_H_
|
||||
#define TMMINTRIN_H_
|
||||
|
||||
#if defined(__ppc64__) && \
|
||||
#if defined(__powerpc64__) && \
|
||||
(defined(__linux__) || defined(__FreeBSD__) || defined(_AIX))
|
||||
|
||||
#include <altivec.h>
|
||||
@ -447,7 +447,7 @@ extern __inline __m64
|
||||
|
||||
#else
|
||||
#include_next <tmmintrin.h>
|
||||
#endif /* defined(__ppc64__) &&
|
||||
#endif /* defined(__powerpc64__) && \
|
||||
* (defined(__linux__) || defined(__FreeBSD__) || defined(_AIX)) */
|
||||
|
||||
#endif /* TMMINTRIN_H_ */
|
||||
|
4
lib/include/ppc_wrappers/xmmintrin.h
vendored
4
lib/include/ppc_wrappers/xmmintrin.h
vendored
@ -35,7 +35,7 @@
|
||||
#ifndef XMMINTRIN_H_
|
||||
#define XMMINTRIN_H_
|
||||
|
||||
#if defined(__ppc64__) && \
|
||||
#if defined(__powerpc64__) && \
|
||||
(defined(__linux__) || defined(__FreeBSD__) || defined(_AIX))
|
||||
|
||||
/* Define four value permute mask */
|
||||
@ -1821,7 +1821,7 @@ extern __inline void
|
||||
|
||||
#else
|
||||
#include_next <xmmintrin.h>
|
||||
#endif /* defined(__ppc64__) &&
|
||||
#endif /* defined(__powerpc64__) && \
|
||||
* (defined(__linux__) || defined(__FreeBSD__) || defined(_AIX)) */
|
||||
|
||||
#endif /* XMMINTRIN_H_ */
|
||||
|
61
lib/include/prfchiintrin.h
vendored
Normal file
61
lib/include/prfchiintrin.h
vendored
Normal file
@ -0,0 +1,61 @@
|
||||
/*===---- prfchiintrin.h - PREFETCHI intrinsic -----------------------------===
|
||||
*
|
||||
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||
* See https://llvm.org/LICENSE.txt for license information.
|
||||
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||
*
|
||||
*===-----------------------------------------------------------------------===
|
||||
*/
|
||||
|
||||
#ifndef __PRFCHIINTRIN_H
|
||||
#define __PRFCHIINTRIN_H
|
||||
|
||||
#ifdef __x86_64__
|
||||
|
||||
/* Define the default attributes for the functions in this file. */
|
||||
#define __DEFAULT_FN_ATTRS \
|
||||
__attribute__((__always_inline__, __nodebug__, __target__("prefetchi")))
|
||||
|
||||
/// Loads an instruction sequence containing the specified memory address into
|
||||
/// all level cache.
|
||||
///
|
||||
/// Note that the effect of this intrinsic is dependent on the processor
|
||||
/// implementation.
|
||||
///
|
||||
/// \headerfile <x86intrin.h>
|
||||
///
|
||||
/// This intrinsic corresponds to the \c PREFETCHIT0 instruction.
|
||||
///
|
||||
/// \param __P
|
||||
/// A pointer specifying the memory address to be prefetched.
|
||||
static __inline__ void __DEFAULT_FN_ATTRS
|
||||
_m_prefetchit0(volatile const void *__P) {
|
||||
#pragma clang diagnostic push
|
||||
#pragma clang diagnostic ignored "-Wcast-qual"
|
||||
__builtin_ia32_prefetchi((const void *)__P, 3 /* _MM_HINT_T0 */);
|
||||
#pragma clang diagnostic pop
|
||||
}
|
||||
|
||||
/// Loads an instruction sequence containing the specified memory address into
|
||||
/// all but the first-level cache.
|
||||
///
|
||||
/// Note that the effect of this intrinsic is dependent on the processor
|
||||
/// implementation.
|
||||
///
|
||||
/// \headerfile <x86intrin.h>
|
||||
///
|
||||
/// This intrinsic corresponds to the \c PREFETCHIT1 instruction.
|
||||
///
|
||||
/// \param __P
|
||||
/// A pointer specifying the memory address to be prefetched.
|
||||
static __inline__ void __DEFAULT_FN_ATTRS
|
||||
_m_prefetchit1(volatile const void *__P) {
|
||||
#pragma clang diagnostic push
|
||||
#pragma clang diagnostic ignored "-Wcast-qual"
|
||||
__builtin_ia32_prefetchi((const void *)__P, 2 /* _MM_HINT_T1 */);
|
||||
#pragma clang diagnostic pop
|
||||
}
|
||||
#endif /* __x86_64__ */
|
||||
#undef __DEFAULT_FN_ATTRS
|
||||
|
||||
#endif /* __PRFCHWINTRIN_H */
|
203
lib/include/raointintrin.h
vendored
Normal file
203
lib/include/raointintrin.h
vendored
Normal file
@ -0,0 +1,203 @@
|
||||
/*===----------------------- raointintrin.h - RAOINT ------------------------===
|
||||
*
|
||||
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||
* See https://llvm.org/LICENSE.txt for license information.
|
||||
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||
*
|
||||
*===-----------------------------------------------------------------------===
|
||||
*/
|
||||
|
||||
#ifndef __X86GPRINTRIN_H
|
||||
#error "Never use <raointintrin.h> directly; include <x86gprintrin.h> instead."
|
||||
#endif // __X86GPRINTRIN_H
|
||||
|
||||
#ifndef __RAOINTINTRIN_H
|
||||
#define __RAOINTINTRIN_H
|
||||
|
||||
#define __DEFAULT_FN_ATTRS \
|
||||
__attribute__((__always_inline__, __nodebug__, __target__("raoint")))
|
||||
|
||||
/// Atomically add a 32-bit value at memory operand \a __A and a 32-bit \a __B,
|
||||
/// and store the result to the same memory location.
|
||||
///
|
||||
/// This intrinsic should be used for contention or weak ordering. It may
|
||||
/// result in bad performance for hot data used by single thread only.
|
||||
///
|
||||
/// \headerfile <x86intrin.h>
|
||||
///
|
||||
/// This intrinsic corresponds to the \c AADD instruction.
|
||||
///
|
||||
/// \param __A
|
||||
/// A pointer to a 32-bit memory location.
|
||||
/// \param __B
|
||||
/// A 32-bit integer value.
|
||||
///
|
||||
/// \code{.operation}
|
||||
/// MEM[__A+31:__A] := MEM[__A+31:__A] + __B[31:0]
|
||||
/// \endcode
|
||||
static __inline__ void __DEFAULT_FN_ATTRS _aadd_i32(int *__A, int __B) {
|
||||
__builtin_ia32_aadd32((int *)__A, __B);
|
||||
}
|
||||
|
||||
/// Atomically and a 32-bit value at memory operand \a __A and a 32-bit \a __B,
|
||||
/// and store the result to the same memory location.
|
||||
///
|
||||
/// This intrinsic should be used for contention or weak ordering. It may
|
||||
/// result in bad performance for hot data used by single thread only.
|
||||
///
|
||||
/// \headerfile <x86intrin.h>
|
||||
///
|
||||
/// This intrinsic corresponds to the \c AAND instruction.
|
||||
///
|
||||
/// \param __A
|
||||
/// A pointer to a 32-bit memory location.
|
||||
/// \param __B
|
||||
/// A 32-bit integer value.
|
||||
///
|
||||
/// \code{.operation}
|
||||
/// MEM[__A+31:__A] := MEM[__A+31:__A] AND __B[31:0]
|
||||
/// \endcode
|
||||
static __inline__ void __DEFAULT_FN_ATTRS _aand_i32(int *__A, int __B) {
|
||||
__builtin_ia32_aand32((int *)__A, __B);
|
||||
}
|
||||
|
||||
/// Atomically or a 32-bit value at memory operand \a __A and a 32-bit \a __B,
|
||||
/// and store the result to the same memory location.
|
||||
///
|
||||
/// This intrinsic should be used for contention or weak ordering. It may
|
||||
/// result in bad performance for hot data used by single thread only.
|
||||
///
|
||||
/// \headerfile <x86intrin.h>
|
||||
///
|
||||
/// This intrinsic corresponds to the \c AOR instruction.
|
||||
///
|
||||
/// \param __A
|
||||
/// A pointer to a 32-bit memory location.
|
||||
/// \param __B
|
||||
/// A 32-bit integer value.
|
||||
///
|
||||
/// \code{.operation}
|
||||
/// MEM[__A+31:__A] := MEM[__A+31:__A] OR __B[31:0]
|
||||
/// \endcode
|
||||
static __inline__ void __DEFAULT_FN_ATTRS _aor_i32(int *__A, int __B) {
|
||||
__builtin_ia32_aor32((int *)__A, __B);
|
||||
}
|
||||
|
||||
/// Atomically xor a 32-bit value at memory operand \a __A and a 32-bit \a __B,
|
||||
/// and store the result to the same memory location.
|
||||
///
|
||||
/// This intrinsic should be used for contention or weak ordering. It may
|
||||
/// result in bad performance for hot data used by single thread only.
|
||||
///
|
||||
/// \headerfile <x86intrin.h>
|
||||
///
|
||||
/// This intrinsic corresponds to the \c AXOR instruction.
|
||||
///
|
||||
/// \param __A
|
||||
/// A pointer to a 32-bit memory location.
|
||||
/// \param __B
|
||||
/// A 32-bit integer value.
|
||||
///
|
||||
/// \code{.operation}
|
||||
/// MEM[__A+31:__A] := MEM[__A+31:__A] XOR __B[31:0]
|
||||
/// \endcode
|
||||
static __inline__ void __DEFAULT_FN_ATTRS _axor_i32(int *__A, int __B) {
|
||||
__builtin_ia32_axor32((int *)__A, __B);
|
||||
}
|
||||
|
||||
#ifdef __x86_64__
|
||||
/// Atomically add a 64-bit value at memory operand \a __A and a 64-bit \a __B,
|
||||
/// and store the result to the same memory location.
|
||||
///
|
||||
/// This intrinsic should be used for contention or weak ordering. It may
|
||||
/// result in bad performance for hot data used by single thread only.
|
||||
///
|
||||
/// \headerfile <x86intrin.h>
|
||||
///
|
||||
/// This intrinsic corresponds to the \c AADD instruction.
|
||||
///
|
||||
/// \param __A
|
||||
/// A pointer to a 64-bit memory location.
|
||||
/// \param __B
|
||||
/// A 64-bit integer value.
|
||||
///
|
||||
/// \code{.operation}
|
||||
/// MEM[__A+63:__A] := MEM[__A+63:__A] + __B[63:0]
|
||||
/// \endcode
|
||||
static __inline__ void __DEFAULT_FN_ATTRS _aadd_i64(long long *__A,
|
||||
long long __B) {
|
||||
__builtin_ia32_aadd64((long long *)__A, __B);
|
||||
}
|
||||
|
||||
/// Atomically and a 64-bit value at memory operand \a __A and a 64-bit \a __B,
|
||||
/// and store the result to the same memory location.
|
||||
///
|
||||
/// This intrinsic should be used for contention or weak ordering. It may
|
||||
/// result in bad performance for hot data used by single thread only.
|
||||
///
|
||||
/// \headerfile <x86intrin.h>
|
||||
///
|
||||
/// This intrinsic corresponds to the \c AAND instruction.
|
||||
///
|
||||
/// \param __A
|
||||
/// A pointer to a 64-bit memory location.
|
||||
/// \param __B
|
||||
/// A 64-bit integer value.
|
||||
///
|
||||
/// \code{.operation}
|
||||
/// MEM[__A+63:__A] := MEM[__A+63:__A] AND __B[63:0]
|
||||
/// \endcode
|
||||
static __inline__ void __DEFAULT_FN_ATTRS _aand_i64(long long *__A,
|
||||
long long __B) {
|
||||
__builtin_ia32_aand64((long long *)__A, __B);
|
||||
}
|
||||
|
||||
/// Atomically or a 64-bit value at memory operand \a __A and a 64-bit \a __B,
|
||||
/// and store the result to the same memory location.
|
||||
///
|
||||
/// This intrinsic should be used for contention or weak ordering. It may
|
||||
/// result in bad performance for hot data used by single thread only.
|
||||
///
|
||||
/// \headerfile <x86intrin.h>
|
||||
///
|
||||
/// This intrinsic corresponds to the \c AOR instruction.
|
||||
///
|
||||
/// \param __A
|
||||
/// A pointer to a 64-bit memory location.
|
||||
/// \param __B
|
||||
/// A 64-bit integer value.
|
||||
///
|
||||
/// \code{.operation}
|
||||
/// MEM[__A+63:__A] := MEM[__A+63:__A] OR __B[63:0]
|
||||
/// \endcode
|
||||
static __inline__ void __DEFAULT_FN_ATTRS _aor_i64(long long *__A,
|
||||
long long __B) {
|
||||
__builtin_ia32_aor64((long long *)__A, __B);
|
||||
}
|
||||
|
||||
/// Atomically xor a 64-bit value at memory operand \a __A and a 64-bit \a __B,
|
||||
/// and store the result to the same memory location.
|
||||
///
|
||||
/// This intrinsic should be used for contention or weak ordering. It may
|
||||
/// result in bad performance for hot data used by single thread only.
|
||||
///
|
||||
/// \headerfile <x86intrin.h>
|
||||
///
|
||||
/// This intrinsic corresponds to the \c AXOR instruction.
|
||||
///
|
||||
/// \param __A
|
||||
/// A pointer to a 64-bit memory location.
|
||||
/// \param __B
|
||||
/// A 64-bit integer value.
|
||||
///
|
||||
/// \code{.operation}
|
||||
/// MEM[__A+63:__A] := MEM[__A+63:__A] XOR __B[63:0]
|
||||
/// \endcode
|
||||
static __inline__ void __DEFAULT_FN_ATTRS _axor_i64(long long *__A,
|
||||
long long __B) {
|
||||
__builtin_ia32_axor64((long long *)__A, __B);
|
||||
}
|
||||
#endif // __x86_64__
|
||||
|
||||
#undef __DEFAULT_FN_ATTRS
|
||||
#endif // __RAOINTINTRIN_H
|
21
lib/include/riscv_vector.h
vendored
21
lib/include/riscv_vector.h
vendored
@ -25,6 +25,8 @@ extern "C" {
|
||||
#pragma clang riscv intrinsic vector
|
||||
|
||||
|
||||
#define vlenb() __builtin_rvv_vlenb()
|
||||
|
||||
enum RVV_CSR {
|
||||
RVV_VSTART = 0,
|
||||
RVV_VXSAT,
|
||||
@ -70,7 +72,6 @@ void vwrite_csr(enum RVV_CSR __csr, unsigned long __value) {
|
||||
}
|
||||
}
|
||||
|
||||
#define vsetvl_e8mf8(avl) __builtin_rvv_vsetvli((size_t)(avl), 0, 5)
|
||||
#define vsetvl_e8mf4(avl) __builtin_rvv_vsetvli((size_t)(avl), 0, 6)
|
||||
#define vsetvl_e8mf2(avl) __builtin_rvv_vsetvli((size_t)(avl), 0, 7)
|
||||
#define vsetvl_e8m1(avl) __builtin_rvv_vsetvli((size_t)(avl), 0, 0)
|
||||
@ -78,25 +79,28 @@ void vwrite_csr(enum RVV_CSR __csr, unsigned long __value) {
|
||||
#define vsetvl_e8m4(avl) __builtin_rvv_vsetvli((size_t)(avl), 0, 2)
|
||||
#define vsetvl_e8m8(avl) __builtin_rvv_vsetvli((size_t)(avl), 0, 3)
|
||||
|
||||
#define vsetvl_e16mf4(avl) __builtin_rvv_vsetvli((size_t)(avl), 1, 6)
|
||||
#define vsetvl_e16mf2(avl) __builtin_rvv_vsetvli((size_t)(avl), 1, 7)
|
||||
#define vsetvl_e16m1(avl) __builtin_rvv_vsetvli((size_t)(avl), 1, 0)
|
||||
#define vsetvl_e16m2(avl) __builtin_rvv_vsetvli((size_t)(avl), 1, 1)
|
||||
#define vsetvl_e16m4(avl) __builtin_rvv_vsetvli((size_t)(avl), 1, 2)
|
||||
#define vsetvl_e16m8(avl) __builtin_rvv_vsetvli((size_t)(avl), 1, 3)
|
||||
|
||||
#define vsetvl_e32mf2(avl) __builtin_rvv_vsetvli((size_t)(avl), 2, 7)
|
||||
#define vsetvl_e32m1(avl) __builtin_rvv_vsetvli((size_t)(avl), 2, 0)
|
||||
#define vsetvl_e32m2(avl) __builtin_rvv_vsetvli((size_t)(avl), 2, 1)
|
||||
#define vsetvl_e32m4(avl) __builtin_rvv_vsetvli((size_t)(avl), 2, 2)
|
||||
#define vsetvl_e32m8(avl) __builtin_rvv_vsetvli((size_t)(avl), 2, 3)
|
||||
|
||||
#if __riscv_v_elen >= 64
|
||||
#define vsetvl_e8mf8(avl) __builtin_rvv_vsetvli((size_t)(avl), 0, 5)
|
||||
#define vsetvl_e16mf4(avl) __builtin_rvv_vsetvli((size_t)(avl), 1, 6)
|
||||
#define vsetvl_e32mf2(avl) __builtin_rvv_vsetvli((size_t)(avl), 2, 7)
|
||||
|
||||
#define vsetvl_e64m1(avl) __builtin_rvv_vsetvli((size_t)(avl), 3, 0)
|
||||
#define vsetvl_e64m2(avl) __builtin_rvv_vsetvli((size_t)(avl), 3, 1)
|
||||
#define vsetvl_e64m4(avl) __builtin_rvv_vsetvli((size_t)(avl), 3, 2)
|
||||
#define vsetvl_e64m8(avl) __builtin_rvv_vsetvli((size_t)(avl), 3, 3)
|
||||
#endif
|
||||
|
||||
#define vsetvlmax_e8mf8() __builtin_rvv_vsetvlimax(0, 5)
|
||||
#define vsetvlmax_e8mf4() __builtin_rvv_vsetvlimax(0, 6)
|
||||
#define vsetvlmax_e8mf2() __builtin_rvv_vsetvlimax(0, 7)
|
||||
#define vsetvlmax_e8m1() __builtin_rvv_vsetvlimax(0, 0)
|
||||
@ -104,23 +108,28 @@ void vwrite_csr(enum RVV_CSR __csr, unsigned long __value) {
|
||||
#define vsetvlmax_e8m4() __builtin_rvv_vsetvlimax(0, 2)
|
||||
#define vsetvlmax_e8m8() __builtin_rvv_vsetvlimax(0, 3)
|
||||
|
||||
#define vsetvlmax_e16mf4() __builtin_rvv_vsetvlimax(1, 6)
|
||||
#define vsetvlmax_e16mf2() __builtin_rvv_vsetvlimax(1, 7)
|
||||
#define vsetvlmax_e16m1() __builtin_rvv_vsetvlimax(1, 0)
|
||||
#define vsetvlmax_e16m2() __builtin_rvv_vsetvlimax(1, 1)
|
||||
#define vsetvlmax_e16m4() __builtin_rvv_vsetvlimax(1, 2)
|
||||
#define vsetvlmax_e16m8() __builtin_rvv_vsetvlimax(1, 3)
|
||||
|
||||
#define vsetvlmax_e32mf2() __builtin_rvv_vsetvlimax(2, 7)
|
||||
#define vsetvlmax_e32m1() __builtin_rvv_vsetvlimax(2, 0)
|
||||
#define vsetvlmax_e32m2() __builtin_rvv_vsetvlimax(2, 1)
|
||||
#define vsetvlmax_e32m4() __builtin_rvv_vsetvlimax(2, 2)
|
||||
#define vsetvlmax_e32m8() __builtin_rvv_vsetvlimax(2, 3)
|
||||
|
||||
#if __riscv_v_elen >= 64
|
||||
#define vsetvlmax_e8mf8() __builtin_rvv_vsetvlimax(0, 5)
|
||||
#define vsetvlmax_e16mf4() __builtin_rvv_vsetvlimax(1, 6)
|
||||
#define vsetvlmax_e32mf2() __builtin_rvv_vsetvlimax(2, 7)
|
||||
|
||||
#define vsetvlmax_e64m1() __builtin_rvv_vsetvlimax(3, 0)
|
||||
#define vsetvlmax_e64m2() __builtin_rvv_vsetvlimax(3, 1)
|
||||
#define vsetvlmax_e64m4() __builtin_rvv_vsetvlimax(3, 2)
|
||||
#define vsetvlmax_e64m8() __builtin_rvv_vsetvlimax(3, 3)
|
||||
#endif
|
||||
|
||||
typedef __rvv_bool64_t vbool64_t;
|
||||
typedef __rvv_bool32_t vbool32_t;
|
||||
typedef __rvv_bool16_t vbool16_t;
|
||||
|
2
lib/include/smmintrin.h
vendored
2
lib/include/smmintrin.h
vendored
@ -818,7 +818,7 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_max_epu32(__m128i __V1,
|
||||
/// parameter, is copied to the result.
|
||||
/// \param N
|
||||
/// Specifies which bits from operand \a Y will be copied, which bits in the
|
||||
/// result they will be be copied to, and which bits in the result will be
|
||||
/// result they will be copied to, and which bits in the result will be
|
||||
/// cleared. The following assignments are made: \n
|
||||
/// Bits [7:6] specify the bits to copy from operand \a Y: \n
|
||||
/// 00: Selects bits [31:0] from operand \a Y. \n
|
||||
|
30
lib/include/stdarg.h
vendored
30
lib/include/stdarg.h
vendored
@ -8,13 +8,30 @@
|
||||
*/
|
||||
|
||||
#ifndef __STDARG_H
|
||||
#define __STDARG_H
|
||||
|
||||
#ifndef __GNUC_VA_LIST
|
||||
#define __GNUC_VA_LIST
|
||||
typedef __builtin_va_list __gnuc_va_list;
|
||||
#endif
|
||||
|
||||
#ifdef __need___va_list
|
||||
#undef __need___va_list
|
||||
#else
|
||||
#define __STDARG_H
|
||||
#ifndef _VA_LIST
|
||||
typedef __builtin_va_list va_list;
|
||||
#define _VA_LIST
|
||||
#endif
|
||||
|
||||
/* FIXME: This is using the placeholder dates Clang produces for these macros
|
||||
in C2x mode; switch to the correct values once they've been published. */
|
||||
#if defined(__STDC_VERSION__) && __STDC_VERSION__ >= 202000L
|
||||
/* C2x does not require the second parameter for va_start. */
|
||||
#define va_start(ap, ...) __builtin_va_start(ap, 0)
|
||||
#else
|
||||
/* Versions before C2x do require the second parameter. */
|
||||
#define va_start(ap, param) __builtin_va_start(ap, param)
|
||||
#endif
|
||||
#define va_end(ap) __builtin_va_end(ap)
|
||||
#define va_arg(ap, type) __builtin_va_arg(ap, type)
|
||||
|
||||
@ -23,13 +40,12 @@ typedef __builtin_va_list va_list;
|
||||
*/
|
||||
#define __va_copy(d,s) __builtin_va_copy(d,s)
|
||||
|
||||
#if __STDC_VERSION__ >= 199901L || __cplusplus >= 201103L || !defined(__STRICT_ANSI__)
|
||||
#if (defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L) || \
|
||||
(defined(__cplusplus) && __cplusplus >= 201103L) || \
|
||||
!defined(__STRICT_ANSI__)
|
||||
#define va_copy(dest, src) __builtin_va_copy(dest, src)
|
||||
#endif
|
||||
|
||||
#ifndef __GNUC_VA_LIST
|
||||
#define __GNUC_VA_LIST 1
|
||||
typedef __builtin_va_list __gnuc_va_list;
|
||||
#endif
|
||||
|
||||
#endif /* __STDARG_H */
|
||||
|
||||
#endif /* not __STDARG_H */
|
||||
|
9
lib/include/stdatomic.h
vendored
9
lib/include/stdatomic.h
vendored
@ -15,10 +15,12 @@
|
||||
*
|
||||
* Exclude the MSVC path as well as the MSVC header as of the 14.31.30818
|
||||
* explicitly disallows `stdatomic.h` in the C mode via an `#error`. Fallback
|
||||
* to the clang resource header until that is fully supported.
|
||||
* to the clang resource header until that is fully supported. The
|
||||
* `stdatomic.h` header requires C++ 23 or newer.
|
||||
*/
|
||||
#if __STDC_HOSTED__ && \
|
||||
__has_include_next(<stdatomic.h>) && !(defined(_MSC_VER) && !defined(__cplusplus))
|
||||
__has_include_next(<stdatomic.h>) && \
|
||||
(!defined(_MSC_VER) || (defined(__cplusplus) && __cplusplus >= 202002L))
|
||||
# include_next <stdatomic.h>
|
||||
#else
|
||||
|
||||
@ -45,7 +47,8 @@ extern "C" {
|
||||
/* 7.17.2 Initialization */
|
||||
|
||||
#define ATOMIC_VAR_INIT(value) (value)
|
||||
#if (__STDC_VERSION__ >= 201710L || __cplusplus >= 202002L) && \
|
||||
#if ((defined(__STDC_VERSION__) && __STDC_VERSION__ >= 201710L) || \
|
||||
(defined(__cplusplus) && __cplusplus >= 202002L)) && \
|
||||
!defined(_CLANG_DISABLE_CRT_DEPRECATION_WARNINGS)
|
||||
/* ATOMIC_VAR_INIT was deprecated in C17 and C++20. */
|
||||
#pragma clang deprecated(ATOMIC_VAR_INIT)
|
||||
|
4
lib/include/stdbool.h
vendored
4
lib/include/stdbool.h
vendored
@ -12,7 +12,7 @@
|
||||
|
||||
#define __bool_true_false_are_defined 1
|
||||
|
||||
#if __STDC_VERSION__ > 201710L
|
||||
#if defined(__STDC_VERSION__) && __STDC_VERSION__ > 201710L
|
||||
/* FIXME: We should be issuing a deprecation warning here, but cannot yet due
|
||||
* to system headers which include this header file unconditionally.
|
||||
*/
|
||||
@ -23,7 +23,7 @@
|
||||
#elif defined(__GNUC__) && !defined(__STRICT_ANSI__)
|
||||
/* Define _Bool as a GNU extension. */
|
||||
#define _Bool bool
|
||||
#if __cplusplus < 201103L
|
||||
#if defined(__cplusplus) && __cplusplus < 201103L
|
||||
/* For C++98, define bool, false, true as a GNU extension. */
|
||||
#define bool bool
|
||||
#define false false
|
||||
|
9
lib/include/stddef.h
vendored
9
lib/include/stddef.h
vendored
@ -97,8 +97,15 @@ using ::std::nullptr_t;
|
||||
#undef __need_NULL
|
||||
#endif /* defined(__need_NULL) */
|
||||
|
||||
/* FIXME: This is using the placeholder dates Clang produces for these macros
|
||||
in C2x mode; switch to the correct values once they've been published. */
|
||||
#if defined(__STDC_VERSION__) && __STDC_VERSION__ >= 202000L
|
||||
typedef typeof(nullptr) nullptr_t;
|
||||
#endif /* defined(__STDC_VERSION__) && __STDC_VERSION__ >= 202000L */
|
||||
|
||||
#if defined(__need_STDDEF_H_misc)
|
||||
#if __STDC_VERSION__ >= 201112L || __cplusplus >= 201103L
|
||||
#if (defined(__STDC_VERSION__) && __STDC_VERSION__ >= 201112L) || \
|
||||
(defined(__cplusplus) && __cplusplus >= 201103L)
|
||||
#include "__stddef_max_align_t.h"
|
||||
#endif
|
||||
#define offsetof(t, d) __builtin_offsetof(t, d)
|
||||
|
198
lib/include/stdint.h
vendored
198
lib/include/stdint.h
vendored
@ -96,13 +96,21 @@
|
||||
typedef __INT64_TYPE__ int64_t;
|
||||
# endif /* __int8_t_defined */
|
||||
typedef __UINT64_TYPE__ uint64_t;
|
||||
# undef __int_least64_t
|
||||
# define __int_least64_t int64_t
|
||||
# undef __uint_least64_t
|
||||
# define __uint_least64_t uint64_t
|
||||
# undef __int_least32_t
|
||||
# define __int_least32_t int64_t
|
||||
# undef __uint_least32_t
|
||||
# define __uint_least32_t uint64_t
|
||||
# undef __int_least16_t
|
||||
# define __int_least16_t int64_t
|
||||
# undef __uint_least16_t
|
||||
# define __uint_least16_t uint64_t
|
||||
# undef __int_least8_t
|
||||
# define __int_least8_t int64_t
|
||||
# undef __uint_least8_t
|
||||
# define __uint_least8_t uint64_t
|
||||
#endif /* __INT64_TYPE__ */
|
||||
|
||||
@ -120,11 +128,17 @@ typedef int56_t int_least56_t;
|
||||
typedef uint56_t uint_least56_t;
|
||||
typedef int56_t int_fast56_t;
|
||||
typedef uint56_t uint_fast56_t;
|
||||
# undef __int_least32_t
|
||||
# define __int_least32_t int56_t
|
||||
# undef __uint_least32_t
|
||||
# define __uint_least32_t uint56_t
|
||||
# undef __int_least16_t
|
||||
# define __int_least16_t int56_t
|
||||
# undef __uint_least16_t
|
||||
# define __uint_least16_t uint56_t
|
||||
# undef __int_least8_t
|
||||
# define __int_least8_t int56_t
|
||||
# undef __uint_least8_t
|
||||
# define __uint_least8_t uint56_t
|
||||
#endif /* __INT56_TYPE__ */
|
||||
|
||||
@ -136,11 +150,17 @@ typedef int48_t int_least48_t;
|
||||
typedef uint48_t uint_least48_t;
|
||||
typedef int48_t int_fast48_t;
|
||||
typedef uint48_t uint_fast48_t;
|
||||
# undef __int_least32_t
|
||||
# define __int_least32_t int48_t
|
||||
# undef __uint_least32_t
|
||||
# define __uint_least32_t uint48_t
|
||||
# undef __int_least16_t
|
||||
# define __int_least16_t int48_t
|
||||
# undef __uint_least16_t
|
||||
# define __uint_least16_t uint48_t
|
||||
# undef __int_least8_t
|
||||
# define __int_least8_t int48_t
|
||||
# undef __uint_least8_t
|
||||
# define __uint_least8_t uint48_t
|
||||
#endif /* __INT48_TYPE__ */
|
||||
|
||||
@ -152,11 +172,17 @@ typedef int40_t int_least40_t;
|
||||
typedef uint40_t uint_least40_t;
|
||||
typedef int40_t int_fast40_t;
|
||||
typedef uint40_t uint_fast40_t;
|
||||
# undef __int_least32_t
|
||||
# define __int_least32_t int40_t
|
||||
# undef __uint_least32_t
|
||||
# define __uint_least32_t uint40_t
|
||||
# undef __int_least16_t
|
||||
# define __int_least16_t int40_t
|
||||
# undef __uint_least16_t
|
||||
# define __uint_least16_t uint40_t
|
||||
# undef __int_least8_t
|
||||
# define __int_least8_t int40_t
|
||||
# undef __uint_least8_t
|
||||
# define __uint_least8_t uint40_t
|
||||
#endif /* __INT40_TYPE__ */
|
||||
|
||||
@ -172,11 +198,17 @@ typedef __INT32_TYPE__ int32_t;
|
||||
typedef __UINT32_TYPE__ uint32_t;
|
||||
# endif /* __uint32_t_defined */
|
||||
|
||||
# undef __int_least32_t
|
||||
# define __int_least32_t int32_t
|
||||
# undef __uint_least32_t
|
||||
# define __uint_least32_t uint32_t
|
||||
# undef __int_least16_t
|
||||
# define __int_least16_t int32_t
|
||||
# undef __uint_least16_t
|
||||
# define __uint_least16_t uint32_t
|
||||
# undef __int_least8_t
|
||||
# define __int_least8_t int32_t
|
||||
# undef __uint_least8_t
|
||||
# define __uint_least8_t uint32_t
|
||||
#endif /* __INT32_TYPE__ */
|
||||
|
||||
@ -194,9 +226,13 @@ typedef int24_t int_least24_t;
|
||||
typedef uint24_t uint_least24_t;
|
||||
typedef int24_t int_fast24_t;
|
||||
typedef uint24_t uint_fast24_t;
|
||||
# undef __int_least16_t
|
||||
# define __int_least16_t int24_t
|
||||
# undef __uint_least16_t
|
||||
# define __uint_least16_t uint24_t
|
||||
# undef __int_least8_t
|
||||
# define __int_least8_t int24_t
|
||||
# undef __uint_least8_t
|
||||
# define __uint_least8_t uint24_t
|
||||
#endif /* __INT24_TYPE__ */
|
||||
|
||||
@ -205,9 +241,13 @@ typedef uint24_t uint_fast24_t;
|
||||
typedef __INT16_TYPE__ int16_t;
|
||||
#endif /* __int8_t_defined */
|
||||
typedef __UINT16_TYPE__ uint16_t;
|
||||
# undef __int_least16_t
|
||||
# define __int_least16_t int16_t
|
||||
# undef __uint_least16_t
|
||||
# define __uint_least16_t uint16_t
|
||||
# undef __int_least8_t
|
||||
# define __int_least8_t int16_t
|
||||
# undef __uint_least8_t
|
||||
# define __uint_least8_t uint16_t
|
||||
#endif /* __INT16_TYPE__ */
|
||||
|
||||
@ -224,7 +264,9 @@ typedef __uint_least16_t uint_fast16_t;
|
||||
typedef __INT8_TYPE__ int8_t;
|
||||
#endif /* __int8_t_defined */
|
||||
typedef __UINT8_TYPE__ uint8_t;
|
||||
# undef __int_least8_t
|
||||
# define __int_least8_t int8_t
|
||||
# undef __uint_least8_t
|
||||
# define __uint_least8_t uint8_t
|
||||
#endif /* __INT8_TYPE__ */
|
||||
|
||||
@ -285,16 +327,15 @@ typedef __UINTMAX_TYPE__ uintmax_t;
|
||||
|
||||
|
||||
#ifdef __INT64_TYPE__
|
||||
# undef __int64_c_suffix
|
||||
# undef __int32_c_suffix
|
||||
# undef __int16_c_suffix
|
||||
# undef __int8_c_suffix
|
||||
# ifdef __INT64_C_SUFFIX__
|
||||
# define __int64_c_suffix __INT64_C_SUFFIX__
|
||||
# define __int32_c_suffix __INT64_C_SUFFIX__
|
||||
# define __int16_c_suffix __INT64_C_SUFFIX__
|
||||
# define __int8_c_suffix __INT64_C_SUFFIX__
|
||||
# else
|
||||
# undef __int64_c_suffix
|
||||
# undef __int32_c_suffix
|
||||
# undef __int16_c_suffix
|
||||
# undef __int8_c_suffix
|
||||
# endif /* __INT64_C_SUFFIX__ */
|
||||
#endif /* __INT64_TYPE__ */
|
||||
|
||||
@ -310,6 +351,9 @@ typedef __UINTMAX_TYPE__ uintmax_t;
|
||||
|
||||
|
||||
#ifdef __INT56_TYPE__
|
||||
# undef __int32_c_suffix
|
||||
# undef __int16_c_suffix
|
||||
# undef __int8_c_suffix
|
||||
# ifdef __INT56_C_SUFFIX__
|
||||
# define INT56_C(v) __int_c(v, __INT56_C_SUFFIX__)
|
||||
# define UINT56_C(v) __uint_c(v, __INT56_C_SUFFIX__)
|
||||
@ -319,14 +363,14 @@ typedef __UINTMAX_TYPE__ uintmax_t;
|
||||
# else
|
||||
# define INT56_C(v) v
|
||||
# define UINT56_C(v) v ## U
|
||||
# undef __int32_c_suffix
|
||||
# undef __int16_c_suffix
|
||||
# undef __int8_c_suffix
|
||||
# endif /* __INT56_C_SUFFIX__ */
|
||||
#endif /* __INT56_TYPE__ */
|
||||
|
||||
|
||||
#ifdef __INT48_TYPE__
|
||||
# undef __int32_c_suffix
|
||||
# undef __int16_c_suffix
|
||||
# undef __int8_c_suffix
|
||||
# ifdef __INT48_C_SUFFIX__
|
||||
# define INT48_C(v) __int_c(v, __INT48_C_SUFFIX__)
|
||||
# define UINT48_C(v) __uint_c(v, __INT48_C_SUFFIX__)
|
||||
@ -336,14 +380,14 @@ typedef __UINTMAX_TYPE__ uintmax_t;
|
||||
# else
|
||||
# define INT48_C(v) v
|
||||
# define UINT48_C(v) v ## U
|
||||
# undef __int32_c_suffix
|
||||
# undef __int16_c_suffix
|
||||
# undef __int8_c_suffix
|
||||
# endif /* __INT48_C_SUFFIX__ */
|
||||
#endif /* __INT48_TYPE__ */
|
||||
|
||||
|
||||
#ifdef __INT40_TYPE__
|
||||
# undef __int32_c_suffix
|
||||
# undef __int16_c_suffix
|
||||
# undef __int8_c_suffix
|
||||
# ifdef __INT40_C_SUFFIX__
|
||||
# define INT40_C(v) __int_c(v, __INT40_C_SUFFIX__)
|
||||
# define UINT40_C(v) __uint_c(v, __INT40_C_SUFFIX__)
|
||||
@ -353,22 +397,18 @@ typedef __UINTMAX_TYPE__ uintmax_t;
|
||||
# else
|
||||
# define INT40_C(v) v
|
||||
# define UINT40_C(v) v ## U
|
||||
# undef __int32_c_suffix
|
||||
# undef __int16_c_suffix
|
||||
# undef __int8_c_suffix
|
||||
# endif /* __INT40_C_SUFFIX__ */
|
||||
#endif /* __INT40_TYPE__ */
|
||||
|
||||
|
||||
#ifdef __INT32_TYPE__
|
||||
# undef __int32_c_suffix
|
||||
# undef __int16_c_suffix
|
||||
# undef __int8_c_suffix
|
||||
# ifdef __INT32_C_SUFFIX__
|
||||
# define __int32_c_suffix __INT32_C_SUFFIX__
|
||||
# define __int16_c_suffix __INT32_C_SUFFIX__
|
||||
# define __int8_c_suffix __INT32_C_SUFFIX__
|
||||
#else
|
||||
# undef __int32_c_suffix
|
||||
# undef __int16_c_suffix
|
||||
# undef __int8_c_suffix
|
||||
# endif /* __INT32_C_SUFFIX__ */
|
||||
#endif /* __INT32_TYPE__ */
|
||||
|
||||
@ -384,6 +424,8 @@ typedef __UINTMAX_TYPE__ uintmax_t;
|
||||
|
||||
|
||||
#ifdef __INT24_TYPE__
|
||||
# undef __int16_c_suffix
|
||||
# undef __int8_c_suffix
|
||||
# ifdef __INT24_C_SUFFIX__
|
||||
# define INT24_C(v) __int_c(v, __INT24_C_SUFFIX__)
|
||||
# define UINT24_C(v) __uint_c(v, __INT24_C_SUFFIX__)
|
||||
@ -392,19 +434,16 @@ typedef __UINTMAX_TYPE__ uintmax_t;
|
||||
# else
|
||||
# define INT24_C(v) v
|
||||
# define UINT24_C(v) v ## U
|
||||
# undef __int16_c_suffix
|
||||
# undef __int8_c_suffix
|
||||
# endif /* __INT24_C_SUFFIX__ */
|
||||
#endif /* __INT24_TYPE__ */
|
||||
|
||||
|
||||
#ifdef __INT16_TYPE__
|
||||
# undef __int16_c_suffix
|
||||
# undef __int8_c_suffix
|
||||
# ifdef __INT16_C_SUFFIX__
|
||||
# define __int16_c_suffix __INT16_C_SUFFIX__
|
||||
# define __int8_c_suffix __INT16_C_SUFFIX__
|
||||
#else
|
||||
# undef __int16_c_suffix
|
||||
# undef __int8_c_suffix
|
||||
# endif /* __INT16_C_SUFFIX__ */
|
||||
#endif /* __INT16_TYPE__ */
|
||||
|
||||
@ -420,10 +459,9 @@ typedef __UINTMAX_TYPE__ uintmax_t;
|
||||
|
||||
|
||||
#ifdef __INT8_TYPE__
|
||||
# undef __int8_c_suffix
|
||||
# ifdef __INT8_C_SUFFIX__
|
||||
# define __int8_c_suffix __INT8_C_SUFFIX__
|
||||
#else
|
||||
# undef __int8_c_suffix
|
||||
# endif /* __INT8_C_SUFFIX__ */
|
||||
#endif /* __INT8_TYPE__ */
|
||||
|
||||
@ -463,27 +501,39 @@ typedef __UINTMAX_TYPE__ uintmax_t;
|
||||
# define UINT64_MAX UINT64_C(18446744073709551615)
|
||||
/* FIXME: This is using the placeholder dates Clang produces for these macros
|
||||
in C2x mode; switch to the correct values once they've been published. */
|
||||
#if __STDC_VERSION__ >= 202000L
|
||||
#if defined(__STDC_VERSION__) && __STDC_VERSION__ >= 202000L
|
||||
# define UINT64_WIDTH 64
|
||||
# define INT64_WIDTH UINT64_WIDTH
|
||||
|
||||
# define __UINT_LEAST64_WIDTH UINT64_WIDTH
|
||||
# undef __UINT_LEAST32_WIDTH
|
||||
# define __UINT_LEAST32_WIDTH UINT64_WIDTH
|
||||
# undef __UINT_LEAST16_WIDTH
|
||||
# define __UINT_LEAST16_WIDTH UINT64_WIDTH
|
||||
# undef __UINT_LEAST8_MAX
|
||||
# define __UINT_LEAST8_MAX UINT64_MAX
|
||||
#endif /* __STDC_VERSION__ */
|
||||
|
||||
# define __INT_LEAST64_MIN INT64_MIN
|
||||
# define __INT_LEAST64_MAX INT64_MAX
|
||||
# define __UINT_LEAST64_MAX UINT64_MAX
|
||||
# undef __INT_LEAST32_MIN
|
||||
# define __INT_LEAST32_MIN INT64_MIN
|
||||
# undef __INT_LEAST32_MAX
|
||||
# define __INT_LEAST32_MAX INT64_MAX
|
||||
# undef __UINT_LEAST32_MAX
|
||||
# define __UINT_LEAST32_MAX UINT64_MAX
|
||||
# undef __INT_LEAST16_MIN
|
||||
# define __INT_LEAST16_MIN INT64_MIN
|
||||
# undef __INT_LEAST16_MAX
|
||||
# define __INT_LEAST16_MAX INT64_MAX
|
||||
# undef __UINT_LEAST16_MAX
|
||||
# define __UINT_LEAST16_MAX UINT64_MAX
|
||||
# undef __INT_LEAST8_MIN
|
||||
# define __INT_LEAST8_MIN INT64_MIN
|
||||
# undef __INT_LEAST8_MAX
|
||||
# define __INT_LEAST8_MAX INT64_MAX
|
||||
# undef __UINT_LEAST8_MAX
|
||||
# define __UINT_LEAST8_MAX UINT64_MAX
|
||||
#endif /* __INT64_TYPE__ */
|
||||
|
||||
@ -497,7 +547,7 @@ typedef __UINTMAX_TYPE__ uintmax_t;
|
||||
|
||||
/* FIXME: This is using the placeholder dates Clang produces for these macros
|
||||
in C2x mode; switch to the correct values once they've been published. */
|
||||
#if __STDC_VERSION__ >= 202000L
|
||||
#if defined(__STDC_VERSION__) && __STDC_VERSION__ >= 202000L
|
||||
# define UINT_LEAST64_WIDTH __UINT_LEAST64_WIDTH
|
||||
# define INT_LEAST64_WIDTH UINT_LEAST64_WIDTH
|
||||
# define UINT_FAST64_WIDTH __UINT_LEAST64_WIDTH
|
||||
@ -517,27 +567,39 @@ typedef __UINTMAX_TYPE__ uintmax_t;
|
||||
# define INT_FAST56_MAX INT56_MAX
|
||||
# define UINT_FAST56_MAX UINT56_MAX
|
||||
|
||||
# undef __INT_LEAST32_MIN
|
||||
# define __INT_LEAST32_MIN INT56_MIN
|
||||
# undef __INT_LEAST32_MAX
|
||||
# define __INT_LEAST32_MAX INT56_MAX
|
||||
# undef __UINT_LEAST32_MAX
|
||||
# define __UINT_LEAST32_MAX UINT56_MAX
|
||||
# undef __INT_LEAST16_MIN
|
||||
# define __INT_LEAST16_MIN INT56_MIN
|
||||
# undef __INT_LEAST16_MAX
|
||||
# define __INT_LEAST16_MAX INT56_MAX
|
||||
# undef __UINT_LEAST16_MAX
|
||||
# define __UINT_LEAST16_MAX UINT56_MAX
|
||||
# undef __INT_LEAST8_MIN
|
||||
# define __INT_LEAST8_MIN INT56_MIN
|
||||
# undef __INT_LEAST8_MAX
|
||||
# define __INT_LEAST8_MAX INT56_MAX
|
||||
# undef __UINT_LEAST8_MAX
|
||||
# define __UINT_LEAST8_MAX UINT56_MAX
|
||||
|
||||
/* FIXME: This is using the placeholder dates Clang produces for these macros
|
||||
in C2x mode; switch to the correct values once they've been published. */
|
||||
#if __STDC_VERSION__ >= 202000L
|
||||
#if defined(__STDC_VERSION__) && __STDC_VERSION__ >= 202000L
|
||||
# define UINT56_WIDTH 56
|
||||
# define INT56_WIDTH UINT56_WIDTH
|
||||
# define UINT_LEAST56_WIDTH UINT56_WIDTH
|
||||
# define INT_LEAST56_WIDTH UINT_LEAST56_WIDTH
|
||||
# define UINT_FAST56_WIDTH UINT56_WIDTH
|
||||
# define INT_FAST56_WIDTH UINT_FAST56_WIDTH
|
||||
# undef __UINT_LEAST32_WIDTH
|
||||
# define __UINT_LEAST32_WIDTH UINT56_WIDTH
|
||||
# undef __UINT_LEAST16_WIDTH
|
||||
# define __UINT_LEAST16_WIDTH UINT56_WIDTH
|
||||
# undef __UINT_LEAST8_WIDTH
|
||||
# define __UINT_LEAST8_WIDTH UINT56_WIDTH
|
||||
#endif /* __STDC_VERSION__ */
|
||||
#endif /* __INT56_TYPE__ */
|
||||
@ -554,27 +616,39 @@ typedef __UINTMAX_TYPE__ uintmax_t;
|
||||
# define INT_FAST48_MAX INT48_MAX
|
||||
# define UINT_FAST48_MAX UINT48_MAX
|
||||
|
||||
# undef __INT_LEAST32_MIN
|
||||
# define __INT_LEAST32_MIN INT48_MIN
|
||||
# undef __INT_LEAST32_MAX
|
||||
# define __INT_LEAST32_MAX INT48_MAX
|
||||
# undef __UINT_LEAST32_MAX
|
||||
# define __UINT_LEAST32_MAX UINT48_MAX
|
||||
# undef __INT_LEAST16_MIN
|
||||
# define __INT_LEAST16_MIN INT48_MIN
|
||||
# undef __INT_LEAST16_MAX
|
||||
# define __INT_LEAST16_MAX INT48_MAX
|
||||
# undef __UINT_LEAST16_MAX
|
||||
# define __UINT_LEAST16_MAX UINT48_MAX
|
||||
# undef __INT_LEAST8_MIN
|
||||
# define __INT_LEAST8_MIN INT48_MIN
|
||||
# undef __INT_LEAST8_MAX
|
||||
# define __INT_LEAST8_MAX INT48_MAX
|
||||
# undef __UINT_LEAST8_MAX
|
||||
# define __UINT_LEAST8_MAX UINT48_MAX
|
||||
|
||||
/* FIXME: This is using the placeholder dates Clang produces for these macros
|
||||
in C2x mode; switch to the correct values once they've been published. */
|
||||
#if __STDC_VERSION__ >= 202000L
|
||||
#if defined(__STDC_VERSION__) && __STDC_VERSION__ >= 202000L
|
||||
#define UINT48_WIDTH 48
|
||||
#define INT48_WIDTH UINT48_WIDTH
|
||||
#define UINT_LEAST48_WIDTH UINT48_WIDTH
|
||||
#define INT_LEAST48_WIDTH UINT_LEAST48_WIDTH
|
||||
#define UINT_FAST48_WIDTH UINT48_WIDTH
|
||||
#define INT_FAST48_WIDTH UINT_FAST48_WIDTH
|
||||
#undef __UINT_LEAST32_WIDTH
|
||||
#define __UINT_LEAST32_WIDTH UINT48_WIDTH
|
||||
# undef __UINT_LEAST16_WIDTH
|
||||
#define __UINT_LEAST16_WIDTH UINT48_WIDTH
|
||||
# undef __UINT_LEAST8_WIDTH
|
||||
#define __UINT_LEAST8_WIDTH UINT48_WIDTH
|
||||
#endif /* __STDC_VERSION__ */
|
||||
#endif /* __INT48_TYPE__ */
|
||||
@ -591,27 +665,39 @@ typedef __UINTMAX_TYPE__ uintmax_t;
|
||||
# define INT_FAST40_MAX INT40_MAX
|
||||
# define UINT_FAST40_MAX UINT40_MAX
|
||||
|
||||
# undef __INT_LEAST32_MIN
|
||||
# define __INT_LEAST32_MIN INT40_MIN
|
||||
# undef __INT_LEAST32_MAX
|
||||
# define __INT_LEAST32_MAX INT40_MAX
|
||||
# undef __UINT_LEAST32_MAX
|
||||
# define __UINT_LEAST32_MAX UINT40_MAX
|
||||
# undef __INT_LEAST16_MIN
|
||||
# define __INT_LEAST16_MIN INT40_MIN
|
||||
# undef __INT_LEAST16_MAX
|
||||
# define __INT_LEAST16_MAX INT40_MAX
|
||||
# undef __UINT_LEAST16_MAX
|
||||
# define __UINT_LEAST16_MAX UINT40_MAX
|
||||
# undef __INT_LEAST8_MIN
|
||||
# define __INT_LEAST8_MIN INT40_MIN
|
||||
# undef __INT_LEAST8_MAX
|
||||
# define __INT_LEAST8_MAX INT40_MAX
|
||||
# undef __UINT_LEAST8_MAX
|
||||
# define __UINT_LEAST8_MAX UINT40_MAX
|
||||
|
||||
/* FIXME: This is using the placeholder dates Clang produces for these macros
|
||||
in C2x mode; switch to the correct values once they've been published. */
|
||||
#if __STDC_VERSION__ >= 202000L
|
||||
#if defined(__STDC_VERSION__) && __STDC_VERSION__ >= 202000L
|
||||
# define UINT40_WIDTH 40
|
||||
# define INT40_WIDTH UINT40_WIDTH
|
||||
# define UINT_LEAST40_WIDTH UINT40_WIDTH
|
||||
# define INT_LEAST40_WIDTH UINT_LEAST40_WIDTH
|
||||
# define UINT_FAST40_WIDTH UINT40_WIDTH
|
||||
# define INT_FAST40_WIDTH UINT_FAST40_WIDTH
|
||||
# undef __UINT_LEAST32_WIDTH
|
||||
# define __UINT_LEAST32_WIDTH UINT40_WIDTH
|
||||
# undef __UINT_LEAST16_WIDTH
|
||||
# define __UINT_LEAST16_WIDTH UINT40_WIDTH
|
||||
# undef __UINT_LEAST8_WIDTH
|
||||
# define __UINT_LEAST8_WIDTH UINT40_WIDTH
|
||||
#endif /* __STDC_VERSION__ */
|
||||
#endif /* __INT40_TYPE__ */
|
||||
@ -622,23 +708,35 @@ typedef __UINTMAX_TYPE__ uintmax_t;
|
||||
# define INT32_MIN (-INT32_C(2147483647)-1)
|
||||
# define UINT32_MAX UINT32_C(4294967295)
|
||||
|
||||
# undef __INT_LEAST32_MIN
|
||||
# define __INT_LEAST32_MIN INT32_MIN
|
||||
# undef __INT_LEAST32_MAX
|
||||
# define __INT_LEAST32_MAX INT32_MAX
|
||||
# undef __UINT_LEAST32_MAX
|
||||
# define __UINT_LEAST32_MAX UINT32_MAX
|
||||
# undef __INT_LEAST16_MIN
|
||||
# define __INT_LEAST16_MIN INT32_MIN
|
||||
# undef __INT_LEAST16_MAX
|
||||
# define __INT_LEAST16_MAX INT32_MAX
|
||||
# undef __UINT_LEAST16_MAX
|
||||
# define __UINT_LEAST16_MAX UINT32_MAX
|
||||
# undef __INT_LEAST8_MIN
|
||||
# define __INT_LEAST8_MIN INT32_MIN
|
||||
# undef __INT_LEAST8_MAX
|
||||
# define __INT_LEAST8_MAX INT32_MAX
|
||||
# undef __UINT_LEAST8_MAX
|
||||
# define __UINT_LEAST8_MAX UINT32_MAX
|
||||
|
||||
/* FIXME: This is using the placeholder dates Clang produces for these macros
|
||||
in C2x mode; switch to the correct values once they've been published. */
|
||||
#if __STDC_VERSION__ >= 202000L
|
||||
#if defined(__STDC_VERSION__) && __STDC_VERSION__ >= 202000L
|
||||
# define UINT32_WIDTH 32
|
||||
# define INT32_WIDTH UINT32_WIDTH
|
||||
# undef __UINT_LEAST32_WIDTH
|
||||
# define __UINT_LEAST32_WIDTH UINT32_WIDTH
|
||||
# undef __UINT_LEAST16_WIDTH
|
||||
# define __UINT_LEAST16_WIDTH UINT32_WIDTH
|
||||
# undef __UINT_LEAST8_WIDTH
|
||||
# define __UINT_LEAST8_WIDTH UINT32_WIDTH
|
||||
#endif /* __STDC_VERSION__ */
|
||||
#endif /* __INT32_TYPE__ */
|
||||
@ -653,7 +751,7 @@ typedef __UINTMAX_TYPE__ uintmax_t;
|
||||
|
||||
/* FIXME: This is using the placeholder dates Clang produces for these macros
|
||||
in C2x mode; switch to the correct values once they've been published. */
|
||||
#if __STDC_VERSION__ >= 202000L
|
||||
#if defined(__STDC_VERSION__) && __STDC_VERSION__ >= 202000L
|
||||
# define UINT_LEAST32_WIDTH __UINT_LEAST32_WIDTH
|
||||
# define INT_LEAST32_WIDTH UINT_LEAST32_WIDTH
|
||||
# define UINT_FAST32_WIDTH __UINT_LEAST32_WIDTH
|
||||
@ -673,23 +771,31 @@ typedef __UINTMAX_TYPE__ uintmax_t;
|
||||
# define INT_FAST24_MAX INT24_MAX
|
||||
# define UINT_FAST24_MAX UINT24_MAX
|
||||
|
||||
# undef __INT_LEAST16_MIN
|
||||
# define __INT_LEAST16_MIN INT24_MIN
|
||||
# undef __INT_LEAST16_MAX
|
||||
# define __INT_LEAST16_MAX INT24_MAX
|
||||
# undef __UINT_LEAST16_MAX
|
||||
# define __UINT_LEAST16_MAX UINT24_MAX
|
||||
# undef __INT_LEAST8_MIN
|
||||
# define __INT_LEAST8_MIN INT24_MIN
|
||||
# undef __INT_LEAST8_MAX
|
||||
# define __INT_LEAST8_MAX INT24_MAX
|
||||
# undef __UINT_LEAST8_MAX
|
||||
# define __UINT_LEAST8_MAX UINT24_MAX
|
||||
|
||||
/* FIXME: This is using the placeholder dates Clang produces for these macros
|
||||
in C2x mode; switch to the correct values once they've been published. */
|
||||
#if __STDC_VERSION__ >= 202000L
|
||||
#if defined(__STDC_VERSION__) && __STDC_VERSION__ >= 202000L
|
||||
# define UINT24_WIDTH 24
|
||||
# define INT24_WIDTH UINT24_WIDTH
|
||||
# define UINT_LEAST24_WIDTH UINT24_WIDTH
|
||||
# define INT_LEAST24_WIDTH UINT_LEAST24_WIDTH
|
||||
# define UINT_FAST24_WIDTH UINT24_WIDTH
|
||||
# define INT_FAST24_WIDTH UINT_FAST24_WIDTH
|
||||
# undef __UINT_LEAST16_WIDTH
|
||||
# define __UINT_LEAST16_WIDTH UINT24_WIDTH
|
||||
# undef __UINT_LEAST8_WIDTH
|
||||
# define __UINT_LEAST8_WIDTH UINT24_WIDTH
|
||||
#endif /* __STDC_VERSION__ */
|
||||
#endif /* __INT24_TYPE__ */
|
||||
@ -700,19 +806,27 @@ typedef __UINTMAX_TYPE__ uintmax_t;
|
||||
#define INT16_MIN (-INT16_C(32767)-1)
|
||||
#define UINT16_MAX UINT16_C(65535)
|
||||
|
||||
# undef __INT_LEAST16_MIN
|
||||
# define __INT_LEAST16_MIN INT16_MIN
|
||||
# undef __INT_LEAST16_MAX
|
||||
# define __INT_LEAST16_MAX INT16_MAX
|
||||
# undef __UINT_LEAST16_MAX
|
||||
# define __UINT_LEAST16_MAX UINT16_MAX
|
||||
# undef __INT_LEAST8_MIN
|
||||
# define __INT_LEAST8_MIN INT16_MIN
|
||||
# undef __INT_LEAST8_MAX
|
||||
# define __INT_LEAST8_MAX INT16_MAX
|
||||
# undef __UINT_LEAST8_MAX
|
||||
# define __UINT_LEAST8_MAX UINT16_MAX
|
||||
|
||||
/* FIXME: This is using the placeholder dates Clang produces for these macros
|
||||
in C2x mode; switch to the correct values once they've been published. */
|
||||
#if __STDC_VERSION__ >= 202000L
|
||||
#if defined(__STDC_VERSION__) && __STDC_VERSION__ >= 202000L
|
||||
# define UINT16_WIDTH 16
|
||||
# define INT16_WIDTH UINT16_WIDTH
|
||||
# undef __UINT_LEAST16_WIDTH
|
||||
# define __UINT_LEAST16_WIDTH UINT16_WIDTH
|
||||
# undef __UINT_LEAST8_WIDTH
|
||||
# define __UINT_LEAST8_WIDTH UINT16_WIDTH
|
||||
#endif /* __STDC_VERSION__ */
|
||||
#endif /* __INT16_TYPE__ */
|
||||
@ -727,7 +841,7 @@ typedef __UINTMAX_TYPE__ uintmax_t;
|
||||
|
||||
/* FIXME: This is using the placeholder dates Clang produces for these macros
|
||||
in C2x mode; switch to the correct values once they've been published. */
|
||||
#if __STDC_VERSION__ >= 202000L
|
||||
#if defined(__STDC_VERSION__) && __STDC_VERSION__ >= 202000L
|
||||
# define UINT_LEAST16_WIDTH __UINT_LEAST16_WIDTH
|
||||
# define INT_LEAST16_WIDTH UINT_LEAST16_WIDTH
|
||||
# define UINT_FAST16_WIDTH __UINT_LEAST16_WIDTH
|
||||
@ -741,15 +855,19 @@ typedef __UINTMAX_TYPE__ uintmax_t;
|
||||
# define INT8_MIN (-INT8_C(127)-1)
|
||||
# define UINT8_MAX UINT8_C(255)
|
||||
|
||||
# undef __INT_LEAST8_MIN
|
||||
# define __INT_LEAST8_MIN INT8_MIN
|
||||
# undef __INT_LEAST8_MAX
|
||||
# define __INT_LEAST8_MAX INT8_MAX
|
||||
# undef __UINT_LEAST8_MAX
|
||||
# define __UINT_LEAST8_MAX UINT8_MAX
|
||||
|
||||
/* FIXME: This is using the placeholder dates Clang produces for these macros
|
||||
in C2x mode; switch to the correct values once they've been published. */
|
||||
#if __STDC_VERSION__ >= 202000L
|
||||
#if defined(__STDC_VERSION__) && __STDC_VERSION__ >= 202000L
|
||||
# define UINT8_WIDTH 8
|
||||
# define INT8_WIDTH UINT8_WIDTH
|
||||
# undef __UINT_LEAST8_WIDTH
|
||||
# define __UINT_LEAST8_WIDTH UINT8_WIDTH
|
||||
#endif /* __STDC_VERSION__ */
|
||||
#endif /* __INT8_TYPE__ */
|
||||
@ -764,7 +882,7 @@ typedef __UINTMAX_TYPE__ uintmax_t;
|
||||
|
||||
/* FIXME: This is using the placeholder dates Clang produces for these macros
|
||||
in C2x mode; switch to the correct values once they've been published. */
|
||||
#if __STDC_VERSION__ >= 202000L
|
||||
#if defined(__STDC_VERSION__) && __STDC_VERSION__ >= 202000L
|
||||
# define UINT_LEAST8_WIDTH __UINT_LEAST8_WIDTH
|
||||
# define INT_LEAST8_WIDTH UINT_LEAST8_WIDTH
|
||||
# define UINT_FAST8_WIDTH __UINT_LEAST8_WIDTH
|
||||
@ -792,7 +910,7 @@ typedef __UINTMAX_TYPE__ uintmax_t;
|
||||
/* C2x 7.20.2.4 Width of integer types capable of holding object pointers. */
|
||||
/* FIXME: This is using the placeholder dates Clang produces for these macros
|
||||
in C2x mode; switch to the correct values once they've been published. */
|
||||
#if __STDC_VERSION__ >= 202000L
|
||||
#if defined(__STDC_VERSION__) && __STDC_VERSION__ >= 202000L
|
||||
/* NB: The C standard requires that these be the same value, but the compiler
|
||||
exposes separate internal width macros. */
|
||||
#define INTPTR_WIDTH __INTPTR_WIDTH__
|
||||
@ -813,7 +931,7 @@ typedef __UINTMAX_TYPE__ uintmax_t;
|
||||
/* C2x 7.20.2.5 Width of greatest-width integer types. */
|
||||
/* FIXME: This is using the placeholder dates Clang produces for these macros
|
||||
in C2x mode; switch to the correct values once they've been published. */
|
||||
#if __STDC_VERSION__ >= 202000L
|
||||
#if defined(__STDC_VERSION__) && __STDC_VERSION__ >= 202000L
|
||||
/* NB: The C standard requires that these be the same value, but the compiler
|
||||
exposes separate internal width macros. */
|
||||
#define INTMAX_WIDTH __INTMAX_WIDTH__
|
||||
@ -849,7 +967,7 @@ typedef __UINTMAX_TYPE__ uintmax_t;
|
||||
/* C2x 7.20.3.x Width of other integer types. */
|
||||
/* FIXME: This is using the placeholder dates Clang produces for these macros
|
||||
in C2x mode; switch to the correct values once they've been published. */
|
||||
#if __STDC_VERSION__ >= 202000L
|
||||
#if defined(__STDC_VERSION__) && __STDC_VERSION__ >= 202000L
|
||||
#define PTRDIFF_WIDTH __PTRDIFF_WIDTH__
|
||||
#define SIG_ATOMIC_WIDTH __SIG_ATOMIC_WIDTH__
|
||||
#define SIZE_WIDTH __SIZE_WIDTH__
|
||||
|
2
lib/include/stdnoreturn.h
vendored
2
lib/include/stdnoreturn.h
vendored
@ -13,7 +13,7 @@
|
||||
#define noreturn _Noreturn
|
||||
#define __noreturn_is_defined 1
|
||||
|
||||
#if __STDC_VERSION__ > 201710L && \
|
||||
#if (defined(__STDC_VERSION__) && __STDC_VERSION__ > 201710L) && \
|
||||
!defined(_CLANG_DISABLE_CRT_DEPRECATION_WARNINGS)
|
||||
/* The noreturn macro is deprecated in C2x. We do not mark it as such because
|
||||
including the header file in C2x is also deprecated and we do not want to
|
||||
|
3
lib/include/unwind.h
vendored
3
lib/include/unwind.h
vendored
@ -65,7 +65,8 @@ struct _Unwind_Context;
|
||||
#if defined(__arm__) && !(defined(__USING_SJLJ_EXCEPTIONS__) || \
|
||||
defined(__ARM_DWARF_EH__) || defined(__SEH__))
|
||||
struct _Unwind_Control_Block;
|
||||
typedef struct _Unwind_Control_Block _Unwind_Exception; /* Alias */
|
||||
typedef struct _Unwind_Control_Block _Unwind_Control_Block;
|
||||
#define _Unwind_Exception _Unwind_Control_Block /* Alias */
|
||||
#else
|
||||
struct _Unwind_Exception;
|
||||
typedef struct _Unwind_Exception _Unwind_Exception;
|
||||
|
2
lib/include/velintrin.h
vendored
2
lib/include/velintrin.h
vendored
@ -13,7 +13,7 @@
|
||||
typedef double __vr __attribute__((__vector_size__(2048)));
|
||||
|
||||
// Vector mask registers
|
||||
#if __STDC_VERSION__ >= 199901L
|
||||
#if defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L
|
||||
// For C99
|
||||
typedef _Bool __vm __attribute__((ext_vector_type(256)));
|
||||
typedef _Bool __vm256 __attribute__((ext_vector_type(256)));
|
||||
|
26
lib/include/x86gprintrin.h
vendored
26
lib/include/x86gprintrin.h
vendored
@ -25,23 +25,35 @@
|
||||
#include <crc32intrin.h>
|
||||
#endif
|
||||
|
||||
#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \
|
||||
defined(__PRFCHI__)
|
||||
#include <prfchiintrin.h>
|
||||
#endif
|
||||
|
||||
#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \
|
||||
defined(__RAOINT__)
|
||||
#include <raointintrin.h>
|
||||
#endif
|
||||
|
||||
#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \
|
||||
defined(__CMPCCXADD__)
|
||||
#include <cmpccxaddintrin.h>
|
||||
#endif
|
||||
|
||||
#if defined(__i386__)
|
||||
#define __FULLBX "ebx"
|
||||
#define __SAVE_GPRBX "mov {%%ebx, %%eax |eax, ebx};"
|
||||
#define __RESTORE_GPRBX "mov {%%eax, %%ebx |ebx, eax};"
|
||||
#define __TMPGPR "eax"
|
||||
#else
|
||||
// When in 64-bit target, the 32-bit operands generate a 32-bit result,
|
||||
// zero-extended to a 64-bit result in the destination general-purpose,
|
||||
// It means "mov x %ebx" will clobber the higher 32 bits of rbx, so we
|
||||
// should preserve the 64-bit register rbx.
|
||||
#define __FULLBX "rbx"
|
||||
#define __SAVE_GPRBX "mov {%%rbx, %%rax |rax, rbx};"
|
||||
#define __RESTORE_GPRBX "mov {%%rax, %%rbx |rbx, rax};"
|
||||
#define __TMPGPR "rax"
|
||||
#endif
|
||||
|
||||
#define __MOVEGPR(__r1, __r2) "mov {%%"__r1 ", %%"__r2 "|"__r2 ", "__r1"};"
|
||||
|
||||
#define __SAVE_GPRBX __MOVEGPR(__FULLBX, __TMPGPR)
|
||||
#define __RESTORE_GPRBX __MOVEGPR(__TMPGPR, __FULLBX)
|
||||
|
||||
#define __SSC_MARK(__Tag) \
|
||||
__asm__ __volatile__( __SAVE_GPRBX \
|
||||
"mov {%0, %%ebx|ebx, %0}; " \
|
||||
|
3
lib/include/xmmintrin.h
vendored
3
lib/include/xmmintrin.h
vendored
@ -1906,7 +1906,7 @@ _mm_setr_ps(float __z, float __y, float __x, float __w)
|
||||
static __inline__ __m128 __DEFAULT_FN_ATTRS
|
||||
_mm_setzero_ps(void)
|
||||
{
|
||||
return __extension__ (__m128){ 0, 0, 0, 0 };
|
||||
return __extension__ (__m128){ 0.0f, 0.0f, 0.0f, 0.0f };
|
||||
}
|
||||
|
||||
/// Stores the upper 64 bits of a 128-bit vector of [4 x float] to a
|
||||
@ -3005,7 +3005,6 @@ do { \
|
||||
#define _m_pavgw _mm_avg_pu16
|
||||
#define _m_psadbw _mm_sad_pu8
|
||||
#define _m_ _mm_
|
||||
#define _m_ _mm_
|
||||
|
||||
#undef __DEFAULT_FN_ATTRS
|
||||
#undef __DEFAULT_FN_ATTRS_MMX
|
||||
|
Loading…
Reference in New Issue
Block a user