Imported Upstream version 6.10.0.49

Former-commit-id: 1d6753294b2993e1fbf92de9366bb9544db4189b
This commit is contained in:
Xamarin Public Jenkins (auto-signing)
2020-01-16 16:38:04 +00:00
parent d94e79959b
commit 468663ddbb
48518 changed files with 2789335 additions and 61176 deletions

View File

@@ -0,0 +1,157 @@
set(files
adxintrin.h
altivec.h
ammintrin.h
arm_acle.h
armintr.h
arm64intr.h
avx2intrin.h
avx512bwintrin.h
avx512bitalgintrin.h
avx512vlbitalgintrin.h
avx512cdintrin.h
avx512vpopcntdqintrin.h
avx512dqintrin.h
avx512erintrin.h
avx512fintrin.h
avx512ifmaintrin.h
avx512ifmavlintrin.h
avx512pfintrin.h
avx512vbmiintrin.h
avx512vbmivlintrin.h
avx512vbmi2intrin.h
avx512vlvbmi2intrin.h
avx512vlbwintrin.h
avx512vlcdintrin.h
avx512vldqintrin.h
avx512vlintrin.h
avx512vpopcntdqvlintrin.h
avx512vnniintrin.h
avx512vlvnniintrin.h
avxintrin.h
bmi2intrin.h
bmiintrin.h
__clang_cuda_builtin_vars.h
__clang_cuda_cmath.h
__clang_cuda_complex_builtins.h
__clang_cuda_intrinsics.h
__clang_cuda_math_forward_declares.h
__clang_cuda_runtime_wrapper.h
cetintrin.h
clzerointrin.h
cpuid.h
clflushoptintrin.h
clwbintrin.h
emmintrin.h
f16cintrin.h
float.h
fma4intrin.h
fmaintrin.h
fxsrintrin.h
gfniintrin.h
htmintrin.h
htmxlintrin.h
ia32intrin.h
immintrin.h
intrin.h
inttypes.h
iso646.h
limits.h
lwpintrin.h
lzcntintrin.h
mm3dnow.h
mmintrin.h
mm_malloc.h
module.modulemap
msa.h
mwaitxintrin.h
nmmintrin.h
opencl-c.h
pkuintrin.h
pmmintrin.h
popcntintrin.h
prfchwintrin.h
rdseedintrin.h
rtmintrin.h
s390intrin.h
shaintrin.h
smmintrin.h
stdalign.h
stdarg.h
stdatomic.h
stdbool.h
stddef.h
__stddef_max_align_t.h
stdint.h
stdnoreturn.h
tbmintrin.h
tgmath.h
tmmintrin.h
unwind.h
vadefs.h
vaesintrin.h
varargs.h
vecintrin.h
vpclmulqdqintrin.h
wmmintrin.h
__wmmintrin_aes.h
__wmmintrin_pclmul.h
x86intrin.h
xmmintrin.h
xopintrin.h
xsavecintrin.h
xsaveintrin.h
xsaveoptintrin.h
xsavesintrin.h
xtestintrin.h
)
set(cuda_wrapper_files
cuda_wrappers/algorithm
cuda_wrappers/complex
cuda_wrappers/new
)
set(output_dir ${LLVM_LIBRARY_OUTPUT_INTDIR}/clang/${CLANG_VERSION}/include)
# Generate arm_neon.h
clang_tablegen(arm_neon.h -gen-arm-neon
SOURCE ${CLANG_SOURCE_DIR}/include/clang/Basic/arm_neon.td)
set(out_files)
foreach( f ${files} ${cuda_wrapper_files} )
set( src ${CMAKE_CURRENT_SOURCE_DIR}/${f} )
set( dst ${output_dir}/${f} )
add_custom_command(OUTPUT ${dst}
DEPENDS ${src}
COMMAND ${CMAKE_COMMAND} -E copy_if_different ${src} ${dst}
COMMENT "Copying clang's ${f}...")
list(APPEND out_files ${dst})
endforeach( f )
add_custom_command(OUTPUT ${output_dir}/arm_neon.h
DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/arm_neon.h
COMMAND ${CMAKE_COMMAND} -E copy_if_different ${CMAKE_CURRENT_BINARY_DIR}/arm_neon.h ${output_dir}/arm_neon.h
COMMENT "Copying clang's arm_neon.h...")
list(APPEND out_files ${output_dir}/arm_neon.h)
add_custom_target(clang-headers ALL DEPENDS ${out_files})
set_target_properties(clang-headers PROPERTIES FOLDER "Misc")
install(
FILES ${files} ${CMAKE_CURRENT_BINARY_DIR}/arm_neon.h
COMPONENT clang-headers
PERMISSIONS OWNER_READ OWNER_WRITE GROUP_READ WORLD_READ
DESTINATION lib${LLVM_LIBDIR_SUFFIX}/clang/${CLANG_VERSION}/include)
install(
FILES ${cuda_wrapper_files}
COMPONENT clang-headers
PERMISSIONS OWNER_READ OWNER_WRITE GROUP_READ WORLD_READ
DESTINATION lib${LLVM_LIBDIR_SUFFIX}/clang/${CLANG_VERSION}/include/cuda_wrappers)
if (NOT CMAKE_CONFIGURATION_TYPES) # don't add this for IDE's.
add_llvm_install_targets(install-clang-headers
DEPENDS clang-headers
COMPONENT clang-headers)
endif()

View File

@@ -0,0 +1,126 @@
/*===---- cuda_builtin_vars.h - CUDA built-in variables ---------------------===
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*
*===-----------------------------------------------------------------------===
*/
#ifndef __CUDA_BUILTIN_VARS_H
#define __CUDA_BUILTIN_VARS_H
// Forward declares from vector_types.h.
struct uint3;
struct dim3;
// The file implements built-in CUDA variables using __declspec(property).
// https://msdn.microsoft.com/en-us/library/yhfk0thd.aspx
// All read accesses of built-in variable fields get converted into calls to a
// getter function which in turn calls the appropriate builtin to fetch the
// value.
//
// Example:
// int x = threadIdx.x;
// IR output:
// %0 = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() #3
// PTX output:
// mov.u32 %r2, %tid.x;
#define __CUDA_DEVICE_BUILTIN(FIELD, INTRINSIC) \
__declspec(property(get = __fetch_builtin_##FIELD)) unsigned int FIELD; \
static inline __attribute__((always_inline)) \
__attribute__((device)) unsigned int __fetch_builtin_##FIELD(void) { \
return INTRINSIC; \
}
#if __cplusplus >= 201103L
#define __DELETE =delete
#else
#define __DELETE
#endif
// Make sure nobody can create instances of the special varible types. nvcc
// also disallows taking address of special variables, so we disable address-of
// operator as well.
#define __CUDA_DISALLOW_BUILTINVAR_ACCESS(TypeName) \
__attribute__((device)) TypeName() __DELETE; \
__attribute__((device)) TypeName(const TypeName &) __DELETE; \
__attribute__((device)) void operator=(const TypeName &) const __DELETE; \
__attribute__((device)) TypeName *operator&() const __DELETE
struct __cuda_builtin_threadIdx_t {
__CUDA_DEVICE_BUILTIN(x,__nvvm_read_ptx_sreg_tid_x());
__CUDA_DEVICE_BUILTIN(y,__nvvm_read_ptx_sreg_tid_y());
__CUDA_DEVICE_BUILTIN(z,__nvvm_read_ptx_sreg_tid_z());
// threadIdx should be convertible to uint3 (in fact in nvcc, it *is* a
// uint3). This function is defined after we pull in vector_types.h.
__attribute__((device)) operator uint3() const;
private:
__CUDA_DISALLOW_BUILTINVAR_ACCESS(__cuda_builtin_threadIdx_t);
};
struct __cuda_builtin_blockIdx_t {
__CUDA_DEVICE_BUILTIN(x,__nvvm_read_ptx_sreg_ctaid_x());
__CUDA_DEVICE_BUILTIN(y,__nvvm_read_ptx_sreg_ctaid_y());
__CUDA_DEVICE_BUILTIN(z,__nvvm_read_ptx_sreg_ctaid_z());
// blockIdx should be convertible to uint3 (in fact in nvcc, it *is* a
// uint3). This function is defined after we pull in vector_types.h.
__attribute__((device)) operator uint3() const;
private:
__CUDA_DISALLOW_BUILTINVAR_ACCESS(__cuda_builtin_blockIdx_t);
};
struct __cuda_builtin_blockDim_t {
__CUDA_DEVICE_BUILTIN(x,__nvvm_read_ptx_sreg_ntid_x());
__CUDA_DEVICE_BUILTIN(y,__nvvm_read_ptx_sreg_ntid_y());
__CUDA_DEVICE_BUILTIN(z,__nvvm_read_ptx_sreg_ntid_z());
// blockDim should be convertible to dim3 (in fact in nvcc, it *is* a
// dim3). This function is defined after we pull in vector_types.h.
__attribute__((device)) operator dim3() const;
private:
__CUDA_DISALLOW_BUILTINVAR_ACCESS(__cuda_builtin_blockDim_t);
};
struct __cuda_builtin_gridDim_t {
__CUDA_DEVICE_BUILTIN(x,__nvvm_read_ptx_sreg_nctaid_x());
__CUDA_DEVICE_BUILTIN(y,__nvvm_read_ptx_sreg_nctaid_y());
__CUDA_DEVICE_BUILTIN(z,__nvvm_read_ptx_sreg_nctaid_z());
// gridDim should be convertible to dim3 (in fact in nvcc, it *is* a
// dim3). This function is defined after we pull in vector_types.h.
__attribute__((device)) operator dim3() const;
private:
__CUDA_DISALLOW_BUILTINVAR_ACCESS(__cuda_builtin_gridDim_t);
};
#define __CUDA_BUILTIN_VAR \
extern const __attribute__((device)) __attribute__((weak))
__CUDA_BUILTIN_VAR __cuda_builtin_threadIdx_t threadIdx;
__CUDA_BUILTIN_VAR __cuda_builtin_blockIdx_t blockIdx;
__CUDA_BUILTIN_VAR __cuda_builtin_blockDim_t blockDim;
__CUDA_BUILTIN_VAR __cuda_builtin_gridDim_t gridDim;
// warpSize should translate to read of %WARP_SZ but there's currently no
// builtin to do so. According to PTX v4.2 docs 'to date, all target
// architectures have a WARP_SZ value of 32'.
__attribute__((device)) const int warpSize = 32;
#undef __CUDA_DEVICE_BUILTIN
#undef __CUDA_BUILTIN_VAR
#undef __CUDA_DISALLOW_BUILTINVAR_ACCESS
#endif /* __CUDA_BUILTIN_VARS_H */

View File

@@ -0,0 +1,472 @@
/*===---- __clang_cuda_cmath.h - Device-side CUDA cmath support ------------===
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*
*===-----------------------------------------------------------------------===
*/
#ifndef __CLANG_CUDA_CMATH_H__
#define __CLANG_CUDA_CMATH_H__
#ifndef __CUDA__
#error "This file is for CUDA compilation only."
#endif
#include <limits>
// CUDA lets us use various std math functions on the device side. This file
// works in concert with __clang_cuda_math_forward_declares.h to make this work.
//
// Specifically, the forward-declares header declares __device__ overloads for
// these functions in the global namespace, then pulls them into namespace std
// with 'using' statements. Then this file implements those functions, after
// their implementations have been pulled in.
//
// It's important that we declare the functions in the global namespace and pull
// them into namespace std with using statements, as opposed to simply declaring
// these functions in namespace std, because our device functions need to
// overload the standard library functions, which may be declared in the global
// namespace or in std, depending on the degree of conformance of the stdlib
// implementation. Declaring in the global namespace and pulling into namespace
// std covers all of the known knowns.
#define __DEVICE__ static __device__ __inline__ __attribute__((always_inline))
__DEVICE__ long long abs(long long __n) { return ::llabs(__n); }
__DEVICE__ long abs(long __n) { return ::labs(__n); }
__DEVICE__ float abs(float __x) { return ::fabsf(__x); }
__DEVICE__ double abs(double __x) { return ::fabs(__x); }
__DEVICE__ float acos(float __x) { return ::acosf(__x); }
__DEVICE__ float asin(float __x) { return ::asinf(__x); }
__DEVICE__ float atan(float __x) { return ::atanf(__x); }
__DEVICE__ float atan2(float __x, float __y) { return ::atan2f(__x, __y); }
__DEVICE__ float ceil(float __x) { return ::ceilf(__x); }
__DEVICE__ float cos(float __x) { return ::cosf(__x); }
__DEVICE__ float cosh(float __x) { return ::coshf(__x); }
__DEVICE__ float exp(float __x) { return ::expf(__x); }
__DEVICE__ float fabs(float __x) { return ::fabsf(__x); }
__DEVICE__ float floor(float __x) { return ::floorf(__x); }
__DEVICE__ float fmod(float __x, float __y) { return ::fmodf(__x, __y); }
__DEVICE__ int fpclassify(float __x) {
return __builtin_fpclassify(FP_NAN, FP_INFINITE, FP_NORMAL, FP_SUBNORMAL,
FP_ZERO, __x);
}
__DEVICE__ int fpclassify(double __x) {
return __builtin_fpclassify(FP_NAN, FP_INFINITE, FP_NORMAL, FP_SUBNORMAL,
FP_ZERO, __x);
}
__DEVICE__ float frexp(float __arg, int *__exp) {
return ::frexpf(__arg, __exp);
}
// For inscrutable reasons, the CUDA headers define these functions for us on
// Windows.
#ifndef _MSC_VER
__DEVICE__ bool isinf(float __x) { return ::__isinff(__x); }
__DEVICE__ bool isinf(double __x) { return ::__isinf(__x); }
__DEVICE__ bool isfinite(float __x) { return ::__finitef(__x); }
// For inscrutable reasons, __finite(), the double-precision version of
// __finitef, does not exist when compiling for MacOS. __isfinited is available
// everywhere and is just as good.
__DEVICE__ bool isfinite(double __x) { return ::__isfinited(__x); }
__DEVICE__ bool isnan(float __x) { return ::__isnanf(__x); }
__DEVICE__ bool isnan(double __x) { return ::__isnan(__x); }
#endif
__DEVICE__ bool isgreater(float __x, float __y) {
return __builtin_isgreater(__x, __y);
}
__DEVICE__ bool isgreater(double __x, double __y) {
return __builtin_isgreater(__x, __y);
}
__DEVICE__ bool isgreaterequal(float __x, float __y) {
return __builtin_isgreaterequal(__x, __y);
}
__DEVICE__ bool isgreaterequal(double __x, double __y) {
return __builtin_isgreaterequal(__x, __y);
}
__DEVICE__ bool isless(float __x, float __y) {
return __builtin_isless(__x, __y);
}
__DEVICE__ bool isless(double __x, double __y) {
return __builtin_isless(__x, __y);
}
__DEVICE__ bool islessequal(float __x, float __y) {
return __builtin_islessequal(__x, __y);
}
__DEVICE__ bool islessequal(double __x, double __y) {
return __builtin_islessequal(__x, __y);
}
__DEVICE__ bool islessgreater(float __x, float __y) {
return __builtin_islessgreater(__x, __y);
}
__DEVICE__ bool islessgreater(double __x, double __y) {
return __builtin_islessgreater(__x, __y);
}
__DEVICE__ bool isnormal(float __x) { return __builtin_isnormal(__x); }
__DEVICE__ bool isnormal(double __x) { return __builtin_isnormal(__x); }
__DEVICE__ bool isunordered(float __x, float __y) {
return __builtin_isunordered(__x, __y);
}
__DEVICE__ bool isunordered(double __x, double __y) {
return __builtin_isunordered(__x, __y);
}
__DEVICE__ float ldexp(float __arg, int __exp) {
return ::ldexpf(__arg, __exp);
}
__DEVICE__ float log(float __x) { return ::logf(__x); }
__DEVICE__ float log10(float __x) { return ::log10f(__x); }
__DEVICE__ float modf(float __x, float *__iptr) { return ::modff(__x, __iptr); }
__DEVICE__ float pow(float __base, float __exp) {
return ::powf(__base, __exp);
}
__DEVICE__ float pow(float __base, int __iexp) {
return ::powif(__base, __iexp);
}
__DEVICE__ double pow(double __base, int __iexp) {
return ::powi(__base, __iexp);
}
__DEVICE__ bool signbit(float __x) { return ::__signbitf(__x); }
__DEVICE__ bool signbit(double __x) { return ::__signbitd(__x); }
__DEVICE__ float sin(float __x) { return ::sinf(__x); }
__DEVICE__ float sinh(float __x) { return ::sinhf(__x); }
__DEVICE__ float sqrt(float __x) { return ::sqrtf(__x); }
__DEVICE__ float tan(float __x) { return ::tanf(__x); }
__DEVICE__ float tanh(float __x) { return ::tanhf(__x); }
// Notably missing above is nexttoward. We omit it because
// libdevice doesn't provide an implementation, and we don't want to be in the
// business of implementing tricky libm functions in this header.
// Now we've defined everything we promised we'd define in
// __clang_cuda_math_forward_declares.h. We need to do two additional things to
// fix up our math functions.
//
// 1) Define __device__ overloads for e.g. sin(int). The CUDA headers define
// only sin(float) and sin(double), which means that e.g. sin(0) is
// ambiguous.
//
// 2) Pull the __device__ overloads of "foobarf" math functions into namespace
// std. These are defined in the CUDA headers in the global namespace,
// independent of everything else we've done here.
// We can't use std::enable_if, because we want to be pre-C++11 compatible. But
// we go ahead and unconditionally define functions that are only available when
// compiling for C++11 to match the behavior of the CUDA headers.
template<bool __B, class __T = void>
struct __clang_cuda_enable_if {};
template <class __T> struct __clang_cuda_enable_if<true, __T> {
typedef __T type;
};
// Defines an overload of __fn that accepts one integral argument, calls
// __fn((double)x), and returns __retty.
#define __CUDA_CLANG_FN_INTEGER_OVERLOAD_1(__retty, __fn) \
template <typename __T> \
__DEVICE__ \
typename __clang_cuda_enable_if<std::numeric_limits<__T>::is_integer, \
__retty>::type \
__fn(__T __x) { \
return ::__fn((double)__x); \
}
// Defines an overload of __fn that accepts one two arithmetic arguments, calls
// __fn((double)x, (double)y), and returns a double.
//
// Note this is different from OVERLOAD_1, which generates an overload that
// accepts only *integral* arguments.
#define __CUDA_CLANG_FN_INTEGER_OVERLOAD_2(__retty, __fn) \
template <typename __T1, typename __T2> \
__DEVICE__ typename __clang_cuda_enable_if< \
std::numeric_limits<__T1>::is_specialized && \
std::numeric_limits<__T2>::is_specialized, \
__retty>::type \
__fn(__T1 __x, __T2 __y) { \
return __fn((double)__x, (double)__y); \
}
__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, acos)
__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, acosh)
__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, asin)
__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, asinh)
__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, atan)
__CUDA_CLANG_FN_INTEGER_OVERLOAD_2(double, atan2);
__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, atanh)
__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, cbrt)
__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, ceil)
__CUDA_CLANG_FN_INTEGER_OVERLOAD_2(double, copysign);
__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, cos)
__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, cosh)
__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, erf)
__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, erfc)
__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, exp)
__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, exp2)
__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, expm1)
__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, fabs)
__CUDA_CLANG_FN_INTEGER_OVERLOAD_2(double, fdim);
__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, floor)
__CUDA_CLANG_FN_INTEGER_OVERLOAD_2(double, fmax);
__CUDA_CLANG_FN_INTEGER_OVERLOAD_2(double, fmin);
__CUDA_CLANG_FN_INTEGER_OVERLOAD_2(double, fmod);
__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(int, fpclassify)
__CUDA_CLANG_FN_INTEGER_OVERLOAD_2(double, hypot);
__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(int, ilogb)
__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(bool, isfinite)
__CUDA_CLANG_FN_INTEGER_OVERLOAD_2(bool, isgreater);
__CUDA_CLANG_FN_INTEGER_OVERLOAD_2(bool, isgreaterequal);
__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(bool, isinf);
__CUDA_CLANG_FN_INTEGER_OVERLOAD_2(bool, isless);
__CUDA_CLANG_FN_INTEGER_OVERLOAD_2(bool, islessequal);
__CUDA_CLANG_FN_INTEGER_OVERLOAD_2(bool, islessgreater);
__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(bool, isnan);
__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(bool, isnormal)
__CUDA_CLANG_FN_INTEGER_OVERLOAD_2(bool, isunordered);
__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, lgamma)
__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, log)
__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, log10)
__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, log1p)
__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, log2)
__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, logb)
__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(long long, llrint)
__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(long long, llround)
__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(long, lrint)
__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(long, lround)
__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, nearbyint);
__CUDA_CLANG_FN_INTEGER_OVERLOAD_2(double, nextafter);
__CUDA_CLANG_FN_INTEGER_OVERLOAD_2(double, pow);
__CUDA_CLANG_FN_INTEGER_OVERLOAD_2(double, remainder);
__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, rint);
__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, round);
__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(bool, signbit)
__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, sin)
__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, sinh)
__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, sqrt)
__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, tan)
__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, tanh)
__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, tgamma)
__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, trunc);
#undef __CUDA_CLANG_FN_INTEGER_OVERLOAD_1
#undef __CUDA_CLANG_FN_INTEGER_OVERLOAD_2
// Overloads for functions that don't match the patterns expected by
// __CUDA_CLANG_FN_INTEGER_OVERLOAD_{1,2}.
template <typename __T1, typename __T2, typename __T3>
__DEVICE__ typename __clang_cuda_enable_if<
std::numeric_limits<__T1>::is_specialized &&
std::numeric_limits<__T2>::is_specialized &&
std::numeric_limits<__T3>::is_specialized,
double>::type
fma(__T1 __x, __T2 __y, __T3 __z) {
return std::fma((double)__x, (double)__y, (double)__z);
}
template <typename __T>
__DEVICE__ typename __clang_cuda_enable_if<std::numeric_limits<__T>::is_integer,
double>::type
frexp(__T __x, int *__exp) {
return std::frexp((double)__x, __exp);
}
template <typename __T>
__DEVICE__ typename __clang_cuda_enable_if<std::numeric_limits<__T>::is_integer,
double>::type
ldexp(__T __x, int __exp) {
return std::ldexp((double)__x, __exp);
}
template <typename __T1, typename __T2>
__DEVICE__ typename __clang_cuda_enable_if<
std::numeric_limits<__T1>::is_specialized &&
std::numeric_limits<__T2>::is_specialized,
double>::type
remquo(__T1 __x, __T2 __y, int *__quo) {
return std::remquo((double)__x, (double)__y, __quo);
}
template <typename __T>
__DEVICE__ typename __clang_cuda_enable_if<std::numeric_limits<__T>::is_integer,
double>::type
scalbln(__T __x, long __exp) {
return std::scalbln((double)__x, __exp);
}
template <typename __T>
__DEVICE__ typename __clang_cuda_enable_if<std::numeric_limits<__T>::is_integer,
double>::type
scalbn(__T __x, int __exp) {
return std::scalbn((double)__x, __exp);
}
// We need to define these overloads in exactly the namespace our standard
// library uses (including the right inline namespace), otherwise they won't be
// picked up by other functions in the standard library (e.g. functions in
// <complex>). Thus the ugliness below.
#ifdef _LIBCPP_BEGIN_NAMESPACE_STD
_LIBCPP_BEGIN_NAMESPACE_STD
#else
namespace std {
#ifdef _GLIBCXX_BEGIN_NAMESPACE_VERSION
_GLIBCXX_BEGIN_NAMESPACE_VERSION
#endif
#endif
// Pull the new overloads we defined above into namespace std.
using ::acos;
using ::acosh;
using ::asin;
using ::asinh;
using ::atan;
using ::atan2;
using ::atanh;
using ::cbrt;
using ::ceil;
using ::copysign;
using ::cos;
using ::cosh;
using ::erf;
using ::erfc;
using ::exp;
using ::exp2;
using ::expm1;
using ::fabs;
using ::fdim;
using ::floor;
using ::fma;
using ::fmax;
using ::fmin;
using ::fmod;
using ::fpclassify;
using ::frexp;
using ::hypot;
using ::ilogb;
using ::isfinite;
using ::isgreater;
using ::isgreaterequal;
using ::isless;
using ::islessequal;
using ::islessgreater;
using ::isnormal;
using ::isunordered;
using ::ldexp;
using ::lgamma;
using ::llrint;
using ::llround;
using ::log;
using ::log10;
using ::log1p;
using ::log2;
using ::logb;
using ::lrint;
using ::lround;
using ::nearbyint;
using ::nextafter;
using ::pow;
using ::remainder;
using ::remquo;
using ::rint;
using ::round;
using ::scalbln;
using ::scalbn;
using ::signbit;
using ::sin;
using ::sinh;
using ::sqrt;
using ::tan;
using ::tanh;
using ::tgamma;
using ::trunc;
// Well this is fun: We need to pull these symbols in for libc++, but we can't
// pull them in with libstdc++, because its ::isinf and ::isnan are different
// than its std::isinf and std::isnan.
#ifndef __GLIBCXX__
using ::isinf;
using ::isnan;
#endif
// Finally, pull the "foobarf" functions that CUDA defines in its headers into
// namespace std.
using ::acosf;
using ::acoshf;
using ::asinf;
using ::asinhf;
using ::atan2f;
using ::atanf;
using ::atanhf;
using ::cbrtf;
using ::ceilf;
using ::copysignf;
using ::cosf;
using ::coshf;
using ::erfcf;
using ::erff;
using ::exp2f;
using ::expf;
using ::expm1f;
using ::fabsf;
using ::fdimf;
using ::floorf;
using ::fmaf;
using ::fmaxf;
using ::fminf;
using ::fmodf;
using ::frexpf;
using ::hypotf;
using ::ilogbf;
using ::ldexpf;
using ::lgammaf;
using ::llrintf;
using ::llroundf;
using ::log10f;
using ::log1pf;
using ::log2f;
using ::logbf;
using ::logf;
using ::lrintf;
using ::lroundf;
using ::modff;
using ::nearbyintf;
using ::nextafterf;
using ::powf;
using ::remainderf;
using ::remquof;
using ::rintf;
using ::roundf;
using ::scalblnf;
using ::scalbnf;
using ::sinf;
using ::sinhf;
using ::sqrtf;
using ::tanf;
using ::tanhf;
using ::tgammaf;
using ::truncf;
#ifdef _LIBCPP_END_NAMESPACE_STD
_LIBCPP_END_NAMESPACE_STD
#else
#ifdef _GLIBCXX_BEGIN_NAMESPACE_VERSION
_GLIBCXX_END_NAMESPACE_VERSION
#endif
} // namespace std
#endif
#undef __DEVICE__
#endif

View File

@@ -0,0 +1,203 @@
/*===-- __clang_cuda_complex_builtins - CUDA impls of runtime complex fns ---===
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*
*===-----------------------------------------------------------------------===
*/
#ifndef __CLANG_CUDA_COMPLEX_BUILTINS
#define __CLANG_CUDA_COMPLEX_BUILTINS
// This header defines __muldc3, __mulsc3, __divdc3, and __divsc3. These are
// libgcc functions that clang assumes are available when compiling c99 complex
// operations. (These implementations come from libc++, and have been modified
// to work with CUDA.)
extern "C" inline __device__ double _Complex __muldc3(double __a, double __b,
double __c, double __d) {
double __ac = __a * __c;
double __bd = __b * __d;
double __ad = __a * __d;
double __bc = __b * __c;
double _Complex z;
__real__(z) = __ac - __bd;
__imag__(z) = __ad + __bc;
if (std::isnan(__real__(z)) && std::isnan(__imag__(z))) {
int __recalc = 0;
if (std::isinf(__a) || std::isinf(__b)) {
__a = std::copysign(std::isinf(__a) ? 1 : 0, __a);
__b = std::copysign(std::isinf(__b) ? 1 : 0, __b);
if (std::isnan(__c))
__c = std::copysign(0, __c);
if (std::isnan(__d))
__d = std::copysign(0, __d);
__recalc = 1;
}
if (std::isinf(__c) || std::isinf(__d)) {
__c = std::copysign(std::isinf(__c) ? 1 : 0, __c);
__d = std::copysign(std::isinf(__d) ? 1 : 0, __d);
if (std::isnan(__a))
__a = std::copysign(0, __a);
if (std::isnan(__b))
__b = std::copysign(0, __b);
__recalc = 1;
}
if (!__recalc && (std::isinf(__ac) || std::isinf(__bd) ||
std::isinf(__ad) || std::isinf(__bc))) {
if (std::isnan(__a))
__a = std::copysign(0, __a);
if (std::isnan(__b))
__b = std::copysign(0, __b);
if (std::isnan(__c))
__c = std::copysign(0, __c);
if (std::isnan(__d))
__d = std::copysign(0, __d);
__recalc = 1;
}
if (__recalc) {
// Can't use std::numeric_limits<double>::infinity() -- that doesn't have
// a device overload (and isn't constexpr before C++11, naturally).
__real__(z) = __builtin_huge_valf() * (__a * __c - __b * __d);
__imag__(z) = __builtin_huge_valf() * (__a * __d + __b * __c);
}
}
return z;
}
extern "C" inline __device__ float _Complex __mulsc3(float __a, float __b,
float __c, float __d) {
float __ac = __a * __c;
float __bd = __b * __d;
float __ad = __a * __d;
float __bc = __b * __c;
float _Complex z;
__real__(z) = __ac - __bd;
__imag__(z) = __ad + __bc;
if (std::isnan(__real__(z)) && std::isnan(__imag__(z))) {
int __recalc = 0;
if (std::isinf(__a) || std::isinf(__b)) {
__a = std::copysign(std::isinf(__a) ? 1 : 0, __a);
__b = std::copysign(std::isinf(__b) ? 1 : 0, __b);
if (std::isnan(__c))
__c = std::copysign(0, __c);
if (std::isnan(__d))
__d = std::copysign(0, __d);
__recalc = 1;
}
if (std::isinf(__c) || std::isinf(__d)) {
__c = std::copysign(std::isinf(__c) ? 1 : 0, __c);
__d = std::copysign(std::isinf(__d) ? 1 : 0, __d);
if (std::isnan(__a))
__a = std::copysign(0, __a);
if (std::isnan(__b))
__b = std::copysign(0, __b);
__recalc = 1;
}
if (!__recalc && (std::isinf(__ac) || std::isinf(__bd) ||
std::isinf(__ad) || std::isinf(__bc))) {
if (std::isnan(__a))
__a = std::copysign(0, __a);
if (std::isnan(__b))
__b = std::copysign(0, __b);
if (std::isnan(__c))
__c = std::copysign(0, __c);
if (std::isnan(__d))
__d = std::copysign(0, __d);
__recalc = 1;
}
if (__recalc) {
__real__(z) = __builtin_huge_valf() * (__a * __c - __b * __d);
__imag__(z) = __builtin_huge_valf() * (__a * __d + __b * __c);
}
}
return z;
}
extern "C" inline __device__ double _Complex __divdc3(double __a, double __b,
double __c, double __d) {
int __ilogbw = 0;
// Can't use std::max, because that's defined in <algorithm>, and we don't
// want to pull that in for every compile. The CUDA headers define
// ::max(float, float) and ::max(double, double), which is sufficient for us.
double __logbw = std::logb(max(std::abs(__c), std::abs(__d)));
if (std::isfinite(__logbw)) {
__ilogbw = (int)__logbw;
__c = std::scalbn(__c, -__ilogbw);
__d = std::scalbn(__d, -__ilogbw);
}
double __denom = __c * __c + __d * __d;
double _Complex z;
__real__(z) = std::scalbn((__a * __c + __b * __d) / __denom, -__ilogbw);
__imag__(z) = std::scalbn((__b * __c - __a * __d) / __denom, -__ilogbw);
if (std::isnan(__real__(z)) && std::isnan(__imag__(z))) {
if ((__denom == 0.0) && (!std::isnan(__a) || !std::isnan(__b))) {
__real__(z) = std::copysign(__builtin_huge_valf(), __c) * __a;
__imag__(z) = std::copysign(__builtin_huge_valf(), __c) * __b;
} else if ((std::isinf(__a) || std::isinf(__b)) && std::isfinite(__c) &&
std::isfinite(__d)) {
__a = std::copysign(std::isinf(__a) ? 1.0 : 0.0, __a);
__b = std::copysign(std::isinf(__b) ? 1.0 : 0.0, __b);
__real__(z) = __builtin_huge_valf() * (__a * __c + __b * __d);
__imag__(z) = __builtin_huge_valf() * (__b * __c - __a * __d);
} else if (std::isinf(__logbw) && __logbw > 0.0 && std::isfinite(__a) &&
std::isfinite(__b)) {
__c = std::copysign(std::isinf(__c) ? 1.0 : 0.0, __c);
__d = std::copysign(std::isinf(__d) ? 1.0 : 0.0, __d);
__real__(z) = 0.0 * (__a * __c + __b * __d);
__imag__(z) = 0.0 * (__b * __c - __a * __d);
}
}
return z;
}
extern "C" inline __device__ float _Complex __divsc3(float __a, float __b,
float __c, float __d) {
int __ilogbw = 0;
float __logbw = std::logb(max(std::abs(__c), std::abs(__d)));
if (std::isfinite(__logbw)) {
__ilogbw = (int)__logbw;
__c = std::scalbn(__c, -__ilogbw);
__d = std::scalbn(__d, -__ilogbw);
}
float __denom = __c * __c + __d * __d;
float _Complex z;
__real__(z) = std::scalbn((__a * __c + __b * __d) / __denom, -__ilogbw);
__imag__(z) = std::scalbn((__b * __c - __a * __d) / __denom, -__ilogbw);
if (std::isnan(__real__(z)) && std::isnan(__imag__(z))) {
if ((__denom == 0) && (!std::isnan(__a) || !std::isnan(__b))) {
__real__(z) = std::copysign(__builtin_huge_valf(), __c) * __a;
__imag__(z) = std::copysign(__builtin_huge_valf(), __c) * __b;
} else if ((std::isinf(__a) || std::isinf(__b)) && std::isfinite(__c) &&
std::isfinite(__d)) {
__a = std::copysign(std::isinf(__a) ? 1 : 0, __a);
__b = std::copysign(std::isinf(__b) ? 1 : 0, __b);
__real__(z) = __builtin_huge_valf() * (__a * __c + __b * __d);
__imag__(z) = __builtin_huge_valf() * (__b * __c - __a * __d);
} else if (std::isinf(__logbw) && __logbw > 0 && std::isfinite(__a) &&
std::isfinite(__b)) {
__c = std::copysign(std::isinf(__c) ? 1 : 0, __c);
__d = std::copysign(std::isinf(__d) ? 1 : 0, __d);
__real__(z) = 0 * (__a * __c + __b * __d);
__imag__(z) = 0 * (__b * __c - __a * __d);
}
}
return z;
}
#endif // __CLANG_CUDA_COMPLEX_BUILTINS

View File

@@ -0,0 +1,489 @@
/*===--- __clang_cuda_intrinsics.h - Device-side CUDA intrinsic wrappers ---===
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*
*===-----------------------------------------------------------------------===
*/
#ifndef __CLANG_CUDA_INTRINSICS_H__
#define __CLANG_CUDA_INTRINSICS_H__
#ifndef __CUDA__
#error "This file is for CUDA compilation only."
#endif
// sm_30 intrinsics: __shfl_{up,down,xor}.
#define __SM_30_INTRINSICS_H__
#define __SM_30_INTRINSICS_HPP__
#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 300
#pragma push_macro("__MAKE_SHUFFLES")
#define __MAKE_SHUFFLES(__FnName, __IntIntrinsic, __FloatIntrinsic, __Mask, \
__Type) \
inline __device__ int __FnName(int __val, __Type __offset, \
int __width = warpSize) { \
return __IntIntrinsic(__val, __offset, \
((warpSize - __width) << 8) | (__Mask)); \
} \
inline __device__ float __FnName(float __val, __Type __offset, \
int __width = warpSize) { \
return __FloatIntrinsic(__val, __offset, \
((warpSize - __width) << 8) | (__Mask)); \
} \
inline __device__ unsigned int __FnName(unsigned int __val, __Type __offset, \
int __width = warpSize) { \
return static_cast<unsigned int>( \
::__FnName(static_cast<int>(__val), __offset, __width)); \
} \
inline __device__ long long __FnName(long long __val, __Type __offset, \
int __width = warpSize) { \
struct __Bits { \
int __a, __b; \
}; \
_Static_assert(sizeof(__val) == sizeof(__Bits)); \
_Static_assert(sizeof(__Bits) == 2 * sizeof(int)); \
__Bits __tmp; \
memcpy(&__val, &__tmp, sizeof(__val)); \
__tmp.__a = ::__FnName(__tmp.__a, __offset, __width); \
__tmp.__b = ::__FnName(__tmp.__b, __offset, __width); \
long long __ret; \
memcpy(&__ret, &__tmp, sizeof(__tmp)); \
return __ret; \
} \
inline __device__ long __FnName(long __val, __Type __offset, \
int __width = warpSize) { \
_Static_assert(sizeof(long) == sizeof(long long) || \
sizeof(long) == sizeof(int)); \
if (sizeof(long) == sizeof(long long)) { \
return static_cast<long>( \
::__FnName(static_cast<long long>(__val), __offset, __width)); \
} else if (sizeof(long) == sizeof(int)) { \
return static_cast<long>( \
::__FnName(static_cast<int>(__val), __offset, __width)); \
} \
} \
inline __device__ unsigned long __FnName( \
unsigned long __val, __Type __offset, int __width = warpSize) { \
return static_cast<unsigned long>( \
::__FnName(static_cast<long>(__val), __offset, __width)); \
} \
inline __device__ unsigned long long __FnName( \
unsigned long long __val, __Type __offset, int __width = warpSize) { \
return static_cast<unsigned long long>(::__FnName( \
static_cast<unsigned long long>(__val), __offset, __width)); \
} \
inline __device__ double __FnName(double __val, __Type __offset, \
int __width = warpSize) { \
long long __tmp; \
_Static_assert(sizeof(__tmp) == sizeof(__val)); \
memcpy(&__tmp, &__val, sizeof(__val)); \
__tmp = ::__FnName(__tmp, __offset, __width); \
double __ret; \
memcpy(&__ret, &__tmp, sizeof(__ret)); \
return __ret; \
}
__MAKE_SHUFFLES(__shfl, __nvvm_shfl_idx_i32, __nvvm_shfl_idx_f32, 0x1f, int);
// We use 0 rather than 31 as our mask, because shfl.up applies to lanes >=
// maxLane.
__MAKE_SHUFFLES(__shfl_up, __nvvm_shfl_up_i32, __nvvm_shfl_up_f32, 0,
unsigned int);
__MAKE_SHUFFLES(__shfl_down, __nvvm_shfl_down_i32, __nvvm_shfl_down_f32, 0x1f,
unsigned int);
__MAKE_SHUFFLES(__shfl_xor, __nvvm_shfl_bfly_i32, __nvvm_shfl_bfly_f32, 0x1f,
int);
#pragma pop_macro("__MAKE_SHUFFLES")
#endif // !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 300
#if CUDA_VERSION >= 9000
#if (!defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 300)
// __shfl_sync_* variants available in CUDA-9
#pragma push_macro("__MAKE_SYNC_SHUFFLES")
#define __MAKE_SYNC_SHUFFLES(__FnName, __IntIntrinsic, __FloatIntrinsic, \
__Mask, __Type) \
inline __device__ int __FnName(unsigned int __mask, int __val, \
__Type __offset, int __width = warpSize) { \
return __IntIntrinsic(__mask, __val, __offset, \
((warpSize - __width) << 8) | (__Mask)); \
} \
inline __device__ float __FnName(unsigned int __mask, float __val, \
__Type __offset, int __width = warpSize) { \
return __FloatIntrinsic(__mask, __val, __offset, \
((warpSize - __width) << 8) | (__Mask)); \
} \
inline __device__ unsigned int __FnName(unsigned int __mask, \
unsigned int __val, __Type __offset, \
int __width = warpSize) { \
return static_cast<unsigned int>( \
::__FnName(__mask, static_cast<int>(__val), __offset, __width)); \
} \
inline __device__ long long __FnName(unsigned int __mask, long long __val, \
__Type __offset, \
int __width = warpSize) { \
struct __Bits { \
int __a, __b; \
}; \
_Static_assert(sizeof(__val) == sizeof(__Bits)); \
_Static_assert(sizeof(__Bits) == 2 * sizeof(int)); \
__Bits __tmp; \
memcpy(&__val, &__tmp, sizeof(__val)); \
__tmp.__a = ::__FnName(__mask, __tmp.__a, __offset, __width); \
__tmp.__b = ::__FnName(__mask, __tmp.__b, __offset, __width); \
long long __ret; \
memcpy(&__ret, &__tmp, sizeof(__tmp)); \
return __ret; \
} \
inline __device__ unsigned long long __FnName( \
unsigned int __mask, unsigned long long __val, __Type __offset, \
int __width = warpSize) { \
return static_cast<unsigned long long>(::__FnName( \
__mask, static_cast<unsigned long long>(__val), __offset, __width)); \
} \
inline __device__ long __FnName(unsigned int __mask, long __val, \
__Type __offset, int __width = warpSize) { \
_Static_assert(sizeof(long) == sizeof(long long) || \
sizeof(long) == sizeof(int)); \
if (sizeof(long) == sizeof(long long)) { \
return static_cast<long>(::__FnName( \
__mask, static_cast<long long>(__val), __offset, __width)); \
} else if (sizeof(long) == sizeof(int)) { \
return static_cast<long>( \
::__FnName(__mask, static_cast<int>(__val), __offset, __width)); \
} \
} \
inline __device__ unsigned long __FnName( \
unsigned int __mask, unsigned long __val, __Type __offset, \
int __width = warpSize) { \
return static_cast<unsigned long>( \
::__FnName(__mask, static_cast<long>(__val), __offset, __width)); \
} \
inline __device__ double __FnName(unsigned int __mask, double __val, \
__Type __offset, int __width = warpSize) { \
long long __tmp; \
_Static_assert(sizeof(__tmp) == sizeof(__val)); \
memcpy(&__tmp, &__val, sizeof(__val)); \
__tmp = ::__FnName(__mask, __tmp, __offset, __width); \
double __ret; \
memcpy(&__ret, &__tmp, sizeof(__ret)); \
return __ret; \
}
__MAKE_SYNC_SHUFFLES(__shfl_sync, __nvvm_shfl_sync_idx_i32,
__nvvm_shfl_sync_idx_f32, 0x1f, int);
// We use 0 rather than 31 as our mask, because shfl.up applies to lanes >=
// maxLane.
__MAKE_SYNC_SHUFFLES(__shfl_up_sync, __nvvm_shfl_sync_up_i32,
__nvvm_shfl_sync_up_f32, 0, unsigned int);
__MAKE_SYNC_SHUFFLES(__shfl_down_sync, __nvvm_shfl_sync_down_i32,
__nvvm_shfl_sync_down_f32, 0x1f, unsigned int);
__MAKE_SYNC_SHUFFLES(__shfl_xor_sync, __nvvm_shfl_sync_bfly_i32,
__nvvm_shfl_sync_bfly_f32, 0x1f, int);
#pragma pop_macro("__MAKE_SYNC_SHUFFLES")
inline __device__ void __syncwarp(unsigned int mask = 0xffffffff) {
return __nvvm_bar_warp_sync(mask);
}
inline __device__ void __barrier_sync(unsigned int id) {
__nvvm_barrier_sync(id);
}
inline __device__ void __barrier_sync_count(unsigned int id,
unsigned int count) {
__nvvm_barrier_sync_cnt(id, count);
}
inline __device__ int __all_sync(unsigned int mask, int pred) {
return __nvvm_vote_all_sync(mask, pred);
}
inline __device__ int __any_sync(unsigned int mask, int pred) {
return __nvvm_vote_any_sync(mask, pred);
}
inline __device__ int __uni_sync(unsigned int mask, int pred) {
return __nvvm_vote_uni_sync(mask, pred);
}
inline __device__ unsigned int __ballot_sync(unsigned int mask, int pred) {
return __nvvm_vote_ballot_sync(mask, pred);
}
inline __device__ unsigned int __activemask() { return __nvvm_vote_ballot(1); }
inline __device__ unsigned int __fns(unsigned mask, unsigned base, int offset) {
return __nvvm_fns(mask, base, offset);
}
#endif // !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 300
// Define __match* builtins CUDA-9 headers expect to see.
#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700
inline __device__ unsigned int __match32_any_sync(unsigned int mask,
unsigned int value) {
return __nvvm_match_any_sync_i32(mask, value);
}
inline __device__ unsigned long long
__match64_any_sync(unsigned int mask, unsigned long long value) {
return __nvvm_match_any_sync_i64(mask, value);
}
inline __device__ unsigned int
__match32_all_sync(unsigned int mask, unsigned int value, int *pred) {
return __nvvm_match_all_sync_i32p(mask, value, pred);
}
inline __device__ unsigned long long
__match64_all_sync(unsigned int mask, unsigned long long value, int *pred) {
return __nvvm_match_all_sync_i64p(mask, value, pred);
}
#include "crt/sm_70_rt.hpp"
#endif // !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700
#endif // __CUDA_VERSION >= 9000
// sm_32 intrinsics: __ldg and __funnelshift_{l,lc,r,rc}.
// Prevent the vanilla sm_32 intrinsics header from being included.
#define __SM_32_INTRINSICS_H__
#define __SM_32_INTRINSICS_HPP__
#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 320
inline __device__ char __ldg(const char *ptr) { return __nvvm_ldg_c(ptr); }
inline __device__ short __ldg(const short *ptr) { return __nvvm_ldg_s(ptr); }
inline __device__ int __ldg(const int *ptr) { return __nvvm_ldg_i(ptr); }
inline __device__ long __ldg(const long *ptr) { return __nvvm_ldg_l(ptr); }
inline __device__ long long __ldg(const long long *ptr) {
return __nvvm_ldg_ll(ptr);
}
inline __device__ unsigned char __ldg(const unsigned char *ptr) {
return __nvvm_ldg_uc(ptr);
}
inline __device__ unsigned short __ldg(const unsigned short *ptr) {
return __nvvm_ldg_us(ptr);
}
inline __device__ unsigned int __ldg(const unsigned int *ptr) {
return __nvvm_ldg_ui(ptr);
}
inline __device__ unsigned long __ldg(const unsigned long *ptr) {
return __nvvm_ldg_ul(ptr);
}
inline __device__ unsigned long long __ldg(const unsigned long long *ptr) {
return __nvvm_ldg_ull(ptr);
}
inline __device__ float __ldg(const float *ptr) { return __nvvm_ldg_f(ptr); }
inline __device__ double __ldg(const double *ptr) { return __nvvm_ldg_d(ptr); }
inline __device__ char2 __ldg(const char2 *ptr) {
typedef char c2 __attribute__((ext_vector_type(2)));
// We can assume that ptr is aligned at least to char2's alignment, but the
// load will assume that ptr is aligned to char2's alignment. This is only
// safe if alignof(c2) <= alignof(char2).
c2 rv = __nvvm_ldg_c2(reinterpret_cast<const c2 *>(ptr));
char2 ret;
ret.x = rv[0];
ret.y = rv[1];
return ret;
}
inline __device__ char4 __ldg(const char4 *ptr) {
typedef char c4 __attribute__((ext_vector_type(4)));
c4 rv = __nvvm_ldg_c4(reinterpret_cast<const c4 *>(ptr));
char4 ret;
ret.x = rv[0];
ret.y = rv[1];
ret.z = rv[2];
ret.w = rv[3];
return ret;
}
inline __device__ short2 __ldg(const short2 *ptr) {
typedef short s2 __attribute__((ext_vector_type(2)));
s2 rv = __nvvm_ldg_s2(reinterpret_cast<const s2 *>(ptr));
short2 ret;
ret.x = rv[0];
ret.y = rv[1];
return ret;
}
inline __device__ short4 __ldg(const short4 *ptr) {
typedef short s4 __attribute__((ext_vector_type(4)));
s4 rv = __nvvm_ldg_s4(reinterpret_cast<const s4 *>(ptr));
short4 ret;
ret.x = rv[0];
ret.y = rv[1];
ret.z = rv[2];
ret.w = rv[3];
return ret;
}
inline __device__ int2 __ldg(const int2 *ptr) {
typedef int i2 __attribute__((ext_vector_type(2)));
i2 rv = __nvvm_ldg_i2(reinterpret_cast<const i2 *>(ptr));
int2 ret;
ret.x = rv[0];
ret.y = rv[1];
return ret;
}
inline __device__ int4 __ldg(const int4 *ptr) {
typedef int i4 __attribute__((ext_vector_type(4)));
i4 rv = __nvvm_ldg_i4(reinterpret_cast<const i4 *>(ptr));
int4 ret;
ret.x = rv[0];
ret.y = rv[1];
ret.z = rv[2];
ret.w = rv[3];
return ret;
}
inline __device__ longlong2 __ldg(const longlong2 *ptr) {
typedef long long ll2 __attribute__((ext_vector_type(2)));
ll2 rv = __nvvm_ldg_ll2(reinterpret_cast<const ll2 *>(ptr));
longlong2 ret;
ret.x = rv[0];
ret.y = rv[1];
return ret;
}
inline __device__ uchar2 __ldg(const uchar2 *ptr) {
typedef unsigned char uc2 __attribute__((ext_vector_type(2)));
uc2 rv = __nvvm_ldg_uc2(reinterpret_cast<const uc2 *>(ptr));
uchar2 ret;
ret.x = rv[0];
ret.y = rv[1];
return ret;
}
inline __device__ uchar4 __ldg(const uchar4 *ptr) {
typedef unsigned char uc4 __attribute__((ext_vector_type(4)));
uc4 rv = __nvvm_ldg_uc4(reinterpret_cast<const uc4 *>(ptr));
uchar4 ret;
ret.x = rv[0];
ret.y = rv[1];
ret.z = rv[2];
ret.w = rv[3];
return ret;
}
inline __device__ ushort2 __ldg(const ushort2 *ptr) {
typedef unsigned short us2 __attribute__((ext_vector_type(2)));
us2 rv = __nvvm_ldg_us2(reinterpret_cast<const us2 *>(ptr));
ushort2 ret;
ret.x = rv[0];
ret.y = rv[1];
return ret;
}
inline __device__ ushort4 __ldg(const ushort4 *ptr) {
typedef unsigned short us4 __attribute__((ext_vector_type(4)));
us4 rv = __nvvm_ldg_us4(reinterpret_cast<const us4 *>(ptr));
ushort4 ret;
ret.x = rv[0];
ret.y = rv[1];
ret.z = rv[2];
ret.w = rv[3];
return ret;
}
inline __device__ uint2 __ldg(const uint2 *ptr) {
typedef unsigned int ui2 __attribute__((ext_vector_type(2)));
ui2 rv = __nvvm_ldg_ui2(reinterpret_cast<const ui2 *>(ptr));
uint2 ret;
ret.x = rv[0];
ret.y = rv[1];
return ret;
}
inline __device__ uint4 __ldg(const uint4 *ptr) {
typedef unsigned int ui4 __attribute__((ext_vector_type(4)));
ui4 rv = __nvvm_ldg_ui4(reinterpret_cast<const ui4 *>(ptr));
uint4 ret;
ret.x = rv[0];
ret.y = rv[1];
ret.z = rv[2];
ret.w = rv[3];
return ret;
}
inline __device__ ulonglong2 __ldg(const ulonglong2 *ptr) {
typedef unsigned long long ull2 __attribute__((ext_vector_type(2)));
ull2 rv = __nvvm_ldg_ull2(reinterpret_cast<const ull2 *>(ptr));
ulonglong2 ret;
ret.x = rv[0];
ret.y = rv[1];
return ret;
}
inline __device__ float2 __ldg(const float2 *ptr) {
typedef float f2 __attribute__((ext_vector_type(2)));
f2 rv = __nvvm_ldg_f2(reinterpret_cast<const f2 *>(ptr));
float2 ret;
ret.x = rv[0];
ret.y = rv[1];
return ret;
}
inline __device__ float4 __ldg(const float4 *ptr) {
typedef float f4 __attribute__((ext_vector_type(4)));
f4 rv = __nvvm_ldg_f4(reinterpret_cast<const f4 *>(ptr));
float4 ret;
ret.x = rv[0];
ret.y = rv[1];
ret.z = rv[2];
ret.w = rv[3];
return ret;
}
inline __device__ double2 __ldg(const double2 *ptr) {
typedef double d2 __attribute__((ext_vector_type(2)));
d2 rv = __nvvm_ldg_d2(reinterpret_cast<const d2 *>(ptr));
double2 ret;
ret.x = rv[0];
ret.y = rv[1];
return ret;
}
// TODO: Implement these as intrinsics, so the backend can work its magic on
// these. Alternatively, we could implement these as plain C and try to get
// llvm to recognize the relevant patterns.
inline __device__ unsigned __funnelshift_l(unsigned low32, unsigned high32,
unsigned shiftWidth) {
unsigned result;
asm("shf.l.wrap.b32 %0, %1, %2, %3;"
: "=r"(result)
: "r"(low32), "r"(high32), "r"(shiftWidth));
return result;
}
inline __device__ unsigned __funnelshift_lc(unsigned low32, unsigned high32,
unsigned shiftWidth) {
unsigned result;
asm("shf.l.clamp.b32 %0, %1, %2, %3;"
: "=r"(result)
: "r"(low32), "r"(high32), "r"(shiftWidth));
return result;
}
inline __device__ unsigned __funnelshift_r(unsigned low32, unsigned high32,
unsigned shiftWidth) {
unsigned result;
asm("shf.r.wrap.b32 %0, %1, %2, %3;"
: "=r"(result)
: "r"(low32), "r"(high32), "r"(shiftWidth));
return result;
}
inline __device__ unsigned __funnelshift_rc(unsigned low32, unsigned high32,
unsigned shiftWidth) {
unsigned ret;
asm("shf.r.clamp.b32 %0, %1, %2, %3;"
: "=r"(ret)
: "r"(low32), "r"(high32), "r"(shiftWidth));
return ret;
}
#endif // !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 320
#endif // defined(__CLANG_CUDA_INTRINSICS_H__)

View File

@@ -0,0 +1,286 @@
/*===- __clang_math_forward_declares.h - Prototypes of __device__ math fns --===
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*
*===-----------------------------------------------------------------------===
*/
#ifndef __CLANG__CUDA_MATH_FORWARD_DECLARES_H__
#define __CLANG__CUDA_MATH_FORWARD_DECLARES_H__
#ifndef __CUDA__
#error "This file is for CUDA compilation only."
#endif
// This file forward-declares of some math functions we (or the CUDA headers)
// will define later. We need to do this, and do it before cmath is included,
// because the standard library may have constexpr math functions. In the
// absence of a prior __device__ decl, those constexpr functions may become
// implicitly host+device. host+device functions can't be overloaded, so that
// would preclude the use of our own __device__ overloads for these functions.
#pragma push_macro("__DEVICE__")
#define __DEVICE__ \
static __inline__ __attribute__((always_inline)) __attribute__((device))
__DEVICE__ double abs(double);
__DEVICE__ float abs(float);
__DEVICE__ int abs(int);
__DEVICE__ long abs(long);
__DEVICE__ long long abs(long long);
__DEVICE__ double acos(double);
__DEVICE__ float acos(float);
__DEVICE__ double acosh(double);
__DEVICE__ float acosh(float);
__DEVICE__ double asin(double);
__DEVICE__ float asin(float);
__DEVICE__ double asinh(double);
__DEVICE__ float asinh(float);
__DEVICE__ double atan2(double, double);
__DEVICE__ float atan2(float, float);
__DEVICE__ double atan(double);
__DEVICE__ float atan(float);
__DEVICE__ double atanh(double);
__DEVICE__ float atanh(float);
__DEVICE__ double cbrt(double);
__DEVICE__ float cbrt(float);
__DEVICE__ double ceil(double);
__DEVICE__ float ceil(float);
__DEVICE__ double copysign(double, double);
__DEVICE__ float copysign(float, float);
__DEVICE__ double cos(double);
__DEVICE__ float cos(float);
__DEVICE__ double cosh(double);
__DEVICE__ float cosh(float);
__DEVICE__ double erfc(double);
__DEVICE__ float erfc(float);
__DEVICE__ double erf(double);
__DEVICE__ float erf(float);
__DEVICE__ double exp2(double);
__DEVICE__ float exp2(float);
__DEVICE__ double exp(double);
__DEVICE__ float exp(float);
__DEVICE__ double expm1(double);
__DEVICE__ float expm1(float);
__DEVICE__ double fabs(double);
__DEVICE__ float fabs(float);
__DEVICE__ double fdim(double, double);
__DEVICE__ float fdim(float, float);
__DEVICE__ double floor(double);
__DEVICE__ float floor(float);
__DEVICE__ double fma(double, double, double);
__DEVICE__ float fma(float, float, float);
__DEVICE__ double fmax(double, double);
__DEVICE__ float fmax(float, float);
__DEVICE__ double fmin(double, double);
__DEVICE__ float fmin(float, float);
__DEVICE__ double fmod(double, double);
__DEVICE__ float fmod(float, float);
__DEVICE__ int fpclassify(double);
__DEVICE__ int fpclassify(float);
__DEVICE__ double frexp(double, int *);
__DEVICE__ float frexp(float, int *);
__DEVICE__ double hypot(double, double);
__DEVICE__ float hypot(float, float);
__DEVICE__ int ilogb(double);
__DEVICE__ int ilogb(float);
__DEVICE__ bool isfinite(double);
__DEVICE__ bool isfinite(float);
__DEVICE__ bool isgreater(double, double);
__DEVICE__ bool isgreaterequal(double, double);
__DEVICE__ bool isgreaterequal(float, float);
__DEVICE__ bool isgreater(float, float);
__DEVICE__ bool isinf(double);
__DEVICE__ bool isinf(float);
__DEVICE__ bool isless(double, double);
__DEVICE__ bool islessequal(double, double);
__DEVICE__ bool islessequal(float, float);
__DEVICE__ bool isless(float, float);
__DEVICE__ bool islessgreater(double, double);
__DEVICE__ bool islessgreater(float, float);
__DEVICE__ bool isnan(double);
__DEVICE__ bool isnan(float);
__DEVICE__ bool isnormal(double);
__DEVICE__ bool isnormal(float);
__DEVICE__ bool isunordered(double, double);
__DEVICE__ bool isunordered(float, float);
__DEVICE__ long labs(long);
__DEVICE__ double ldexp(double, int);
__DEVICE__ float ldexp(float, int);
__DEVICE__ double lgamma(double);
__DEVICE__ float lgamma(float);
__DEVICE__ long long llabs(long long);
__DEVICE__ long long llrint(double);
__DEVICE__ long long llrint(float);
__DEVICE__ double log10(double);
__DEVICE__ float log10(float);
__DEVICE__ double log1p(double);
__DEVICE__ float log1p(float);
__DEVICE__ double log2(double);
__DEVICE__ float log2(float);
__DEVICE__ double logb(double);
__DEVICE__ float logb(float);
__DEVICE__ double log(double);
__DEVICE__ float log(float);
__DEVICE__ long lrint(double);
__DEVICE__ long lrint(float);
__DEVICE__ long lround(double);
__DEVICE__ long lround(float);
__DEVICE__ long long llround(float); // No llround(double).
__DEVICE__ double modf(double, double *);
__DEVICE__ float modf(float, float *);
__DEVICE__ double nan(const char *);
__DEVICE__ float nanf(const char *);
__DEVICE__ double nearbyint(double);
__DEVICE__ float nearbyint(float);
__DEVICE__ double nextafter(double, double);
__DEVICE__ float nextafter(float, float);
__DEVICE__ double pow(double, double);
__DEVICE__ double pow(double, int);
__DEVICE__ float pow(float, float);
__DEVICE__ float pow(float, int);
__DEVICE__ double remainder(double, double);
__DEVICE__ float remainder(float, float);
__DEVICE__ double remquo(double, double, int *);
__DEVICE__ float remquo(float, float, int *);
__DEVICE__ double rint(double);
__DEVICE__ float rint(float);
__DEVICE__ double round(double);
__DEVICE__ float round(float);
__DEVICE__ double scalbln(double, long);
__DEVICE__ float scalbln(float, long);
__DEVICE__ double scalbn(double, int);
__DEVICE__ float scalbn(float, int);
__DEVICE__ bool signbit(double);
__DEVICE__ bool signbit(float);
__DEVICE__ double sin(double);
__DEVICE__ float sin(float);
__DEVICE__ double sinh(double);
__DEVICE__ float sinh(float);
__DEVICE__ double sqrt(double);
__DEVICE__ float sqrt(float);
__DEVICE__ double tan(double);
__DEVICE__ float tan(float);
__DEVICE__ double tanh(double);
__DEVICE__ float tanh(float);
__DEVICE__ double tgamma(double);
__DEVICE__ float tgamma(float);
__DEVICE__ double trunc(double);
__DEVICE__ float trunc(float);
// Notably missing above is nexttoward, which we don't define on
// the device side because libdevice doesn't give us an implementation, and we
// don't want to be in the business of writing one ourselves.
// We need to define these overloads in exactly the namespace our standard
// library uses (including the right inline namespace), otherwise they won't be
// picked up by other functions in the standard library (e.g. functions in
// <complex>). Thus the ugliness below.
#ifdef _LIBCPP_BEGIN_NAMESPACE_STD
_LIBCPP_BEGIN_NAMESPACE_STD
#else
namespace std {
#ifdef _GLIBCXX_BEGIN_NAMESPACE_VERSION
_GLIBCXX_BEGIN_NAMESPACE_VERSION
#endif
#endif
using ::abs;
using ::acos;
using ::acosh;
using ::asin;
using ::asinh;
using ::atan;
using ::atan2;
using ::atanh;
using ::cbrt;
using ::ceil;
using ::copysign;
using ::cos;
using ::cosh;
using ::erf;
using ::erfc;
using ::exp;
using ::exp2;
using ::expm1;
using ::fabs;
using ::fdim;
using ::floor;
using ::fma;
using ::fmax;
using ::fmin;
using ::fmod;
using ::fpclassify;
using ::frexp;
using ::hypot;
using ::ilogb;
using ::isfinite;
using ::isgreater;
using ::isgreaterequal;
using ::isinf;
using ::isless;
using ::islessequal;
using ::islessgreater;
using ::isnan;
using ::isnormal;
using ::isunordered;
using ::labs;
using ::ldexp;
using ::lgamma;
using ::llabs;
using ::llrint;
using ::log;
using ::log10;
using ::log1p;
using ::log2;
using ::logb;
using ::lrint;
using ::lround;
using ::llround;
using ::modf;
using ::nan;
using ::nanf;
using ::nearbyint;
using ::nextafter;
using ::pow;
using ::remainder;
using ::remquo;
using ::rint;
using ::round;
using ::scalbln;
using ::scalbn;
using ::signbit;
using ::sin;
using ::sinh;
using ::sqrt;
using ::tan;
using ::tanh;
using ::tgamma;
using ::trunc;
#ifdef _LIBCPP_END_NAMESPACE_STD
_LIBCPP_END_NAMESPACE_STD
#else
#ifdef _GLIBCXX_BEGIN_NAMESPACE_VERSION
_GLIBCXX_END_NAMESPACE_VERSION
#endif
} // namespace std
#endif
#pragma pop_macro("__DEVICE__")
#endif

View File

@@ -0,0 +1,381 @@
/*===---- __clang_cuda_runtime_wrapper.h - CUDA runtime support -------------===
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*
*===-----------------------------------------------------------------------===
*/
/*
* WARNING: This header is intended to be directly -include'd by
* the compiler and is not supposed to be included by users.
*
* CUDA headers are implemented in a way that currently makes it
* impossible for user code to #include directly when compiling with
* Clang. They present different view of CUDA-supplied functions
* depending on where in NVCC's compilation pipeline the headers are
* included. Neither of these modes provides function definitions with
* correct attributes, so we use preprocessor to force the headers
* into a form that Clang can use.
*
* Similarly to NVCC which -include's cuda_runtime.h, Clang -include's
* this file during every CUDA compilation.
*/
#ifndef __CLANG_CUDA_RUNTIME_WRAPPER_H__
#define __CLANG_CUDA_RUNTIME_WRAPPER_H__
#if defined(__CUDA__) && defined(__clang__)
// Include some forward declares that must come before cmath.
#include <__clang_cuda_math_forward_declares.h>
// Include some standard headers to avoid CUDA headers including them
// while some required macros (like __THROW) are in a weird state.
#include <cmath>
#include <cstdlib>
#include <stdlib.h>
// Preserve common macros that will be changed below by us or by CUDA
// headers.
#pragma push_macro("__THROW")
#pragma push_macro("__CUDA_ARCH__")
// WARNING: Preprocessor hacks below are based on specific details of
// CUDA-7.x headers and are not expected to work with any other
// version of CUDA headers.
#include "cuda.h"
#if !defined(CUDA_VERSION)
#error "cuda.h did not define CUDA_VERSION"
#elif CUDA_VERSION < 7000 || CUDA_VERSION > 9000
#error "Unsupported CUDA version!"
#endif
// Make largest subset of device functions available during host
// compilation -- SM_35 for the time being.
#ifndef __CUDA_ARCH__
#define __CUDA_ARCH__ 350
#endif
#include "__clang_cuda_builtin_vars.h"
// No need for device_launch_parameters.h as __clang_cuda_builtin_vars.h above
// has taken care of builtin variables declared in the file.
#define __DEVICE_LAUNCH_PARAMETERS_H__
// {math,device}_functions.h only have declarations of the
// functions. We don't need them as we're going to pull in their
// definitions from .hpp files.
#define __DEVICE_FUNCTIONS_H__
#define __MATH_FUNCTIONS_H__
#define __COMMON_FUNCTIONS_H__
#undef __CUDACC__
#if CUDA_VERSION < 9000
#define __CUDABE__
#else
#define __CUDA_LIBDEVICE__
#endif
// Disables definitions of device-side runtime support stubs in
// cuda_device_runtime_api.h
#include "driver_types.h"
#include "host_config.h"
#include "host_defines.h"
#undef __CUDABE__
#undef __CUDA_LIBDEVICE__
#define __CUDACC__
#include "cuda_runtime.h"
#undef __CUDACC__
#define __CUDABE__
// CUDA headers use __nvvm_memcpy and __nvvm_memset which Clang does
// not have at the moment. Emulate them with a builtin memcpy/memset.
#define __nvvm_memcpy(s, d, n, a) __builtin_memcpy(s, d, n)
#define __nvvm_memset(d, c, n, a) __builtin_memset(d, c, n)
#if CUDA_VERSION < 9000
#include "crt/device_runtime.h"
#endif
#include "crt/host_runtime.h"
// device_runtime.h defines __cxa_* macros that will conflict with
// cxxabi.h.
// FIXME: redefine these as __device__ functions.
#undef __cxa_vec_ctor
#undef __cxa_vec_cctor
#undef __cxa_vec_dtor
#undef __cxa_vec_new
#undef __cxa_vec_new2
#undef __cxa_vec_new3
#undef __cxa_vec_delete2
#undef __cxa_vec_delete
#undef __cxa_vec_delete3
#undef __cxa_pure_virtual
// math_functions.hpp expects this host function be defined on MacOS, but it
// ends up not being there because of the games we play here. Just define it
// ourselves; it's simple enough.
#ifdef __APPLE__
inline __host__ double __signbitd(double x) {
return std::signbit(x);
}
#endif
// We need decls for functions in CUDA's libdevice with __device__
// attribute only. Alas they come either as __host__ __device__ or
// with no attributes at all. To work around that, define __CUDA_RTC__
// which produces HD variant and undef __host__ which gives us desided
// decls with __device__ attribute.
#pragma push_macro("__host__")
#define __host__
#define __CUDACC_RTC__
#include "device_functions_decls.h"
#undef __CUDACC_RTC__
// Temporarily poison __host__ macro to ensure it's not used by any of
// the headers we're about to include.
#define __host__ UNEXPECTED_HOST_ATTRIBUTE
// CUDA 8.0.41 relies on __USE_FAST_MATH__ and __CUDA_PREC_DIV's values.
// Previous versions used to check whether they are defined or not.
// CU_DEVICE_INVALID macro is only defined in 8.0.41, so we use it
// here to detect the switch.
#if defined(CU_DEVICE_INVALID)
#if !defined(__USE_FAST_MATH__)
#define __USE_FAST_MATH__ 0
#endif
#if !defined(__CUDA_PREC_DIV)
#define __CUDA_PREC_DIV 0
#endif
#endif
// device_functions.hpp and math_functions*.hpp use 'static
// __forceinline__' (with no __device__) for definitions of device
// functions. Temporarily redefine __forceinline__ to include
// __device__.
#pragma push_macro("__forceinline__")
#define __forceinline__ __device__ __inline__ __attribute__((always_inline))
#pragma push_macro("__float2half_rn")
#if CUDA_VERSION >= 9000
// CUDA-9 has conflicting prototypes for __float2half_rn(float f) in
// cuda_fp16.h[pp] and device_functions.hpp. We need to get the one in
// device_functions.hpp out of the way.
#define __float2half_rn __float2half_rn_disabled
#endif
#include "device_functions.hpp"
#pragma pop_macro("__float2half_rn")
// math_function.hpp uses the __USE_FAST_MATH__ macro to determine whether we
// get the slow-but-accurate or fast-but-inaccurate versions of functions like
// sin and exp. This is controlled in clang by -fcuda-approx-transcendentals.
//
// device_functions.hpp uses __USE_FAST_MATH__ for a different purpose (fast vs.
// slow divides), so we need to scope our define carefully here.
#pragma push_macro("__USE_FAST_MATH__")
#if defined(__CLANG_CUDA_APPROX_TRANSCENDENTALS__)
#define __USE_FAST_MATH__ 1
#endif
#include "math_functions.hpp"
#pragma pop_macro("__USE_FAST_MATH__")
#include "math_functions_dbl_ptx3.hpp"
#pragma pop_macro("__forceinline__")
// Pull in host-only functions that are only available when neither
// __CUDACC__ nor __CUDABE__ are defined.
#undef __MATH_FUNCTIONS_HPP__
#undef __CUDABE__
#include "math_functions.hpp"
// Alas, additional overloads for these functions are hard to get to.
// Considering that we only need these overloads for a few functions,
// we can provide them here.
static inline float rsqrt(float __a) { return rsqrtf(__a); }
static inline float rcbrt(float __a) { return rcbrtf(__a); }
static inline float sinpi(float __a) { return sinpif(__a); }
static inline float cospi(float __a) { return cospif(__a); }
static inline void sincospi(float __a, float *__b, float *__c) {
return sincospif(__a, __b, __c);
}
static inline float erfcinv(float __a) { return erfcinvf(__a); }
static inline float normcdfinv(float __a) { return normcdfinvf(__a); }
static inline float normcdf(float __a) { return normcdff(__a); }
static inline float erfcx(float __a) { return erfcxf(__a); }
// For some reason single-argument variant is not always declared by
// CUDA headers. Alas, device_functions.hpp included below needs it.
static inline __device__ void __brkpt(int __c) { __brkpt(); }
// Now include *.hpp with definitions of various GPU functions. Alas,
// a lot of thins get declared/defined with __host__ attribute which
// we don't want and we have to define it out. We also have to include
// {device,math}_functions.hpp again in order to extract the other
// branch of #if/else inside.
#define __host__
#undef __CUDABE__
#define __CUDACC__
#undef __DEVICE_FUNCTIONS_HPP__
#include "device_atomic_functions.hpp"
#include "device_functions.hpp"
#include "sm_20_atomic_functions.hpp"
#include "sm_20_intrinsics.hpp"
#include "sm_32_atomic_functions.hpp"
// Don't include sm_30_intrinsics.h and sm_32_intrinsics.h. These define the
// __shfl and __ldg intrinsics using inline (volatile) asm, but we want to
// define them using builtins so that the optimizer can reason about and across
// these instructions. In particular, using intrinsics for ldg gets us the
// [addr+imm] addressing mode, which, although it doesn't actually exist in the
// hardware, seems to generate faster machine code because ptxas can more easily
// reason about our code.
#if CUDA_VERSION >= 8000
#include "sm_60_atomic_functions.hpp"
#include "sm_61_intrinsics.hpp"
#endif
#undef __MATH_FUNCTIONS_HPP__
// math_functions.hpp defines ::signbit as a __host__ __device__ function. This
// conflicts with libstdc++'s constexpr ::signbit, so we have to rename
// math_function.hpp's ::signbit. It's guarded by #undef signbit, but that's
// conditional on __GNUC__. :)
#pragma push_macro("signbit")
#pragma push_macro("__GNUC__")
#undef __GNUC__
#define signbit __ignored_cuda_signbit
// CUDA-9 omits device-side definitions of some math functions if it sees
// include guard from math.h wrapper from libstdc++. We have to undo the header
// guard temporarily to get the definitions we need.
#pragma push_macro("_GLIBCXX_MATH_H")
#pragma push_macro("_LIBCPP_VERSION")
#if CUDA_VERSION >= 9000
#undef _GLIBCXX_MATH_H
// We also need to undo another guard that checks for libc++ 3.8+
#ifdef _LIBCPP_VERSION
#define _LIBCPP_VERSION 3700
#endif
#endif
#include "math_functions.hpp"
#pragma pop_macro("_GLIBCXX_MATH_H")
#pragma pop_macro("_LIBCPP_VERSION")
#pragma pop_macro("__GNUC__")
#pragma pop_macro("signbit")
#pragma pop_macro("__host__")
#include "texture_indirect_functions.h"
// Restore state of __CUDA_ARCH__ and __THROW we had on entry.
#pragma pop_macro("__CUDA_ARCH__")
#pragma pop_macro("__THROW")
// Set up compiler macros expected to be seen during compilation.
#undef __CUDABE__
#define __CUDACC__
extern "C" {
// Device-side CUDA system calls.
// http://docs.nvidia.com/cuda/ptx-writers-guide-to-interoperability/index.html#system-calls
// We need these declarations and wrappers for device-side
// malloc/free/printf calls to work without relying on
// -fcuda-disable-target-call-checks option.
__device__ int vprintf(const char *, const char *);
__device__ void free(void *) __attribute((nothrow));
__device__ void *malloc(size_t) __attribute((nothrow)) __attribute__((malloc));
__device__ void __assertfail(const char *__message, const char *__file,
unsigned __line, const char *__function,
size_t __charSize) __attribute__((noreturn));
// In order for standard assert() macro on linux to work we need to
// provide device-side __assert_fail()
__device__ static inline void __assert_fail(const char *__message,
const char *__file, unsigned __line,
const char *__function) {
__assertfail(__message, __file, __line, __function, sizeof(char));
}
// Clang will convert printf into vprintf, but we still need
// device-side declaration for it.
__device__ int printf(const char *, ...);
} // extern "C"
// We also need device-side std::malloc and std::free.
namespace std {
__device__ static inline void free(void *__ptr) { ::free(__ptr); }
__device__ static inline void *malloc(size_t __size) {
return ::malloc(__size);
}
} // namespace std
// Out-of-line implementations from __clang_cuda_builtin_vars.h. These need to
// come after we've pulled in the definition of uint3 and dim3.
__device__ inline __cuda_builtin_threadIdx_t::operator uint3() const {
uint3 ret;
ret.x = x;
ret.y = y;
ret.z = z;
return ret;
}
__device__ inline __cuda_builtin_blockIdx_t::operator uint3() const {
uint3 ret;
ret.x = x;
ret.y = y;
ret.z = z;
return ret;
}
__device__ inline __cuda_builtin_blockDim_t::operator dim3() const {
return dim3(x, y, z);
}
__device__ inline __cuda_builtin_gridDim_t::operator dim3() const {
return dim3(x, y, z);
}
#include <__clang_cuda_cmath.h>
#include <__clang_cuda_intrinsics.h>
#include <__clang_cuda_complex_builtins.h>
// curand_mtgp32_kernel helpfully redeclares blockDim and threadIdx in host
// mode, giving them their "proper" types of dim3 and uint3. This is
// incompatible with the types we give in __clang_cuda_builtin_vars.h. As as
// hack, force-include the header (nvcc doesn't include it by default) but
// redefine dim3 and uint3 to our builtin types. (Thankfully dim3 and uint3 are
// only used here for the redeclarations of blockDim and threadIdx.)
#pragma push_macro("dim3")
#pragma push_macro("uint3")
#define dim3 __cuda_builtin_blockDim_t
#define uint3 __cuda_builtin_threadIdx_t
#include "curand_mtgp32_kernel.h"
#pragma pop_macro("dim3")
#pragma pop_macro("uint3")
#pragma pop_macro("__USE_FAST_MATH__")
#endif // __CUDA__
#endif // __CLANG_CUDA_RUNTIME_WRAPPER_H__

View File

@@ -0,0 +1,43 @@
/*===---- __stddef_max_align_t.h - Definition of max_align_t for modules ---===
*
* Copyright (c) 2014 Chandler Carruth
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*
*===-----------------------------------------------------------------------===
*/
#ifndef __CLANG_MAX_ALIGN_T_DEFINED
#define __CLANG_MAX_ALIGN_T_DEFINED
#if defined(_MSC_VER)
typedef double max_align_t;
#elif defined(__APPLE__)
typedef long double max_align_t;
#else
// Define 'max_align_t' to match the GCC definition.
typedef struct {
long long __clang_max_align_nonce1
__attribute__((__aligned__(__alignof__(long long))));
long double __clang_max_align_nonce2
__attribute__((__aligned__(__alignof__(long double))));
} max_align_t;
#endif
#endif

View File

@@ -0,0 +1,151 @@
/*===---- __wmmintrin_aes.h - AES intrinsics -------------------------------===
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*
*===-----------------------------------------------------------------------===
*/
#ifndef _WMMINTRIN_AES_H
#define _WMMINTRIN_AES_H
#include <emmintrin.h>
/* Define the default attributes for the functions in this file. */
#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("aes")))
/// \brief Performs a single round of AES encryption using the Equivalent
/// Inverse Cipher, transforming the state value from the first source
/// operand using a 128-bit round key value contained in the second source
/// operand, and writes the result to the destination.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the <c> VAESENC </c> instruction.
///
/// \param __V
/// A 128-bit integer vector containing the state value.
/// \param __R
/// A 128-bit integer vector containing the round key value.
/// \returns A 128-bit integer vector containing the encrypted value.
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_aesenc_si128(__m128i __V, __m128i __R)
{
return (__m128i)__builtin_ia32_aesenc128((__v2di)__V, (__v2di)__R);
}
/// \brief Performs the final round of AES encryption using the Equivalent
/// Inverse Cipher, transforming the state value from the first source
/// operand using a 128-bit round key value contained in the second source
/// operand, and writes the result to the destination.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the <c> VAESENCLAST </c> instruction.
///
/// \param __V
/// A 128-bit integer vector containing the state value.
/// \param __R
/// A 128-bit integer vector containing the round key value.
/// \returns A 128-bit integer vector containing the encrypted value.
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_aesenclast_si128(__m128i __V, __m128i __R)
{
return (__m128i)__builtin_ia32_aesenclast128((__v2di)__V, (__v2di)__R);
}
/// \brief Performs a single round of AES decryption using the Equivalent
/// Inverse Cipher, transforming the state value from the first source
/// operand using a 128-bit round key value contained in the second source
/// operand, and writes the result to the destination.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the <c> VAESDEC </c> instruction.
///
/// \param __V
/// A 128-bit integer vector containing the state value.
/// \param __R
/// A 128-bit integer vector containing the round key value.
/// \returns A 128-bit integer vector containing the decrypted value.
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_aesdec_si128(__m128i __V, __m128i __R)
{
return (__m128i)__builtin_ia32_aesdec128((__v2di)__V, (__v2di)__R);
}
/// \brief Performs the final round of AES decryption using the Equivalent
/// Inverse Cipher, transforming the state value from the first source
/// operand using a 128-bit round key value contained in the second source
/// operand, and writes the result to the destination.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the <c> VAESDECLAST </c> instruction.
///
/// \param __V
/// A 128-bit integer vector containing the state value.
/// \param __R
/// A 128-bit integer vector containing the round key value.
/// \returns A 128-bit integer vector containing the decrypted value.
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_aesdeclast_si128(__m128i __V, __m128i __R)
{
return (__m128i)__builtin_ia32_aesdeclast128((__v2di)__V, (__v2di)__R);
}
/// \brief Applies the AES InvMixColumns() transformation to an expanded key
/// contained in the source operand, and writes the result to the
/// destination.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the <c> VAESIMC </c> instruction.
///
/// \param __V
/// A 128-bit integer vector containing the expanded key.
/// \returns A 128-bit integer vector containing the transformed value.
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_aesimc_si128(__m128i __V)
{
return (__m128i)__builtin_ia32_aesimc128((__v2di)__V);
}
/// \brief Generates a round key for AES encyption, operating on 128-bit data
/// specified in the first source operand and using an 8-bit round constant
/// specified by the second source operand, and writes the result to the
/// destination.
///
/// \headerfile <x86intrin.h>
///
/// \code
/// __m128i _mm_aeskeygenassist_si128(__m128i C, const int R);
/// \endcode
///
/// This intrinsic corresponds to the <c> AESKEYGENASSIST </c> instruction.
///
/// \param C
/// A 128-bit integer vector that is used to generate the AES encryption key.
/// \param R
/// An 8-bit round constant used to generate the AES encryption key.
/// \returns A 128-bit round key for AES encryption.
#define _mm_aeskeygenassist_si128(C, R) \
(__m128i)__builtin_ia32_aeskeygenassist128((__v2di)(__m128i)(C), (int)(R))
#undef __DEFAULT_FN_ATTRS
#endif /* _WMMINTRIN_AES_H */

View File

@@ -0,0 +1,57 @@
/*===---- __wmmintrin_pclmul.h - PCMUL intrinsics ---------------------------===
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*
*===-----------------------------------------------------------------------===
*/
#ifndef _WMMINTRIN_PCLMUL_H
#define _WMMINTRIN_PCLMUL_H
/// \brief Multiplies two 64-bit integer values, which are selected from source
/// operands using the immediate-value operand. The multiplication is a
/// carry-less multiplication, and the 128-bit integer product is stored in
/// the destination.
///
/// \headerfile <x86intrin.h>
///
/// \code
/// __m128i _mm_clmulepi64_si128(__m128i __X, __m128i __Y, const int __I);
/// \endcode
///
/// This intrinsic corresponds to the <c> VPCLMULQDQ </c> instruction.
///
/// \param __X
/// A 128-bit vector of [2 x i64] containing one of the source operands.
/// \param __Y
/// A 128-bit vector of [2 x i64] containing one of the source operands.
/// \param __I
/// An immediate value specifying which 64-bit values to select from the
/// operands. Bit 0 is used to select a value from operand \a __X, and bit
/// 4 is used to select a value from operand \a __Y: \n
/// Bit[0]=0 indicates that bits[63:0] of operand \a __X are used. \n
/// Bit[0]=1 indicates that bits[127:64] of operand \a __X are used. \n
/// Bit[4]=0 indicates that bits[63:0] of operand \a __Y are used. \n
/// Bit[4]=1 indicates that bits[127:64] of operand \a __Y are used.
/// \returns The 128-bit integer vector containing the result of the carry-less
/// multiplication of the selected 64-bit values.
#define _mm_clmulepi64_si128(__X, __Y, __I) \
((__m128i)__builtin_ia32_pclmulqdq128((__v2di)(__m128i)(__X), \
(__v2di)(__m128i)(__Y), (char)(__I)))
#endif /* _WMMINTRIN_PCLMUL_H */

View File

@@ -0,0 +1,86 @@
/*===---- adxintrin.h - ADX intrinsics -------------------------------------===
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*
*===-----------------------------------------------------------------------===
*/
#ifndef __IMMINTRIN_H
#error "Never use <adxintrin.h> directly; include <immintrin.h> instead."
#endif
#ifndef __ADXINTRIN_H
#define __ADXINTRIN_H
/* Define the default attributes for the functions in this file. */
#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__))
/* Intrinsics that are available only if __ADX__ defined */
static __inline unsigned char __attribute__((__always_inline__, __nodebug__, __target__("adx")))
_addcarryx_u32(unsigned char __cf, unsigned int __x, unsigned int __y,
unsigned int *__p)
{
return __builtin_ia32_addcarryx_u32(__cf, __x, __y, __p);
}
#ifdef __x86_64__
static __inline unsigned char __attribute__((__always_inline__, __nodebug__, __target__("adx")))
_addcarryx_u64(unsigned char __cf, unsigned long long __x,
unsigned long long __y, unsigned long long *__p)
{
return __builtin_ia32_addcarryx_u64(__cf, __x, __y, __p);
}
#endif
/* Intrinsics that are also available if __ADX__ undefined */
static __inline unsigned char __DEFAULT_FN_ATTRS
_addcarry_u32(unsigned char __cf, unsigned int __x, unsigned int __y,
unsigned int *__p)
{
return __builtin_ia32_addcarry_u32(__cf, __x, __y, __p);
}
#ifdef __x86_64__
static __inline unsigned char __DEFAULT_FN_ATTRS
_addcarry_u64(unsigned char __cf, unsigned long long __x,
unsigned long long __y, unsigned long long *__p)
{
return __builtin_ia32_addcarry_u64(__cf, __x, __y, __p);
}
#endif
static __inline unsigned char __DEFAULT_FN_ATTRS
_subborrow_u32(unsigned char __cf, unsigned int __x, unsigned int __y,
unsigned int *__p)
{
return __builtin_ia32_subborrow_u32(__cf, __x, __y, __p);
}
#ifdef __x86_64__
static __inline unsigned char __DEFAULT_FN_ATTRS
_subborrow_u64(unsigned char __cf, unsigned long long __x,
unsigned long long __y, unsigned long long *__p)
{
return __builtin_ia32_subborrow_u64(__cf, __x, __y, __p);
}
#endif
#undef __DEFAULT_FN_ATTRS
#endif /* __ADXINTRIN_H */

View File

@@ -0,0 +1 @@
90fd477d9b983fccbe638b6969853a89d7309c8a

View File

@@ -0,0 +1,193 @@
/*===---- ammintrin.h - SSE4a intrinsics -----------------------------------===
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*
*===-----------------------------------------------------------------------===
*/
#ifndef __AMMINTRIN_H
#define __AMMINTRIN_H
#include <pmmintrin.h>
/* Define the default attributes for the functions in this file. */
#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("sse4a")))
/// \brief Extracts the specified bits from the lower 64 bits of the 128-bit
/// integer vector operand at the index \a idx and of the length \a len.
///
/// \headerfile <x86intrin.h>
///
/// \code
/// __m128i _mm_extracti_si64(__m128i x, const int len, const int idx);
/// \endcode
///
/// This intrinsic corresponds to the <c> EXTRQ </c> instruction.
///
/// \param x
/// The value from which bits are extracted.
/// \param len
/// Bits [5:0] specify the length; the other bits are ignored. If bits [5:0]
/// are zero, the length is interpreted as 64.
/// \param idx
/// Bits [5:0] specify the index of the least significant bit; the other
/// bits are ignored. If the sum of the index and length is greater than 64,
/// the result is undefined. If the length and index are both zero, bits
/// [63:0] of parameter \a x are extracted. If the length is zero but the
/// index is non-zero, the result is undefined.
/// \returns A 128-bit integer vector whose lower 64 bits contain the bits
/// extracted from the source operand.
#define _mm_extracti_si64(x, len, idx) \
((__m128i)__builtin_ia32_extrqi((__v2di)(__m128i)(x), \
(char)(len), (char)(idx)))
/// \brief Extracts the specified bits from the lower 64 bits of the 128-bit
/// integer vector operand at the index and of the length specified by
/// \a __y.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the <c> EXTRQ </c> instruction.
///
/// \param __x
/// The value from which bits are extracted.
/// \param __y
/// Specifies the index of the least significant bit at [13:8] and the
/// length at [5:0]; all other bits are ignored. If bits [5:0] are zero, the
/// length is interpreted as 64. If the sum of the index and length is
/// greater than 64, the result is undefined. If the length and index are
/// both zero, bits [63:0] of parameter \a __x are extracted. If the length
/// is zero but the index is non-zero, the result is undefined.
/// \returns A 128-bit vector whose lower 64 bits contain the bits extracted
/// from the source operand.
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_extract_si64(__m128i __x, __m128i __y)
{
return (__m128i)__builtin_ia32_extrq((__v2di)__x, (__v16qi)__y);
}
/// \brief Inserts bits of a specified length from the source integer vector
/// \a y into the lower 64 bits of the destination integer vector \a x at
/// the index \a idx and of the length \a len.
///
/// \headerfile <x86intrin.h>
///
/// \code
/// __m128i _mm_inserti_si64(__m128i x, __m128i y, const int len,
/// const int idx);
/// \endcode
///
/// This intrinsic corresponds to the <c> INSERTQ </c> instruction.
///
/// \param x
/// The destination operand where bits will be inserted. The inserted bits
/// are defined by the length \a len and by the index \a idx specifying the
/// least significant bit.
/// \param y
/// The source operand containing the bits to be extracted. The extracted
/// bits are the least significant bits of operand \a y of length \a len.
/// \param len
/// Bits [5:0] specify the length; the other bits are ignored. If bits [5:0]
/// are zero, the length is interpreted as 64.
/// \param idx
/// Bits [5:0] specify the index of the least significant bit; the other
/// bits are ignored. If the sum of the index and length is greater than 64,
/// the result is undefined. If the length and index are both zero, bits
/// [63:0] of parameter \a y are inserted into parameter \a x. If the length
/// is zero but the index is non-zero, the result is undefined.
/// \returns A 128-bit integer vector containing the original lower 64-bits of
/// destination operand \a x with the specified bitfields replaced by the
/// lower bits of source operand \a y. The upper 64 bits of the return value
/// are undefined.
#define _mm_inserti_si64(x, y, len, idx) \
((__m128i)__builtin_ia32_insertqi((__v2di)(__m128i)(x), \
(__v2di)(__m128i)(y), \
(char)(len), (char)(idx)))
/// \brief Inserts bits of a specified length from the source integer vector
/// \a __y into the lower 64 bits of the destination integer vector \a __x
/// at the index and of the length specified by \a __y.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the <c> INSERTQ </c> instruction.
///
/// \param __x
/// The destination operand where bits will be inserted. The inserted bits
/// are defined by the length and by the index of the least significant bit
/// specified by operand \a __y.
/// \param __y
/// The source operand containing the bits to be extracted. The extracted
/// bits are the least significant bits of operand \a __y with length
/// specified by bits [69:64]. These are inserted into the destination at the
/// index specified by bits [77:72]; all other bits are ignored. If bits
/// [69:64] are zero, the length is interpreted as 64. If the sum of the
/// index and length is greater than 64, the result is undefined. If the
/// length and index are both zero, bits [63:0] of parameter \a __y are
/// inserted into parameter \a __x. If the length is zero but the index is
/// non-zero, the result is undefined.
/// \returns A 128-bit integer vector containing the original lower 64-bits of
/// destination operand \a __x with the specified bitfields replaced by the
/// lower bits of source operand \a __y. The upper 64 bits of the return
/// value are undefined.
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_insert_si64(__m128i __x, __m128i __y)
{
return (__m128i)__builtin_ia32_insertq((__v2di)__x, (__v2di)__y);
}
/// \brief Stores a 64-bit double-precision value in a 64-bit memory location.
/// To minimize caching, the data is flagged as non-temporal (unlikely to be
/// used again soon).
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the <c> MOVNTSD </c> instruction.
///
/// \param __p
/// The 64-bit memory location used to store the register value.
/// \param __a
/// The 64-bit double-precision floating-point register value to be stored.
static __inline__ void __DEFAULT_FN_ATTRS
_mm_stream_sd(double *__p, __m128d __a)
{
__builtin_ia32_movntsd(__p, (__v2df)__a);
}
/// \brief Stores a 32-bit single-precision floating-point value in a 32-bit
/// memory location. To minimize caching, the data is flagged as
/// non-temporal (unlikely to be used again soon).
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the <c> MOVNTSS </c> instruction.
///
/// \param __p
/// The 32-bit memory location used to store the register value.
/// \param __a
/// The 32-bit single-precision floating-point register value to be stored.
static __inline__ void __DEFAULT_FN_ATTRS
_mm_stream_ss(float *__p, __m128 __a)
{
__builtin_ia32_movntss(__p, (__v4sf)__a);
}
#undef __DEFAULT_FN_ATTRS
#endif /* __AMMINTRIN_H */

View File

@@ -0,0 +1,49 @@
/*===---- arm64intr.h - ARM64 Windows intrinsics -------------------------------===
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*
*===-----------------------------------------------------------------------===
*/
/* Only include this if we're compiling for the windows platform. */
#ifndef _MSC_VER
#include_next <arm64intr.h>
#else
#ifndef __ARM64INTR_H
#define __ARM64INTR_H
typedef enum
{
_ARM64_BARRIER_SY = 0xF,
_ARM64_BARRIER_ST = 0xE,
_ARM64_BARRIER_LD = 0xD,
_ARM64_BARRIER_ISH = 0xB,
_ARM64_BARRIER_ISHST = 0xA,
_ARM64_BARRIER_ISHLD = 0x9,
_ARM64_BARRIER_NSH = 0x7,
_ARM64_BARRIER_NSHST = 0x6,
_ARM64_BARRIER_NSHLD = 0x5,
_ARM64_BARRIER_OSH = 0x3,
_ARM64_BARRIER_OSHST = 0x2,
_ARM64_BARRIER_OSHLD = 0x1
} _ARM64INTR_BARRIER_TYPE;
#endif /* __ARM64INTR_H */
#endif /* _MSC_VER */

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,45 @@
/*===---- armintr.h - ARM Windows intrinsics -------------------------------===
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*
*===-----------------------------------------------------------------------===
*/
/* Only include this if we're compiling for the windows platform. */
#ifndef _MSC_VER
#include_next <armintr.h>
#else
#ifndef __ARMINTR_H
#define __ARMINTR_H
typedef enum
{
_ARM_BARRIER_SY = 0xF,
_ARM_BARRIER_ST = 0xE,
_ARM_BARRIER_ISH = 0xB,
_ARM_BARRIER_ISHST = 0xA,
_ARM_BARRIER_NSH = 0x7,
_ARM_BARRIER_NSHST = 0x6,
_ARM_BARRIER_OSH = 0x3,
_ARM_BARRIER_OSHST = 0x2
} _ARMINTR_BARRIER_TYPE;
#endif /* __ARMINTR_H */
#endif /* _MSC_VER */

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,97 @@
/*===------------- avx512bitalgintrin.h - BITALG intrinsics ------------------===
*
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*
*===-----------------------------------------------------------------------===
*/
#ifndef __IMMINTRIN_H
#error "Never use <avx512bitalgintrin.h> directly; include <immintrin.h> instead."
#endif
#ifndef __AVX512BITALGINTRIN_H
#define __AVX512BITALGINTRIN_H
/* Define the default attributes for the functions in this file. */
#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx512bitalg")))
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_popcnt_epi16(__m512i __A)
{
return (__m512i) __builtin_ia32_vpopcntw_512((__v32hi) __A);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_mask_popcnt_epi16(__m512i __A, __mmask32 __U, __m512i __B)
{
return (__m512i) __builtin_ia32_selectw_512((__mmask32) __U,
(__v32hi) _mm512_popcnt_epi16(__B),
(__v32hi) __A);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_maskz_popcnt_epi16(__mmask32 __U, __m512i __B)
{
return _mm512_mask_popcnt_epi16((__m512i) _mm512_setzero_hi(),
__U,
__B);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_popcnt_epi8(__m512i __A)
{
return (__m512i) __builtin_ia32_vpopcntb_512((__v64qi) __A);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_mask_popcnt_epi8(__m512i __A, __mmask64 __U, __m512i __B)
{
return (__m512i) __builtin_ia32_selectb_512((__mmask64) __U,
(__v64qi) _mm512_popcnt_epi8(__B),
(__v64qi) __A);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_maskz_popcnt_epi8(__mmask64 __U, __m512i __B)
{
return _mm512_mask_popcnt_epi8((__m512i) _mm512_setzero_qi(),
__U,
__B);
}
static __inline__ __mmask64 __DEFAULT_FN_ATTRS
_mm512_mask_bitshuffle_epi64_mask(__mmask64 __U, __m512i __A, __m512i __B)
{
return (__mmask64) __builtin_ia32_vpshufbitqmb512_mask((__v64qi) __A,
(__v64qi) __B,
__U);
}
static __inline__ __mmask64 __DEFAULT_FN_ATTRS
_mm512_bitshuffle_epi64_mask(__m512i __A, __m512i __B)
{
return _mm512_mask_bitshuffle_epi64_mask((__mmask64) -1,
__A,
__B);
}
#undef __DEFAULT_FN_ATTRS
#endif

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,145 @@
/*===------------- avx512cdintrin.h - AVX512CD intrinsics ------------------===
*
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*
*===-----------------------------------------------------------------------===
*/
#ifndef __IMMINTRIN_H
#error "Never use <avx512cdintrin.h> directly; include <immintrin.h> instead."
#endif
#ifndef __AVX512CDINTRIN_H
#define __AVX512CDINTRIN_H
/* Define the default attributes for the functions in this file. */
#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx512cd")))
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_conflict_epi64 (__m512i __A)
{
return (__m512i) __builtin_ia32_vpconflictdi_512_mask ((__v8di) __A,
(__v8di) _mm512_setzero_si512 (),
(__mmask8) -1);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_mask_conflict_epi64 (__m512i __W, __mmask8 __U, __m512i __A)
{
return (__m512i) __builtin_ia32_vpconflictdi_512_mask ((__v8di) __A,
(__v8di) __W,
(__mmask8) __U);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_maskz_conflict_epi64 (__mmask8 __U, __m512i __A)
{
return (__m512i) __builtin_ia32_vpconflictdi_512_mask ((__v8di) __A,
(__v8di) _mm512_setzero_si512 (),
(__mmask8) __U);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_conflict_epi32 (__m512i __A)
{
return (__m512i) __builtin_ia32_vpconflictsi_512_mask ((__v16si) __A,
(__v16si) _mm512_setzero_si512 (),
(__mmask16) -1);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_mask_conflict_epi32 (__m512i __W, __mmask16 __U, __m512i __A)
{
return (__m512i) __builtin_ia32_vpconflictsi_512_mask ((__v16si) __A,
(__v16si) __W,
(__mmask16) __U);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_maskz_conflict_epi32 (__mmask16 __U, __m512i __A)
{
return (__m512i) __builtin_ia32_vpconflictsi_512_mask ((__v16si) __A,
(__v16si) _mm512_setzero_si512 (),
(__mmask16) __U);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_lzcnt_epi32 (__m512i __A)
{
return (__m512i) __builtin_ia32_vplzcntd_512_mask ((__v16si) __A,
(__v16si) _mm512_setzero_si512 (),
(__mmask16) -1);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_mask_lzcnt_epi32 (__m512i __W, __mmask16 __U, __m512i __A)
{
return (__m512i) __builtin_ia32_vplzcntd_512_mask ((__v16si) __A,
(__v16si) __W,
(__mmask16) __U);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_maskz_lzcnt_epi32 (__mmask16 __U, __m512i __A)
{
return (__m512i) __builtin_ia32_vplzcntd_512_mask ((__v16si) __A,
(__v16si) _mm512_setzero_si512 (),
(__mmask16) __U);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_lzcnt_epi64 (__m512i __A)
{
return (__m512i) __builtin_ia32_vplzcntq_512_mask ((__v8di) __A,
(__v8di) _mm512_setzero_si512 (),
(__mmask8) -1);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_mask_lzcnt_epi64 (__m512i __W, __mmask8 __U, __m512i __A)
{
return (__m512i) __builtin_ia32_vplzcntq_512_mask ((__v8di) __A,
(__v8di) __W,
(__mmask8) __U);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_maskz_lzcnt_epi64 (__mmask8 __U, __m512i __A)
{
return (__m512i) __builtin_ia32_vplzcntq_512_mask ((__v8di) __A,
(__v8di) _mm512_setzero_si512 (),
(__mmask8) __U);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_broadcastmb_epi64 (__mmask8 __A)
{
return (__m512i) _mm512_set1_epi64((long long) __A);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_broadcastmw_epi32 (__mmask16 __A)
{
return (__m512i) _mm512_set1_epi32((int) __A);
}
#undef __DEFAULT_FN_ATTRS
#endif

Some files were not shown because too many files have changed in this diff Show More