Math API
-4 bit floating point is not supported
-6 bit floating point is not supported
-8 bit floating point is not supported
typedef __half __nv_half;
typedef __half2 __nv_half2;
typedef __half2_raw __nv_half2_raw;
typedef __half_raw __nv_half_raw;
typedef __half half;
typedef __half2 half2;
typedef __half nv_half;
typedef __half2 nv_half2;
#define CUDART_INF_FP16
#define CUDART_MAX_NORMAL_FP16
#define CUDART_MIN_DENORM_FP16
#define CUDART_NAN_FP16
#define CUDART_NEG_ZERO_FP16
#define CUDART_ONE_FP16
#define CUDART_ZERO_FP16
__half __habs(const __half a);
__device__ __half __habs(const __half a);
__half __hadd(const __half a, const __half b);
__device__ __half __hadd(const __half a, const __half b);
__half __hadd_rn(const __half a, const __half b);
__device__ __half __hadd_rn(const __half a, const __half b);
__half __hadd_sat(const __half a, const __half b);
__device__ __half __hadd_sat(const __half a, const __half b);
__half __hdiv(const __half a, const __half b);
__device__ __half __hdiv(const __half a, const __half b);
__device__ __half __hfma(const __half a, const __half b, const __half c);
__device__ __half __hfma_relu(const __half a, const __half b, const __half c);
__device__ __half __hfma_sat(const __half a, const __half b, const __half c);
__half __hmul(const __half a, const __half b);
__device__ __half __hmul(const __half a, const __half b);
__half __hmul_rn(const __half a, const __half b);
__device__ __half __hmul_rn(const __half a, const __half b);
__half __hmul_sat(const __half a, const __half b);
__device__ __half __hmul_sat(const __half a, const __half b);
__half __hneg(const __half a);
__device__ __half __hneg(const __half a);
__half __hsub(const __half a, const __half b);
__device__ __half __hsub(const __half a, const __half b);
__half __hsub_rn(const __half a, const __half b);
__device__ __half __hsub_rn(const __half a, const __half b);
__half __hsub_sat(const __half a, const __half b);
__device__ __half __hsub_sat(const __half a, const __half b);
__device__ __half atomicAdd(__half *const address, const __half val);
bool __heq(const __half a, const __half b);
__device__ bool __heq(const __half a, const __half b);
bool __hequ(const __half a, const __half b);
__device__ bool __hequ(const __half a, const __half b);
bool __hge(const __half a, const __half b);
__device__ bool __hge(const __half a, const __half b);
bool __hgeu(const __half a, const __half b);
__device__ bool __hgeu(const __half a, const __half b);
bool __hgt(const __half a, const __half b);
__device__ bool __hgt(const __half a, const __half b);
bool __hgtu(const __half a, const __half b);
__device__ bool __hgtu(const __half a, const __half b);
int __hisinf(const __half a);
__device__ int __hisinf(const __half a);
bool __hisnan(const __half a);
__device__ bool __hisnan(const __half a);
bool __hle(const __half a, const __half b);
__device__ bool __hle(const __half a, const __half b);
bool __hleu(const __half a, const __half b);
__device__ bool __hleu(const __half a, const __half b);
bool __hlt(const __half a, const __half b);
__device__ bool __hlt(const __half a, const __half b);
bool __hltu(const __half a, const __half b);
__device__ bool __hltu(const __half a, const __half b);
__half __hmax(const __half a, const __half b);
__device__ __half __hmax(const __half a, const __half b);
__half __hmax_nan(const __half a, const __half b);
__device__ __half __hmax_nan(const __half a, const __half b);
__half __hmin(const __half a, const __half b);
__device__ __half __hmin(const __half a, const __half b);
__half __hmin_nan(const __half a, const __half b);
__device__ __half __hmin_nan(const __half a, const __half b);
bool __hne(const __half a, const __half b);
__device__ bool __hne(const __half a, const __half b);
bool __hneu(const __half a, const __half b);
__device__ bool __hneu(const __half a, const __half b);
__device__ __half hceil(const __half h);
__device__ __half hcos(const __half a);
__device__ __half hexp(const __half a);
__device__ __half hexp10(const __half a);
__device__ __half hexp2(const __half a);
__device__ __half hfloor(const __half h);
__device__ __half hlog(const __half a);
__device__ __half hlog10(const __half a);
__device__ __half hlog2(const __half a);
__device__ __half hrcp(const __half a);
__device__ __half hrint(const __half h);
__device__ __half hrsqrt(const __half a);
__device__ __half hsin(const __half a);
__device__ __half hsqrt(const __half a);
-__device__ __half htanh(const __half a);
-__device__ __half htanh_approx(const __half a);
__device__ __half htrunc(const __half h);
__half __double2half(const double a);
__device__ __half __double2half(const double a);
__half2 __float22half2_rn(const float2 a);
__device__ __half2 __float22half2_rn(const float2 a);
__half __float2half(const float a);
__device__ __half __float2half(const float a);
__half2 __float2half2_rn(const float a);
__device__ __half2 __float2half2_rn(const float a);
-__half __float2half_rd(const float a);
-__device__ __half __float2half_rd(const float a);
__half __float2half_rn(const float a);
__device__ __half __float2half_rn(const float a);
-__half __float2half_ru(const float a);
-__device__ __half __float2half_ru(const float a);
-__half __float2half_rz(const float a);
-__device__ __half __float2half_rz(const float a);
__half2 __floats2half2_rn(const float a, const float b);
__device__ __half2 __floats2half2_rn(const float a, const float b);
float2 __half22float2(const __half2 a);
__device__ float2 __half22float2(const __half2 a);
-signed char __half2char_rz(const __half h);
-__device__ signed char __half2char_rz(const __half h);
float __half2float(const __half a);
__device__ float __half2float(const __half a);
__half2 __half2half2(const __half a);
__device__ __half2 __half2half2(const __half a);
__device__ int __half2int_rd(const __half h);
__device__ int __half2int_rn(const __half h);
__device__ int __half2int_ru(const __half h);
int __half2int_rz(const __half h);
__device__ int __half2int_rz(const __half h);
__device__ long long int __half2ll_rd(const __half h);
__device__ long long int __half2ll_rn(const __half h);
__device__ long long int __half2ll_ru(const __half h);
long long int __half2ll_rz(const __half h);
__device__ long long int __half2ll_rz(const __half h);
__device__ short int __half2short_rd(const __half h);
__device__ short int __half2short_rn(const __half h);
__device__ short int __half2short_ru(const __half h);
short int __half2short_rz(const __half h);
__device__ short int __half2short_rz(const __half h);
-unsigned char __half2uchar_rz(const __half h);
-__device__ unsigned char __half2uchar_rz(const __half h);
__device__ unsigned int __half2uint_rd(const __half h);
__device__ unsigned int __half2uint_rn(const __half h);
__device__ unsigned int __half2uint_ru(const __half h);
unsigned int __half2uint_rz(const __half h);
__device__ unsigned int __half2uint_rz(const __half h);
__device__ unsigned long long int __half2ull_rd(const __half h);
__device__ unsigned long long int __half2ull_rn(const __half h);
__device__ unsigned long long int __half2ull_ru(const __half h);
unsigned long long int __half2ull_rz(const __half h);
__device__ unsigned long long int __half2ull_rz(const __half h);
__device__ unsigned short int __half2ushort_rd(const __half h);
__device__ unsigned short int __half2ushort_rn(const __half h);
__device__ unsigned short int __half2ushort_ru(const __half h);
unsigned short int __half2ushort_rz(const __half h);
__device__ unsigned short int __half2ushort_rz(const __half h);
short int __half_as_short(const __half h);
__device__ short int __half_as_short(const __half h);
unsigned short int __half_as_ushort(const __half h);
__device__ unsigned short int __half_as_ushort(const __half h);
__half2 __halves2half2(const __half a, const __half b);
__device__ __half2 __halves2half2(const __half a, const __half b);
float __high2float(const __half2 a);
__device__ float __high2float(const __half2 a);
__half __high2half(const __half2 a);
__device__ __half __high2half(const __half2 a);
__half2 __high2half2(const __half2 a);
__device__ __half2 __high2half2(const __half2 a);
__half2 __highs2half2(const __half2 a, const __half2 b);
__device__ __half2 __highs2half2(const __half2 a, const __half2 b);
-__half __int2half_rd(const int i);
-__device__ __half __int2half_rd(const int i);
__half __int2half_rn(const int i);
__device__ __half __int2half_rn(const int i);
-__half __int2half_ru(const int i);
-__device__ __half __int2half_ru(const int i);
-__half __int2half_rz(const int i);
-__device__ __half __int2half_rz(const int i);
__device__ __half2 __ldca(const __half2 *const ptr);
__device__ __half __ldca(const __half *const ptr);
__device__ __half __ldcg(const __half *const ptr);
__device__ __half2 __ldcg(const __half2 *const ptr);
__device__ __half __ldcs(const __half *const ptr);
__device__ __half2 __ldcs(const __half2 *const ptr);
__device__ __half2 __ldcv(const __half2 *const ptr);
__device__ __half __ldcv(const __half *const ptr);
__device__ __half2 __ldg(const __half2 *const ptr);
__device__ __half __ldg(const __half *const ptr);
__device__ __half __ldlu(const __half *const ptr);
__device__ __half2 __ldlu(const __half2 *const ptr);
-__half __ll2half_rd(const long long int i);
-__device__ __half __ll2half_rd(const long long int i);
__half __ll2half_rn(const long long int i);
__device__ __half __ll2half_rn(const long long int i);
-__half __ll2half_ru(const long long int i);
-__device__ __half __ll2half_ru(const long long int i);
-__half __ll2half_rz(const long long int i);
-__device__ __half __ll2half_rz(const long long int i);
float __low2float(const __half2 a);
__device__ float __low2float(const __half2 a);
__half __low2half(const __half2 a);
__device__ __half __low2half(const __half2 a);
__half2 __low2half2(const __half2 a);
__device__ __half2 __low2half2(const __half2 a);
__half2 __lowhigh2highlow(const __half2 a);
__device__ __half2 __lowhigh2highlow(const __half2 a);
__half2 __lows2half2(const __half2 a, const __half2 b);
__device__ __half2 __lows2half2(const __half2 a, const __half2 b);
-__device__ __half __shfl_down_sync(const unsigned int mask, const __half var, const unsigned int delta, const int width=warpSize);
-__device__ __half2 __shfl_down_sync(const unsigned int mask, const __half2 var, const unsigned int delta, const int width=warpSize);
-__device__ __half2 __shfl_sync(const unsigned int mask, const __half2 var, const int srcLane, const int width=warpSize);
-__device__ __half __shfl_sync(const unsigned int mask, const __half var, const int srcLane, const int width=warpSize);
-__device__ __half2 __shfl_up_sync(const unsigned int mask, const __half2 var, const unsigned int delta, const int width=warpSize);
-__device__ __half __shfl_up_sync(const unsigned int mask, const __half var, const unsigned int delta, const int width=warpSize);
-__device__ __half2 __shfl_xor_sync(const unsigned int mask, const __half2 var, const int laneMask, const int width=warpSize);
-__device__ __half __shfl_xor_sync(const unsigned int mask, const __half var, const int laneMask, const int width=warpSize);
-__half __short2half_rd(const short int i);
-__device__ __half __short2half_rd(const short int i);
__half __short2half_rn(const short int i);
__device__ __half __short2half_rn(const short int i);
-__half __short2half_ru(const short int i);
-__device__ __half __short2half_ru(const short int i);
-__half __short2half_rz(const short int i);
-__device__ __half __short2half_rz(const short int i);
__half __short_as_half(const short int i);
__device__ __half __short_as_half(const short int i);
__device__ void __stcg(__half2 *const ptr, const __half2 value);
__device__ void __stcg(__half *const ptr, const __half value);
__device__ void __stcs(__half2 *const ptr, const __half2 value);
__device__ void __stcs(__half *const ptr, const __half value);
__device__ void __stwb(__half2 *const ptr, const __half2 value);
__device__ void __stwb(__half *const ptr, const __half value);
__device__ void __stwt(__half *const ptr, const __half value);
__device__ void __stwt(__half2 *const ptr, const __half2 value);
-__half __uint2half_rd(const unsigned int i);
-__device__ __half __uint2half_rd(const unsigned int i);
__half __uint2half_rn(const unsigned int i);
__device__ __half __uint2half_rn(const unsigned int i);
-__half __uint2half_ru(const unsigned int i);
-__device__ __half __uint2half_ru(const unsigned int i);
-__half __uint2half_rz(const unsigned int i);
-__device__ __half __uint2half_rz(const unsigned int i);
-__half __ull2half_rd(const unsigned long long int i);
-__device__ __half __ull2half_rd(const unsigned long long int i);
__half __ull2half_rn(const unsigned long long int i);
__device__ __half __ull2half_rn(const unsigned long long int i);
-__half __ull2half_ru(const unsigned long long int i);
-__device__ __half __ull2half_ru(const unsigned long long int i);
-__half __ull2half_rz(const unsigned long long int i);
-__device__ __half __ull2half_rz(const unsigned long long int i);
-__half __ushort2half_rd(const unsigned short int i);
-__device__ __half __ushort2half_rd(const unsigned short int i);
__half __ushort2half_rn(const unsigned short int i);
__device__ __half __ushort2half_rn(const unsigned short int i);
-__half __ushort2half_ru(const unsigned short int i);
-__device__ __half __ushort2half_ru(const unsigned short int i);
-__half __ushort2half_rz(const unsigned short int i);
-__device__ __half __ushort2half_rz(const unsigned short int i);
__half __ushort_as_half(const unsigned short int i);
__device__ __half __ushort_as_half(const unsigned short int i);
__half2 make_half2(const __half x, const __half y);
__device__ __half2 make_half2(const __half x, const __half y);
__half2 __h2div(const __half2 a, const __half2 b);
__device__ __half2 __h2div(const __half2 a, const __half2 b);
__half2 __habs2(const __half2 a);
__device__ __half2 __habs2(const __half2 a);
__half2 __hadd2(const __half2 a, const __half2 b);
__device__ __half2 __hadd2(const __half2 a, const __half2 b);
__half2 __hadd2_rn(const __half2 a, const __half2 b);
__device__ __half2 __hadd2_rn(const __half2 a, const __half2 b);
__half2 __hadd2_sat(const __half2 a, const __half2 b);
__device__ __half2 __hadd2_sat(const __half2 a, const __half2 b);
__device__ __half2 __hcmadd(const __half2 a, const __half2 b, const __half2 c);
__device__ __half2 __hfma2(const __half2 a, const __half2 b, const __half2 c);
__device__ __half2 __hfma2_relu(const __half2 a, const __half2 b, const __half2 c);
__device__ __half2 __hfma2_sat(const __half2 a, const __half2 b, const __half2 c);
__half2 __hmul2(const __half2 a, const __half2 b);
__device__ __half2 __hmul2(const __half2 a, const __half2 b);
__half2 __hmul2_rn(const __half2 a, const __half2 b);
__device__ __half2 __hmul2_rn(const __half2 a, const __half2 b);
__half2 __hmul2_sat(const __half2 a, const __half2 b);
__device__ __half2 __hmul2_sat(const __half2 a, const __half2 b);
__half2 __hneg2(const __half2 a);
__device__ __half2 __hneg2(const __half2 a);
__half2 __hsub2(const __half2 a, const __half2 b);
__device__ __half2 __hsub2(const __half2 a, const __half2 b);
__half2 __hsub2_rn(const __half2 a, const __half2 b);
__device__ __half2 __hsub2_rn(const __half2 a, const __half2 b);
__half2 __hsub2_sat(const __half2 a, const __half2 b);
__device__ __half2 __hsub2_sat(const __half2 a, const __half2 b);
__device__ __half2 atomicAdd(__half2 *const address, const __half2 val);
bool __hbeq2(const __half2 a, const __half2 b);
__device__ bool __hbeq2(const __half2 a, const __half2 b);
bool __hbequ2(const __half2 a, const __half2 b);
__device__ bool __hbequ2(const __half2 a, const __half2 b);
bool __hbge2(const __half2 a, const __half2 b);
__device__ bool __hbge2(const __half2 a, const __half2 b);
bool __hbgeu2(const __half2 a, const __half2 b);
__device__ bool __hbgeu2(const __half2 a, const __half2 b);
bool __hbgt2(const __half2 a, const __half2 b);
__device__ bool __hbgt2(const __half2 a, const __half2 b);
bool __hbgtu2(const __half2 a, const __half2 b);
__device__ bool __hbgtu2(const __half2 a, const __half2 b);
bool __hble2(const __half2 a, const __half2 b);
__device__ bool __hble2(const __half2 a, const __half2 b);
bool __hbleu2(const __half2 a, const __half2 b);
__device__ bool __hbleu2(const __half2 a, const __half2 b);
bool __hblt2(const __half2 a, const __half2 b);
__device__ bool __hblt2(const __half2 a, const __half2 b);
bool __hbltu2(const __half2 a, const __half2 b);
__device__ bool __hbltu2(const __half2 a, const __half2 b);
bool __hbne2(const __half2 a, const __half2 b);
__device__ bool __hbne2(const __half2 a, const __half2 b);
bool __hbneu2(const __half2 a, const __half2 b);
__device__ bool __hbneu2(const __half2 a, const __half2 b);
__half2 __heq2(const __half2 a, const __half2 b);
__device__ __half2 __heq2(const __half2 a, const __half2 b);
unsigned int __heq2_mask(const __half2 a, const __half2 b);
__device__ unsigned int __heq2_mask(const __half2 a, const __half2 b);
__half2 __hequ2(const __half2 a, const __half2 b);
__device__ __half2 __hequ2(const __half2 a, const __half2 b);
unsigned int __hequ2_mask(const __half2 a, const __half2 b);
__device__ unsigned int __hequ2_mask(const __half2 a, const __half2 b);
__half2 __hge2(const __half2 a, const __half2 b);
__device__ __half2 __hge2(const __half2 a, const __half2 b);
unsigned int __hge2_mask(const __half2 a, const __half2 b);
__device__ unsigned int __hge2_mask(const __half2 a, const __half2 b);
__half2 __hgeu2(const __half2 a, const __half2 b);
__device__ __half2 __hgeu2(const __half2 a, const __half2 b);
unsigned int __hgeu2_mask(const __half2 a, const __half2 b);
__device__ unsigned int __hgeu2_mask(const __half2 a, const __half2 b);
__half2 __hgt2(const __half2 a, const __half2 b);
__device__ __half2 __hgt2(const __half2 a, const __half2 b);
unsigned int __hgt2_mask(const __half2 a, const __half2 b);
__device__ unsigned int __hgt2_mask(const __half2 a, const __half2 b);
__half2 __hgtu2(const __half2 a, const __half2 b);
__device__ __half2 __hgtu2(const __half2 a, const __half2 b);
unsigned int __hgtu2_mask(const __half2 a, const __half2 b);
__device__ unsigned int __hgtu2_mask(const __half2 a, const __half2 b);
__half2 __hisnan2(const __half2 a);
__device__ __half2 __hisnan2(const __half2 a);
__half2 __hle2(const __half2 a, const __half2 b);
__device__ __half2 __hle2(const __half2 a, const __half2 b);
unsigned int __hle2_mask(const __half2 a, const __half2 b);
__device__ unsigned int __hle2_mask(const __half2 a, const __half2 b);
__half2 __hleu2(const __half2 a, const __half2 b);
__device__ __half2 __hleu2(const __half2 a, const __half2 b);
unsigned int __hleu2_mask(const __half2 a, const __half2 b);
__device__ unsigned int __hleu2_mask(const __half2 a, const __half2 b);
__half2 __hlt2(const __half2 a, const __half2 b);
__device__ __half2 __hlt2(const __half2 a, const __half2 b);
unsigned int __hlt2_mask(const __half2 a, const __half2 b);
__device__ unsigned int __hlt2_mask(const __half2 a, const __half2 b);
__half2 __hltu2(const __half2 a, const __half2 b);
__device__ __half2 __hltu2(const __half2 a, const __half2 b);
unsigned int __hltu2_mask(const __half2 a, const __half2 b);
__device__ unsigned int __hltu2_mask(const __half2 a, const __half2 b);
__half2 __hmax2(const __half2 a, const __half2 b);
__device__ __half2 __hmax2(const __half2 a, const __half2 b);
__half2 __hmax2_nan(const __half2 a, const __half2 b);
__device__ __half2 __hmax2_nan(const __half2 a, const __half2 b);
__half2 __hmin2(const __half2 a, const __half2 b);
__device__ __half2 __hmin2(const __half2 a, const __half2 b);
__half2 __hmin2_nan(const __half2 a, const __half2 b);
__device__ __half2 __hmin2_nan(const __half2 a, const __half2 b);
__half2 __hne2(const __half2 a, const __half2 b);
__device__ __half2 __hne2(const __half2 a, const __half2 b);
unsigned int __hne2_mask(const __half2 a, const __half2 b);
__device__ unsigned int __hne2_mask(const __half2 a, const __half2 b);
__half2 __hneu2(const __half2 a, const __half2 b);
__device__ __half2 __hneu2(const __half2 a, const __half2 b);
unsigned int __hneu2_mask(const __half2 a, const __half2 b);
__device__ unsigned int __hneu2_mask(const __half2 a, const __half2 b);
__device__ __half2 h2ceil(const __half2 h);
__device__ __half2 h2cos(const __half2 a);
__device__ __half2 h2exp(const __half2 a);
__device__ __half2 h2exp10(const __half2 a);
__device__ __half2 h2exp2(const __half2 a);
__device__ __half2 h2floor(const __half2 h);
__device__ __half2 h2log(const __half2 a);
__device__ __half2 h2log10(const __half2 a);
__device__ __half2 h2log2(const __half2 a);
__device__ __half2 h2rcp(const __half2 a);
__device__ __half2 h2rint(const __half2 h);
__device__ __half2 h2rsqrt(const __half2 a);
__device__ __half2 h2sin(const __half2 a);
__device__ __half2 h2sqrt(const __half2 a);
-__device__ __half2 h2tanh(const __half2 a);
-__device__ __half2 h2tanh_approx(const __half2 a);
__device__ __half2 h2trunc(const __half2 h);
typedef __nv_bfloat16 nv_bfloat16;
typedef __nv_bfloat162 nv_bfloat162;
#define CUDART_INF_BF16
#define CUDART_MAX_NORMAL_BF16
#define CUDART_MIN_DENORM_BF16
#define CUDART_NAN_BF16
#define CUDART_NEG_ZERO_BF16
#define CUDART_ONE_BF16
#define CUDART_ZERO_BF16
__nv_bfloat16 __habs(const __nv_bfloat16 a);
__device__ __nv_bfloat16 __habs(const __nv_bfloat16 a);
__nv_bfloat16 __hadd(const __nv_bfloat16 a, const __nv_bfloat16 b);
__device__ __nv_bfloat16 __hadd(const __nv_bfloat16 a, const __nv_bfloat16 b);
__nv_bfloat16 __hadd_rn(const __nv_bfloat16 a, const __nv_bfloat16 b);
__device__ __nv_bfloat16 __hadd_rn(const __nv_bfloat16 a, const __nv_bfloat16 b);
__nv_bfloat16 __hadd_sat(const __nv_bfloat16 a, const __nv_bfloat16 b);
__device__ __nv_bfloat16 __hadd_sat(const __nv_bfloat16 a, const __nv_bfloat16 b);
__nv_bfloat16 __hdiv(const __nv_bfloat16 a, const __nv_bfloat16 b);
__device__ __nv_bfloat16 __hdiv(const __nv_bfloat16 a, const __nv_bfloat16 b);
__device__ __nv_bfloat16 __hfma(const __nv_bfloat16 a, const __nv_bfloat16 b, const __nv_bfloat16 c);
__device__ __nv_bfloat16 __hfma_relu(const __nv_bfloat16 a, const __nv_bfloat16 b, const __nv_bfloat16 c);
__device__ __nv_bfloat16 __hfma_sat(const __nv_bfloat16 a, const __nv_bfloat16 b, const __nv_bfloat16 c);
__nv_bfloat16 __hmul(const __nv_bfloat16 a, const __nv_bfloat16 b);
__device__ __nv_bfloat16 __hmul(const __nv_bfloat16 a, const __nv_bfloat16 b);
__nv_bfloat16 __hmul_rn(const __nv_bfloat16 a, const __nv_bfloat16 b);
__device__ __nv_bfloat16 __hmul_rn(const __nv_bfloat16 a, const __nv_bfloat16 b);
__nv_bfloat16 __hmul_sat(const __nv_bfloat16 a, const __nv_bfloat16 b);
__device__ __nv_bfloat16 __hmul_sat(const __nv_bfloat16 a, const __nv_bfloat16 b);
__nv_bfloat16 __hneg(const __nv_bfloat16 a);
__device__ __nv_bfloat16 __hneg(const __nv_bfloat16 a);
__nv_bfloat16 __hsub(const __nv_bfloat16 a, const __nv_bfloat16 b);
__device__ __nv_bfloat16 __hsub(const __nv_bfloat16 a, const __nv_bfloat16 b);
__nv_bfloat16 __hsub_rn(const __nv_bfloat16 a, const __nv_bfloat16 b);
__device__ __nv_bfloat16 __hsub_rn(const __nv_bfloat16 a, const __nv_bfloat16 b);
__nv_bfloat16 __hsub_sat(const __nv_bfloat16 a, const __nv_bfloat16 b);
__device__ __nv_bfloat16 __hsub_sat(const __nv_bfloat16 a, const __nv_bfloat16 b);
__device__ __nv_bfloat16 atomicAdd(__nv_bfloat16 *const address, const __nv_bfloat16 val);
bool __heq(const __nv_bfloat16 a, const __nv_bfloat16 b);
__device__ bool __heq(const __nv_bfloat16 a, const __nv_bfloat16 b);
bool __hequ(const __nv_bfloat16 a, const __nv_bfloat16 b);
__device__ bool __hequ(const __nv_bfloat16 a, const __nv_bfloat16 b);
bool __hge(const __nv_bfloat16 a, const __nv_bfloat16 b);
__device__ bool __hge(const __nv_bfloat16 a, const __nv_bfloat16 b);
bool __hgeu(const __nv_bfloat16 a, const __nv_bfloat16 b);
__device__ bool __hgeu(const __nv_bfloat16 a, const __nv_bfloat16 b);
bool __hgt(const __nv_bfloat16 a, const __nv_bfloat16 b);
__device__ bool __hgt(const __nv_bfloat16 a, const __nv_bfloat16 b);
bool __hgtu(const __nv_bfloat16 a, const __nv_bfloat16 b);
__device__ bool __hgtu(const __nv_bfloat16 a, const __nv_bfloat16 b);
int __hisinf(const __nv_bfloat16 a);
__device__ int __hisinf(const __nv_bfloat16 a);
bool __hisnan(const __nv_bfloat16 a);
__device__ bool __hisnan(const __nv_bfloat16 a);
bool __hle(const __nv_bfloat16 a, const __nv_bfloat16 b);
__device__ bool __hle(const __nv_bfloat16 a, const __nv_bfloat16 b);
bool __hleu(const __nv_bfloat16 a, const __nv_bfloat16 b);
__device__ bool __hleu(const __nv_bfloat16 a, const __nv_bfloat16 b);
bool __hlt(const __nv_bfloat16 a, const __nv_bfloat16 b);
__device__ bool __hlt(const __nv_bfloat16 a, const __nv_bfloat16 b);
bool __hltu(const __nv_bfloat16 a, const __nv_bfloat16 b);
__device__ bool __hltu(const __nv_bfloat16 a, const __nv_bfloat16 b);
__nv_bfloat16 __hmax(const __nv_bfloat16 a, const __nv_bfloat16 b);
__device__ __nv_bfloat16 __hmax(const __nv_bfloat16 a, const __nv_bfloat16 b);
__nv_bfloat16 __hmax_nan(const __nv_bfloat16 a, const __nv_bfloat16 b);
__device__ __nv_bfloat16 __hmax_nan(const __nv_bfloat16 a, const __nv_bfloat16 b);
__nv_bfloat16 __hmin(const __nv_bfloat16 a, const __nv_bfloat16 b);
__device__ __nv_bfloat16 __hmin(const __nv_bfloat16 a, const __nv_bfloat16 b);
__nv_bfloat16 __hmin_nan(const __nv_bfloat16 a, const __nv_bfloat16 b);
__device__ __nv_bfloat16 __hmin_nan(const __nv_bfloat16 a, const __nv_bfloat16 b);
bool __hne(const __nv_bfloat16 a, const __nv_bfloat16 b);
__device__ bool __hne(const __nv_bfloat16 a, const __nv_bfloat16 b);
bool __hneu(const __nv_bfloat16 a, const __nv_bfloat16 b);
__device__ bool __hneu(const __nv_bfloat16 a, const __nv_bfloat16 b);
__device__ __nv_bfloat16 hceil(const __nv_bfloat16 h);
__device__ __nv_bfloat16 hcos(const __nv_bfloat16 a);
__device__ __nv_bfloat16 hexp(const __nv_bfloat16 a);
__device__ __nv_bfloat16 hexp10(const __nv_bfloat16 a);
__device__ __nv_bfloat16 hexp2(const __nv_bfloat16 a);
__device__ __nv_bfloat16 hfloor(const __nv_bfloat16 h);
__device__ __nv_bfloat16 hlog(const __nv_bfloat16 a);
__device__ __nv_bfloat16 hlog10(const __nv_bfloat16 a);
__device__ __nv_bfloat16 hlog2(const __nv_bfloat16 a);
__device__ __nv_bfloat16 hrcp(const __nv_bfloat16 a);
__device__ __nv_bfloat16 hrint(const __nv_bfloat16 h);
__device__ __nv_bfloat16 hrsqrt(const __nv_bfloat16 a);
__device__ __nv_bfloat16 hsin(const __nv_bfloat16 a);
__device__ __nv_bfloat16 hsqrt(const __nv_bfloat16 a);
-__device__ __nv_bfloat16 htanh(const __nv_bfloat16 a);
-__device__ __nv_bfloat16 htanh_approx(const __nv_bfloat16 a);
__device__ __nv_bfloat16 htrunc(const __nv_bfloat16 h);
float2 __bfloat1622float2(const __nv_bfloat162 a);
__device__ float2 __bfloat1622float2(const __nv_bfloat162 a);
__nv_bfloat162 __bfloat162bfloat162(const __nv_bfloat16 a);
__device__ __nv_bfloat162 __bfloat162bfloat162(const __nv_bfloat16 a);
-signed char __bfloat162char_rz(const __nv_bfloat16 h);
-__device__ signed char __bfloat162char_rz(const __nv_bfloat16 h);
float __bfloat162float(const __nv_bfloat16 a);
__device__ float __bfloat162float(const __nv_bfloat16 a);
__device__ int __bfloat162int_rd(const __nv_bfloat16 h);
__device__ int __bfloat162int_rn(const __nv_bfloat16 h);
__device__ int __bfloat162int_ru(const __nv_bfloat16 h);
int __bfloat162int_rz(const __nv_bfloat16 h);
__device__ int __bfloat162int_rz(const __nv_bfloat16 h);
__device__ long long int __bfloat162ll_rd(const __nv_bfloat16 h);
__device__ long long int __bfloat162ll_rn(const __nv_bfloat16 h);
__device__ long long int __bfloat162ll_ru(const __nv_bfloat16 h);
long long int __bfloat162ll_rz(const __nv_bfloat16 h);
__device__ long long int __bfloat162ll_rz(const __nv_bfloat16 h);
__device__ short int __bfloat162short_rd(const __nv_bfloat16 h);
__device__ short int __bfloat162short_rn(const __nv_bfloat16 h);
__device__ short int __bfloat162short_ru(const __nv_bfloat16 h);
short int __bfloat162short_rz(const __nv_bfloat16 h);
__device__ short int __bfloat162short_rz(const __nv_bfloat16 h);
-unsigned char __bfloat162uchar_rz(const __nv_bfloat16 h);
-__device__ unsigned char __bfloat162uchar_rz(const __nv_bfloat16 h);
__device__ unsigned int __bfloat162uint_rd(const __nv_bfloat16 h);
__device__ unsigned int __bfloat162uint_rn(const __nv_bfloat16 h);
__device__ unsigned int __bfloat162uint_ru(const __nv_bfloat16 h);
unsigned int __bfloat162uint_rz(const __nv_bfloat16 h);
__device__ unsigned int __bfloat162uint_rz(const __nv_bfloat16 h);
__device__ unsigned long long int __bfloat162ull_rd(const __nv_bfloat16 h);
__device__ unsigned long long int __bfloat162ull_rn(const __nv_bfloat16 h);
__device__ unsigned long long int __bfloat162ull_ru(const __nv_bfloat16 h);
unsigned long long int __bfloat162ull_rz(const __nv_bfloat16 h);
__device__ unsigned long long int __bfloat162ull_rz(const __nv_bfloat16 h);
__device__ unsigned short int __bfloat162ushort_rd(const __nv_bfloat16 h);
__device__ unsigned short int __bfloat162ushort_rn(const __nv_bfloat16 h);
__device__ unsigned short int __bfloat162ushort_ru(const __nv_bfloat16 h);
unsigned short int __bfloat162ushort_rz(const __nv_bfloat16 h);
__device__ unsigned short int __bfloat162ushort_rz(const __nv_bfloat16 h);
short int __bfloat16_as_short(const __nv_bfloat16 h);
__device__ short int __bfloat16_as_short(const __nv_bfloat16 h);
unsigned short int __bfloat16_as_ushort(const __nv_bfloat16 h);
__device__ unsigned short int __bfloat16_as_ushort(const __nv_bfloat16 h);
__nv_bfloat16 __double2bfloat16(const double a);
__device__ __nv_bfloat16 __double2bfloat16(const double a);
__nv_bfloat162 __float22bfloat162_rn(const float2 a);
__device__ __nv_bfloat162 __float22bfloat162_rn(const float2 a);
__nv_bfloat16 __float2bfloat16(const float a);
__device__ __nv_bfloat16 __float2bfloat16(const float a);
__nv_bfloat162 __float2bfloat162_rn(const float a);
__device__ __nv_bfloat162 __float2bfloat162_rn(const float a);
-__nv_bfloat16 __float2bfloat16_rd(const float a);
-__device__ __nv_bfloat16 __float2bfloat16_rd(const float a);
__nv_bfloat16 __float2bfloat16_rn(const float a);
__device__ __nv_bfloat16 __float2bfloat16_rn(const float a);
-__nv_bfloat16 __float2bfloat16_ru(const float a);
-__device__ __nv_bfloat16 __float2bfloat16_ru(const float a);
-__nv_bfloat16 __float2bfloat16_rz(const float a);
-__device__ __nv_bfloat16 __float2bfloat16_rz(const float a);
__nv_bfloat162 __floats2bfloat162_rn(const float a, const float b);
__device__ __nv_bfloat162 __floats2bfloat162_rn(const float a, const float b);
__nv_bfloat162 __halves2bfloat162(const __nv_bfloat16 a, const __nv_bfloat16 b);
__device__ __nv_bfloat162 __halves2bfloat162(const __nv_bfloat16 a, const __nv_bfloat16 b);
__nv_bfloat16 __high2bfloat16(const __nv_bfloat162 a);
__device__ __nv_bfloat16 __high2bfloat16(const __nv_bfloat162 a);
__nv_bfloat162 __high2bfloat162(const __nv_bfloat162 a);
__device__ __nv_bfloat162 __high2bfloat162(const __nv_bfloat162 a);
float __high2float(const __nv_bfloat162 a);
__device__ float __high2float(const __nv_bfloat162 a);
__nv_bfloat162 __highs2bfloat162(const __nv_bfloat162 a, const __nv_bfloat162 b);
__device__ __nv_bfloat162 __highs2bfloat162(const __nv_bfloat162 a, const __nv_bfloat162 b);
-__device__ __nv_bfloat16 __int2bfloat16_rd(const int i);
__nv_bfloat16 __int2bfloat16_rn(const int i);
__device__ __nv_bfloat16 __int2bfloat16_rn(const int i);
-__device__ __nv_bfloat16 __int2bfloat16_ru(const int i);
-__device__ __nv_bfloat16 __int2bfloat16_rz(const int i);
__device__ __nv_bfloat162 __ldca(const __nv_bfloat162 *const ptr);
__device__ __nv_bfloat16 __ldca(const __nv_bfloat16 *const ptr);
__device__ __nv_bfloat16 __ldcg(const __nv_bfloat16 *const ptr);
__device__ __nv_bfloat162 __ldcg(const __nv_bfloat162 *const ptr);
__device__ __nv_bfloat162 __ldcs(const __nv_bfloat162 *const ptr);
__device__ __nv_bfloat16 __ldcs(const __nv_bfloat16 *const ptr);
__device__ __nv_bfloat16 __ldcv(const __nv_bfloat16 *const ptr);
__device__ __nv_bfloat162 __ldcv(const __nv_bfloat162 *const ptr);
__device__ __nv_bfloat162 __ldg(const __nv_bfloat162 *const ptr);
__device__ __nv_bfloat16 __ldg(const __nv_bfloat16 *const ptr);
__device__ __nv_bfloat162 __ldlu(const __nv_bfloat162 *const ptr);
__device__ __nv_bfloat16 __ldlu(const __nv_bfloat16 *const ptr);
-__device__ __nv_bfloat16 __ll2bfloat16_rd(const long long int i);
__nv_bfloat16 __ll2bfloat16_rn(const long long int i);
__device__ __nv_bfloat16 __ll2bfloat16_rn(const long long int i);
-__device__ __nv_bfloat16 __ll2bfloat16_ru(const long long int i);
-__device__ __nv_bfloat16 __ll2bfloat16_rz(const long long int i);
__nv_bfloat16 __low2bfloat16(const __nv_bfloat162 a);
__device__ __nv_bfloat16 __low2bfloat16(const __nv_bfloat162 a);
__nv_bfloat162 __low2bfloat162(const __nv_bfloat162 a);
__device__ __nv_bfloat162 __low2bfloat162(const __nv_bfloat162 a);
float __low2float(const __nv_bfloat162 a);
__device__ float __low2float(const __nv_bfloat162 a);
__nv_bfloat162 __lowhigh2highlow(const __nv_bfloat162 a);
__device__ __nv_bfloat162 __lowhigh2highlow(const __nv_bfloat162 a);
__nv_bfloat162 __lows2bfloat162(const __nv_bfloat162 a, const __nv_bfloat162 b);
__device__ __nv_bfloat162 __lows2bfloat162(const __nv_bfloat162 a, const __nv_bfloat162 b);
-__device__ __nv_bfloat162 __shfl_down_sync(const unsigned int mask, const __nv_bfloat162 var, const unsigned int delta, const int width=warpSize);
-__device__ __nv_bfloat16 __shfl_down_sync(const unsigned int mask, const __nv_bfloat16 var, const unsigned int delta, const int width=warpSize);
-__device__ __nv_bfloat162 __shfl_sync(const unsigned int mask, const __nv_bfloat162 var, const int srcLane, const int width=warpSize);
-__device__ __nv_bfloat16 __shfl_sync(const unsigned int mask, const __nv_bfloat16 var, const int srcLane, const int width=warpSize);
-__device__ __nv_bfloat16 __shfl_up_sync(const unsigned int mask, const __nv_bfloat16 var, const unsigned int delta, const int width=warpSize);
-__device__ __nv_bfloat162 __shfl_up_sync(const unsigned int mask, const __nv_bfloat162 var, const unsigned int delta, const int width=warpSize);
-__device__ __nv_bfloat16 __shfl_xor_sync(const unsigned int mask, const __nv_bfloat16 var, const int laneMask, const int width=warpSize);
-__device__ __nv_bfloat162 __shfl_xor_sync(const unsigned int mask, const __nv_bfloat162 var, const int laneMask, const int width=warpSize);
-__device__ __nv_bfloat16 __short2bfloat16_rd(const short int i);
__nv_bfloat16 __short2bfloat16_rn(const short int i);
__device__ __nv_bfloat16 __short2bfloat16_rn(const short int i);
-__device__ __nv_bfloat16 __short2bfloat16_ru(const short int i);
-__device__ __nv_bfloat16 __short2bfloat16_rz(const short int i);
__nv_bfloat16 __short_as_bfloat16(const short int i);
__device__ __nv_bfloat16 __short_as_bfloat16(const short int i);
__device__ void __stcg(__nv_bfloat16 *const ptr, const __nv_bfloat16 value);
__device__ void __stcg(__nv_bfloat162 *const ptr, const __nv_bfloat162 value);
__device__ void __stcs(__nv_bfloat16 *const ptr, const __nv_bfloat16 value);
__device__ void __stcs(__nv_bfloat162 *const ptr, const __nv_bfloat162 value);
__device__ void __stwb(__nv_bfloat16 *const ptr, const __nv_bfloat16 value);
__device__ void __stwb(__nv_bfloat162 *const ptr, const __nv_bfloat162 value);
__device__ void __stwt(__nv_bfloat162 *const ptr, const __nv_bfloat162 value);
__device__ void __stwt(__nv_bfloat16 *const ptr, const __nv_bfloat16 value);
-__device__ __nv_bfloat16 __uint2bfloat16_rd(const unsigned int i);
__nv_bfloat16 __uint2bfloat16_rn(const unsigned int i);
__device__ __nv_bfloat16 __uint2bfloat16_rn(const unsigned int i);
-__device__ __nv_bfloat16 __uint2bfloat16_ru(const unsigned int i);
-__device__ __nv_bfloat16 __uint2bfloat16_rz(const unsigned int i);
-__device__ __nv_bfloat16 __ull2bfloat16_rd(const unsigned long long int i);
__nv_bfloat16 __ull2bfloat16_rn(const unsigned long long int i);
__device__ __nv_bfloat16 __ull2bfloat16_rn(const unsigned long long int i);
-__device__ __nv_bfloat16 __ull2bfloat16_ru(const unsigned long long int i);
-__device__ __nv_bfloat16 __ull2bfloat16_rz(const unsigned long long int i);
-__device__ __nv_bfloat16 __ushort2bfloat16_rd(const unsigned short int i);
__nv_bfloat16 __ushort2bfloat16_rn(const unsigned short int i);
__device__ __nv_bfloat16 __ushort2bfloat16_rn(const unsigned short int i);
-__device__ __nv_bfloat16 __ushort2bfloat16_ru(const unsigned short int i);
-__device__ __nv_bfloat16 __ushort2bfloat16_rz(const unsigned short int i);
__nv_bfloat16 __ushort_as_bfloat16(const unsigned short int i);
__device__ __nv_bfloat16 __ushort_as_bfloat16(const unsigned short int i);
__nv_bfloat162 make_bfloat162(const __nv_bfloat16 x, const __nv_bfloat16 y);
__device__ __nv_bfloat162 make_bfloat162(const __nv_bfloat16 x, const __nv_bfloat16 y);
__nv_bfloat162 __h2div(const __nv_bfloat162 a, const __nv_bfloat162 b);
__device__ __nv_bfloat162 __h2div(const __nv_bfloat162 a, const __nv_bfloat162 b);
__nv_bfloat162 __habs2(const __nv_bfloat162 a);
__device__ __nv_bfloat162 __habs2(const __nv_bfloat162 a);
__nv_bfloat162 __hadd2(const __nv_bfloat162 a, const __nv_bfloat162 b);
__device__ __nv_bfloat162 __hadd2(const __nv_bfloat162 a, const __nv_bfloat162 b);
__nv_bfloat162 __hadd2_rn(const __nv_bfloat162 a, const __nv_bfloat162 b);
__device__ __nv_bfloat162 __hadd2_rn(const __nv_bfloat162 a, const __nv_bfloat162 b);
__nv_bfloat162 __hadd2_sat(const __nv_bfloat162 a, const __nv_bfloat162 b);
__device__ __nv_bfloat162 __hadd2_sat(const __nv_bfloat162 a, const __nv_bfloat162 b);
__device__ __nv_bfloat162 __hcmadd(const __nv_bfloat162 a, const __nv_bfloat162 b, const __nv_bfloat162 c);
__device__ __nv_bfloat162 __hfma2(const __nv_bfloat162 a, const __nv_bfloat162 b, const __nv_bfloat162 c);
__device__ __nv_bfloat162 __hfma2_relu(const __nv_bfloat162 a, const __nv_bfloat162 b, const __nv_bfloat162 c);
__device__ __nv_bfloat162 __hfma2_sat(const __nv_bfloat162 a, const __nv_bfloat162 b, const __nv_bfloat162 c);
__nv_bfloat162 __hmul2(const __nv_bfloat162 a, const __nv_bfloat162 b);
__device__ __nv_bfloat162 __hmul2(const __nv_bfloat162 a, const __nv_bfloat162 b);
__nv_bfloat162 __hmul2_rn(const __nv_bfloat162 a, const __nv_bfloat162 b);
__device__ __nv_bfloat162 __hmul2_rn(const __nv_bfloat162 a, const __nv_bfloat162 b);
__nv_bfloat162 __hmul2_sat(const __nv_bfloat162 a, const __nv_bfloat162 b);
__device__ __nv_bfloat162 __hmul2_sat(const __nv_bfloat162 a, const __nv_bfloat162 b);
__nv_bfloat162 __hneg2(const __nv_bfloat162 a);
__device__ __nv_bfloat162 __hneg2(const __nv_bfloat162 a);
__nv_bfloat162 __hsub2(const __nv_bfloat162 a, const __nv_bfloat162 b);
__device__ __nv_bfloat162 __hsub2(const __nv_bfloat162 a, const __nv_bfloat162 b);
__nv_bfloat162 __hsub2_rn(const __nv_bfloat162 a, const __nv_bfloat162 b);
__device__ __nv_bfloat162 __hsub2_rn(const __nv_bfloat162 a, const __nv_bfloat162 b);
__nv_bfloat162 __hsub2_sat(const __nv_bfloat162 a, const __nv_bfloat162 b);
__device__ __nv_bfloat162 __hsub2_sat(const __nv_bfloat162 a, const __nv_bfloat162 b);
__device__ __nv_bfloat162 atomicAdd(__nv_bfloat162 *const address, const __nv_bfloat162 val);
bool __hbeq2(const __nv_bfloat162 a, const __nv_bfloat162 b);
__device__ bool __hbeq2(const __nv_bfloat162 a, const __nv_bfloat162 b);
bool __hbequ2(const __nv_bfloat162 a, const __nv_bfloat162 b);
__device__ bool __hbequ2(const __nv_bfloat162 a, const __nv_bfloat162 b);
bool __hbge2(const __nv_bfloat162 a, const __nv_bfloat162 b);
__device__ bool __hbge2(const __nv_bfloat162 a, const __nv_bfloat162 b);
bool __hbgeu2(const __nv_bfloat162 a, const __nv_bfloat162 b);
__device__ bool __hbgeu2(const __nv_bfloat162 a, const __nv_bfloat162 b);
bool __hbgt2(const __nv_bfloat162 a, const __nv_bfloat162 b);
__device__ bool __hbgt2(const __nv_bfloat162 a, const __nv_bfloat162 b);
bool __hbgtu2(const __nv_bfloat162 a, const __nv_bfloat162 b);
__device__ bool __hbgtu2(const __nv_bfloat162 a, const __nv_bfloat162 b);
bool __hble2(const __nv_bfloat162 a, const __nv_bfloat162 b);
__device__ bool __hble2(const __nv_bfloat162 a, const __nv_bfloat162 b);
bool __hbleu2(const __nv_bfloat162 a, const __nv_bfloat162 b);
__device__ bool __hbleu2(const __nv_bfloat162 a, const __nv_bfloat162 b);
bool __hblt2(const __nv_bfloat162 a, const __nv_bfloat162 b);
__device__ bool __hblt2(const __nv_bfloat162 a, const __nv_bfloat162 b);
bool __hbltu2(const __nv_bfloat162 a, const __nv_bfloat162 b);
__device__ bool __hbltu2(const __nv_bfloat162 a, const __nv_bfloat162 b);
bool __hbne2(const __nv_bfloat162 a, const __nv_bfloat162 b);
__device__ bool __hbne2(const __nv_bfloat162 a, const __nv_bfloat162 b);
bool __hbneu2(const __nv_bfloat162 a, const __nv_bfloat162 b);
__device__ bool __hbneu2(const __nv_bfloat162 a, const __nv_bfloat162 b);
__nv_bfloat162 __heq2(const __nv_bfloat162 a, const __nv_bfloat162 b);
__device__ __nv_bfloat162 __heq2(const __nv_bfloat162 a, const __nv_bfloat162 b);
unsigned int __heq2_mask(const __nv_bfloat162 a, const __nv_bfloat162 b);
__device__ unsigned int __heq2_mask(const __nv_bfloat162 a, const __nv_bfloat162 b);
__nv_bfloat162 __hequ2(const __nv_bfloat162 a, const __nv_bfloat162 b);
__device__ __nv_bfloat162 __hequ2(const __nv_bfloat162 a, const __nv_bfloat162 b);
unsigned int __hequ2_mask(const __nv_bfloat162 a, const __nv_bfloat162 b);
__device__ unsigned int __hequ2_mask(const __nv_bfloat162 a, const __nv_bfloat162 b);
__nv_bfloat162 __hge2(const __nv_bfloat162 a, const __nv_bfloat162 b);
__device__ __nv_bfloat162 __hge2(const __nv_bfloat162 a, const __nv_bfloat162 b);
unsigned int __hge2_mask(const __nv_bfloat162 a, const __nv_bfloat162 b);
__device__ unsigned int __hge2_mask(const __nv_bfloat162 a, const __nv_bfloat162 b);
__nv_bfloat162 __hgeu2(const __nv_bfloat162 a, const __nv_bfloat162 b);
__device__ __nv_bfloat162 __hgeu2(const __nv_bfloat162 a, const __nv_bfloat162 b);
unsigned int __hgeu2_mask(const __nv_bfloat162 a, const __nv_bfloat162 b);
__device__ unsigned int __hgeu2_mask(const __nv_bfloat162 a, const __nv_bfloat162 b);
__nv_bfloat162 __hgt2(const __nv_bfloat162 a, const __nv_bfloat162 b);
__device__ __nv_bfloat162 __hgt2(const __nv_bfloat162 a, const __nv_bfloat162 b);
unsigned int __hgt2_mask(const __nv_bfloat162 a, const __nv_bfloat162 b);
__device__ unsigned int __hgt2_mask(const __nv_bfloat162 a, const __nv_bfloat162 b);
__nv_bfloat162 __hgtu2(const __nv_bfloat162 a, const __nv_bfloat162 b);
__device__ __nv_bfloat162 __hgtu2(const __nv_bfloat162 a, const __nv_bfloat162 b);
unsigned int __hgtu2_mask(const __nv_bfloat162 a, const __nv_bfloat162 b);
__device__ unsigned int __hgtu2_mask(const __nv_bfloat162 a, const __nv_bfloat162 b);
__nv_bfloat162 __hisnan2(const __nv_bfloat162 a);
__device__ __nv_bfloat162 __hisnan2(const __nv_bfloat162 a);
__nv_bfloat162 __hle2(const __nv_bfloat162 a, const __nv_bfloat162 b);
__device__ __nv_bfloat162 __hle2(const __nv_bfloat162 a, const __nv_bfloat162 b);
unsigned int __hle2_mask(const __nv_bfloat162 a, const __nv_bfloat162 b);
__device__ unsigned int __hle2_mask(const __nv_bfloat162 a, const __nv_bfloat162 b);
__nv_bfloat162 __hleu2(const __nv_bfloat162 a, const __nv_bfloat162 b);
__device__ __nv_bfloat162 __hleu2(const __nv_bfloat162 a, const __nv_bfloat162 b);
unsigned int __hleu2_mask(const __nv_bfloat162 a, const __nv_bfloat162 b);
__device__ unsigned int __hleu2_mask(const __nv_bfloat162 a, const __nv_bfloat162 b);
__nv_bfloat162 __hlt2(const __nv_bfloat162 a, const __nv_bfloat162 b);
__device__ __nv_bfloat162 __hlt2(const __nv_bfloat162 a, const __nv_bfloat162 b);
unsigned int __hlt2_mask(const __nv_bfloat162 a, const __nv_bfloat162 b);
__device__ unsigned int __hlt2_mask(const __nv_bfloat162 a, const __nv_bfloat162 b);
__nv_bfloat162 __hltu2(const __nv_bfloat162 a, const __nv_bfloat162 b);
__device__ __nv_bfloat162 __hltu2(const __nv_bfloat162 a, const __nv_bfloat162 b);
unsigned int __hltu2_mask(const __nv_bfloat162 a, const __nv_bfloat162 b);
__device__ unsigned int __hltu2_mask(const __nv_bfloat162 a, const __nv_bfloat162 b);
__nv_bfloat162 __hmax2(const __nv_bfloat162 a, const __nv_bfloat162 b);
__device__ __nv_bfloat162 __hmax2(const __nv_bfloat162 a, const __nv_bfloat162 b);
__nv_bfloat162 __hmax2_nan(const __nv_bfloat162 a, const __nv_bfloat162 b);
__device__ __nv_bfloat162 __hmax2_nan(const __nv_bfloat162 a, const __nv_bfloat162 b);
__nv_bfloat162 __hmin2(const __nv_bfloat162 a, const __nv_bfloat162 b);
__device__ __nv_bfloat162 __hmin2(const __nv_bfloat162 a, const __nv_bfloat162 b);
__nv_bfloat162 __hmin2_nan(const __nv_bfloat162 a, const __nv_bfloat162 b);
__device__ __nv_bfloat162 __hmin2_nan(const __nv_bfloat162 a, const __nv_bfloat162 b);
__nv_bfloat162 __hne2(const __nv_bfloat162 a, const __nv_bfloat162 b);
__device__ __nv_bfloat162 __hne2(const __nv_bfloat162 a, const __nv_bfloat162 b);
unsigned int __hne2_mask(const __nv_bfloat162 a, const __nv_bfloat162 b);
__device__ unsigned int __hne2_mask(const __nv_bfloat162 a, const __nv_bfloat162 b);
__nv_bfloat162 __hneu2(const __nv_bfloat162 a, const __nv_bfloat162 b);
__device__ __nv_bfloat162 __hneu2(const __nv_bfloat162 a, const __nv_bfloat162 b);
unsigned int __hneu2_mask(const __nv_bfloat162 a, const __nv_bfloat162 b);
__device__ unsigned int __hneu2_mask(const __nv_bfloat162 a, const __nv_bfloat162 b);
__device__ __nv_bfloat162 h2ceil(const __nv_bfloat162 h);
__device__ __nv_bfloat162 h2cos(const __nv_bfloat162 a);
__device__ __nv_bfloat162 h2exp(const __nv_bfloat162 a);
__device__ __nv_bfloat162 h2exp10(const __nv_bfloat162 a);
__device__ __nv_bfloat162 h2exp2(const __nv_bfloat162 a);
__device__ __nv_bfloat162 h2floor(const __nv_bfloat162 h);
__device__ __nv_bfloat162 h2log(const __nv_bfloat162 a);
__device__ __nv_bfloat162 h2log10(const __nv_bfloat162 a);
__device__ __nv_bfloat162 h2log2(const __nv_bfloat162 a);
__device__ __nv_bfloat162 h2rcp(const __nv_bfloat162 a);
__device__ __nv_bfloat162 h2rint(const __nv_bfloat162 h);
__device__ __nv_bfloat162 h2rsqrt(const __nv_bfloat162 a);
__device__ __nv_bfloat162 h2sin(const __nv_bfloat162 a);
__device__ __nv_bfloat162 h2sqrt(const __nv_bfloat162 a);
-__device__ __nv_bfloat162 h2tanh(const __nv_bfloat162 a);
-__device__ __nv_bfloat162 h2tanh_approx(const __nv_bfloat162 a);
__device__ __nv_bfloat162 h2trunc(const __nv_bfloat162 h);
__device__ float acosf(float x);
__device__ float acoshf(float x);
__device__ float asinf(float x);
__device__ float asinhf(float x);
__device__ float atan2f(float y, float x);
__device__ float atanf(float x);
__device__ float atanhf(float x);
__device__ float cbrtf(float x);
__device__ float ceilf(float x);
__device__ float copysignf(float x, float y);
__device__ float cosf(float x);
__device__ float coshf(float x);
__device__ float cospif(float x);
-__device__ float cyl_bessel_i0f(float x);
-__device__ float cyl_bessel_i1f(float x);
__device__ float erfcf(float x);
__device__ float erfcinvf(float x);
-__device__ float erfcxf(float x);
__device__ float erff(float x);
__device__ float erfinvf(float x);
__device__ float exp10f(float x);
__device__ float exp2f(float x);
__device__ float expf(float x);
__device__ float expm1f(float x);
__device__ float fabsf(float x);
__device__ float fdimf(float x, float y);
__device__ float fdividef(float x, float y);
__device__ float floorf(float x);
__device__ float fmaf(float x, float y, float z);
__device__ float fmaxf(float x, float y);
__device__ float fminf(float x, float y);
__device__ float fmodf(float x, float y);
__device__ float frexpf(float x, int *nptr);
__device__ float hypotf(float x, float y);
__device__ int ilogbf(float x);
__device__ bool isfinite(float a);
__device__ bool isinf(float a);
__device__ bool isnan(float a);
-__device__ float j0f(float x);
-__device__ float j1f(float x);
-__device__ float jnf(int n, float x);
__device__ float ldexpf(float x, int exp);
__device__ float lgammaf(float x);
__device__ long long int llrintf(float x);
__device__ long long int llroundf(float x);
__device__ float log10f(float x);
__device__ float log1pf(float x);
__device__ float log2f(float x);
__device__ float logbf(float x);
__device__ float logf(float x);
__device__ long int lrintf(float x);
__device__ long int lroundf(float x);
__device__ float max(const float a, const float b);
__device__ float min(const float a, const float b);
__device__ float modff(float x, float *iptr);
__device__ float nanf(const char *tagp);
__device__ float nearbyintf(float x);
__device__ float nextafterf(float x, float y);
__device__ float norm3df(float a, float b, float c);
__device__ float norm4df(float a, float b, float c, float d);
__device__ float normcdff(float x);
-__device__ float normcdfinvf(float x);
__device__ float normf(int dim, float const *p);
__device__ float powf(float x, float y);
-__device__ float rcbrtf(float x);
__device__ float remainderf(float x, float y);
__device__ float remquof(float x, float y, int *quo);
__device__ float rhypotf(float x, float y);
__device__ float rintf(float x);
__device__ float rnorm3df(float a, float b, float c);
__device__ float rnorm4df(float a, float b, float c, float d);
__device__ float rnormf(int dim, float const *p);
__device__ float roundf(float x);
__device__ float rsqrtf(float x);
__device__ float scalblnf(float x, long int n);
__device__ float scalbnf(float x, int n);
__device__ bool signbit(float a);
__device__ void sincosf(float x, float *sptr, float *cptr);
__device__ void sincospif(float x, float *sptr, float *cptr);
__device__ float sinf(float x);
__device__ float sinhf(float x);
__device__ float sinpif(float x);
__device__ float sqrtf(float x);
__device__ float tanf(float x);
__device__ float tanhf(float x);
__device__ float tgammaf(float x);
__device__ float truncf(float x);
-__device__ float y0f(float x);
-__device__ float y1f(float x);
-__device__ float ynf(int n, float x);
__device__ float __cosf(float x);
__device__ float __exp10f(float x);
__device__ float __expf(float x);
-__device__ float2 __fadd2_rd(float2 x, float2 y);
-__device__ float2 __fadd2_rn(float2 x, float2 y);
-__device__ float2 __fadd2_ru(float2 x, float2 y);
-__device__ float2 __fadd2_rz(float2 x, float2 y);
__device__ float __fadd_rd(float x, float y);
__device__ float __fadd_rn(float x, float y);
__device__ float __fadd_ru(float x, float y);
__device__ float __fadd_rz(float x, float y);
__device__ float __fdiv_rd(float x, float y);
__device__ float __fdiv_rn(float x, float y);
__device__ float __fdiv_ru(float x, float y);
__device__ float __fdiv_rz(float x, float y);
__device__ float __fdividef(float x, float y);
-__device__ float2 __ffma2_rd(float2 x, float2 y, float2 z);
-__device__ float2 __ffma2_rn(float2 x, float2 y, float2 z);
-__device__ float2 __ffma2_ru(float2 x, float2 y, float2 z);
-__device__ float2 __ffma2_rz(float2 x, float2 y, float2 z);
__device__ float __fmaf_ieee_rd(float x, float y, float z);
__device__ float __fmaf_ieee_rn(float x, float y, float z);
__device__ float __fmaf_ieee_ru(float x, float y, float z);
__device__ float __fmaf_ieee_rz(float x, float y, float z);
__device__ float __fmaf_rd(float x, float y, float z);
__device__ float __fmaf_rn(float x, float y, float z);
__device__ float __fmaf_ru(float x, float y, float z);
__device__ float __fmaf_rz(float x, float y, float z);
-__device__ float2 __fmul2_rd(float2 x, float2 y);
-__device__ float2 __fmul2_rn(float2 x, float2 y);
-__device__ float2 __fmul2_ru(float2 x, float2 y);
-__device__ float2 __fmul2_rz(float2 x, float2 y);
__device__ float __fmul_rd(float x, float y);
__device__ float __fmul_rn(float x, float y);
__device__ float __fmul_ru(float x, float y);
__device__ float __fmul_rz(float x, float y);
__device__ float __frcp_rd(float x);
__device__ float __frcp_rn(float x);
__device__ float __frcp_ru(float x);
__device__ float __frcp_rz(float x);
__device__ float __frsqrt_rn(float x);
__device__ float __fsqrt_rd(float x);
__device__ float __fsqrt_rn(float x);
__device__ float __fsqrt_ru(float x);
__device__ float __fsqrt_rz(float x);
__device__ float __fsub_rd(float x, float y);
__device__ float __fsub_rn(float x, float y);
__device__ float __fsub_ru(float x, float y);
__device__ float __fsub_rz(float x, float y);
__device__ float __log10f(float x);
__device__ float __log2f(float x);
__device__ float __logf(float x);
__device__ float __powf(float x, float y);
__device__ float __saturatef(float x);
__device__ void __sincosf(float x, float *sptr, float *cptr);
__device__ float __sinf(float x);
__device__ float __tanf(float x);
-__device__ float __tanhf(float x);
__device__ double acos(double x);
__device__ double acosh(double x);
__device__ double asin(double x);
__device__ double asinh(double x);
__device__ double atan(double x);
__device__ double atan2(double y, double x);
__device__ double atanh(double x);
__device__ double cbrt(double x);
__device__ double ceil(double x);
__device__ double copysign(double x, double y);
__device__ double cos(double x);
__device__ double cosh(double x);
__device__ double cospi(double x);
-__device__ double cyl_bessel_i0(double x);
-__device__ double cyl_bessel_i1(double x);
__device__ double erf(double x);
__device__ double erfc(double x);
__device__ double erfcinv(double x);
-__device__ double erfcx(double x);
__device__ double erfinv(double x);
__device__ double exp(double x);
__device__ double exp10(double x);
__device__ double exp2(double x);
__device__ double expm1(double x);
__device__ double fabs(double x);
__device__ double fdim(double x, double y);
__device__ double floor(double x);
__device__ double fma(double x, double y, double z);
__device__ double fmax(double, double);
__device__ double fmin(double x, double y);
__device__ double fmod(double x, double y);
__device__ double frexp(double x, int *nptr);
__device__ double hypot(double x, double y);
__device__ int ilogb(double x);
__device__ bool isfinite(double a);
__device__ bool isinf(double a);
__device__ bool isnan(double a);
-__device__ double j0(double x);
-__device__ double j1(double x);
-__device__ double jn(int n, double x);
__device__ double ldexp(double x, int exp);
__device__ double lgamma(double x);
__device__ long long int llrint(double x);
__device__ long long int llround(double x);
__device__ double log(double x);
__device__ double log10(double x);
__device__ double log1p(double x);
__device__ double log2(double x);
__device__ double logb(double x);
__device__ long int lrint(double x);
__device__ long int lround(double x);
__device__ double max(const float a, const double b);
__device__ double max(const double a, const float b);
__device__ double max(const double a, const double b);
__device__ double min(const float a, const double b);
__device__ double min(const double a, const double b);
__device__ double min(const double a, const float b);
__device__ double modf(double x, double *iptr);
__device__ double nan(const char *tagp);
__device__ double nearbyint(double x);
__device__ double nextafter(double x, double y);
__device__ double norm(int dim, double const *p);
__device__ double norm3d(double a, double b, double c);
__device__ double norm4d(double a, double b, double c, double d);
__device__ double normcdf(double x);
-__device__ double normcdfinv(double x);
__device__ double pow(double x, double y);
-__device__ double rcbrt(double x);
__device__ double remainder(double x, double y);
__device__ double remquo(double x, double y, int *quo);
__device__ double rhypot(double x, double y);
__device__ double rint(double x);
__device__ double rnorm(int dim, double const *p);
__device__ double rnorm3d(double a, double b, double c);
__device__ double rnorm4d(double a, double b, double c, double d);
__device__ double round(double x);
__device__ double rsqrt(double x);
__device__ double scalbln(double x, long int n);
__device__ double scalbn(double x, int n);
__device__ bool signbit(double a);
__device__ double sin(double x);
__device__ void sincos(double x, double *sptr, double *cptr);
__device__ void sincospi(double x, double *sptr, double *cptr);
__device__ double sinh(double x);
__device__ double sinpi(double x);
__device__ double sqrt(double x);
__device__ double tan(double x);
__device__ double tanh(double x);
__device__ double tgamma(double x);
__device__ double trunc(double x);
-__device__ double y0(double x);
-__device__ double y1(double x);
-__device__ double yn(int n, double x);
__device__ double __dadd_rd(double x, double y);
__device__ double __dadd_rn(double x, double y);
__device__ double __dadd_ru(double x, double y);
__device__ double __dadd_rz(double x, double y);
__device__ double __ddiv_rd(double x, double y);
__device__ double __ddiv_rn(double x, double y);
__device__ double __ddiv_ru(double x, double y);
__device__ double __ddiv_rz(double x, double y);
__device__ double __dmul_rd(double x, double y);
__device__ double __dmul_rn(double x, double y);
__device__ double __dmul_ru(double x, double y);
__device__ double __dmul_rz(double x, double y);
__device__ double __drcp_rd(double x);
__device__ double __drcp_rn(double x);
__device__ double __drcp_ru(double x);
__device__ double __drcp_rz(double x);
__device__ double __dsqrt_rd(double x);
__device__ double __dsqrt_rn(double x);
__device__ double __dsqrt_ru(double x);
__device__ double __dsqrt_rz(double x);
__device__ double __dsub_rd(double x, double y);
__device__ double __dsub_rn(double x, double y);
__device__ double __dsub_ru(double x, double y);
__device__ double __dsub_rz(double x, double y);
__device__ double __fma_rd(double x, double y, double z);
__device__ double __fma_rn(double x, double y, double z);
__device__ double __fma_ru(double x, double y, double z);
__device__ double __fma_rz(double x, double y, double z);
-128 bit floating point is not supported
-__device__ float __double2float_rd(double x);
-__device__ float __double2float_rn(double x);
-__device__ float __double2float_ru(double x);
-__device__ float __double2float_rz(double x);
__device__ int __double2hiint(double x);
__device__ int __double2int_rd(double x);
__device__ int __double2int_rn(double x);
__device__ int __double2int_ru(double x);
__device__ int __double2int_rz(double x);
__device__ long long int __double2ll_rd(double x);
__device__ long long int __double2ll_rn(double x);
__device__ long long int __double2ll_ru(double x);
__device__ long long int __double2ll_rz(double x);
__device__ int __double2loint(double x);
__device__ unsigned int __double2uint_rd(double x);
__device__ unsigned int __double2uint_rn(double x);
__device__ unsigned int __double2uint_ru(double x);
__device__ unsigned int __double2uint_rz(double x);
__device__ unsigned long long int __double2ull_rd(double x);
__device__ unsigned long long int __double2ull_rn(double x);
__device__ unsigned long long int __double2ull_ru(double x);
__device__ unsigned long long int __double2ull_rz(double x);
__device__ long long int __double_as_longlong(double x);
__device__ int __float2int_rd(float x);
__device__ int __float2int_rn(float x);
__device__ int __float2int_ru(float);
__device__ int __float2int_rz(float x);
__device__ long long int __float2ll_rd(float x);
__device__ long long int __float2ll_rn(float x);
__device__ long long int __float2ll_ru(float x);
__device__ long long int __float2ll_rz(float x);
__device__ unsigned int __float2uint_rd(float x);
__device__ unsigned int __float2uint_rn(float x);
__device__ unsigned int __float2uint_ru(float x);
__device__ unsigned int __float2uint_rz(float x);
__device__ unsigned long long int __float2ull_rd(float x);
__device__ unsigned long long int __float2ull_rn(float x);
__device__ unsigned long long int __float2ull_ru(float x);
__device__ unsigned long long int __float2ull_rz(float x);
__device__ int __float_as_int(float x);
__device__ unsigned int __float_as_uint(float x);
__device__ double __hiloint2double(int hi, int lo);
__device__ double __int2double_rn(int x);
-__device__ float __int2float_rd(int x);
__device__ float __int2float_rn(int x);
-__device__ float __int2float_ru(int x);
-__device__ float __int2float_rz(int x);
__device__ float __int_as_float(int x);
-__device__ double __ll2double_rd(long long int x);
-__device__ double __ll2double_rn(long long int x);
-__device__ double __ll2double_ru(long long int x);
-__device__ double __ll2double_rz(long long int x);
-__device__ float __ll2float_rd(long long int x);
-__device__ float __ll2float_rn(long long int x);
-__device__ float __ll2float_ru(long long int x);
-__device__ float __ll2float_rz(long long int x);
__device__ double __longlong_as_double(long long int x);
__device__ double __uint2double_rn(unsigned int x);
-__device__ float __uint2float_rd(unsigned int x);
__device__ float __uint2float_rn(unsigned int x);
-__device__ float __uint2float_ru(unsigned int x);
-__device__ float __uint2float_rz(unsigned int x);
__device__ float __uint_as_float(unsigned int x);
-__device__ double __ull2double_rd(unsigned long long int x);
-__device__ double __ull2double_rn(unsigned long long int x);
-__device__ double __ull2double_ru(unsigned long long int x);
-__device__ double __ull2double_rz(unsigned long long int x);
-__device__ float __ull2float_rd(unsigned long long int x);
-__device__ float __ull2float_rn(unsigned long long int x);
-__device__ float __ull2float_ru(unsigned long long int x);
-__device__ float __ull2float_rz(unsigned long long int x);
-__device__ long int abs(long int a);
__device__ int abs(int a);
-__device__ long long int abs(long long int a);
__device__ long int labs(long int a);
__device__ long long int llabs(long long int a);
__device__ long long int llmax(const long long int a, const long long int b);
__device__ long long int llmin(const long long int a, const long long int b);
__device__ unsigned long int max(const long int a, const unsigned long int b);
__device__ unsigned long long int max(const unsigned long long int a, const unsigned long long int b);
__device__ unsigned int max(const unsigned int a, const int b);
__device__ unsigned long long int max(const long long int a, const unsigned long long int b);
__device__ unsigned long int max(const unsigned long int a, const unsigned long int b);
__device__ long long int max(const long long int a, const long long int b);
__device__ unsigned long long int max(const unsigned long long int a, const long long int b);
__device__ unsigned long int max(const unsigned long int a, const long int b);
__device__ long int max(const long int a, const long int b);
__device__ int max(const int a, const int b);
__device__ unsigned int max(const unsigned int a, const unsigned int b);
__device__ unsigned int max(const int a, const unsigned int b);
__device__ unsigned long int min(const long int a, const unsigned long int b);
__device__ unsigned long long int min(const unsigned long long int a, const unsigned long long int b);
__device__ unsigned long long int min(const unsigned long long int a, const long long int b);
__device__ int min(const int a, const int b);
__device__ unsigned int min(const unsigned int a, const int b);
__device__ unsigned long long int min(const long long int a, const unsigned long long int b);
__device__ long long int min(const long long int a, const long long int b);
__device__ unsigned int min(const int a, const unsigned int b);
__device__ long int min(const long int a, const long int b);
__device__ unsigned int min(const unsigned int a, const unsigned int b);
__device__ unsigned long int min(const unsigned long int a, const long int b);
__device__ unsigned long int min(const unsigned long int a, const unsigned long int b);
__device__ unsigned long long int ullmax(const unsigned long long int a, const unsigned long long int b);
__device__ unsigned long long int ullmin(const unsigned long long int a, const unsigned long long int b);
__device__ unsigned int umax(const unsigned int a, const unsigned int b);
__device__ unsigned int umin(const unsigned int a, const unsigned int b);
__device__ unsigned int __brev(unsigned int x);
__device__ unsigned long long int __brevll(unsigned long long int x);
__device__ unsigned int __byte_perm(unsigned int x, unsigned int y, unsigned int s);
__device__ int __clz(int x);
__device__ int __clzll(long long int x);
__device__ int __dp2a_hi(int srcA, int srcB, int c);
__device__ unsigned int __dp2a_hi(unsigned int srcA, unsigned int srcB, unsigned int c);
__device__ unsigned int __dp2a_hi(ushort2 srcA, uchar4 srcB, unsigned int c);
__device__ int __dp2a_hi(short2 srcA, char4 srcB, int c);
__device__ unsigned int __dp2a_lo(ushort2 srcA, uchar4 srcB, unsigned int c);
__device__ int __dp2a_lo(short2 srcA, char4 srcB, int c);
__device__ unsigned int __dp2a_lo(unsigned int srcA, unsigned int srcB, unsigned int c);
__device__ int __dp2a_lo(int srcA, int srcB, int c);
__device__ unsigned int __dp4a(uchar4 srcA, uchar4 srcB, unsigned int c);
__device__ unsigned int __dp4a(unsigned int srcA, unsigned int srcB, unsigned int c);
__device__ int __dp4a(int srcA, int srcB, int c);
__device__ int __dp4a(char4 srcA, char4 srcB, int c);
__device__ int __ffs(int x);
__device__ int __ffsll(long long int x);
__device__ unsigned __fns(unsigned mask, unsigned base, int offset);
__device__ unsigned int __funnelshift_l(unsigned int lo, unsigned int hi, unsigned int shift);
__device__ unsigned int __funnelshift_lc(unsigned int lo, unsigned int hi, unsigned int shift);
__device__ unsigned int __funnelshift_r(unsigned int lo, unsigned int hi, unsigned int shift);
__device__ unsigned int __funnelshift_rc(unsigned int lo, unsigned int hi, unsigned int shift);
__device__ int __hadd(int x, int y);
__device__ int __mul24(int x, int y);
__device__ long long int __mul64hi(long long int x, long long int y);
__device__ int __mulhi(int x, int y);
-unsigned short __nv_bswap16(unsigned short x);
-__device__ unsigned short __nv_bswap16(unsigned short x);
-unsigned int __nv_bswap32(unsigned int x);
-__device__ unsigned int __nv_bswap32(unsigned int x);
-unsigned long long __nv_bswap64(unsigned long long x);
-__device__ unsigned long long __nv_bswap64(unsigned long long x);
__device__ int __popc(unsigned int x);
__device__ int __popcll(unsigned long long int x);
__device__ int __rhadd(int x, int y);
__device__ unsigned int __sad(int x, int y, unsigned int z);
__device__ unsigned int __uhadd(unsigned int x, unsigned int y);
__device__ unsigned int __umul24(unsigned int x, unsigned int y);
__device__ unsigned long long int __umul64hi(unsigned long long int x, unsigned long long int y);
__device__ unsigned int __umulhi(unsigned int x, unsigned int y);
__device__ unsigned int __urhadd(unsigned int x, unsigned int y);
__device__ unsigned int __usad(unsigned int x, unsigned int y, unsigned int z);
__device__ unsigned int __vabs2(unsigned int a);
__device__ unsigned int __vabs4(unsigned int a);
__device__ unsigned int __vabsdiffs2(unsigned int a, unsigned int b);
__device__ unsigned int __vabsdiffs4(unsigned int a, unsigned int b);
__device__ unsigned int __vabsdiffu2(unsigned int a, unsigned int b);
__device__ unsigned int __vabsdiffu4(unsigned int a, unsigned int b);
__device__ unsigned int __vabsss2(unsigned int a);
__device__ unsigned int __vabsss4(unsigned int a);
__device__ unsigned int __vadd2(unsigned int a, unsigned int b);
__device__ unsigned int __vadd4(unsigned int a, unsigned int b);
__device__ unsigned int __vaddss2(unsigned int a, unsigned int b);
__device__ unsigned int __vaddss4(unsigned int a, unsigned int b);
__device__ unsigned int __vaddus2(unsigned int a, unsigned int b);
__device__ unsigned int __vaddus4(unsigned int a, unsigned int b);
__device__ unsigned int __vavgs2(unsigned int a, unsigned int b);
__device__ unsigned int __vavgs4(unsigned int a, unsigned int b);
__device__ unsigned int __vavgu2(unsigned int a, unsigned int b);
__device__ unsigned int __vavgu4(unsigned int a, unsigned int b);
__device__ unsigned int __vcmpeq2(unsigned int a, unsigned int b);
__device__ unsigned int __vcmpeq4(unsigned int a, unsigned int b);
__device__ unsigned int __vcmpges2(unsigned int a, unsigned int b);
__device__ unsigned int __vcmpges4(unsigned int a, unsigned int b);
__device__ unsigned int __vcmpgeu2(unsigned int a, unsigned int b);
__device__ unsigned int __vcmpgeu4(unsigned int a, unsigned int b);
__device__ unsigned int __vcmpgts2(unsigned int a, unsigned int b);
__device__ unsigned int __vcmpgts4(unsigned int a, unsigned int b);
__device__ unsigned int __vcmpgtu2(unsigned int a, unsigned int b);
__device__ unsigned int __vcmpgtu4(unsigned int a, unsigned int b);
__device__ unsigned int __vcmples2(unsigned int a, unsigned int b);
__device__ unsigned int __vcmples4(unsigned int a, unsigned int b);
__device__ unsigned int __vcmpleu2(unsigned int a, unsigned int b);
__device__ unsigned int __vcmpleu4(unsigned int a, unsigned int b);
__device__ unsigned int __vcmplts2(unsigned int a, unsigned int b);
__device__ unsigned int __vcmplts4(unsigned int a, unsigned int b);
__device__ unsigned int __vcmpltu2(unsigned int a, unsigned int b);
__device__ unsigned int __vcmpltu4(unsigned int a, unsigned int b);
__device__ unsigned int __vcmpne2(unsigned int a, unsigned int b);
__device__ unsigned int __vcmpne4(unsigned int a, unsigned int b);
__device__ unsigned int __vhaddu2(unsigned int a, unsigned int b);
__device__ unsigned int __vhaddu4(unsigned int a, unsigned int b);
unsigned int __viaddmax_s16x2(const unsigned int a, const unsigned int b, const unsigned int c);
__device__ unsigned int __viaddmax_s16x2(const unsigned int a, const unsigned int b, const unsigned int c);
unsigned int __viaddmax_s16x2_relu(const unsigned int a, const unsigned int b, const unsigned int c);
__device__ unsigned int __viaddmax_s16x2_relu(const unsigned int a, const unsigned int b, const unsigned int c);
int __viaddmax_s32(const int a, const int b, const int c);
__device__ int __viaddmax_s32(const int a, const int b, const int c);
int __viaddmax_s32_relu(const int a, const int b, const int c);
__device__ int __viaddmax_s32_relu(const int a, const int b, const int c);
unsigned int __viaddmax_u16x2(const unsigned int a, const unsigned int b, const unsigned int c);
__device__ unsigned int __viaddmax_u16x2(const unsigned int a, const unsigned int b, const unsigned int c);
unsigned int __viaddmax_u32(const unsigned int a, const unsigned int b, const unsigned int c);
__device__ unsigned int __viaddmax_u32(const unsigned int a, const unsigned int b, const unsigned int c);
unsigned int __viaddmin_s16x2(const unsigned int a, const unsigned int b, const unsigned int c);
__device__ unsigned int __viaddmin_s16x2(const unsigned int a, const unsigned int b, const unsigned int c);
unsigned int __viaddmin_s16x2_relu(const unsigned int a, const unsigned int b, const unsigned int c);
__device__ unsigned int __viaddmin_s16x2_relu(const unsigned int a, const unsigned int b, const unsigned int c);
int __viaddmin_s32(const int a, const int b, const int c);
__device__ int __viaddmin_s32(const int a, const int b, const int c);
int __viaddmin_s32_relu(const int a, const int b, const int c);
__device__ int __viaddmin_s32_relu(const int a, const int b, const int c);
unsigned int __viaddmin_u16x2(const unsigned int a, const unsigned int b, const unsigned int c);
__device__ unsigned int __viaddmin_u16x2(const unsigned int a, const unsigned int b, const unsigned int c);
unsigned int __viaddmin_u32(const unsigned int a, const unsigned int b, const unsigned int c);
__device__ unsigned int __viaddmin_u32(const unsigned int a, const unsigned int b, const unsigned int c);
unsigned int __vibmax_s16x2(const unsigned int a, const unsigned int b, bool *const pred_hi, bool *const pred_lo);
__device__ unsigned int __vibmax_s16x2(const unsigned int a, const unsigned int b, bool *const pred_hi, bool *const pred_lo);
int __vibmax_s32(const int a, const int b, bool *const pred);
__device__ int __vibmax_s32(const int a, const int b, bool *const pred);
unsigned int __vibmax_u16x2(const unsigned int a, const unsigned int b, bool *const pred_hi, bool *const pred_lo);
__device__ unsigned int __vibmax_u16x2(const unsigned int a, const unsigned int b, bool *const pred_hi, bool *const pred_lo);
unsigned int __vibmax_u32(const unsigned int a, const unsigned int b, bool *const pred);
__device__ unsigned int __vibmax_u32(const unsigned int a, const unsigned int b, bool *const pred);
unsigned int __vibmin_s16x2(const unsigned int a, const unsigned int b, bool *const pred_hi, bool *const pred_lo);
__device__ unsigned int __vibmin_s16x2(const unsigned int a, const unsigned int b, bool *const pred_hi, bool *const pred_lo);
int __vibmin_s32(const int a, const int b, bool *const pred);
__device__ int __vibmin_s32(const int a, const int b, bool *const pred);
unsigned int __vibmin_u16x2(const unsigned int a, const unsigned int b, bool *const pred_hi, bool *const pred_lo);
__device__ unsigned int __vibmin_u16x2(const unsigned int a, const unsigned int b, bool *const pred_hi, bool *const pred_lo);
unsigned int __vibmin_u32(const unsigned int a, const unsigned int b, bool *const pred);
__device__ unsigned int __vibmin_u32(const unsigned int a, const unsigned int b, bool *const pred);
unsigned int __vimax3_s16x2(const unsigned int a, const unsigned int b, const unsigned int c);
__device__ unsigned int __vimax3_s16x2(const unsigned int a, const unsigned int b, const unsigned int c);
unsigned int __vimax3_s16x2_relu(const unsigned int a, const unsigned int b, const unsigned int c);
__device__ unsigned int __vimax3_s16x2_relu(const unsigned int a, const unsigned int b, const unsigned int c);
int __vimax3_s32(const int a, const int b, const int c);
__device__ int __vimax3_s32(const int a, const int b, const int c);
int __vimax3_s32_relu(const int a, const int b, const int c);
__device__ int __vimax3_s32_relu(const int a, const int b, const int c);
unsigned int __vimax3_u16x2(const unsigned int a, const unsigned int b, const unsigned int c);
__device__ unsigned int __vimax3_u16x2(const unsigned int a, const unsigned int b, const unsigned int c);
unsigned int __vimax3_u32(const unsigned int a, const unsigned int b, const unsigned int c);
__device__ unsigned int __vimax3_u32(const unsigned int a, const unsigned int b, const unsigned int c);
unsigned int __vimax_s16x2_relu(const unsigned int a, const unsigned int b);
__device__ unsigned int __vimax_s16x2_relu(const unsigned int a, const unsigned int b);
int __vimax_s32_relu(const int a, const int b);
__device__ int __vimax_s32_relu(const int a, const int b);
unsigned int __vimin3_s16x2(const unsigned int a, const unsigned int b, const unsigned int c);
__device__ unsigned int __vimin3_s16x2(const unsigned int a, const unsigned int b, const unsigned int c);
unsigned int __vimin3_s16x2_relu(const unsigned int a, const unsigned int b, const unsigned int c);
__device__ unsigned int __vimin3_s16x2_relu(const unsigned int a, const unsigned int b, const unsigned int c);
int __vimin3_s32(const int a, const int b, const int c);
__device__ int __vimin3_s32(const int a, const int b, const int c);
int __vimin3_s32_relu(const int a, const int b, const int c);
__device__ int __vimin3_s32_relu(const int a, const int b, const int c);
unsigned int __vimin3_u16x2(const unsigned int a, const unsigned int b, const unsigned int c);
__device__ unsigned int __vimin3_u16x2(const unsigned int a, const unsigned int b, const unsigned int c);
unsigned int __vimin3_u32(const unsigned int a, const unsigned int b, const unsigned int c);
__device__ unsigned int __vimin3_u32(const unsigned int a, const unsigned int b, const unsigned int c);
unsigned int __vimin_s16x2_relu(const unsigned int a, const unsigned int b);
__device__ unsigned int __vimin_s16x2_relu(const unsigned int a, const unsigned int b);
int __vimin_s32_relu(const int a, const int b);
__device__ int __vimin_s32_relu(const int a, const int b);
__device__ unsigned int __vmaxs2(unsigned int a, unsigned int b);
__device__ unsigned int __vmaxs4(unsigned int a, unsigned int b);
__device__ unsigned int __vmaxu2(unsigned int a, unsigned int b);
__device__ unsigned int __vmaxu4(unsigned int a, unsigned int b);
__device__ unsigned int __vmins2(unsigned int a, unsigned int b);
__device__ unsigned int __vmins4(unsigned int a, unsigned int b);
__device__ unsigned int __vminu2(unsigned int a, unsigned int b);
__device__ unsigned int __vminu4(unsigned int a, unsigned int b);
__device__ unsigned int __vneg2(unsigned int a);
__device__ unsigned int __vneg4(unsigned int a);
-__device__ unsigned int __vnegss2(unsigned int a);
-__device__ unsigned int __vnegss4(unsigned int a);
-__device__ unsigned int __vsads2(unsigned int a, unsigned int b);
-__device__ unsigned int __vsads4(unsigned int a, unsigned int b);
__device__ unsigned int __vsadu2(unsigned int a, unsigned int b);
__device__ unsigned int __vsadu4(unsigned int a, unsigned int b);
__device__ unsigned int __vseteq2(unsigned int a, unsigned int b);
__device__ unsigned int __vseteq4(unsigned int a, unsigned int b);
__device__ unsigned int __vsetges2(unsigned int a, unsigned int b);
__device__ unsigned int __vsetges4(unsigned int a, unsigned int b);
__device__ unsigned int __vsetgeu2(unsigned int a, unsigned int b);
__device__ unsigned int __vsetgeu4(unsigned int a, unsigned int b);
__device__ unsigned int __vsetgts2(unsigned int a, unsigned int b);
__device__ unsigned int __vsetgts4(unsigned int a, unsigned int b);
__device__ unsigned int __vsetgtu2(unsigned int a, unsigned int b);
__device__ unsigned int __vsetgtu4(unsigned int a, unsigned int b);
__device__ unsigned int __vsetles2(unsigned int a, unsigned int b);
__device__ unsigned int __vsetles4(unsigned int a, unsigned int b);
__device__ unsigned int __vsetleu2(unsigned int a, unsigned int b);
__device__ unsigned int __vsetleu4(unsigned int a, unsigned int b);
__device__ unsigned int __vsetlts2(unsigned int a, unsigned int b);
__device__ unsigned int __vsetlts4(unsigned int a, unsigned int b);
__device__ unsigned int __vsetltu2(unsigned int a, unsigned int b);
__device__ unsigned int __vsetltu4(unsigned int a, unsigned int b);
__device__ unsigned int __vsetne2(unsigned int a, unsigned int b);
__device__ unsigned int __vsetne4(unsigned int a, unsigned int b);
__device__ unsigned int __vsub2(unsigned int a, unsigned int b);
__device__ unsigned int __vsub4(unsigned int a, unsigned int b);
__device__ unsigned int __vsubss2(unsigned int a, unsigned int b);
__device__ unsigned int __vsubss4(unsigned int a, unsigned int b);
__device__ unsigned int __vsubus2(unsigned int a, unsigned int b);
__device__ unsigned int __vsubus4(unsigned int a, unsigned int b);
struct __half
struct __half2
struct __half2_raw
struct __half_raw
struct __nv_bfloat16
struct __nv_bfloat162
struct __nv_bfloat162_raw
struct __nv_bfloat16_raw
-__nv_fp4_e2m1
-__nv_fp4x2_e2m1
-__nv_fp4x4_e2m1
-__nv_fp6_e2m3
-__nv_fp6_e3m2
-__nv_fp6x2_e2m3
-__nv_fp6x2_e3m2
-__nv_fp6x4_e2m3
-__nv_fp6x4_e3m2
-__nv_fp8_e4m3
-__nv_fp8_e5m2
-__nv_fp8_e8m0
-__nv_fp8x2_e4m3
-__nv_fp8x2_e5m2
-__nv_fp8x2_e8m0
-__nv_fp8x4_e4m3
-__nv_fp8x4_e5m2
-__nv_fp8x4_e8m0