原子操作。并且在静态代码和运行时编译两种条件下使用。
▶ 源代码:静态使用
#ifndef _SIMPLEATOMICS_KERNEL_H_
#define _SIMPLEATOMICS_KERNEL_H_
//#include "device_launch_parameters.h" __global__ void testKernel(int *g_odata)
{
const unsigned int tid = blockDim.x * blockIdx.x + threadIdx.x; // 算术运算原子指令
atomicAdd(&g_odata[], ); // 0号位加 10 atomicSub(&g_odata[], ); // 1号位减 10 atomicExch(&g_odata[], tid); // 2号位与 tid 号值交换(获得最后一个访问的 tid 号) atomicMax(&g_odata[], tid); // 3号位获得最大的 tid 号 atomicMin(&g_odata[], tid); // 4号位获得最小的 tid 号 atomicInc((unsigned int *)&g_odata[], ); // 5号位做模 17 的加法(g_odata[5] == 15 时加 1 得 16,再加 1 得 0) atomicDec((unsigned int *)&g_odata[], );// 6号位做模 137 的减法(g_odata[5] == 0 时减 1 得 136,再减 1 得 135) atomicCAS(&g_odata[], tid - , tid); // 7号位迭代 (g_odata[7] == tid - 1) ? tid : (g_odata[7]);
// 即以 g_odata[7] 初值为起点,增量为 1 的子序列的最大长度(一旦有增量为 1 的元素插到前面去该值就再也不变)
// 位运算原子指令
atomicAnd(&g_odata[], *tid+); // 8号位为 1,注意 (2k+7)%2 == 1 但 (2k+7)%(2^m) == 0 或 1,即仅最后一位能保证为 1 atomicOr(&g_odata[], << tid); // 9号位为 -1,所有的位均为 1 atomicXor(&g_odata[], tid); // 10号位为 255,注意异或运算具有交换律和结合律,硬算
} #endif // #ifndef _SIMPLEATOMICS_KERNEL_H_
/*simpleAtomicIntrinsics_cpu.cpp*/
#include <stdio.h> extern "C" int computeGold(int *gpuData, const int len); int computeGold(int *gpuData, const int len)
{
if (gpuData[] != * len)
{
printf("atomicAdd failed\n");
return false;
}
if (gpuData[] != - * len)
{
printf("atomicSub failed\n");
return false;
}
if (gpuData[] < || gpuData[] >= len)// gpuData[2] ∈ [0, len)
{
printf("atomicExch failed\n");
return false;
}
if (gpuData[] != len - )
{
printf("atomicMax failed\n");
return false;
}
if (gpuData[]!=)
{
printf("atomicMin failed\n");
return false;
}
if (gpuData[] != len % )
{
printf("atomicInc failed\n");
return false;
}
if (gpuData[] != - len % )
{
printf("atomicDec failed\n");
return false;
}
if (gpuData[] < || gpuData[] >= len)// gpuData[7] ∈ [0, len)
{
printf("atomicCAS failed\n");
return false;
}
if (gpuData[] != )
{
printf("atomicAnd failed\n");
return false;
}
if (gpuData[] != -)
{
printf("atomicOr failed\n");
return false;
}
if (gpuData[] != 0xff)
{
printf("atomicXor failed\n");
return false;
}
return true;
}
#include <stdio.h>
#include <windows.h>
#include <cuda_runtime.h>
#include <helper_functions.h>
#include <helper_cuda.h>
#include "simpleAtomicIntrinsics_kernel.cuh" #define WINDOWS_LEAN_AND_MEAN
#define NOMINMAX extern "C" bool computeGold(int *gpuData, const int len); bool runTest()
{
bool testResult = false;
unsigned int numThreads = ;
unsigned int numBlocks = ;
unsigned int numData = ;
unsigned int memSize = sizeof(int) * numData; int *h_data = (int *) malloc(memSize);
for (unsigned int i = ; i < numData; h_data[i] = , i++); // 初始化为全零
h_data[] = h_data[] = 0xff; // 搞点非零值 int *d_data;
cudaMalloc((void **) &d_data, memSize);
cudaMemcpy(d_data, h_data, memSize, cudaMemcpyHostToDevice); // 输出运算前的结果
printf("\n\tBefore:");
for (int i = ; i < numData; i++)
printf("%8d,", h_data[i]);
printf("\n"); // 计算和计时
StopWatchInterface *timer;
sdkCreateTimer(&timer);
sdkStartTimer(&timer); testKernel << <numBlocks, numThreads >> > (d_data);
getLastCudaError("Kernel execution failed"); sdkStopTimer(&timer);
printf("\nProcessing time: %f ms\n", sdkGetTimerValue(&timer));
sdkDeleteTimer(&timer); cudaMemcpy(h_data, d_data, memSize, cudaMemcpyDeviceToHost); // 输出运算后的结果
printf("\n\tAfter :");
for (int i = ; i < numData; i++)
printf("%8d,", h_data[i]);
printf("\n"); testResult = computeGold(h_data, numThreads * numBlocks); free(h_data);
cudaFree(d_data); return testResult;
} int main()
{
bool testResult; printf("\n\tStarted!\n"); testResult = runTest(); printf("\n\tCompleted! main function returned %s\n", testResult ? "OK!" : "ERROR!");
getchar(); return ;
}
▶ 源代码:即时编译
/*simpleAtomicIntrinsics_kernel.cuh 发生变化的地方*/
extern "C" __global__ void testKernel(int *g_odata)
/*simpleAtomicIntrinsics_cpu.cpp 完全一样*/
/*simpleAtomicIntrinsics.cpp*/
#include <stdio.h>
#include <windows.h>
#include <cuda_runtime.h>
#include <nvrtc_helper.h>
#include <helper_functions.h>// includes cuda.h and cuda_runtime_api.h #define WINDOWS_LEAN_AND_MEAN
#define NOMINMAX extern "C" bool computeGold(int *gpuData, const int len); bool runTest()
{
bool testResult = false;
unsigned int numThreads = ;
unsigned int numBlocks = ;
unsigned int numData = ;
unsigned int memSize = sizeof(int) * numData; //即时编译过程
char *kernel_file = sdkFindFilePath("simpleAtomicIntrinsics_kernel.cuh", NULL);
char *ptx;
size_t ptxSize;
compileFileToPTX(kernel_file, , NULL, &ptx, &ptxSize);
CUmodule module = loadPTX(ptx, , NULL);
CUfunction kernel_addr;
cuModuleGetFunction(&kernel_addr, module, "testKernel"); int *h_data = (int *) malloc(memSize);
for (unsigned int i = ; i < numData; h_data[i] = , i++);
h_data[] = h_data[] = 0xff; CUdeviceptr d_data;
cuMemAlloc(&d_data, memSize);
cuMemcpyHtoD(d_data, h_data, memSize); dim3 cudaBlockSize(numThreads,,);
dim3 cudaGridSize(numBlocks, , );
void *arr[] = { (void *)&d_data };
cuLaunchKernel(kernel_addr, cudaGridSize.x, cudaGridSize.y, cudaGridSize.z,
cudaBlockSize.x, cudaBlockSize.y, cudaBlockSize.z, , , &arr[], ); cuCtxSynchronize(); cuMemcpyDtoH(h_data, d_data, memSize); testResult = computeGold(h_data, numThreads * numBlocks); free(h_data);
cuMemFree(d_data); return testResult;
} int main()
{
bool testResult; printf("\n\tStarted!\n"); testResult = runTest(); printf("\n\tCompleted! main function returned %s\n", testResult ? "OK!" : "ERROR!");
getchar(); return ;
}
▶ 输出结果:
Started! Before: , , , , , , , , , , , Processing time: 0.035352 ms After : , -, , , , , , , , -, , Completed! main function returned OK!
▶ 涨姿势
● 一个有趣的数列:命 x0 = 0,xn = xn-1 XOR n,则有 x4n == 4n,x4n+1 = 1, x4n+2 == 4n+3, x4n+3 == 0。当改变初值的时候该表达式发生变化,结果如下图。三种颜色分别代表初始选作右边三个值的时候的结果。
● 解毒 device_atomic_functions.h 与原子操作。只保留了有效部分,去掉了注释和留白。
#if !defined(__DEVICE_ATOMIC_FUNCTIONS_HPP__)
#define __DEVICE_ATOMIC_FUNCTIONS_HPP__ #if defined(__CUDACC_RTC__) // 主机编译
#define __DEVICE_ATOMIC_FUNCTIONS_DECL__ __host__ __device__
#else // 设备编译
#define __DEVICE_ATOMIC_FUNCTIONS_DECL__ static __inline__ __device__
#endif #if defined(__cplusplus) && defined(__CUDACC__) #include "builtin_types.h"
#include "host_defines.h" // 整数原子加法。返回 *address 旧值,*address += val;
__DEVICE_ATOMIC_FUNCTIONS_DECL__ int atomicAdd(int *address, int val)
{
return __iAtomicAdd(address, val);
} // 无符号整数原子加法
__DEVICE_ATOMIC_FUNCTIONS_DECL__ unsigned int atomicAdd(unsigned int *address, unsigned int val)
{
return __uAtomicAdd(address, val);
} // 整数原子减法,注意转换为加法来运算。返回 *address 旧值,*address -= val;。
__DEVICE_ATOMIC_FUNCTIONS_DECL__ int atomicSub(int *address, int val)
{
return __iAtomicAdd(address, (unsigned int)-(int)val);
} // 无符号整数原子减法
__DEVICE_ATOMIC_FUNCTIONS_DECL__ unsigned int atomicSub(unsigned int *address, unsigned int val)
{
return __uAtomicAdd(address, (unsigned int)-(int)val);
} // 整数原子替换。返回 *address 旧值,*address = val;
__DEVICE_ATOMIC_FUNCTIONS_DECL__ int atomicExch(int *address, int val)
{
return __iAtomicExch(address, val);
} // 无符号整数原子替换
__DEVICE_ATOMIC_FUNCTIONS_DECL__ unsigned int atomicExch(unsigned int *address, unsigned int val)
{
return __uAtomicExch(address, val);
} // 浮点原子替换
__DEVICE_ATOMIC_FUNCTIONS_DECL__ float atomicExch(float *address, float val)
{
return __fAtomicExch(address, val);
} // 整数原子取小。返回 *address 旧值,*address = MIN(*adress, val);
__DEVICE_ATOMIC_FUNCTIONS_DECL__ int atomicMin(int *address, int val)
{
return __iAtomicMin(address, val);
} // 无符号整数原子取小
__DEVICE_ATOMIC_FUNCTIONS_DECL__ unsigned int atomicMin(unsigned int *address, unsigned int val)
{
return __uAtomicMin(address, val);
} // 整数原子取大。返回 *address 旧值,*address = MAX(*adress, val);
__DEVICE_ATOMIC_FUNCTIONS_DECL__ int atomicMax(int *address, int val)
{
return __iAtomicMax(address, val);
} // 无符号整数原子取大
__DEVICE_ATOMIC_FUNCTIONS_DECL__ unsigned int atomicMax(unsigned int *address, unsigned int val)
{
return __uAtomicMax(address, val);
} // 无符号整数原子模加法。返回 *address 旧值,*address = (*adress + 1) % (val + 1);
__DEVICE_ATOMIC_FUNCTIONS_DECL__ unsigned int atomicInc(unsigned int *address, unsigned int val)
{
return __uAtomicInc(address, val);
} // 无符号整数原子模减法。返回 *address 旧值,*address = (*adress + val) % (val + 1);
// 不用 (*adress - 1) 是为了把结果控制在 [0, val] 中,防止变成负数,这与 C 中的 % 运算不同
__DEVICE_ATOMIC_FUNCTIONS_DECL__ unsigned int atomicDec(unsigned int *address, unsigned int val)
{
return __uAtomicDec(address, val);
} // 整数原子按位且。返回 *address 旧值,*adress &= val;
__DEVICE_ATOMIC_FUNCTIONS_DECL__ int atomicAnd(int *address, int val)
{
return __iAtomicAnd(address, val);
} // 无符号整数原子按位且
__DEVICE_ATOMIC_FUNCTIONS_DECL__ unsigned int atomicAnd(unsigned int *address, unsigned int val)
{
return __uAtomicAnd(address, val);
} // 整数原子按位或。返回 *address 旧值,*adress |= val;
__DEVICE_ATOMIC_FUNCTIONS_DECL__ int atomicOr(int *address, int val)
{
return __iAtomicOr(address, val);
} // 无符号整数原子按位或
__DEVICE_ATOMIC_FUNCTIONS_DECL__ unsigned int atomicOr(unsigned int *address, unsigned int val)
{
return __uAtomicOr(address, val);
} // 整数原子按位异或。返回 *address 旧值,*adress ^= val;
__DEVICE_ATOMIC_FUNCTIONS_DECL__ int atomicXor(int *address, int val)
{
return __iAtomicXor(address, val);
} // 无符号整数原子按位异或
__DEVICE_ATOMIC_FUNCTIONS_DECL__ unsigned int atomicXor(unsigned int *address, unsigned int val)
{
return __uAtomicXor(address, val);
} // 整数原子比较赋值。返回 *address 旧值,*address = (*address == compare) ? val : *address;
__DEVICE_ATOMIC_FUNCTIONS_DECL__ int atomicCAS(int *address, int compare, int val)
{
return __iAtomicCAS(address, compare, val);
} // 无符号整数原子比较赋值
__DEVICE_ATOMIC_FUNCTIONS_DECL__ unsigned int atomicCAS(unsigned int *address, unsigned int compare, unsigned int val)
{
return __uAtomicCAS(address, compare, val);
} // 无符号长整数原子加法
__DEVICE_ATOMIC_FUNCTIONS_DECL__ unsigned long long int atomicAdd(unsigned long long int *address, unsigned long long int val)
{
return __ullAtomicAdd(address, val);
} // 无符号长整数原子替换
__DEVICE_ATOMIC_FUNCTIONS_DECL__ unsigned long long int atomicExch(unsigned long long int *address, unsigned long long int val)
{
return __ullAtomicExch(address, val);
} // 无符号长整数原子比较赋值
__DEVICE_ATOMIC_FUNCTIONS_DECL__ unsigned long long int atomicCAS(unsigned long long int *address, unsigned long long int compare, unsigned long long int val)
{
return __ullAtomicCAS(address, compare, val);
} // 原子存在量词
__DEVICE_ATOMIC_FUNCTIONS_DECL__ bool any(bool cond)
{
return (bool)__any((int)cond);
} // 原子全称量词
__DEVICE_ATOMIC_FUNCTIONS_DECL__ bool all(bool cond)
{
return (bool)__all((int)cond);
} #endif /* __cplusplus && __CUDACC__ */ #undef __DEVICE_ATOMIC_FUNCTIONS_DECL__ #endif /* !__DEVICE_ATOMIC_FUNCTIONS_HPP__ */
● 原子操作函数声明在 device_functions.h 中。
当设备计算能力 > 320 或 > 600 时开放各原子操作对应的 block 和 system 函数,并开放对应 long long、float、double 型数据的同一个函数,例如:
#if !defined(__CUDACC_RTC__) || __CUDA_ARCH__ >= 600
__DEVICE_FUNCTIONS_STATIC_DECL__ float __fAtomicAdd_block(float *p, float val); __DEVICE_FUNCTIONS_STATIC_DECL__ float __fAtomicAdd_system(float *p, float val); __DEVICE_FUNCTIONS_STATIC_DECL__ double __dAtomicAdd(double *p, double val); __DEVICE_FUNCTIONS_STATIC_DECL__ double __dAtomicAdd_block(double *p, double val); __DEVICE_FUNCTIONS_STATIC_DECL__ double __dAtomicAdd_system(double *p, double val);
#endif /* !__CUDACC_RTC__ || __CUDA_ARCH__ >= 600 */
计算能力 600 以上能用的所有原子操作:
#define __DEVICE_FUNCTIONS_STATIC_DECL__ __host__ __device__ __cudart_builtin__ __DEVICE_FUNCTIONS_STATIC_DECL__ int __iAtomicAdd(int *p, int val); __DEVICE_FUNCTIONS_STATIC_DECL__ int __iAtomicAdd_block(int *p, int val); __DEVICE_FUNCTIONS_STATIC_DECL__ int __iAtomicAdd_system(int *p, int val); __DEVICE_FUNCTIONS_STATIC_DECL__ unsigned int __uAtomicAdd(unsigned int *p, unsigned int val); __DEVICE_FUNCTIONS_STATIC_DECL__ unsigned int __uAtomicAdd_block(unsigned int *p, unsigned int val); __DEVICE_FUNCTIONS_STATIC_DECL__ unsigned int __uAtomicAdd_system(unsigned int *p, unsigned int val); __DEVICE_FUNCTIONS_STATIC_DECL__ unsigned long long __ullAtomicAdd(unsigned long long *p, unsigned long long val); __DEVICE_FUNCTIONS_STATIC_DECL__ unsigned long long __ullAtomicAdd_block(unsigned long long *p, unsigned long long val); __DEVICE_FUNCTIONS_STATIC_DECL__ unsigned long long __ullAtomicAdd_system(unsigned long long *p, unsigned long long val); __DEVICE_FUNCTIONS_STATIC_DECL__ float __fAtomicAdd(float *p, float val); __DEVICE_FUNCTIONS_STATIC_DECL__ float __fAtomicAdd_block(float *p, float val); __DEVICE_FUNCTIONS_STATIC_DECL__ float __fAtomicAdd_system(float *p, float val); __DEVICE_FUNCTIONS_STATIC_DECL__ double __dAtomicAdd(double *p, double val); __DEVICE_FUNCTIONS_STATIC_DECL__ double __dAtomicAdd_block(double *p, double val); __DEVICE_FUNCTIONS_STATIC_DECL__ double __dAtomicAdd_system(double *p, double val); __DEVICE_FUNCTIONS_STATIC_DECL__ int __iAtomicExch(int *p, int val); __DEVICE_FUNCTIONS_STATIC_DECL__ int __iAtomicExch_block(int *p, int val); __DEVICE_FUNCTIONS_STATIC_DECL__ int __iAtomicExch_system(int *p, int val); __DEVICE_FUNCTIONS_STATIC_DECL__ unsigned int __uAtomicExch(unsigned int *p, unsigned int val); __DEVICE_FUNCTIONS_STATIC_DECL__ unsigned int __uAtomicExch_block(unsigned int *p, unsigned int val); __DEVICE_FUNCTIONS_STATIC_DECL__ unsigned int __uAtomicExch_system(unsigned int *p, unsigned int val); __DEVICE_FUNCTIONS_STATIC_DECL__ unsigned long long __ullAtomicExch(unsigned long long *p, unsigned long long val); __DEVICE_FUNCTIONS_STATIC_DECL__ unsigned long long __ullAtomicExch_block(unsigned long long *p, unsigned long long val); __DEVICE_FUNCTIONS_STATIC_DECL__ unsigned long long __ullAtomicExch_system(unsigned long long *p, unsigned long long val); __DEVICE_FUNCTIONS_STATIC_DECL__ float __fAtomicExch(float *p, float val); __DEVICE_FUNCTIONS_STATIC_DECL__ float __fAtomicExch_block(float *p, float val); __DEVICE_FUNCTIONS_STATIC_DECL__ float __fAtomicExch_system(float *p, float val); __DEVICE_FUNCTIONS_STATIC_DECL__ int __iAtomicMin(int *p, int val); __DEVICE_FUNCTIONS_STATIC_DECL__ int __iAtomicMin_block(int *p, int val); __DEVICE_FUNCTIONS_STATIC_DECL__ int __iAtomicMin_system(int *p, int val); __DEVICE_FUNCTIONS_STATIC_DECL__ long long __illAtomicMin(long long *p, long long val); __DEVICE_FUNCTIONS_STATIC_DECL__ long long __illAtomicMin_block(long long *p, long long val); __DEVICE_FUNCTIONS_STATIC_DECL__ long long __illAtomicMin_system(long long *p, long long val); __DEVICE_FUNCTIONS_STATIC_DECL__ unsigned int __uAtomicMin(unsigned int *p, unsigned int val); __DEVICE_FUNCTIONS_STATIC_DECL__ unsigned int __uAtomicMin_block(unsigned int *p, unsigned int val); __DEVICE_FUNCTIONS_STATIC_DECL__ unsigned int __uAtomicMin_system(unsigned int *p, unsigned int val); __DEVICE_FUNCTIONS_STATIC_DECL__ unsigned long long __ullAtomicMin(unsigned long long *p, unsigned long long val); __DEVICE_FUNCTIONS_STATIC_DECL__ unsigned long long __ullAtomicMin_block(unsigned long long *p, unsigned long long val); __DEVICE_FUNCTIONS_STATIC_DECL__ unsigned long long __ullAtomicMin_system(unsigned long long *p, unsigned long long val); __DEVICE_FUNCTIONS_STATIC_DECL__ int __iAtomicMax(int *p, int val); __DEVICE_FUNCTIONS_STATIC_DECL__ int __iAtomicMax_block(int *p, int val); __DEVICE_FUNCTIONS_STATIC_DECL__ int __iAtomicMax_system(int *p, int val); __DEVICE_FUNCTIONS_STATIC_DECL__ long long __illAtomicMax(long long *p, long long val); __DEVICE_FUNCTIONS_STATIC_DECL__ long long __illAtomicMax_block(long long *p, long long val); __DEVICE_FUNCTIONS_STATIC_DECL__ long long __illAtomicMax_system(long long *p, long long val); __DEVICE_FUNCTIONS_STATIC_DECL__ unsigned int __uAtomicMax(unsigned int *p, unsigned int val); __DEVICE_FUNCTIONS_STATIC_DECL__ unsigned int __uAtomicMax_block(unsigned int *p, unsigned int val); __DEVICE_FUNCTIONS_STATIC_DECL__ unsigned int __uAtomicMax_system(unsigned int *p, unsigned int val); __DEVICE_FUNCTIONS_STATIC_DECL__ unsigned long long __ullAtomicMax(unsigned long long *p, unsigned long long val); __DEVICE_FUNCTIONS_STATIC_DECL__ unsigned long long __ullAtomicMax_block(unsigned long long *p, unsigned long long val); __DEVICE_FUNCTIONS_STATIC_DECL__ unsigned long long __ullAtomicMax_system(unsigned long long *p, unsigned long long val); __DEVICE_FUNCTIONS_STATIC_DECL__ unsigned int __uAtomicInc(unsigned int *p, unsigned int val); __DEVICE_FUNCTIONS_STATIC_DECL__ unsigned int __uAtomicInc_block(unsigned int *p, unsigned int val); __DEVICE_FUNCTIONS_STATIC_DECL__ unsigned int __uAtomicInc_system(unsigned int *p, unsigned int val); __DEVICE_FUNCTIONS_STATIC_DECL__ unsigned int __uAtomicDec(unsigned int *p, unsigned int val); __DEVICE_FUNCTIONS_STATIC_DECL__ unsigned int __uAtomicDec_block(unsigned int *p, unsigned int val); __DEVICE_FUNCTIONS_STATIC_DECL__ unsigned int __uAtomicDec_system(unsigned int *p, unsigned int val); __DEVICE_FUNCTIONS_STATIC_DECL__ int __iAtomicCAS(int *p, int compare, int val); __DEVICE_FUNCTIONS_STATIC_DECL__ int __iAtomicCAS_block(int *p, int compare, int val); __DEVICE_FUNCTIONS_STATIC_DECL__ int __iAtomicCAS_system(int *p, int compare, int val); __DEVICE_FUNCTIONS_STATIC_DECL__ unsigned int __uAtomicCAS(unsigned int *p, unsigned int compare, unsigned int val); __DEVICE_FUNCTIONS_STATIC_DECL__ unsigned int __uAtomicCAS_block(unsigned int *p, unsigned int compare, unsigned int val); __DEVICE_FUNCTIONS_STATIC_DECL__ unsigned int __uAtomicCAS_system(unsigned int *p, unsigned int compare unsigned int val); __DEVICE_FUNCTIONS_STATIC_DECL__ unsigned long long int __ullAtomicCAS(unsigned long long int *p unsigned long long int compare unsigned long long int val); __DEVICE_FUNCTIONS_STATIC_DECL__ unsigned long long int __ullAtomicCAS_block(unsigned long long int *p unsigned long long int compare unsigned long long int val); __DEVICE_FUNCTIONS_STATIC_DECL__ unsigned long long int __ullAtomicCAS_system(unsigned long long int *p unsigned long long int compare unsigned long long int val); __DEVICE_FUNCTIONS_STATIC_DECL__ int __iAtomicAnd(int *p, int val); __DEVICE_FUNCTIONS_STATIC_DECL__ int __iAtomicAnd_block(int *p, int val); __DEVICE_FUNCTIONS_STATIC_DECL__ int __iAtomicAnd_system(int *p, int val); __DEVICE_FUNCTIONS_STATIC_DECL__ long long int __llAtomicAnd(long long int *p, long long int val); __DEVICE_FUNCTIONS_STATIC_DECL__ long long __llAtomicAnd_block(long long *p, long long val); __DEVICE_FUNCTIONS_STATIC_DECL__ long long __llAtomicAnd_system(long long *p, long long val); __DEVICE_FUNCTIONS_STATIC_DECL__ unsigned int __uAtomicAnd(unsigned int *p, unsigned int val); __DEVICE_FUNCTIONS_STATIC_DECL__ unsigned int __uAtomicAnd_block(unsigned int *p, unsigned int val); __DEVICE_FUNCTIONS_STATIC_DECL__ unsigned int __uAtomicAnd_system(unsigned int *p, unsigned int val); __DEVICE_FUNCTIONS_STATIC_DECL__ unsigned long long int __ullAtomicAnd(unsigned long long int *p unsigned long long int val); __DEVICE_FUNCTIONS_STATIC_DECL__ unsigned long long __ullAtomicAnd_block(unsigned long long *p, unsigned long long val); __DEVICE_FUNCTIONS_STATIC_DECL__ unsigned long long __ullAtomicAnd_system(unsigned long long *p, unsigned long long val); __DEVICE_FUNCTIONS_STATIC_DECL__ int __iAtomicOr(int *p, int val); __DEVICE_FUNCTIONS_STATIC_DECL__ int __iAtomicOr_block(int *p, int val); __DEVICE_FUNCTIONS_STATIC_DECL__ int __iAtomicOr_system(int *p, int val); __DEVICE_FUNCTIONS_STATIC_DECL__ long long int __llAtomicOr(long long int *p, long long int val); __DEVICE_FUNCTIONS_STATIC_DECL__ long long __llAtomicOr_block(long long *p, long long val); __DEVICE_FUNCTIONS_STATIC_DECL__ long long __llAtomicOr_system(long long *p, long long val); __DEVICE_FUNCTIONS_STATIC_DECL__ unsigned int __uAtomicOr(unsigned int *p, unsigned int val); __DEVICE_FUNCTIONS_STATIC_DECL__ unsigned int __uAtomicOr_block(unsigned int *p, unsigned int val); __DEVICE_FUNCTIONS_STATIC_DECL__ unsigned int __uAtomicOr_system(unsigned int *p, unsigned int val); __DEVICE_FUNCTIONS_STATIC_DECL__ unsigned long long int __ullAtomicOr(unsigned long long int *p unsigned long long int val); __DEVICE_FUNCTIONS_STATIC_DECL__ unsigned long long __ullAtomicOr_block(unsigned long long *p, unsigned long long val); __DEVICE_FUNCTIONS_STATIC_DECL__ unsigned long long __ullAtomicOr_system(unsigned long long *p, unsigned long long val); __DEVICE_FUNCTIONS_STATIC_DECL__ int __iAtomicXor(int *p, int val); __DEVICE_FUNCTIONS_STATIC_DECL__ int __iAtomicXor_block(int *p, int val); __DEVICE_FUNCTIONS_STATIC_DECL__ int __iAtomicXor_system(int *p, int val); __DEVICE_FUNCTIONS_STATIC_DECL__ long long int __llAtomicXor(long long int *p, long long int val); __DEVICE_FUNCTIONS_STATIC_DECL__ long long __llAtomicXor_block(long long *p, long long val); __DEVICE_FUNCTIONS_STATIC_DECL__ long long __llAtomicXor_system(long long *p, long long val); __DEVICE_FUNCTIONS_STATIC_DECL__ unsigned int __uAtomicXor(unsigned int *p, unsigned int val); __DEVICE_FUNCTIONS_STATIC_DECL__ unsigned int __uAtomicXor_block(unsigned int *p, unsigned int val); __DEVICE_FUNCTIONS_STATIC_DECL__ unsigned int __uAtomicXor_system(unsigned int *p, unsigned int val); __DEVICE_FUNCTIONS_STATIC_DECL__ unsigned long long int __ullAtomicXor(unsigned long long int *p unsigned long long int val); __DEVICE_FUNCTIONS_STATIC_DECL__ unsigned long long __ullAtomicXor_block(unsigned long long *p, unsigned long long val); __DEVICE_FUNCTIONS_STATIC_DECL__ unsigned long long __ullAtomicXor_system(unsigned long long *p, unsigned long long val);