三种基于CUDA的归约实现

归约在并行计算中很常见,并且在实现上具有一定的套路。本文分别基于三种机制(Intrinsic,共享内存,atomic),实现三个版本的归约操作,完成一个warp(32)大小的整数数组的归约求和计算。

Intrinsic版本

基于Intrinsic函数 __shfl_down_sync 实现,使一个warp内的线程通过读取相邻线程寄存器中数据,完成归约操作。
实现如下:

__global__ void kIntrinsicWarpReduce(int* d_src, int* d_dst)
{
	int val = d_src[threadIdx.x];
	//val = __reduce_add_sync(0xFFFFFFFF,val);/*CUDA文档中说这个Intrinsic函数需要计算能力8.x,不知道是哪款显卡*/
	for (size_t delta = (blockDim.x >>1); delta > 0; delta = (delta >> 1))
	{
		val += __shfl_down_sync(0xFFFFFFFF, val,delta, 32);
	}
	if (threadIdx.x == 0)
	{
		d_dst[0] = val;
	}
}

其中,关于函数 __shfl_down_sync 的说明如下,其中英文部分截取自《CUDA_C_Best_Practices_Guide》:
cuda可以用于归约操作的两个Intrinsic指令
unsigned __reduce_add_sync(unsigned mask, unsigned value)

The mask indicates the threads participating in the call.
一个bit位表示一个线程lane,整个warp做归约0xFFFFFFFF

T __shfl_down_sync(unsigned mask, T var, unsigned int delta, int width=warpSize)

width must have a value which is a power of 2;If width is less than warpSize then each subsection of the warp behaves as a separate entity with a starting logical lane ID of 0. If srcLane is outside the range [0:width-1], the value returned corresponds to the value of var held by the srcLane modulo width.
以width分割warp,每width个线程为一组,srclane超过width后做取模处理

共享内存版本

在共享内存中与一个warp中其他线程交换数据,实现归约计算。
代码实现如下:

__global__ void kSMWarpReduce(int* d_src, int* d_dst)
{
	__shared__ int sm[warpsize];
	sm[threadIdx.x] = d_src[threadIdx.x];
	for (size_t step = (warpsize >> 1); step > 0; step = (step >> 1))
	{
		if (threadIdx.x < step)
		{
			sm[threadIdx.x] += sm[threadIdx.x + step];
		}
	}

	if (threadIdx.x == 0)
	{
		d_dst[0] = sm[0];
	}

}

Atomic版本

基于原子操作实现归约:

__global__ void kAtomicWarpReduce(int* d_src, int* d_dst)
{
	atomicAdd(d_dst, d_src[threadIdx.x]);
}

测试程序


const int warpsize = 32;
void TestReduce()
{
	int h_src[warpsize];
	for (size_t i = 0; i < warpsize; i++)
	{
		h_src[i] = i;
	}
	int h_dst = 0;
	int* d_src, * d_dst;
	cudaMalloc(&d_src, warpsize * sizeof(int));
	cudaMalloc(&d_dst, sizeof(int));
	cudaMemcpy(d_src, h_src, warpsize * sizeof(int), cudaMemcpyHostToDevice);
	kIntrinsicWarpReduce << <1,32 >> > (d_src,d_dst);
	checkCUDARtn(cudaDeviceSynchronize());
	cudaMemcpy(&h_dst, d_dst, sizeof(int), cudaMemcpyDeviceToHost);
	printf("kIntrinsicWarpReduce rtn is %d \n", h_dst);
	cudaMemset(d_dst, 0, sizeof(int));
	kSMWarpReduce << <1, 32 >> > (d_src, d_dst);
	checkCUDARtn(cudaDeviceSynchronize());
	cudaMemcpy(&h_dst, d_dst, sizeof(int), cudaMemcpyDeviceToHost);
	printf("kSMWarpReduce rtn is %d \n", h_dst);
	cudaMemset(d_dst, 0, sizeof(int));
	kAtomicWarpReduce << <1, 32 >> > (d_src, d_dst);
	checkCUDARtn(cudaDeviceSynchronize());
	cudaMemcpy(&h_dst, d_dst, sizeof(int), cudaMemcpyDeviceToHost);
	printf("kAtomicWarpReduce rtn is %d \n", h_dst);
	cudaFree(d_src);
	cudaFree(d_dst);
}
上一篇:warpAffine 仿射变换图像填充算法对比


下一篇:java面试题及答案2021,java最新面试题汇总(三十四)