0_Simple__simpleVoteIntrinsics + 0_Simple__simpleVoteIntrinsics_nvrtc

介绍了线程束表决函数的实例(其概念介绍见 http://www.cnblogs.com/cuancuancuanhao/p/7841512.html),并在静态和运行时编译两种条件下进行使用。

▶ 源代码:静态

 // simpleVote_kernel.cuh
#ifndef SIMPLEVOTE_KERNEL_CU
#define SIMPLEVOTE_KERNEL_CU __global__ void voteAny(unsigned int *input, unsigned int *result)// 任意一个线程抛出非零值则函数返回非零值
{
int tx = threadIdx.x;
int mask = 0xffffffff;
result[tx] = __any_sync(mask, input[tx]);
} __global__ void voteAll(unsigned int *input, unsigned int *result)// 当且仅当所有线程抛出非零值函数才返回非零值
{
int tx = threadIdx.x;
int mask = 0xffffffff;
result[tx] = __all_sync(mask, input[tx]);
} __global__ void vote3(bool *info, int warp_size)// 跨线程束检查
{
int tx = threadIdx.x;
unsigned int mask = 0xffffffff;
bool *offs = info + (tx * );// 将每个线程指向等距间隔的元素,表明表决函数的运算结果可以进行分发 // 第一组 “下标模 3 得 0” 的元素为 0,第二组和第三组 “下标模 3 得 0” 的元素为 1。“一组” 为 warp_size * 3 个元素
*offs = __any_sync(mask, tx >= warp_size * / );
// 第一组和第二组前半段 “下标模 3 得 1” 的元素为 0,第二组后半段和第三组 “下标模 3 得 1” 的元素为 1
*(offs + ) = (tx >= warp_size * / )? true: false;
// 第一组和第二组 “下标模 3 得 2” 的元素为 0,第三组 “下标模 3 得 2” 的元素为 1
*(offs + ) = all(tx >= warp_size * / ) ? true : false;
// 最终结果应该是:
// 1 2 3 4 15 16 17 18 30 31 32
// 000 000 000 000 ... 000 000 000 000 ... 000 000 000
// 100 100 100 100 ... 100 100 110 110 ... 110 110 110
// 111 111 111 111 ... 111 111 111 111 ... 111 111 111
}
#endif
 // simpleVoteIntrinsics.cu
#include <stdio.h>
#include <cuda_runtime.h>
#include "device_launch_parameters.h"
#include <helper_functions.h>
#include <helper_cuda.h>
#include "simpleVote_kernel.cuh" #define WARP_SIZE 32
#define GROUP 4 void genVoteTestPattern(unsigned int *VOTE_PATTERN, int size)// 构建原数组,size == 8 时结果为{0,0,0,3,4,0,ffffffff,ffffffff}
{
for (int i = ; i < size / ; i++)
VOTE_PATTERN[i] = 0x00000000; for (int i = size / ; i < size / ; i++)
VOTE_PATTERN[i] = (i & 0x01) ? i : ; for (int i = size / ; i < * size / ; i++)
VOTE_PATTERN[i] = (i & 0x01) ? : i; for (int i = * size / ; i < size; i++)
VOTE_PATTERN[i] = 0xffffffff;
}
// 数组检查函数,type == 1:把数组元素全部加起来,结果非零就报错;type == 0:把数组元素全部加起来,结果不等于 WARP_SIZE 就报错
int checkErrors(unsigned int *h_result, int start, int end, bool type, const char * name)
{
int i, sum;
for (sum = , i = start; i < end; i++)
sum += h_result[i];
if (type&&sum > || !type&& sum != WARP_SIZE)
{
printf("\n\t<%s>[%d - %d]:", name, start, end-);
for (i = start; i < end; i++)
printf("%d,", h_result[i]);
printf("\b\n");
}
return type?(sum > ):(sum != WARP_SIZE);
} // 数组检查的中间函数,type == 1:使用(1,0,0,0)的模式调用数组检查函数;type == 0:使用(1,1,1,0)的模式调用数组检查函数
int checkResultsVoteKernel(unsigned int *h_result, int totalElement, bool type)
{
int error_count = ; error_count += checkErrors(h_result, * totalElement / , * totalElement / , type?:,"Vote.Any");
error_count += checkErrors(h_result, * totalElement / , * totalElement / , type?:,"Vote.Any");
error_count += checkErrors(h_result, * totalElement / , * totalElement / , type?:,"Vote.Any");
error_count += checkErrors(h_result, * totalElement / , * totalElement / , type?:,"Vote.Any"); printf("%s\n", !error_count ? "Passed" : "Failed");
return error_count;
}
int checkResultsVoteKernel(bool *hinfo, int totalThread)
{
int i, error_count;
for (i = error_count = ; i < totalThread * ; i++)
{
switch (i % )
{
case :
if (hinfo[i] != (i >= totalThread * )) // 等价于 if (i < totalThread && hinfo[i] == 0 || i >= totalThread && hinfo == 1)
error_count++;
break;
case :
if (hinfo[i] != (i >= totalThread * / )) // 等价于 if (i < totalThread * 3 / 2 && hinfo[i] == 0 || i >= totalThread * 3 / 2 && hinfo == 1)
error_count++;
break;
case :
if (hinfo[i] != (i >= totalThread * )) // 等价于 if (i < totalThread * 2 && hinfo[i] == 0 || i >= totalThread * 2 && hinfo == 1)
error_count++;
break;
}
}
printf("%s\n", !error_count ? "Passed" : "Failed");
return error_count;
} int main()
{
printf("\n\tStart.\n");
int totalElement;
unsigned int *h_input, *h_result;
unsigned int *d_input, *d_result;
bool *dinfo = NULL, *hinfo = NULL;
int error_count[] = { , , };
cudaSetDevice(); //使用长度为 4 个线程束的数组,刚好分为 4 个组(全零,后交替非零,前交替非零,全非零)进行表决
totalElement = WARP_SIZE * GROUP;
h_input = (unsigned int *)malloc(totalElement * sizeof(unsigned int));
h_result = (unsigned int *)malloc(totalElement * sizeof(unsigned int));
cudaMalloc((void **)&d_input, totalElement * sizeof(unsigned int));
cudaMalloc((void **)&d_result, totalElement * sizeof(unsigned int));
genVoteTestPattern(h_input, totalElement);
cudaMemcpy(d_input, h_input, totalElement * sizeof(unsigned int), cudaMemcpyHostToDevice); //测试一,any
printf("\n\tTest 1: ");
voteAny << <dim3(, ), dim3(totalElement, ) >> > (d_input, d_result);
cudaDeviceSynchronize();
cudaMemcpy(h_result, d_result, totalElement * sizeof(unsigned int), cudaMemcpyDeviceToHost);
error_count[] += checkResultsVoteKernel(h_result, totalElement, ); // 测试二,all
printf("\n\tTest 2: ");
voteAll << <dim3(, ), dim3(totalElement, ) >> > (d_input, d_result);
cudaDeviceSynchronize();
cudaMemcpy(h_result, d_result, totalElement * sizeof(unsigned int), cudaMemcpyDeviceToHost);
error_count[] += checkResultsVoteKernel(h_result, totalElement, ); // 测试三,使用长度为 9 个线程束的数组,但调用内核时只使用数量为 3 个线程束的线程,即分为 3 组,每组 WARP_SIZE * 3 个元素
printf("\n\tTest 3: ");
totalElement = WARP_SIZE * * ;
hinfo = (bool *)calloc(totalElement, sizeof(bool));
cudaMalloc((void **)&dinfo, totalElement * sizeof(bool));
cudaMemcpy(dinfo, hinfo, totalElement * sizeof(bool), cudaMemcpyHostToDevice);
vote3 << <, totalElement / >> > (dinfo, WARP_SIZE);
cudaDeviceSynchronize();
cudaMemcpy(hinfo, dinfo, totalElement * sizeof(bool), cudaMemcpyDeviceToHost);
error_count[] = checkResultsVoteKernel(hinfo, totalElement / ); // 清理工作
cudaFree(d_input);
cudaFree(d_result);
free(h_input);
free(h_result);
free(hinfo);
cudaFree(dinfo);
printf("\t\nFinish.\n");
getchar();
return (error_count[] || error_count[] || error_count[]) ? EXIT_FAILURE : EXIT_SUCCESS;
}

▶ 输出结果:

    Start.

    Test : Passed

    Test : Passed

    Test : Passed

    Finish.

▶ 源代码:运行时编译(删掉了相同的注释)

 // simpleVote_kernel.cuh
#ifndef SIMPLEVOTE_KERNEL_CU
#define SIMPLEVOTE_KERNEL_CU extern "C" __global__ void voteAny(unsigned int *input, unsigned int *result)
{
int tx = threadIdx.x;
int mask = 0xffffffff;
result[tx] = __any_sync(mask, input[tx]);
} extern "C" __global__ void voteAll(unsigned int *input, unsigned int *result)
{
int tx = threadIdx.x;
int mask = 0xffffffff;
result[tx] = __all_sync(mask, input[tx]);
} extern "C" __global__ void vote3(bool *info, int warp_size)
{
int tx = threadIdx.x;
unsigned int mask = 0xffffffff;
bool *offs = info + (tx * );
*offs = __any_sync(mask, tx >= warp_size * / );
*(offs + ) = (tx >= warp_size * / ) ? true : false;
*(offs + ) = all(tx >= warp_size * / ) ? true : false;
}
#endif
 // simpleVoteIntrinsics.cu
#include <stdio.h>
#include <cuda_runtime.h>
#include "device_launch_parameters.h"
#include "nvrtc_helper.h"
#include <helper_functions.h> #define WARP_SIZE 32
#define GROUP 4 void genVoteTestPattern(unsigned int *VOTE_PATTERN, int size)
{
for (int i = ; i < size / ; i++)
VOTE_PATTERN[i] = 0x00000000; for (int i = size / ; i < size / ; i++)
VOTE_PATTERN[i] = (i & 0x01) ? i : ; for (int i = size / ; i < * size / ; i++)
VOTE_PATTERN[i] = (i & 0x01) ? : i; for (int i = * size / ; i < size; i++)
VOTE_PATTERN[i] = 0xffffffff;
} int checkErrors(unsigned int *h_result, int start, int end, bool type, const char * name)
{
int i, sum;
for (sum = , i = start; i < end; i++)
sum += h_result[i];
if (type&&sum > || !type&& sum != WARP_SIZE)
{
printf("\n\t<%s>[%d - %d]:", name, start, end - );
for (i = start; i < end; i++)
printf("%d,", h_result[i]);
printf("\b\n");
}
return type ? (sum > ) : (sum != WARP_SIZE);
} int checkResultsVoteKernel(unsigned int *h_result, int totalElement, bool type)
{
int error_count = ; error_count += checkErrors(h_result, * totalElement / , * totalElement / , type ? : , "Vote.Any");
error_count += checkErrors(h_result, * totalElement / , * totalElement / , type ? : , "Vote.Any");
error_count += checkErrors(h_result, * totalElement / , * totalElement / , type ? : , "Vote.Any");
error_count += checkErrors(h_result, * totalElement / , * totalElement / , type ? : , "Vote.Any"); printf("%s\n", !error_count ? "Passed" : "Failed");
return error_count;
}
int checkResultsVoteKernel(bool *hinfo, int totalThread)
{
int i, error_count;
for (i = error_count = ; i < totalThread * ; i++)
{
switch (i % )
{
case :
if (hinfo[i] != (i >= totalThread * ))
error_count++;
break;
case :
if (hinfo[i] != (i >= totalThread * / ))
error_count++;
break;
case :
if (hinfo[i] != (i >= totalThread * ))
error_count++;
break;
}
}
printf("%s\n", !error_count ? "Passed" : "Failed");
return error_count;
} int main()
{
printf("\n\tStart.\n");
int totalElement;
unsigned int *h_input, *h_result;
CUdeviceptr d_input, d_result;// unsigned long long
bool *hinfo = NULL;
CUdeviceptr dinfo;
int error_count[] = { , , };
//cudaSetDevice(0); // 编译 PTX
char *ptx, *kernel_file;
size_t ptxSize;
kernel_file = "D:\\Program\\CUDA9.0\\Samples\\0_Simple\\simpleVoteIntrinsics_nvrtc\\simpleVote_kernel.cuh";
compileFileToPTX(kernel_file, , NULL, &ptx, &ptxSize, );// (1, NULL) 为主函数接受的参数个数和参数
CUmodule module = loadPTX(ptx, , NULL); totalElement = WARP_SIZE * GROUP;
h_input = (unsigned int *)malloc(totalElement * sizeof(unsigned int));
h_result = (unsigned int *)malloc(totalElement * sizeof(unsigned int));
cuMemAlloc(&d_input, totalElement * sizeof(unsigned int));
cuMemAlloc(&d_result, totalElement * sizeof(unsigned int));
genVoteTestPattern(h_input, totalElement);
cuMemcpyHtoD(d_input, h_input, totalElement * sizeof(unsigned int)); //测试一,any
printf("\n\tTest 1: ");
dim3 gridBlock(, );
dim3 threadBlock(totalElement, );
CUfunction kernel_addr;
cuModuleGetFunction(&kernel_addr, module, "voteAny");
void *arr1[] = { (void *)&d_input, (void *)&d_result };
cuLaunchKernel(kernel_addr, gridBlock.x, gridBlock.y, gridBlock.z, threadBlock.x, threadBlock.y, threadBlock.z, , , &arr1[], );
cuCtxSynchronize();
cuMemcpyDtoH(h_result, d_result, totalElement * sizeof(unsigned int));
error_count[] += checkResultsVoteKernel(h_result, totalElement, ); // 测试二,all
printf("\n\tTest 2: ");
cuModuleGetFunction(&kernel_addr, module, "voteAll");
cuLaunchKernel(kernel_addr, gridBlock.x, gridBlock.y, gridBlock.z, threadBlock.x, threadBlock.y, threadBlock.z, , , &arr1[], );
cuCtxSynchronize();
cuMemcpyDtoH(h_result, d_result, totalElement * sizeof(unsigned int));
error_count[] += checkResultsVoteKernel(h_result, totalElement, ); // 测试三
printf("\n\tTest 3: ");
totalElement = WARP_SIZE * * ;
hinfo = (bool *)calloc(totalElement, sizeof(bool));
cuMemAlloc(&dinfo, totalElement * sizeof(bool));
cuMemcpyHtoD(dinfo, hinfo, totalElement * sizeof(bool));
threadBlock = dim3(totalElement / , ); // 改变线程块尺寸
cuModuleGetFunction(&kernel_addr, module, "vote3");
int size = WARP_SIZE;
void *arr2[] = { (void *)&dinfo, (void *)&size };
cuLaunchKernel(kernel_addr, gridBlock.x, gridBlock.y, gridBlock.z, threadBlock.x, threadBlock.y, threadBlock.z, , , &arr2[], );
cuCtxSynchronize();
cuMemcpyDtoH(hinfo, dinfo, totalElement * sizeof(bool));
error_count[] = checkResultsVoteKernel(hinfo, totalElement / ); // 清理工作
cuMemFree(d_input);
cuMemFree(d_result);
free(h_input);
free(h_result);
free(hinfo);
cuMemFree(dinfo);
printf("\t\nFinish.\n");
getchar();
return (error_count[] || error_count[] || error_count[]) ? EXIT_FAILURE : EXIT_SUCCESS;
}

▶ 输出结果:

        Start.
> Using CUDA Device []: GeForce GTX
> GPU Device has SM 6.1 compute capability Test : Passed Test : Passed Test : Passed Finish.

▶ 涨姿势

● 线程表决函数见另一篇博客,注意 CUDA9.0 改进了部分函数,废弃了旧的部分函数。

上一篇:[20191011]通过bash计算sql语句的sql_id.txt


下一篇:linux进程模型总结