Created on 2013-8-5
URL : http://blog.sina.com.cn/s/blog_a502f1a30101mi6t.html
@author: zhxfl
转载请说明出处
cudaDeviceProp prop; int count;
( cudaGetDeviceCount( &count ) );
for (int i=; i< count; i++) {
( cudaGetDeviceProperties( &prop, i ) );
printf( " --- General Information for device %d ---\n", i );
printf( "Name: %s\n", prop.name );
printf( "Compute capability: %d.%d\n", prop.major, prop.minor );
printf( "Clock rate: %d\n", prop.clockRate );
printf( "Device copy overlap: " );
if (prop.deviceOverlap)
printf( "Enabled\n" );
else
printf( "Disabled\n");
printf( "Kernel execution timeout : " );
if (prop.kernelExecTimeoutEnabled)
printf( "Enabled\n" );
else
printf( "Disabled\n" ); printf( " --- Memory Information for device %d ---\n", i );
printf( "Total global mem: %ld\n", prop.totalGlobalMem );
printf( "Total constant Mem: %ld\n", prop.totalConstMem );
printf( "Max mem pitch: %ld\n", prop.memPitch );
printf( "Texture Alignment: %ld\n", prop.textureAlignment ); printf( " --- MP Information for device %d ---\n", i );
printf( "Multiprocessor count: %d\n",
prop.multiProcessorCount );
printf( "Shared mem per mp: %ld\n", prop.sharedMemPerBlock );
printf( "Registers per mp: %d\n", prop.regsPerBlock );
printf( "Threads in warp: %d\n", prop.warpSize );
printf( "Max threads per block: %d\n",
prop.maxThreadsPerBlock );
printf( "Max thread dimensions: (%d, %d, %d)\n",
prop.maxThreadsDim[], prop.maxThreadsDim[],
prop.maxThreadsDim[] );
printf( "Max grid dimensions: (%d, %d, %d)\n",
prop.maxGridSize[], prop.maxGridSize[],
prop.maxGridSize[] );
printf( "\n" );
} int n1 = rand() % base + base;
int m1 = rand() % base + base;
int n2 = m1;
int m2 = rand() % base + base;
int *g1 = new int[n1 * m1];
int *g2 = new int[n2 * m2];
printf("matrix A[%3d %3d]\n", n1, m1);
for(int i = ; i < n1 * m1;i++)
{
g1[i] = rand() % large;
//printf("%5d ", g1[i]);
//if((i + 1) % m1 == 0)printf("\n");
}
printf("matrix B[%3d %3d]\n", n2, m2);
for(int i = ; i < n2 * m2;i++)
{
g2[i] = rand() % large;
//printf("%5d ", g2[i]);
//if((i + 1) % m2 == 0)printf("\n");
}
int *g; g = matrixMultiply(g1,n1,m1,g2,n2,m2); printf("matrix C[%3d %3d]\n", n1, m2);
for(int i = ; i< n1*m2;i++)
{
//printf("%5d ", g[i]);
//if((i + 1) % m2 == 0) printf("\n");
}
cuda本机参数查看的代码
上面是cuda example的代码,其中maxThreadsPerBlock是指每个块上的最大线程数,maxGridSize是最大的blocks数。理论上讲最大的并发量是maxThreadsPerBlock * maxGridSize。下面是在我本机上的运行结果,可以看到65535 * 1024是我想要的答案。
下面来测试一下
1)崩溃:function <<<65535,1024>>()
2)正常:function <<<65536,1>>()
3)崩溃:function <<<65536,1>>()
4)正常:function <<<1,1024>>()
5)崩溃:function <<<1,1025>>()
其中第一个的崩溃让人无法理解,其他都符合预期,对于这种申请资源失败的情况,目前还没有较好的对策,如果有我会及时补上