Nsight Compute 软件Source模块提供了精确到源代码行号的metrics参数,用于辅助性能调优,本篇基于访问共享内存的矩阵转置核函数的实现,记录一下对常用metrics含义的理解。
Metrics含义
Memory L1 Transcations Global:实际全局内存加载至L1缓存的内存交换次数,粒度128bytes
Memory L2 Transactions Global:实际全局内存加载至L2缓存的内存交换次数,粒度32bytes,该参数的值应该是Memory L1 Transcations Global 的4倍
Memory Ideal L2 Transactions Global:理论需要从全局内存加载至L2缓存的内存交换次数,当数值比Memory L2 Transactions Global小时,说明当前全局内存访问模式有效率问题
Memory L1 Transactions Shared:L1缓存与共享内存的数据交换次数,粒度32bytes
Memory Ideal L1 Transactions Shared:理论需要的L1缓存与共享内存的数据交换次数,当数值比Memory L1 Transactions Shared小时,说明存在Bank Conflict
代码实现
核函数执行配置
dim3 block(32, 32, 1);
dim3 grid(32, 32, 1);
无Bank Conflict的核函数实现
__global__ void kSMMatrixT(float* d_src, float* d_dst, int ROWDIM, int COLDIM)
{
__shared__ float smTmp[SMDIM][SMDIM+1];
int srcxIdx = threadIdx.x + blockIdx.x * blockDim.x;
int srcyIdx = threadIdx.y + blockIdx.y * blockDim.y;
smTmp[threadIdx.y][threadIdx.x] = d_src[srcyIdx * COLDIM + srcxIdx];
__syncthreads();
int dstxIdx = blockIdx.y * blockDim.y + threadIdx.x;
int dstyIdx = blockIdx.x * blockDim.x + threadIdx.y;
d_dst[dstyIdx*ROWDIM + dstxIdx] = smTmp[threadIdx.x][threadIdx.y];
}
存在Bank Conflict的核函数实现
__global__ void kBankCMatrixT(float* d_src, float* d_dst, int ROWDIM, int COLDIM)
{
__shared__ float smTmp[SMDIM][SMDIM];
int srcxIdx = threadIdx.x + blockIdx.x * blockDim.x;
int srcyIdx = threadIdx.y + blockIdx.y * blockDim.y;
smTmp[threadIdx.y][threadIdx.x] = d_src[srcyIdx * COLDIM + srcxIdx];
__syncthreads();
int dstxIdx = blockIdx.y * blockDim.y + threadIdx.x;
int dstyIdx = blockIdx.x * blockDim.x + threadIdx.y;
d_dst[dstyIdx * ROWDIM + dstxIdx] = smTmp[threadIdx.x][threadIdx.y];
}
具体Metrics
BankConflict
无BankConflict