该代码实现了矩阵相加计算,在GPU中将矩阵的数据分块平铺处理,使用cudaMemcpy2D优化矩阵访问,并且对比了算法相对于CPU版本的加速效果,验证了算法正确性。
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include <malloc.h>
#include <random>
#include "time.h"
#define W 2000//矩阵维度
#define H 3000
int a[H][W];
int b[H][W];
int c1[H][W];//存储CPU计算结果
int c2[H][W];//存储GPU计算结果
__global__ void matrixAddGPU(int* c, int* a, int* b, size_t pitch)//GPU版本
{
int x = blockDim.x * blockIdx.x + threadIdx.x;//计算全局序号
int y = blockDim.y * blockIdx.y + threadIdx.y;
if (x < W && y < H) {//防止越界访问
c[y * pitch + x] = a[y * pitch + x] + b[y * pitch + x];
}
}
void matrixAddCPU(int c[][W], int a[][W], int b[][W]) {//CPU版本
for (int i = 0; i < H; ++i) {
for (int j = 0; j < W; ++j)
c[i][j] = a[i][j] + b[i][j];
}
}
int main()
{
srand(0);
for (int i = 0; i < H; ++i) {
for (int j = 0; j < W; ++j) {
a[i][j] = rand() % 1000;
b[i][j] = rand() % 1000;
}
}
clock_t start, end;
double elapsedTime;
start = clock();
matrixAddCPU(c1, a, b);
end = clock();
elapsedTime = (double)(end - start);
printf("time to generate CPU:% 5.3f ms\n", elapsedTime);//打印CPU执行用时
int* dev_a, * dev_b, * dev_c;
size_t pitch;
cudaMallocPitch((void**)&dev_a, &pitch, sizeof(int) * W, H);
cudaMemcpy2D(dev_a, pitch,
a, sizeof(int) * W,
sizeof(int) * W, H, cudaMemcpyHostToDevice);
cudaMallocPitch((void**)&dev_b, &pitch, sizeof(int) * W, H);
cudaMemcpy2D(dev_b, pitch,
b, sizeof(int) * W,
sizeof(int) * W, H, cudaMemcpyHostToDevice);
cudaMallocPitch((void**)&dev_c, &pitch, sizeof(int) * W, H);
dim3 dimBlock(16, 16);//矩阵分块大小
dim3 dimGrid((W + 16 - 1) / 16, (H + 16 - 1) / 16);//矩阵分块后维度
cudaEvent_t start1, stop1;
cudaEventCreate(&start1);
cudaEventCreate(&stop1);
cudaEventRecord(start1, 0);
matrixAddGPU << <dimGrid, dimBlock >> > (dev_c,
dev_a, dev_b, pitch/sizeof(int));
cudaEventRecord(stop1, 0);
cudaEventSynchronize(stop1);
float elapsedTime1;
cudaEventElapsedTime(&elapsedTime1, start1, stop1);
printf("time to generate GPU:% 5.3f ms\n", elapsedTime1);//打印GPU执行用时
cudaEventDestroy(start1);
cudaEventDestroy(stop1);
cudaMemcpy2D(c2, sizeof(int) * W,//将GPU计算结果拷贝回CPU端
dev_c, pitch, sizeof(int) * W, H, cudaMemcpyDeviceToHost);
bool flag = true;
for (int i = 0; i < H; ++i) {//检验计算正确性
for(int j=0;j<W;++j)
if (c1[i][j] != c2[i][j]) {
flag = false;
break;
}
}
if (flag) printf("Consistent!!!\n");
else printf("Not consistent!!!\n");
cudaFree(dev_a);
cudaFree(dev_b);
cudaFree(dev_c);
return 0;
}