#include <stdio.h>
#define N 10
__global__ void add(int* a, int* b, int* c)
{
int tid = blockIdx.x; // 计算位于这个索引处的数据
if (tid < N)
c[tid] = a[tid] + b[tid];
}
int main(void)
{
int a[N], b[N], c[N];
int* dev_a, * dev_b, * dev_c;
// 在GPU上分配内存
cudaError_t(cudaMalloc((void**)& dev_a, N * sizeof(int)));
cudaError_t(cudaMalloc((void**)& dev_b, N * sizeof(int)));
cudaError_t(cudaMalloc((void**)& dev_c, N * sizeof(int)));
// 在CPU上为数组'a'和'b'赋值
for (int i = 0; i < N; i++)
{
a[i] = -i;
b[i] = i * i;
}
// 将数组'a'和'b'复制到GPU
cudaError_t(cudaMemcpy(dev_a, a, N * sizeof(int), cudaMemcpyHostToDevice));
cudaError_t(cudaMemcpy(dev_b, b, N * sizeof(int), cudaMemcpyHostToDevice));
add << <N, 1 >> > (dev_a, dev_b, dev_c);
// 将数组'c'从GPU复制到CPU
cudaError_t(cudaMemcpy(c, dev_c, N * sizeof(int), cudaMemcpyDeviceToHost));
// 显示结果
for (int i = 0; i < N; i++)
{
printf("%d + %d = %d\n", a[i], b[i], c[i]);
}
// 释放在GPU上分配的内存
cudaFree(dev_a);
cudaFree(dev_b);
cudaFree(dev_c);
return 0;
}