cuda规约操作的优化策略

问题描述

完成按列求和规约程序的并行优化,主要要求:
∙ \bullet ∙只能使用一块Tesla K40m GPU进行计算。
∙ \bullet ∙测试附件slurm中几种典型规模的加速效果。
原始代码解析:
cuda规约操作的优化策略

代码运行:进入文件夹以后,我们先加载cuda模块,这里特别注意,我们需要使用命令:
module add cuda/9.0加载9.0版本的cuda。
然后再使用make编译,如果事先直接使用make或者加载错误版本的cuda,然后编译,这样会生成一些中间文件,这个时候编译报错,此时重新加载9.0版本的cuda仍然会报错,原因极有可能是这些中间文件影响编译,这个时候需要使用命令make clean来删除之前生成的后缀为o的文件,重新编译才行。原始版本的代码经过运行以后会打印出不同形状的矩阵规约以后的时间,包括CPU和GPU以及kernel运行的时间。
cuda规约操作的优化策略

代码说明

后面几个版本的代码唯一的差别在于col_reduce.cu,其余的代码不变。
Makefile

CC = gcc
CFLAGS = -std=c99

NVCC = nvcc
NVCC_FLAGS = --gpu-architecture=sm_35 -std=c++11 -O3 -Wno-deprecated-gpu-targets

LIBRARIES = -L/mnt/lustrefs/softwares/cuda/9.0/lib64 -lcudart -lm

col_reduce: main.o cpu_col_reduce.o gpu_col_reduce.o 
	$(CC) $^ -o $@ $(LIBRARIES)

main.o: main.c
	$(CC) $(CFLAGS) -c $^ -o $@

cpu_col_reduce.o: col_reduce.c
	$(CC) $(CFLAGS) -c $^ -o $@

gpu_col_reduce.o: col_reduce.cu
	$(NVCC) $(NVCC_FLAGS) -c $^ -o $@

clean:
	rm -f *.o col_reduce
	

col_reduce.c

// M: [m, n]
// vec: [n]
void cpu_mat_column_reduce(double* mat, double* vec, int m, int n) {
  for (int i = 0; i < n; i++) {  // n col
    double sum = 0.0;
    for (int j = 0; j < m; j++) {  // m row
      sum += mat[j * n + i];
    }
    vec[i] = sum;
  }
}


col_reduce.h

#ifndef MAT_MUL_H
#define MAT_MUL_H

#include <stdio.h>
#include <time.h>
#include <sys/time.h>

#define SEED 0

void cpu_mat_column_reduce(double* mat, double* vec, int m, int n);
void gpu_mat_column_reduce(double* mat, double* vec, int m, int n);
// Returns a randomized matrix containing mxn elements
static inline double *rand_mat(int m, int n) {
  srand(SEED);
  double *mat = (double *) malloc(m * n * sizeof(double));
  if (mat == NULL) {
    printf("Error allocating CPU memory");
    exit(1);
  }
  for(int i = 0; i < m; i++){
    for(int j = 0; j < n; j++){
      mat[i * n + j] = (double)(rand() % 10) / 100000.0;
    }
  }
  return mat;
}

// Returns a raw matrix containing n elements
static inline double *raw_mat(int m, int n) {
  double *mat = (double *) malloc(m * n * sizeof(double));
  if (mat == NULL) {
    printf("Error allocating CPU memory");
    exit(1);
  }
  return mat;
}

// Returns a raw matrix containing n elements
static inline double *zero_vec(int m) {
  double *vec = (double *) malloc(m * sizeof(double));
  if (vec == NULL) {
    printf("Error allocating CPU memory");
    exit(1);
  }
  for (int i = 0; i < m; i++) {
    vec[i] = 0.0;
  }
  return vec;
}

// Returns the current time in microseconds
static inline long long start_timer() {
  struct timeval tv;
  gettimeofday(&tv, NULL);
  return tv.tv_sec*1000000 + tv.tv_usec;
}

// Prints the time elapsed since the specified time
static inline long long stop_timer(long long start_time, char *name) {
  struct timeval tv;
  gettimeofday(&tv, NULL);
  long long end_time = tv.tv_sec*1000000 + tv.tv_usec;
  printf("%s: %f milisec\n", name, ((double) (end_time - start_time)) / 1000);
  return end_time - start_time;
}

#endif

main.c

/
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include "col_reduce.h"

int main(int argc, char **argv){

  int m = 3;  // default value
  int n= 3;  // default value
  if (argc == 3) {
    m = atoi(argv[1]); // user-specified value
    n = atoi(argv[2]); // user-specified value
  }
  printf("\nDo column reduce on mat m: %d n: %d ...\n", m, n);
  double* mat = rand_mat(m, n);
  double* cpu_vec = zero_vec(n);
  double* host_vec = zero_vec(n);
  
// CPU
  long long cpu_start_time = start_timer();
  cpu_mat_column_reduce(mat, cpu_vec, m, n);
  long long cpu_time = stop_timer(cpu_start_time, "CPU");

// GPU  
  long long gpu_start_time = start_timer();
  gpu_mat_column_reduce(mat, host_vec, m, n);
  long long gpu_time = stop_timer(gpu_start_time, "GPU");

  // Check the correctness of the GPU results
  int num_wrong = 0;
  for (int i = 0; i < n; i++) {
    if (fabs(cpu_vec[i] - host_vec[i]) > 0.001) {
      num_wrong++;
      printf("cpu vec: %f, host vec: %f \n", cpu_vec[i], host_vec[i]);
    }
  }
  // Report the correctness results
  if (num_wrong) printf("GPU %d / %d values incorrect\n", num_wrong, n);
  else           printf("Passed, GPU all values correct!\n");
  free(mat);
  free(cpu_vec);
  free(host_vec);
}

run.slurm

#!/bin/bash

#SBATCH -o job_%j_%N.out
#SBATCH --partition=gpu
#SBATCH -J col_reduce
#SBATCH --nodes=1
#SBATCH --ntasks-per-node=1
#SBATCH --cpus-per-task=1
#SBATCH --gres=gpu:1
#SBATCH -t 20:00

module add cuda/9.0
./col_reduce 160000 8
./col_reduce 1600000 8
./col_reduce 6400000 8
./col_reduce 160000 32
./col_reduce 1600000 32
./col_reduce 6400000 32
./col_reduce 160000 64
./col_reduce 1600000 64


col_reduce.cu

#include <cuda.h>
#include <stdio.h>
#include <math.h>

#define ThreadNum 128

#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 600
#else
static __inline__ __device__ double atomicAdd(double *address, double val) {
  unsigned long long int* address_as_ull = (unsigned long long int*)address;
  unsigned long long int old = *address_as_ull, assumed;
  if (val==0.0)
    return __longlong_as_double(old);
  do {
    assumed = old;
    old = atomicCAS(address_as_ull, assumed, __double_as_longlong(val +__longlong_as_double(assumed)));
  } while (assumed != old);
  return __longlong_as_double(old);
}
#endif

extern "C" void gpu_mat_column_reduce(double* h_M, double* h_vec, int m, int n);

__global__
void gpu_mat_col_reduce_kernel(double* mat, double* vec, int m, int n){
  int i = blockIdx.x * blockDim.x + threadIdx.x;
  if(i < n){
    double sum = 0.0;
    for (int j = 0; j < m; j++) {  // m row
      sum +=  mat[j * n + i];
    }
    vec[i] = sum;
  }
}

void gpu_mat_column_reduce(double* h_M, double* h_vec, int m, int n) {
  double *d_M, *d_vec;
  size_t size_of_float = sizeof(double);
  size_t size_M = m * n * size_of_float;
  size_t size_vec = n * size_of_float;

// malloc and memcpy
  cudaMalloc((void**)&d_M, size_M);
  cudaMalloc((void**)&d_vec, size_vec);
  cudaMemcpy(d_M, h_M, size_M, cudaMemcpyHostToDevice);
  cudaMemcpy(d_vec, h_vec, size_vec, cudaMemcpyHostToDevice);

// timing   
  cudaEvent_t start, stop;
  float elapsed_time = 0.0;
  cudaEventCreate(&start);
  cudaEventCreate(&stop);
  cudaEventRecord(start, 0);

// running
  dim3 grid_dim(int(ceil((double)n/ThreadNum)), 1, 1);
  dim3 block_dim(ThreadNum, 1, 1);
  gpu_mat_col_reduce_kernel<<<grid_dim, block_dim>>>(d_M, d_vec, m, n);

// timing  
  cudaEventRecord(stop, 0);
  cudaEventSynchronize(stop);
  cudaEventElapsedTime(&elapsed_time, start, stop);
// memcpy   
  cudaMemcpy(h_vec, d_vec, size_vec, cudaMemcpyDeviceToHost);

// print res  
  printf("  grid  dim:  %d, %d, %d.\n", grid_dim.x, grid_dim.y, grid_dim.z);
  printf("  block dim: %d, %d, %d.\n", block_dim.x, block_dim.y, block_dim.z);
  printf("  kernel time: %f milisec\n", elapsed_time);
  
 // free 
  cudaFree(d_M);
  cudaFree(d_vec);
  cudaEventDestroy(start);
  cudaEventDestroy(stop);
}

版本优化

high1
上面的原始代码,由于n最大只取64,根据64除以128向上取整,因此grid的第一个维度就是1,而blockDim.x虽然是128,但是n最多只取64,所以会有大量线程闲置,上面的原始代码每一个线程处理一列元素的求和,那么我们按着这个思路去想,是否可以让多个线程去操作一列,具体的想法如下:
我们设置blockDim.x = 64,表示x维度上看一行线程处理操作一列元素,然后考虑blockDim.y,由于机器的限制,一个block最多只有1024个线程,因此设置blockDim.y=16,即每一列元素被切分成16 份,其中一个线程操作一段,等到16个线程操作完以后,再把这些结果分别加起来。\textbf{注意:}当我们这样处理以后,由于不同线程操作的都是该列的一段元素,最终我们需要把不同线程计算出来的sum累加到vec去,这里就存在读写冲突,因此,在最后我们处理的时候需要利用原子操作避免这种冲突,
cuda规约操作的优化策略
col_reduce.cu

#include <cuda.h>
#include <stdio.h>
#include <math.h>

#define ThreadNum 64
#define colnum 16

#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 600
#else
static __inline__ __device__ double atomicAdd(double *address, double val) {
  unsigned long long int* address_as_ull = (unsigned long long int*)address;
  unsigned long long int old = *address_as_ull, assumed;
  if (val==0.0)
    return __longlong_as_double(old);
  do {
    assumed = old;
    old = atomicCAS(address_as_ull, assumed, __double_as_longlong(val +__longlong_as_double(assumed)));
  } while (assumed != old);
  return __longlong_as_double(old);
}
#endif

extern "C" void gpu_mat_column_reduce(double* h_M, double* h_vec, int m, int n);

__global__
void gpu_mat_col_reduce_kernel(double* mat, double* vec, int m, int n){
  int i = blockIdx.x * blockDim.x + threadIdx.x;
  //printf("i=%d\n",i);
  int wid = m/colnum;
  int ind = blockIdx.y * blockDim.y + threadIdx.y;
  int st = ind*wid,end = min(ind*wid + wid,m);
  if(i < n){
    double sum = 0.0;
    for (int j = st; j < end; j++) {  // m row
      sum +=  mat[j * n + i];
    }
    //printf("sum:%.2f,ind:%d\n");
    atomicAdd(&vec[i],sum);
  }
}

void gpu_mat_column_reduce(double* h_M, double* h_vec, int m, int n) {
  double *d_M, *d_vec;
  size_t size_of_float = sizeof(double);
  size_t size_M = m * n * size_of_float;
  size_t size_vec = n * size_of_float;

// malloc and memcpy
  cudaMalloc((void**)&d_M, size_M);
  cudaMalloc((void**)&d_vec, size_vec);
  cudaMemcpy(d_M, h_M, size_M, cudaMemcpyHostToDevice);
  cudaMemcpy(d_vec, h_vec, size_vec, cudaMemcpyHostToDevice);

// timing   
  cudaEvent_t start, stop;
  float elapsed_time = 0.0;
  cudaEventCreate(&start);
  cudaEventCreate(&stop);
  cudaEventRecord(start, 0);

// running
  dim3 grid_dim(int(ceil((double)n/ThreadNum)), 1, 1);
  dim3 block_dim(ThreadNum, colnum, 1);
  gpu_mat_col_reduce_kernel<<<grid_dim, block_dim>>>(d_M, d_vec, m, n);

// timing  
  cudaEventRecord(stop, 0);
  cudaEventSynchronize(stop);
  cudaEventElapsedTime(&elapsed_time, start, stop);
// memcpy   
  cudaMemcpy(h_vec, d_vec, size_vec, cudaMemcpyDeviceToHost);

// print res  
  printf("  grid  dim:  %d, %d, %d.\n", grid_dim.x, grid_dim.y, grid_dim.z);
  printf("  block dim: %d, %d, %d.\n", block_dim.x, block_dim.y, block_dim.z);
  printf("  kernel time: %f milisec\n", elapsed_time);
  
 // free 
  cudaFree(d_M);
  cudaFree(d_vec);
  cudaEventDestroy(start);
  cudaEventDestroy(stop);
}

cuda规约操作的优化策略
在上面本人提供了两种优化思路,本来打算写一个共享内存的优化版本,然而对于这种规约操作,本人实在看不出来如何做共享内存。另外是对于high2优化,还可以把上面提到的一些常量定义为寄存器变量以达到加速效果,不过经过本人测试,这样做效果并不明显。另外是,我们可以这么想,多申请几个block,以达到一个block处理一列元素,这样的代码十分容易编写,只不过这里涉及到一个问题:n 最大是64,也就是我们最少要申请64个block,此时每一个block的线程数目该是多少,本人尝试设置每一个block线程数目为1024,然后不知道什么原因计算结果错误,本人猜测可能GPU上能够申请的线程数目有上限,如果尝试把每一个block的线程数目降低,然后发现此时计算结果正确,但是本人测试以后发现加速效果不佳,因此也没有罗列结果。下面本人用一个表格\ref{out}来展示上述提到的两种加速版本的加速比,其中radio1,radio2分别表示high1,high2相对于原始版本的加速比。
cuda规约操作的优化策略
col_reduce.cu

#include <cuda.h>
#include <stdio.h>
#include <math.h>

#define gridnum 8
#define blocknum 8
#define colnum 128

#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 600
#else
static __inline__ __device__ double atomicAdd(double *address, double val) {
  unsigned long long int* address_as_ull = (unsigned long long int*)address;
  unsigned long long int old = *address_as_ull, assumed;
  if (val==0.0)
    return __longlong_as_double(old);
  do {
    assumed = old;
    old = atomicCAS(address_as_ull, assumed, __double_as_longlong(val +__longlong_as_double(assumed)));
  } while (assumed != old);
  return __longlong_as_double(old);
}
#endif

extern "C" void gpu_mat_column_reduce(double* h_M, double* h_vec, int m, int n);

__global__
void gpu_mat_col_reduce_kernel(double* mat, double* vec, int m, int n){
  int fen = n/blockDim.x;
  int H = blockDim.y,V = (int)((double)(blockDim.x/fen));
  int i;
  if (fen > 1 && fen < blockDim.x){
    if (threadIdx.x%2 == 0) i = blockIdx.x*fen + (threadIdx.x/2);
    else i = blockIdx.x*fen + (threadIdx.x - 1)/2;
  }
  else {
    i = blockIdx.x*fen + threadIdx.x%fen;
  }
  //if (threadIdx.x%2 == 0) i = blockIdx.x*fen + (threadIdx.x/2);
 // else i = blockIdx.x*fen + (threadIdx.x - 1)/2;
 // int i = blockIdx.x*fen + threadIdx.x%fen;
 
  int wid = (int)(ceil((double)m/(H*V))); 
  int thread = threadIdx.x%V + threadIdx.y*V;
 
  int jst = thread*wid,jend = min((thread + 1)*wid,m);
 
  if(i < n){
    double sum = 0.0;
    for (int j = jst; j < jend; j++) {  // m row
      //atomicAdd(&sum,mat[j*n + i]);
      sum +=  mat[j * n + i];
    }
    atomicAdd(&vec[i],sum);
  }
  
}

void gpu_mat_column_reduce(double* h_M, double* h_vec, int m, int n) {
  double *d_M, *d_vec;
  size_t size_of_float = sizeof(double);
  size_t size_M = m * n * size_of_float;
  size_t size_vec = n * size_of_float;

// malloc and memcpy
  cudaMalloc((void**)&d_M, size_M);
  cudaMalloc((void**)&d_vec, size_vec);
  cudaMemcpy(d_M, h_M, size_M, cudaMemcpyHostToDevice);
  cudaMemcpy(d_vec, h_vec, size_vec, cudaMemcpyHostToDevice);

// timing   
  cudaEvent_t start, stop;
  float elapsed_time = 0.0;
  cudaEventCreate(&start);
  cudaEventCreate(&stop);
  cudaEventRecord(start, 0);

// running
  dim3 grid_dim(gridnum , 1, 1);
  dim3 block_dim(blocknum, colnum, 1);
  gpu_mat_col_reduce_kernel<<<grid_dim, block_dim>>>(d_M, d_vec, m, n);

// timing  
  cudaEventRecord(stop, 0);
  cudaEventSynchronize(stop);
  cudaEventElapsedTime(&elapsed_time, start, stop);
// memcpy   
  cudaMemcpy(h_vec, d_vec, size_vec, cudaMemcpyDeviceToHost);

// print res  
  printf("  grid  dim:  %d, %d, %d.\n", grid_dim.x, grid_dim.y, grid_dim.z);
  printf("  block dim: %d, %d, %d.\n", block_dim.x, block_dim.y, block_dim.z);
  printf("  kernel time: %f milisec\n", elapsed_time);
  
 // free 
  cudaFree(d_M);
  cudaFree(d_vec);
  cudaEventDestroy(start);
  cudaEventDestroy(stop);
}

经过我们测试,发现不同的线程组织方式经过合理搭配可以有效加速算法,同时我们还可以发现,每次申请的线程数目有限,尤其是对于一个block的线程数有一个上界,本人一开始不知道这个,每一个block申请的线程数目巨大,导致Kernel函数没有办法进去GPU。\par
另外是,经过我们实验,我们很明显看出这样修改线程调度方式只能缩小kernel时间,然而对于这个GPU的运行时间影响不是特别明显,这个需要人为修改变量的访存模型,提高线程对不同变量的访问速度才能达到目的,这个就比较困难。

上一篇:C++基础之STL


下一篇:哈希表及解决冲突