Linux MPI+CUDA混编
源文件(main.cpp pi_cu.cu)
main.cpp
#include <mpi.h>
#include <stdio.h>
#include <stdlib.h>
#define NBIN 10000000 // Number of bins
#define NUM_BLOCK 13 // Number of thread blocks
#define NUM_THREAD 192 // Number of threads per block
// Kernel that executes on the CUDA device
void computePI(int nproc,int myid,float *sumHost,float step);
int main(int argc,char **argv) {
int myid,nproc,tid, nbin;
float pi=0.0, pig, step;
float *sumHost; // Pointers to host arrays
MPI_Init(&argc,&argv);
MPI_Comm_rank(MPI_COMM_WORLD,&myid); // My MPI rank
MPI_Comm_size(MPI_COMM_WORLD,&nproc); // Number of MPI processes
size_t size = NUM_BLOCK*NUM_THREAD*sizeof(float); //Array memory size
sumHost = (float *)malloc(size); // Allocate array on host
nbin = NBIN/nproc; // Number of bins per MPI process
step = 1.0/(float)(nbin*nproc); // Step size with redefined number of bins
computePI(nproc,myid,sumHost,step);
/* Reduction over CUDA threads */
for(tid=0; tid<NUM_THREAD*NUM_BLOCK; tid++) pi += sumHost[tid];
printf("step = %11.7f\n", step);
pi *=step;
free(sumHost);
printf("myid = %d: partial pi = %11.7f\n",myid, pi);
/* Reduction over MPI processes */
MPI_Allreduce(&pi,&pig,1,MPI_FLOAT,MPI_SUM,MPI_COMM_WORLD);
if (myid==0) printf("PI = %11.7f\n",pig);
MPI_Finalize();
return 0;
}
pi_cu.cu
#include<stdio.h>
#include<stdlib.h>
#define NBIN 10000000 // Number of bins
#define NUM_BLOCK 13 // Number of thread blocks
#define NUM_THREAD 192 // Number of threads per block
__global__ void cal_pi(float *sum,int nbin,float step,float offset,int nthreads,int nblocks)
{
int i;
float x;
int idx = blockIdx.x*blockDim.x+threadIdx.x; // Sequential thread index across blocks
for (i=idx; i< nbin; i+=nthreads*nblocks) { // Interleaved bin assignment to threads
x = offset+(i+0.5)*step;
sum[idx] += 4.0/(1.0+x*x);
}
}
void computePI(int nproc,int myid, float *sumHost,float step)
{
int nbin;
float offset;
float *sumDev; // Pointers to device arrays
dim3 dimGrid(NUM_BLOCK,1,1); // Grid dimensions (only use 1D)
dim3 dimBlock(NUM_THREAD,1,1); // Block dimensions (only use 1D)
nbin = NBIN/nproc; // Number of bins per MPI process
offset = myid*step*nbin; // Quadrature-point offset
size_t size = NUM_BLOCK*NUM_THREAD*sizeof(float); //Array memory size
cudaMalloc((void **) &sumDev,size); // Allocate array on device
cudaMemset(sumDev,0,size); // Reset array in device to 0
/* Calculate on device (call CUDA kernel) */
cal_pi <<<dimGrid,dimBlock>>> (sumDev,nbin,step,offset,NUM_THREAD,NUM_BLOCK);
/* Retrieve result from device and store it in host array */
cudaMemcpy(sumHost,sumDev,size,cudaMemcpyDeviceToHost);
cudaFree(sumDev);
}
使用bsub脚本提交作业
test.bsub
#BSUB -W 0:10
#BSUB -n 4
#BSUB -R "span[ptile=2]"
#BSUB -q "gpu"
#BSUB -o res.out
#BSUB -e out.err
module unload compiler
module load compiler/intel/composer_xe_2013_sp1.0.080
module unload mpi
module load mpi/mvapich2/1.9/intel
module unload cuda
module load cuda/6.0.37
module load
mpijob.mvapich2 ./gpu-pi
提交作业:
bsub < test.bsub
Makefile文件
all:gpu-pi
CFLAGS+=-O3
NVCCFLAGS+=-I/soft/cuda/6.0.37/samples/common/inc/ -I/soft/cuda/6.0.37/include
NVCCFLAGS+=-I/soft/mpi/mvapich2/1.9/intel/include/ -Wno-deprecated-gpu-targets
NVCCLIB+=-L/soft/cuda/6.0.37/lib64 -lcudart
gpu-pi: main.o pi_cu.o
mpicc $^ -o gpu-pi $(NVCCLIB) -lm
bsub < test.bsub
%.o:%.cpp
mpicc $(NVCCFLAGS) $(CFLAGS) -o $@ -c $^ -lm
%.o:%.cu
nvcc $(NVCCFLAGS) $(CFLAGS) -o $@ -c $^
clean:
rm -fr *.o *.err *.out gpu-pi
- .cu文件和 .cpp文件分别用 nvcc 和 mpicc 编译,注意编译时需要包含CUDA和MPI的include路径;
- 用mpicc链接,注意加上cuda的库;
- 如果不提交作业,直接用mpirun运行是得不到结果的,因为没有利用到GPU
[scatmstu1@login4 pi2]$ mpirun -np 4 ./gpu-pi
step = 0.0000001
myid = 0: partial pi = 0.0000000
step = 0.0000001
myid = 3: partial pi = 0.0000000
step = 0.0000001
myid = 1: partial pi = 0.0000000
step = 0.0000001
myid = 2: partial pi = 0.0000000
PI = 0.0000000
- 正确的运行结果( cat res.out )
[scatmstu1@login4 pi2]$ cat res.out
Your job looked like:
------------------------------------------------------------
# LSBATCH: User input
#BSUB -W 0:10
#BSUB -n 4
#BSUB -R "span[ptile=2]"
#BSUB -q "gpu"
#BSUB -o res.out
#BSUB -e out.err
module unload compiler
module load compiler/intel/composer_xe_2013_sp1.0.080
module unload mpi
module load mpi/mvapich2/1.9/intel
module unload cuda
module load cuda/6.0.37
module load
mpijob.mvapich2 ./gpu-pi
------------------------------------------------------------
Successfully completed.
Resource usage summary:
CPU time : 8.34 sec.
Max Memory : 2 MB
Max Swap : 22 MB
Max Processes : 1
The output (if any) follows:
step = 0.0000001
myid = 0: partial pi = 0.9799146
step = 0.0000001
myid = 1: partial pi = 0.8746758
step = 0.0000001
myid = 2: partial pi = 0.7194140
step = 0.0000001
myid = 3: partial pi = 0.5675882
PI = 3.1415925
PS:
Read file <out.err> for stderr output of this job.