Linux MPI+HIP混编

Linux MPI+HIP混编

源文件:
pi_hip.cpp

#include<stdio.h>
#include<stdlib.h>
#include <hip/hip_runtime.h>

#define NBIN 10000000 // Number of bins
#define NUM_BLOCK 13  // Number of thread blocks
#define NUM_THREAD 192 // Number of threads per block

__global__ void cal_pi(float *sum,int nbin,float step,float offset,int nthreads,int nblocks)
 {
   int i;
   float x;
   int idx = blockIdx.x*blockDim.x+threadIdx.x; // Sequential thread index across blocks
   for (i=idx; i< nbin; i+=nthreads*nblocks) { // Interleaved bin assignment to threads
   x = offset+(i+0.5)*step;
   sum[idx] += 4.0/(1.0+x*x);
   }
 }

void computePI(int nproc,int myid, float *sumHost,float step)
{
  int nbin;
  float offset;
  float *sumDev; // Pointers to device arrays
  dim3 dimGrid(NUM_BLOCK,1,1); // Grid dimensions (only use 1D)
  dim3 dimBlock(NUM_THREAD,1,1); // Block dimensions (only use 1D)
 
  nbin = NBIN/nproc; // Number of bins per MPI process
  offset = myid*step*nbin; // Quadrature-point offset
  size_t size = NUM_BLOCK*NUM_THREAD*sizeof(float); //Array memory size
  hipMalloc((void **) &sumDev,size); // Allocate array on device
  hipMemset(sumDev,0,size); // Reset array in device to 0
  // // Calculate on device (call CUDA kernel)
  hipLaunchKernelGGL(cal_pi,dimGrid,dimBlock,0,0,sumDev,nbin,step,offset,NUM_THREAD,NUM_BLOCK);
  // // Retrieve result from device and store it in host array
  hipMemcpy(sumHost,sumDev,size,hipMemcpyDeviceToHost);

  hipFree(sumDev);
}

源文件:
main.cpp

#include <mpi.h>
#include <stdio.h>
#include <stdlib.h>
#define NBIN 10000000 // Number of bins
#define NUM_BLOCK 13 // Number of thread blocks
#define NUM_THREAD 192 // Number of threads per block
// Kernel that executes on the CUDA device
void computePI(int nproc,int myid,float *sumHost,float step);
 int main(int argc,char **argv) {
 int myid,nproc,tid, nbin;
 float pi=0.0, pig, step;
 float *sumHost; // Pointers to host arrays
 MPI_Init(&argc,&argv);
 MPI_Comm_rank(MPI_COMM_WORLD,&myid); // My MPI rank
 MPI_Comm_size(MPI_COMM_WORLD,&nproc); // Number of MPI processes

 size_t size = NUM_BLOCK*NUM_THREAD*sizeof(float); //Array memory size
 sumHost = (float *)malloc(size); // Allocate array on host
 nbin = NBIN/nproc; // Number of bins per MPI process
 step = 1.0/(float)(nbin*nproc); // Step size with redefined number of bins
 computePI(nproc,myid,sumHost,step);

// // Reduction over CUDA threads
 for(tid=0; tid<NUM_THREAD*NUM_BLOCK; tid++) pi += sumHost[tid];
 printf("step = %11.7f\n", step);
 pi *=step;
 free(sumHost);
 printf("myid = %d: partial pi = %11.7f\n",myid, pi);
// // Reduction over MPI processes
 MPI_Allreduce(&pi,&pig,1,MPI_FLOAT,MPI_SUM,MPI_COMM_WORLD);
 if (myid==0) printf("PI = %11.7f\n",pig);
 MPI_Finalize();
 return 0;
 }

Makefile文件:

方法1:直接将pi_hip.cpp编译成.o文件
MPILIB=-L/opt/hpc/software/openmpi-3.1.2/lib -lmpi

all:
	make clean
	hipcc -c -O3 -std=c++11 -D_HIP_PLATFORM_HCC__ -o pi_hip.o pi_hip.cpp	
	mpicc -w -g -o main.o -c main.cpp
	hipcc $(MPILIB) -g -o out-pi pi_hip.o main.o -lm  -lstdc++
	mpirun -mca pml ucx -np 8 ./out-pi

clean:
	rm -fr *.o *.err *.out out-pi

编译: 分别用 hipcc 和mpicc 编译 *_hip.cpp 和 *.cpp 文件 (不分前后)

链接: 用hipcc链接,注意加上mpicc的库

运行: 直接用mpicc运行,注意加上一些参数(如 -mca pml ucx)

方法2:直接将pi_hip.cpp编译动态库(.so文件)
HIPLIB=-L/opt/rocm/hip/lib -lhip_hcc
MPILIB=-L/opt/hpc/software/mpi/hpcx/v2.4.0/ompi/lib -lmpi
all:
        make clean
        hipcc -c -fpic -O3 -std=c++11 -D_HIP_PLATFORM_HCC__ -o pi_hip.o pi_hip.cpp
        hipcc -shared pi_hip.o -o libpi_hip.so
        mpicc -w -g -o main.o -c main.cpp
        hipcc $(MPILIB) -g -o out-pi -L./ -lpi_hip main.o -lm  -lstdc++
        mpirun -mca pml ucx -np 8 ./out-pi
clean:
        rm -fr *.o *.err *.out out-pi *.so

编译
用hipcc将_hip.cpp编译成.o文件,再进一步编译成.so文件
用mpicc将
.cpp文件编译成.o文件

链接:用hipcc链接,注意加上mpicc的库,和.so文件

运行: 直接用mpicc运行,注意加上一些参数(如 -mca pml ucx)

上一篇:链栈


下一篇:macos安装MPI-IS / mesh遇到的坑