【cuda】 thrust 进行加速

官方网站

  • https://docs.nvidia.com/cuda/thrust/index.html
  • https://github.com/NVIDIA/thrust

thrus的特点

  • thrust一般还是只能单独(从host中)调用,并不适合和.cu混合使用。
  • thrust中的算法主要是建立在vector和map<key,value>这两种数据结构之上。 比较适合工程使用,并不能实现复杂的算法;
  • 如果你需要处理big size的vector和map,并且操作都比较简单,可以考虑 thrust 后缀是.cpp,编译使用cuda
  • c++ release版本(debug目录有问题)

代码部分

定义核函数后,在main.cpp中进行调用

thrust_abcd.cuh

#pragma once
#include<thrust/host_vector.h>
#include<thrust/device_vector.h>
#include <thrust/copy.h>
#include <thrust/fill.h>
#include <thrust/sequence.h>

#include<chrono> 
#include <iostream>
void gpuInit(int* a);
extern float time_pure_gpu;
extern float time_cp_total;

thrust_abdf.cu

#include "thrust_abcd.cuh"

struct saxpy_functor
{
    const float a;

    saxpy_functor(float _a) : a(_a) {}

    __host__ __device__
        float operator()(const float& x, const float& y) const {
        return x + y;
    }
};

float time_pure_gpu = 0;
float time_cp_total = 0;
void gpuInit(int* a)
{
   
    thrust::host_vector<int> H(a, a + 25 * 25); // get cpu data
    std::cout << "H has size " << H.size() << std::endl;

    // Copy host_vector H to device_vector D
    std::chrono::time_point<std::chrono::high_resolution_clock> p0 = std::chrono::high_resolution_clock::now();
    thrust::device_vector<int> D = H;   // listd
    thrust::device_vector<int> E = H;   // liste
    thrust::device_vector<int> D_res(E.size()); // listd + liste -> res
    std::chrono::time_point<std::chrono::high_resolution_clock> p1 = std::chrono::high_resolution_clock::now();
    float time_cp = (float)std::chrono::duration_cast<std::chrono::microseconds>(p1 - p0).count() / 1000;
    std::cout << "copy data to gpu time:" << time_cp << "ms" << std::endl;
    
    if (time_cp < 1)
    time_cp_total += time_cp;


    //thrust::transform(数据A的开始, 数据A的结尾, 数据B的开始, 计算结果的开始, 对应算法thrust::multiplies<float>());
    std::chrono::time_point<std::chrono::high_resolution_clock> p2 = std::chrono::high_resolution_clock::now();
    thrust::transform(D.begin(), D.end(), E.begin(), D_res.begin(), thrust::plus<float>());
    std::chrono::time_point<std::chrono::high_resolution_clock> p3 = std::chrono::high_resolution_clock::now();
    float time_g = (float)std::chrono::duration_cast<std::chrono::microseconds>(p3 - p2).count() / 1000;
    std::cout << "pure gpu time:" << time_g << "ms" << std::endl;
    if (time_g <1)
    time_pure_gpu += time_g;

    thrust::host_vector<int> H_res = D_res;

   /* for (int i = 0; i < D.size(); i++)
    {
        std::cout << "D[" << i << "] = " << D[i] << std::endl;
        std::cout << "res[" << i << "] = " << res[i] << std::endl;
    }*/
    
}

main.cpp

#include "thrust_abcd.cuh"
int main()
{
   
    // create two imgs,1*25*25, fill with 1 and 2 separately
    int h = 25; //row
    int w = 25; //col
     
    // method 2,thrust
    int a[25 * 25];
    for (int i = 0; i < 25 * 25; i++)
    {
        a[i] = i * (i + 1);
    }
    // warmup
    gpuInit(a);
    
    for (int i = 0; i < 100; ++i)
    {
        gpuInit(a);
    }

    std::cout << "avg pure gpu time: " << time_pure_gpu / 100 << "ms" << std::endl;
    std::cout << "avg cp gpu time: " << time_cp_total / 100 << "ms" << std::endl;
    std::cout << "add over" << std::endl;*/
    return 0;
    
}
上一篇:opecv cuda加速官方教程2:Using a cv::cuda::GpuMat with thrust


下一篇:CUDA--Thrust--数学运算(5)