【CUDA基础练习】激光点云俯视投影深度图像

    ok,继续做一个CUDA方面的练习。在激光点云目标检测的诸多方法中有一类方法是建立在BEV视角下点云俯视投影出来的深度图像上面的。当然了,就俯视投影这一步而言,无论用python还是c++都可以用很简单的逻辑来完成。但是从效率上来考虑,当点的数量比较多的时候,我们也可以借助CUDA来进行加速。为方便对比,还是先摆出c++实现的例子。

【源代码】

#include <stdio.h>
#include "math.h"
#include "bev.h"

int scale_to_255(float z, float z_min, float z_max) {
    return int(((z-z_min)/(z_max-z_min)) * 255);
}

void bev(const float* points,
         int num_points,
         unsigned char* out,
         const std::vector<float>& cloud_range,
         const std::vector<float>& voxel_size,
         const std::vector<int>& grid_size,
         int in_cols) {
    for (int i=0; i<num_points; i++) {
        const float* point = points + i*in_cols;
        int x  = std::floor((point[0] - cloud_range[0]) / voxel_size[0]);
        int y  = std::floor((point[1] - cloud_range[1]) / voxel_size[1]);
        int z  = std::floor((point[2] - cloud_range[2]) / voxel_size[2]);
        if (x<0 || y<0 || z<0 || x>=grid_size[0] || y>=grid_size[1] || z>=grid_size[2]) {
            continue;
        }
        //TODO: add range filter here
        int row = grid_size[0] - x;
        int col = grid_size[1] - y;
        int tmp = scale_to_255(point[2],cloud_range[2],cloud_range[5]);
        int offset = row*grid_size[1] + col;
        out[offset] = tmp > out[offset] ? tmp : out[offset];
        //printf("row:%d,col:%d,offset:%d\n",row,col,offset);
    }
}

参数说明:

cloud_range:点云范围,[xmin,ymin,zmin,xmax,ymax,zmax],框定一个立体矩形范围,只考虑范围内的点.

voxel_size:体素大小,[dx,dy,dz].因为是要做3D到2D的俯视投影,z方像会压缩到1,实际使用时dz的大小会设置成和(zmax-zmin)一样大。

grid_size:网格尺寸[grid_x,grid_y,grid_z],这个根据前两个参数就可以计算出来。

void Test(const pcl::PointCloud<pcl::PointXYZ>::Ptr cloud) {
    printf("recv points:%d \n", cloud->width);
 
    std::vector<int> grid_size(3);
    std::vector<float> voxel_size = {0.09765, 0.09765, 20};
    std::vector<float> cloud_range = {0.0, -50., -5., 100., 50., 15.};
    for (int i = 0; i < 3; i++) {
        grid_size[i] = round((cloud_range[3 + i] - cloud_range[i]) / voxel_size[i]);
    }   
    //printf("grid_size[0]:%d,grid_size[1]:%d\n",grid_size[0],grid_size[1]);
 
    cv::Mat gray(grid_size[0], grid_size[1], CV_8UC1, cv::Scalar(0));
    if (0) {
        //write cuda bev here
    } else {
        std::chrono::steady_clock::time_point t1 = std::chrono::steady_clock::now();
        bev(data,num_points,gray.data,cloud_range,voxel_size,grid_size,in_cols);
        std::chrono::steady_clock::time_point t2 = std::chrono::steady_clock::now();
        std::chrono::duration<double, std::ratio<1, 1000>> time_span =
                std::chrono::duration_cast<std::chrono::duration<double, std::ratio<1, 1000>>>(t2 - t1);
        std::cout << "time_cost: " << time_span.count() << " ms" << std::endl;
    }   
 
    {   
        cv::imshow("preview", gray);
        auto key = cv::waitKey(5);
        if (key == 'q' || key == 'Q') {
            exit(0);
        }   
    }   
 
    free(data);
}

通过上面这段代码可以简单测试并显示投影后的深度图像。我设置了一个100x100x20的区域,根据voxel_size计算出的grid_size的大小为[1024,1024,1]。

【测试】

recv points:58900 
time_cost: 1.00689 ms
recv points:57131 
time_cost: 2.32245 ms
recv points:56297 
time_cost: 0.959922 ms
recv points:56186 
time_cost: 2.5897 ms
recv points:56627 
time_cost: 2.41959 ms

显然,c++实现中的这个循环操作是可以用CUDA的并行操作来取代的。有一点值得注意,在俯视投影时,点云的高度信息是被编码在图像的灰度数值中的。当有多个点被投影到一个像素位置时,只会编码了高度值最大的那个点。在循环处理时这一点很好处理,通过简单的比较就可以保证最后编码的是高度最大的那一个点。但是,在CUDA并行计算时,每个点都是独立计算投影坐标及其灰度值。为了保证像素灰度值编码的是高度值最大的那一个点,我将投影过程拆成了两步:

Step1:只确定每个点的投影位置并计算其灰度值;

__device__ int scale_to_255(float z, float z_min, float z_max) {
    return int(((z-z_min)/(z_max-z_min)) * 255);
}

__global__ void InitializeMapEntries(const float* __restrict__  points,
                                     const int num_points,
                                     float range_min_x, float range_min_y, float range_min_z, float range_max_z,
                                     float voxel_size_x, float voxel_size_y, float voxel_size_z,
                                     int grid_x, int grid_y, int grid_z,
                                     int* grid_first_entry, HashEntry* entry_list,
                                     int cols)
{
    int idx = threadIdx.x + blockDim.x * blockIdx.x;
    int stride = gridDim.x * blockDim.x;

    for(int i = idx ; i < num_points; i += stride)
    {   
        const float* cur_point = points + i * cols;
        int x  = __float2int_rd((cur_point[0] - range_min_x) / voxel_size_x);
        int y  = __float2int_rd((cur_point[1] - range_min_y) / voxel_size_y);
        int z  = __float2int_rd((cur_point[2] - range_min_z) / voxel_size_z);
        if(x<0 || y<0 || z<0 || x>=grid_x || y>=grid_y || z>=grid_z) continue;

        int row = grid_x - x;
        int col = grid_y - y;

        HashEntry* entry = entry_list + i;
        entry->gray_var = scale_to_255(cur_point[2],range_min_z,range_max_z);
        int grid_idx =row * grid_y + col;
        int* address = grid_first_entry + grid_idx;
        int new_var = i;
        int cur_var = *address;
        int assumed;

        do {
            assumed = cur_var;
            cur_var = atomicCAS(address, assumed, new_var);
        } while (assumed != cur_var);
        entry -> next_id = cur_var;
    }
}%                                                

Step2:比较每个有效像素位置上的所有点的高度值,选出高度值最大的点的灰度值为像素的灰度值。

【核心代码】

__global__
void ExtractValidOutPixelKernel(const float* points, unsigned char* d_out, HashEntry* entry_list, int* grid_first_entry, int grid_bev_size)
{
    int idx = threadIdx.x + blockDim.x * blockIdx.x;
    int stride = gridDim.x * blockDim.x;

    for(int i = idx ; i < grid_bev_size; i += stride)
    {
        int entry_idx = grid_first_entry[i];
        if (entry_idx >= 0) {
            HashEntry* entry = entry_list + entry_idx;
            int max_gray = entry->gray_var;
            int next_idx = entry->next_id;
            while(next_idx >= 0)
            {
                HashEntry* entry_next = entry_list + next_idx;
                next_idx = entry_next -> next_id;
                max_gray = entry_next->gray_var > max_gray ? entry_next->gray_var : max_gray;
            }
            d_out[i] = max_gray;
        }
    }
}

void cuda_bev(const float* points,
         int num_points,
         unsigned char* out,
         const std::vector<float>& cloud_range,
         const std::vector<float>& voxel_size,
         const std::vector<int>& grid_size,
         int in_cols)
{
    float * d_points;
    unsigned char* d_out;
    HashEntry* d_hash_list;
    int* d_grid_first_entry;
    int grid_bev_size = grid_size[0]*grid_size[1];

    CHECK(cudaMalloc((void**)&d_points, sizeof(float)*in_cols*num_points));
    CHECK(cudaMalloc((void**)&d_out, sizeof(unsigned char)*grid_bev_size));
    CHECK(cudaMalloc((void**)&d_hash_list, sizeof(HashEntry)*num_points));
    CHECK(cudaMalloc((void**)&d_grid_first_entry, sizeof(int)*grid_bev_size));
    init_int_(d_grid_first_entry, -1, grid_bev_size);
    init_uchar_(d_out, 0, grid_bev_size);
    init_hash_list_(d_hash_list, num_points);

    cudaEvent_t start,stop;
    CHECK(cudaEventCreate(&start));
    CHECK(cudaEventCreate(&stop));
    CHECK(cudaEventRecord(start));
    cudaEventQuery(start);

    CHECK(cudaMemcpy((void*)d_points, points, sizeof(float)*in_cols*num_points, cudaMemcpyHostToDevice));

    int blockSize=512;
    int minGridSize=1;
    int num_thread = num_points;
    cudaOccupancyMaxPotentialBlockSize(&minGridSize, &blockSize, InitializeMapEntries, 0, num_thread);
    minGridSize = std::min(minGridSize, DivUp(num_thread, blockSize));
    printf("%s:%d gridSize:%d,blockSize:%d\n",__FUNCTION__ ,__LINE__,minGridSize,blockSize);
    InitializeMapEntries<<<minGridSize, blockSize>>>(d_points, num_points,
                                                     cloud_range[0], cloud_range[1], cloud_range[2], cloud_range[5],
                                                     voxel_size[0], voxel_size[1], voxel_size[2],
                                                     grid_size[0], grid_size[1], grid_size[2],
                                                     d_grid_first_entry, d_hash_list, in_cols);

    num_thread = grid_bev_size;
    cudaOccupancyMaxPotentialBlockSize(&minGridSize, &blockSize, ExtractValidOutPixelKernel, 0, num_thread);
    minGridSize = std::min(minGridSize, DivUp(num_thread, blockSize));
    printf("%s:%d gridSize:%d,blockSize:%d\n",__FUNCTION__ ,__LINE__,minGridSize,blockSize);
    ExtractValidOutPixelKernel<<<minGridSize, blockSize>>>(points, d_out, d_hash_list, d_grid_first_entry, grid_bev_size);

    // cudaDeviceSynchronize waits for the kernel to finish, and returns
    // any errors encountered during the launch
    CHECK(cudaDeviceSynchronize());
    CHECK(cudaMemcpy(out,d_out,grid_bev_size,cudaMemcpyDeviceToHost));

    CHECK(cudaEventRecord(stop));
    CHECK(cudaEventSynchronize(stop));

    float elapsed_time;
    CHECK(cudaEventElapsedTime(&elapsed_time, start, stop));
    printf("time cost: %.3f(ms)\n", elapsed_time);

    CHECK(cudaFree(d_points));
    CHECK(cudaFree(d_hash_list));
    CHECK(cudaFree(d_grid_first_entry));
}

【测试】

recv points:58900 
time cost: 0.470(ms)
recv points:59541 
time cost: 0.502(ms)
recv points:58875 
time cost: 0.822(ms)
recv points:58856 
time cost: 0.786(ms)
recv points:59224 
time cost: 0.819(ms)
recv points:59459 
time cost: 0.952(ms)
recv points:58728 
time cost: 0.504(ms)
recv points:59197 
time cost: 0.503(ms)
recv points:59404 
time cost: 0.844(ms)

当然,这个测试有其不合理的地方。CUDA这里这计算了核函数执行部分的时间,而忽略了显存分配以及数据搬运的时间。
 

上一篇:Managed DirectX 调试


下一篇:ThreadLocalMap 源码分析