ok,继续做一个CUDA方面的练习。在激光点云目标检测的诸多方法中有一类方法是建立在BEV视角下点云俯视投影出来的深度图像上面的。当然了,就俯视投影这一步而言,无论用python还是c++都可以用很简单的逻辑来完成。但是从效率上来考虑,当点的数量比较多的时候,我们也可以借助CUDA来进行加速。为方便对比,还是先摆出c++实现的例子。
【源代码】
#include <stdio.h>
#include "math.h"
#include "bev.h"
int scale_to_255(float z, float z_min, float z_max) {
return int(((z-z_min)/(z_max-z_min)) * 255);
}
void bev(const float* points,
int num_points,
unsigned char* out,
const std::vector<float>& cloud_range,
const std::vector<float>& voxel_size,
const std::vector<int>& grid_size,
int in_cols) {
for (int i=0; i<num_points; i++) {
const float* point = points + i*in_cols;
int x = std::floor((point[0] - cloud_range[0]) / voxel_size[0]);
int y = std::floor((point[1] - cloud_range[1]) / voxel_size[1]);
int z = std::floor((point[2] - cloud_range[2]) / voxel_size[2]);
if (x<0 || y<0 || z<0 || x>=grid_size[0] || y>=grid_size[1] || z>=grid_size[2]) {
continue;
}
//TODO: add range filter here
int row = grid_size[0] - x;
int col = grid_size[1] - y;
int tmp = scale_to_255(point[2],cloud_range[2],cloud_range[5]);
int offset = row*grid_size[1] + col;
out[offset] = tmp > out[offset] ? tmp : out[offset];
//printf("row:%d,col:%d,offset:%d\n",row,col,offset);
}
}
参数说明:
cloud_range:点云范围,[xmin,ymin,zmin,xmax,ymax,zmax],框定一个立体矩形范围,只考虑范围内的点.
voxel_size:体素大小,[dx,dy,dz].因为是要做3D到2D的俯视投影,z方像会压缩到1,实际使用时dz的大小会设置成和(zmax-zmin)一样大。
grid_size:网格尺寸[grid_x,grid_y,grid_z],这个根据前两个参数就可以计算出来。
void Test(const pcl::PointCloud<pcl::PointXYZ>::Ptr cloud) {
printf("recv points:%d \n", cloud->width);
std::vector<int> grid_size(3);
std::vector<float> voxel_size = {0.09765, 0.09765, 20};
std::vector<float> cloud_range = {0.0, -50., -5., 100., 50., 15.};
for (int i = 0; i < 3; i++) {
grid_size[i] = round((cloud_range[3 + i] - cloud_range[i]) / voxel_size[i]);
}
//printf("grid_size[0]:%d,grid_size[1]:%d\n",grid_size[0],grid_size[1]);
cv::Mat gray(grid_size[0], grid_size[1], CV_8UC1, cv::Scalar(0));
if (0) {
//write cuda bev here
} else {
std::chrono::steady_clock::time_point t1 = std::chrono::steady_clock::now();
bev(data,num_points,gray.data,cloud_range,voxel_size,grid_size,in_cols);
std::chrono::steady_clock::time_point t2 = std::chrono::steady_clock::now();
std::chrono::duration<double, std::ratio<1, 1000>> time_span =
std::chrono::duration_cast<std::chrono::duration<double, std::ratio<1, 1000>>>(t2 - t1);
std::cout << "time_cost: " << time_span.count() << " ms" << std::endl;
}
{
cv::imshow("preview", gray);
auto key = cv::waitKey(5);
if (key == 'q' || key == 'Q') {
exit(0);
}
}
free(data);
}
通过上面这段代码可以简单测试并显示投影后的深度图像。我设置了一个100x100x20的区域,根据voxel_size计算出的grid_size的大小为[1024,1024,1]。
【测试】
recv points:58900
time_cost: 1.00689 ms
recv points:57131
time_cost: 2.32245 ms
recv points:56297
time_cost: 0.959922 ms
recv points:56186
time_cost: 2.5897 ms
recv points:56627
time_cost: 2.41959 ms
显然,c++实现中的这个循环操作是可以用CUDA的并行操作来取代的。有一点值得注意,在俯视投影时,点云的高度信息是被编码在图像的灰度数值中的。当有多个点被投影到一个像素位置时,只会编码了高度值最大的那个点。在循环处理时这一点很好处理,通过简单的比较就可以保证最后编码的是高度最大的那一个点。但是,在CUDA并行计算时,每个点都是独立计算投影坐标及其灰度值。为了保证像素灰度值编码的是高度值最大的那一个点,我将投影过程拆成了两步:
Step1:只确定每个点的投影位置并计算其灰度值;
__device__ int scale_to_255(float z, float z_min, float z_max) {
return int(((z-z_min)/(z_max-z_min)) * 255);
}
__global__ void InitializeMapEntries(const float* __restrict__ points,
const int num_points,
float range_min_x, float range_min_y, float range_min_z, float range_max_z,
float voxel_size_x, float voxel_size_y, float voxel_size_z,
int grid_x, int grid_y, int grid_z,
int* grid_first_entry, HashEntry* entry_list,
int cols)
{
int idx = threadIdx.x + blockDim.x * blockIdx.x;
int stride = gridDim.x * blockDim.x;
for(int i = idx ; i < num_points; i += stride)
{
const float* cur_point = points + i * cols;
int x = __float2int_rd((cur_point[0] - range_min_x) / voxel_size_x);
int y = __float2int_rd((cur_point[1] - range_min_y) / voxel_size_y);
int z = __float2int_rd((cur_point[2] - range_min_z) / voxel_size_z);
if(x<0 || y<0 || z<0 || x>=grid_x || y>=grid_y || z>=grid_z) continue;
int row = grid_x - x;
int col = grid_y - y;
HashEntry* entry = entry_list + i;
entry->gray_var = scale_to_255(cur_point[2],range_min_z,range_max_z);
int grid_idx =row * grid_y + col;
int* address = grid_first_entry + grid_idx;
int new_var = i;
int cur_var = *address;
int assumed;
do {
assumed = cur_var;
cur_var = atomicCAS(address, assumed, new_var);
} while (assumed != cur_var);
entry -> next_id = cur_var;
}
}%
Step2:比较每个有效像素位置上的所有点的高度值,选出高度值最大的点的灰度值为像素的灰度值。
【核心代码】
__global__
void ExtractValidOutPixelKernel(const float* points, unsigned char* d_out, HashEntry* entry_list, int* grid_first_entry, int grid_bev_size)
{
int idx = threadIdx.x + blockDim.x * blockIdx.x;
int stride = gridDim.x * blockDim.x;
for(int i = idx ; i < grid_bev_size; i += stride)
{
int entry_idx = grid_first_entry[i];
if (entry_idx >= 0) {
HashEntry* entry = entry_list + entry_idx;
int max_gray = entry->gray_var;
int next_idx = entry->next_id;
while(next_idx >= 0)
{
HashEntry* entry_next = entry_list + next_idx;
next_idx = entry_next -> next_id;
max_gray = entry_next->gray_var > max_gray ? entry_next->gray_var : max_gray;
}
d_out[i] = max_gray;
}
}
}
void cuda_bev(const float* points,
int num_points,
unsigned char* out,
const std::vector<float>& cloud_range,
const std::vector<float>& voxel_size,
const std::vector<int>& grid_size,
int in_cols)
{
float * d_points;
unsigned char* d_out;
HashEntry* d_hash_list;
int* d_grid_first_entry;
int grid_bev_size = grid_size[0]*grid_size[1];
CHECK(cudaMalloc((void**)&d_points, sizeof(float)*in_cols*num_points));
CHECK(cudaMalloc((void**)&d_out, sizeof(unsigned char)*grid_bev_size));
CHECK(cudaMalloc((void**)&d_hash_list, sizeof(HashEntry)*num_points));
CHECK(cudaMalloc((void**)&d_grid_first_entry, sizeof(int)*grid_bev_size));
init_int_(d_grid_first_entry, -1, grid_bev_size);
init_uchar_(d_out, 0, grid_bev_size);
init_hash_list_(d_hash_list, num_points);
cudaEvent_t start,stop;
CHECK(cudaEventCreate(&start));
CHECK(cudaEventCreate(&stop));
CHECK(cudaEventRecord(start));
cudaEventQuery(start);
CHECK(cudaMemcpy((void*)d_points, points, sizeof(float)*in_cols*num_points, cudaMemcpyHostToDevice));
int blockSize=512;
int minGridSize=1;
int num_thread = num_points;
cudaOccupancyMaxPotentialBlockSize(&minGridSize, &blockSize, InitializeMapEntries, 0, num_thread);
minGridSize = std::min(minGridSize, DivUp(num_thread, blockSize));
printf("%s:%d gridSize:%d,blockSize:%d\n",__FUNCTION__ ,__LINE__,minGridSize,blockSize);
InitializeMapEntries<<<minGridSize, blockSize>>>(d_points, num_points,
cloud_range[0], cloud_range[1], cloud_range[2], cloud_range[5],
voxel_size[0], voxel_size[1], voxel_size[2],
grid_size[0], grid_size[1], grid_size[2],
d_grid_first_entry, d_hash_list, in_cols);
num_thread = grid_bev_size;
cudaOccupancyMaxPotentialBlockSize(&minGridSize, &blockSize, ExtractValidOutPixelKernel, 0, num_thread);
minGridSize = std::min(minGridSize, DivUp(num_thread, blockSize));
printf("%s:%d gridSize:%d,blockSize:%d\n",__FUNCTION__ ,__LINE__,minGridSize,blockSize);
ExtractValidOutPixelKernel<<<minGridSize, blockSize>>>(points, d_out, d_hash_list, d_grid_first_entry, grid_bev_size);
// cudaDeviceSynchronize waits for the kernel to finish, and returns
// any errors encountered during the launch
CHECK(cudaDeviceSynchronize());
CHECK(cudaMemcpy(out,d_out,grid_bev_size,cudaMemcpyDeviceToHost));
CHECK(cudaEventRecord(stop));
CHECK(cudaEventSynchronize(stop));
float elapsed_time;
CHECK(cudaEventElapsedTime(&elapsed_time, start, stop));
printf("time cost: %.3f(ms)\n", elapsed_time);
CHECK(cudaFree(d_points));
CHECK(cudaFree(d_hash_list));
CHECK(cudaFree(d_grid_first_entry));
}
【测试】
recv points:58900
time cost: 0.470(ms)
recv points:59541
time cost: 0.502(ms)
recv points:58875
time cost: 0.822(ms)
recv points:58856
time cost: 0.786(ms)
recv points:59224
time cost: 0.819(ms)
recv points:59459
time cost: 0.952(ms)
recv points:58728
time cost: 0.504(ms)
recv points:59197
time cost: 0.503(ms)
recv points:59404
time cost: 0.844(ms)
当然,这个测试有其不合理的地方。CUDA这里这计算了核函数执行部分的时间,而忽略了显存分配以及数据搬运的时间。