Github 现有的 TensorRT 加速的 MTCNN 【PKUZHOU/MTCNN_FaceDetection_TensorRT】不是基于插件的,而是走了使用 scale 和 ReLU 、eltwise-sum 层 “曲线救国”的路线——
PKUZHOU 认为 PReLU 会破坏 TensorRT 的 CBR 优化,但实际上实现 PReLU 插件以后耗时更少,如图
左侧是“曲线救国”版,右侧是实现了 PReLU 插件,一张 1920x1080 的图像,能有200多 ms 的提升(原谅我笔记本的显卡是 GTX 970m,古老的 Maxwell 架构,连半精度都不支持,更别提 Int8 了,抹泪)。
插件的实现代码
prelu_plugn.h
#ifndef PRELU_PLUGIN_H #define PRELU_PLUGIN_H #include "kernels.h" #include <cstring> #include <assert.h> #include <cuda.h> #include <cuda_runtime_api.h> #include <cuda_fp16.h> // __half #include <NvInfer.h> #include <NvCaffeParser.h> using namespace nvinfer1; using namespace nvcaffeparser1; /* Prelu layer */ class PreluPlugin : public IPlugin { public: PreluPlugin(const Weights* weights, int nbWeights); PreluPlugin(const void* buffer, size_t size); ~PreluPlugin(); Dims getOutputDimensions(int index, const Dims* inputs, int nbInputDims); int enqueue(int batchSize, const void* const* inputs, void** outputs, void*, cudaStream_t stream); int getNbOutputs() const override { return 1; }; void configure(const Dims* inputs, int nbInputs, const Dims* outputs, int nbOutputs, int) override; void serialize(void* buffer) override; size_t getSerializationSize() override; inline size_t getWorkspaceSize(int) const override { return 0; } int initialize() override; void terminate() override; protected: int m_input_c; int m_input_h; int m_input_w; int m_input_count; bool m_channel_shared {false}; Weights m_weights; void* m_device_kernel{nullptr}; private: void deserializeToDevice(const char*& hostBuffer, void*& deviceWeights, size_t size) { deviceWeights = copyToDevice(hostBuffer, size); hostBuffer += size; } // 将 host 的 buffer 上的值拷贝到 device (还会开辟设备内存)上 void* copyToDevice(const void* data, size_t count) { void* deviceData; cudaMalloc(&deviceData, count); cudaMemcpy(deviceData, data, count, cudaMemcpyHostToDevice); return deviceData; } template<typename T> void read(const char*& buffer, T& val) { val = *reinterpret_cast<const T*>(buffer); buffer += sizeof(T); } template<typename T> void write(char*& buffer, const T& val) { *reinterpret_cast<T*>(buffer) = val; buffer += sizeof(T); } size_t type2size(nvinfer1::DataType type) { // return sizeof(float); return type == nvinfer1::DataType::kFLOAT ? sizeof(float) : sizeof(__half); } // 将 Weights 的 values 中的值拷贝到 host 的 buffer 上 void convertAndCopyToBuffer(char*& buffer, const Weights& weights) { memcpy(buffer, weights.values, weights.count * type2size(weights.type)); buffer += weights.count * type2size(weights.type); } }; #endif // PRELU_PLUGIN_H
prelu_plugin.cpp
#include "prelu_plugin.h" #include <iostream> using namespace nvinfer1; using namespace nvcaffeparser1; //using namespace plugin; PreluPlugin::PreluPlugin(const Weights* weights, int nbWeights) { assert(nbWeights == 1); m_weights = weights[0]; assert(m_weights.type == DataType::kFLOAT || m_weights.type == DataType::kHALF); // 为 values 开辟空间 m_weights.values = malloc(m_weights.count * type2size(m_weights.type)); // weights[0].values -> m_weights.values memcpy(const_cast<void*>(m_weights.values), weights[0].values, m_weights.count * type2size(m_weights.type)); } PreluPlugin::PreluPlugin(const void* buffer, size_t size) { // 反序列化:和序列化的顺序相同,注意不同的数据类型 const char* d = reinterpret_cast<const char*>(buffer), *a = d; read<int>(d, m_input_c); read<int>(d, m_input_h); read<int>(d, m_input_w); read<int>(d, m_input_count); read<bool>(d, m_channel_shared); read<int64_t>(d, m_weights.count); read<DataType>(d, m_weights.type); // m_weights.values = nullptr; m_weights.values = malloc(m_weights.count * type2size(m_weights.type)); //deserializeToDevice(d,m_device_kernel,m_weights.count); // d -> m_weights.values memcpy(const_cast<void*>(m_weights.values), d, m_weights.count * type2size(m_weights.type)); d += m_weights.count * type2size(m_weights.type); // 指针继续向后 assert(d == a + size); } PreluPlugin::~PreluPlugin() { // std::cout << "~PreluPlugin "<< std::endl; // if (m_weights.values) // { // free(const_cast<void*>(m_weights.values)); // } } // 仅在序列化时调用该方法 Dims PreluPlugin::getOutputDimensions(int index, const Dims* inputs, int nbInputDims) { // std::cout << "0~getOutputDimensions " << std::endl; assert(index == 0 && nbInputDims == 1 && inputs[0].nbDims == 3); return DimsCHW(inputs[0].d[0], inputs[0].d[1], inputs[0].d[2]); } // 仅在序列化时调用该方法 void PreluPlugin::configure(const Dims* inputs, int nbInputs, const Dims* outputs, int nbOutputs, int) { // std::cout << "1~configure " << std::endl; m_input_c = inputs[0].d[0]; m_input_h = inputs[0].d[1]; m_input_w = inputs[0].d[2]; m_input_count = m_input_c * m_input_h * m_input_w; } size_t PreluPlugin::getSerializationSize() { return 4 * sizeof(int) + sizeof(bool) + sizeof(m_weights.count) + sizeof(m_weights.type) + m_weights.count * type2size(m_weights.type); } void PreluPlugin::serialize(void* buffer) { char* d = static_cast<char*>(buffer), *a = d; write(d, m_input_c); write(d, m_input_h); write(d, m_input_w); write(d, m_input_count); write(d, m_channel_shared); write(d, m_weights.count); write(d, m_weights.type); convertAndCopyToBuffer(d, m_weights); assert(d == a + getSerializationSize()); } int PreluPlugin::enqueue(int batchSize, const void* const* inputs, void** outputs, void*, cudaStream_t stream) { const float* bottom_data = reinterpret_cast<const float*>(inputs[0]); float* top_data = reinterpret_cast<float*>(outputs[0]); const int count = batchSize * m_input_count; const int dim = m_input_h * m_input_w; const int channels = m_input_c; const int div_factor = m_channel_shared ? channels : 1; //m_channel_shared default is false pReLUForward(count, channels, dim, bottom_data, top_data, m_device_kernel, div_factor, stream); return 0; } int PreluPlugin::initialize() { // std::cout << "2~initialize~0 "<< m_device_kernel << std::endl; cudaMalloc(&m_device_kernel, m_weights.count * type2size(m_weights.type)); cudaMemcpy(m_device_kernel, m_weights.values, m_weights.count * type2size(m_weights.type), cudaMemcpyHostToDevice); return 0; } // engine 销毁时会调用 void PreluPlugin::terminate() { // std::cout << "~terminate "<< m_device_kernel << std::endl; if (m_weights.values) { free(const_cast<void*>(m_weights.values)); } if (m_device_kernel) { cudaFree(m_device_kernel); m_device_kernel = nullptr; } }
kernel.cu
__global__ void pReLU(const int n, const int channels, const int dim, const float* in, float* out, const float* slope_data, const int div_factor) { CUDA_KERNEL_LOOP(index, n) { int c = (index / dim) % channels / div_factor; out[index] = in[index] > 0 ? in[index] : in[index] * slope_data[c]; } } void pReLUForward(const int count, const int channels, const int dim, const float* bottom_data, float* top_data, void* mDeviceKernel, const int div_factor, cudaStream_t stream) { pReLU <<< CAFFE_GET_BLOCKS(count), CAFFE_CUDA_NUM_THREADS, 0, stream>>>(count, channels, dim, bottom_data, top_data, static_cast<const float*>(mDeviceKernel), // slope_data div_factor); CUDA_POST_KERNEL_CHECK; }
继而通过 Valgrind 和 gProf 进行性能分析可以发现,
如果优化 nms 和 image2Matrix 方法的话,可以进一步提高性能;
由于多个 Pnet 的检测也是相互独立的,所以还可以使用多线程并行,然后多个流在 GPU (最好支持 HyperQ)上的 Overlap 可以再进一步提高性能。
上图是我没用 TensorRT,直接用原生的 CUDA 加速的效果,迭代 20 次,平均每次仅花费 60 ms 左右。
经过进一步优化纯 CUDA 的算法,一次迭代仅需 44 ms( 仍然是在 min_size = 30, thresh_p = 0.7, thresh_r = 0.7, thresh_o = 0.7, thresh_nms_p = 0.5, thresh_nms_r = 0.5, thresh_nms_o = 0.5 的条件下)
如果通过 TensorRT 加速应该会取得更优的性能。
最后是检测的结果: