Ubuntu20.04配置TensorRT

2021-08-03 10:16:16

linux20.04 + CUDA11.3 + cudnn8.2 + TensorRT8.0.1.6

nvidia-smi显示的CUDA版本和nvcc -V得到的CUDA版本会不一致，nvidia-smi显示的是支持的最高的

1.安装CUDA，从官网下载，可下在run的和deb的，按照提示的命令安装，安装完成后/usr/local/cuda*路径下
2.安装cudnn，从官网下载，下载tar版本的，解压后有include和lib64文件夹，执行如下命令安装

sudo cp cuda/include/cudnn.h /usr/local/cuda-11.3/include/
sudo cp cuda/lib64/libcudnn* /usr/local/cuda-11.3/lib64/
sudo chmod a+r /usr/local/cuda-11.3/include/cudnn.h
sudo chmod a+r /usr/local/cuda-11.3/lib64/libcudnn*

3.安装TensorRT，从官网下载，下载tar版本，解压后有lib64文件夹，路径加入环境变量，进入python文件夹安装whl

注意事项:
1.原始CUDA但版本不对，一定要匹配，要不麻烦死，卸载nvidia，出错可以去“软件和更新”那其他软件那去掉一下相关的对沟，要不卸载不了，显示包不对应

sudo apt-get remove --auto-remove nvidia-cuda-toolkit

2.查看CUDA版本

cat /usr/local/cuda-11.3/version.json

3.添加环境变量

vim ~/.bashrc

加入

export PATH="/usr/local/cuda-11.3/bin:$PATH"
	export LD_LIBRARY_PATH="/usr/local/cuda-11.3/lib64:$LD_LIBRARY_PATH"
	export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/home/nercar/TensorRT-7.0.0.11/lib
	export CUDA_INSTALL_DIR=/usr/local/cuda-11.3/lib64
	export CUDNN_INSTALL_DIR=/usr/local/cuda-11.3/lib64

执行

source ~/.bashrc
nvcc -V

4.pycharm上出错没有so文件时，通过在pycharm配置环境变量解决，run里边配置环境

PYTHONUNBUFFERED=1;CUDNN_HOME=/usr/local/cuda-11.3/lib64;LD_LIBRARY_PATH=/usr/local/cuda-11.3/lib64\;/home/nercar/TensorRT-8.0.1.6/lib

5.tensorrt8以上会出现有些东西未声明，可以参照自带的历程

实现bisenet的推理

pth转onnx

import argparse
import torch
from lib.models import model_factory
from configs import set_cfg_from_file
import sys
sys.path.insert(0, '.')

torch.set_grad_enabled(False)
parse = argparse.ArgumentParser()
parse.add_argument('--config', dest='config', type=str, default='../configs/bisenetv2_city.py', )
parse.add_argument('--weight-path', dest='weight_pth', type=str, default='model/bisenetv2.pth')
parse.add_argument('--outpath', dest='out_pth', type=str, default='model/bisenetv2.onnx')
parse.add_argument('--aux-mode', dest='aux_mode', type=str, default='pred')
args = parse.parse_args()

cfg = set_cfg_from_file(args.config)
if cfg.use_sync_bn: cfg.use_sync_bn = False

net = model_factory[cfg.model_type](cfg.n_cats, aux_mode=args.aux_mode)
net.load_state_dict(torch.load(args.weight_pth, map_location='cpu'), strict=False)
net.eval()

dummy_input = torch.randn(1, 3, 1024, 2048)
input_names = ['input_image']
output_names = ['preds', ]
torch.onnx.export(net, dummy_input, args.out_pth,
                  input_names=input_names,
                  output_names=output_names,
                  verbose=False,
                  opset_version=11)

onnx转trt

from __future__ import print_function
import os
import sys
import cv2
import time
import common
import numpy as np
import tensorrt as trt
sys.path.insert(1, os.path.join(sys.path[0], ".."))

TRT_LOGGER = trt.Logger()


def get_engine(onnx_file_path, engine_file_path=""):
    def build_engine():
        with trt.Builder(TRT_LOGGER) as builder, \
                builder.create_network(common.EXPLICIT_BATCH) as network, \
                builder.create_builder_config() as config, \
                trt.OnnxParser(network, TRT_LOGGER) as parser, \
                trt.Runtime(TRT_LOGGER) as runtime:
            config.max_workspace_size = 1 << 30  # 256MiB
            builder.max_batch_size = 1
            if not os.path.exists(onnx_file_path):
                print('ONNX file {} not found.'.format(onnx_file_path))
                exit(0)
            print('Loading ONNX file from path {}...'.format(onnx_file_path))
            with open(onnx_file_path, 'rb') as model:
                print('Beginning ONNX file parsing')
                if not parser.parse(model.read()):
                    print('ERROR: Failed to parse the ONNX file.')
                    for error in range(parser.num_errors):
                        print(parser.get_error(error))
                    return None
            network.get_input(0).shape = [1, 3, 512, 1024]
            print('Completed parsing of ONNX file')
            print('Building an engine from file {}; this may take a while...'.format(onnx_file_path))
            plan = builder.build_serialized_network(network, config)
            engine = runtime.deserialize_cuda_engine(plan)
            print("Completed creating Engine")
            with open(engine_file_path, "wb") as f:
                f.write(plan)
            return engine

    if os.path.exists(engine_file_path):
        print("Reading engine from file {}".format(engine_file_path))
        with open(engine_file_path, "rb") as f, trt.Runtime(TRT_LOGGER) as runtime:
            return runtime.deserialize_cuda_engine(f.read())
    else:
        return build_engine()


mean = (0.3257, 0.3690, 0.3223)
std = (0.2112, 0.2148, 0.2115)


def main():
    onnx_file_path = 'model/bisenetv2.onnx'
    engine_file_path = "model/bisenetv2.trt"
    input_image_path = 'data/1.png'

    image = cv2.imread(input_image_path, 1)
    image = cv2.resize(image, (1024, 512))
    image = image/255.
    # image = ((image / 255.0) - mean) / std
    image = np.transpose(image, [2, 0, 1])
    image = np.expand_dims(image, axis=0)
    image = np.array(image, dtype=np.float32, order='C')

    with get_engine(onnx_file_path, engine_file_path) as engine, engine.create_execution_context() as context:
        inputs, outputs, bindings, stream = common.allocate_buffers(engine)
        print('Running inference on image {}...'.format(input_image_path))
        inputs[0].host = image
        start = time.time()
        trt_outputs = common.do_inference_v2(context, bindings=bindings, inputs=inputs, outputs=outputs, stream=stream)
        end = time.time()
        print("use time:", end-start)
        # stream.synchronize()
    dst = trt_outputs[0].reshape((1024, 2048))
    cv2.namedWindow("dst", 0)
    cv2.imshow("dst", np.array(dst, np.uint8) * 10)
    cv2.waitKey(0)


if __name__ == '__main__':
    main()

common

import argparse
import os
import time
import numpy as np
import pycuda.autoinit
import pycuda.driver as cuda
import tensorrt as trt
EXPLICIT_BATCH = 1 << (int)(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)


def GiB(val):
    return val * 1 << 30


def add_help(description):
    parser = argparse.ArgumentParser(description=description, formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    args, _ = parser.parse_known_args()


def find_sample_data(description="Runs a TensorRT Python sample", subfolder="", find_files=[], err_msg=""):
    # Standard command-line arguments for all samples.
    kDEFAULT_DATA_ROOT = os.path.join(os.sep, "usr", "src", "tensorrt", "data")
    parser = argparse.ArgumentParser(description=description, formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument("-d", "--datadir",
                        help="Location of the TensorRT sample data directory, and any additional data directories.",
                        action="append", default=[kDEFAULT_DATA_ROOT])
    args, _ = parser.parse_known_args()

    def get_data_path(data_dir):
        # If the subfolder exists, append it to the path, otherwise use the provided path as-is.
        data_path = os.path.join(data_dir, subfolder)
        if not os.path.exists(data_path):
            if data_dir != kDEFAULT_DATA_ROOT:
                print("WARNING: " + data_path + " does not exist. Trying " + data_dir + " instead.")
            data_path = data_dir
        # Make sure data directory exists.
        if not (os.path.exists(data_path)) and data_dir != kDEFAULT_DATA_ROOT:
            print("WARNING: {:} does not exist. Please provide the correct data path with the -d option.".format(
                data_path))
        return data_path

    data_paths = [get_data_path(data_dir) for data_dir in args.datadir]
    return data_paths, locate_files(data_paths, find_files, err_msg)


def locate_files(data_paths, filenames, err_msg=""):
    found_files = [None] * len(filenames)
    for data_path in data_paths:
        # Find all requested files.
        for index, (found, filename) in enumerate(zip(found_files, filenames)):
            if not found:
                file_path = os.path.abspath(os.path.join(data_path, filename))
                if os.path.exists(file_path):
                    found_files[index] = file_path

    # Check that all files were found
    for f, filename in zip(found_files, filenames):
        if not f or not os.path.exists(f):
            raise FileNotFoundError(
                "Could not find {:}. Searched in data paths: {:}\n{:}".format(filename, data_paths, err_msg))
    return found_files


# Simple helper data class that's a little nicer to use than a 2-tuple.
class HostDeviceMem(object):
    def __init__(self, host_mem, device_mem):
        self.host = host_mem
        self.device = device_mem

    def __str__(self):
        return "Host:\n" + str(self.host) + "\nDevice:\n" + str(self.device)

    def __repr__(self):
        return self.__str__()


# Allocates all buffers required for an engine, i.e. host/device inputs/outputs.
def allocate_buffers(engine):
    inputs = []
    outputs = []
    bindings = []
    stream = cuda.Stream()
    for binding in engine:
        size = trt.volume(engine.get_binding_shape(binding)) * engine.max_batch_size
        dtype = trt.nptype(engine.get_binding_dtype(binding))
        # Allocate host and device buffers
        host_mem = cuda.pagelocked_empty(size, dtype)
        device_mem = cuda.mem_alloc(host_mem.nbytes)
        # Append the device buffer to device bindings.
        bindings.append(int(device_mem))
        # Append to the appropriate list.
        if engine.binding_is_input(binding):
            inputs.append(HostDeviceMem(host_mem, device_mem))
        else:
            outputs.append(HostDeviceMem(host_mem, device_mem))
    return inputs, outputs, bindings, stream


def do_inference(context, bindings, inputs, outputs, stream, batch_size=1):
    [cuda.memcpy_htod_async(inp.device, inp.host, stream) for inp in inputs]
    context.execute_async(batch_size=batch_size, bindings=bindings, stream_handle=stream.handle)
    [cuda.memcpy_dtoh_async(out.host, out.device, stream) for out in outputs]
    stream.synchronize()
    return [out.host for out in outputs]


def do_inference_v2(context, bindings, inputs, outputs, stream):
    [cuda.memcpy_htod_async(inp.device, inp.host, stream) for inp in inputs]
    context.execute_async_v2(bindings=bindings, stream_handle=stream.handle)
    [cuda.memcpy_dtoh_async(out.host, out.device, stream) for out in outputs]
    # stream.synchronize()
    return [out.host for out in outputs]

码农公寓

linux20.04 + CUDA11.3 + cudnn8.2 + TensorRT8.0.1.6

相关文章