linux20.04 + CUDA11.3 + cudnn8.2 + TensorRT8.0.1.6
nvidia-smi显示的CUDA版本和nvcc -V得到的CUDA版本会不一致,nvidia-smi显示的是支持的最高的
1.安装CUDA,从官网下载,可下在run的和deb的,按照提示的命令安装,安装完成后/usr/local/cuda*路径下
2.安装cudnn,从官网下载,下载tar版本的,解压后有include和lib64文件夹,执行如下命令安装
sudo cp cuda/include/cudnn.h /usr/local/cuda-11.3/include/
sudo cp cuda/lib64/libcudnn* /usr/local/cuda-11.3/lib64/
sudo chmod a+r /usr/local/cuda-11.3/include/cudnn.h
sudo chmod a+r /usr/local/cuda-11.3/lib64/libcudnn*
3.安装TensorRT,从官网下载,下载tar版本,解压后有lib64文件夹,路径加入环境变量,进入python文件夹安装whl
注意事项:
1.原始CUDA但版本不对,一定要匹配,要不麻烦死,卸载nvidia,出错可以去“软件和更新”那其他软件那去掉一下相关的对沟,要不卸载不了,显示包不对应
sudo apt-get remove --auto-remove nvidia-cuda-toolkit
2.查看CUDA版本
cat /usr/local/cuda-11.3/version.json
3.添加环境变量
vim ~/.bashrc
加入
export PATH="/usr/local/cuda-11.3/bin:$PATH"
export LD_LIBRARY_PATH="/usr/local/cuda-11.3/lib64:$LD_LIBRARY_PATH"
export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/home/nercar/TensorRT-7.0.0.11/lib
export CUDA_INSTALL_DIR=/usr/local/cuda-11.3/lib64
export CUDNN_INSTALL_DIR=/usr/local/cuda-11.3/lib64
执行
source ~/.bashrc
nvcc -V
4.pycharm上出错没有so文件时,通过在pycharm配置环境变量解决,run里边配置环境
PYTHONUNBUFFERED=1;CUDNN_HOME=/usr/local/cuda-11.3/lib64;LD_LIBRARY_PATH=/usr/local/cuda-11.3/lib64\;/home/nercar/TensorRT-8.0.1.6/lib
5.tensorrt8以上会出现有些东西未声明,可以参照自带的历程
实现bisenet的推理
pth转onnx
import argparse
import torch
from lib.models import model_factory
from configs import set_cfg_from_file
import sys
sys.path.insert(0, '.')
torch.set_grad_enabled(False)
parse = argparse.ArgumentParser()
parse.add_argument('--config', dest='config', type=str, default='../configs/bisenetv2_city.py', )
parse.add_argument('--weight-path', dest='weight_pth', type=str, default='model/bisenetv2.pth')
parse.add_argument('--outpath', dest='out_pth', type=str, default='model/bisenetv2.onnx')
parse.add_argument('--aux-mode', dest='aux_mode', type=str, default='pred')
args = parse.parse_args()
cfg = set_cfg_from_file(args.config)
if cfg.use_sync_bn: cfg.use_sync_bn = False
net = model_factory[cfg.model_type](cfg.n_cats, aux_mode=args.aux_mode)
net.load_state_dict(torch.load(args.weight_pth, map_location='cpu'), strict=False)
net.eval()
dummy_input = torch.randn(1, 3, 1024, 2048)
input_names = ['input_image']
output_names = ['preds', ]
torch.onnx.export(net, dummy_input, args.out_pth,
input_names=input_names,
output_names=output_names,
verbose=False,
opset_version=11)
onnx转trt
from __future__ import print_function
import os
import sys
import cv2
import time
import common
import numpy as np
import tensorrt as trt
sys.path.insert(1, os.path.join(sys.path[0], ".."))
TRT_LOGGER = trt.Logger()
def get_engine(onnx_file_path, engine_file_path=""):
def build_engine():
with trt.Builder(TRT_LOGGER) as builder, \
builder.create_network(common.EXPLICIT_BATCH) as network, \
builder.create_builder_config() as config, \
trt.OnnxParser(network, TRT_LOGGER) as parser, \
trt.Runtime(TRT_LOGGER) as runtime:
config.max_workspace_size = 1 << 30 # 256MiB
builder.max_batch_size = 1
if not os.path.exists(onnx_file_path):
print('ONNX file {} not found.'.format(onnx_file_path))
exit(0)
print('Loading ONNX file from path {}...'.format(onnx_file_path))
with open(onnx_file_path, 'rb') as model:
print('Beginning ONNX file parsing')
if not parser.parse(model.read()):
print('ERROR: Failed to parse the ONNX file.')
for error in range(parser.num_errors):
print(parser.get_error(error))
return None
network.get_input(0).shape = [1, 3, 512, 1024]
print('Completed parsing of ONNX file')
print('Building an engine from file {}; this may take a while...'.format(onnx_file_path))
plan = builder.build_serialized_network(network, config)
engine = runtime.deserialize_cuda_engine(plan)
print("Completed creating Engine")
with open(engine_file_path, "wb") as f:
f.write(plan)
return engine
if os.path.exists(engine_file_path):
print("Reading engine from file {}".format(engine_file_path))
with open(engine_file_path, "rb") as f, trt.Runtime(TRT_LOGGER) as runtime:
return runtime.deserialize_cuda_engine(f.read())
else:
return build_engine()
mean = (0.3257, 0.3690, 0.3223)
std = (0.2112, 0.2148, 0.2115)
def main():
onnx_file_path = 'model/bisenetv2.onnx'
engine_file_path = "model/bisenetv2.trt"
input_image_path = 'data/1.png'
image = cv2.imread(input_image_path, 1)
image = cv2.resize(image, (1024, 512))
image = image/255.
# image = ((image / 255.0) - mean) / std
image = np.transpose(image, [2, 0, 1])
image = np.expand_dims(image, axis=0)
image = np.array(image, dtype=np.float32, order='C')
with get_engine(onnx_file_path, engine_file_path) as engine, engine.create_execution_context() as context:
inputs, outputs, bindings, stream = common.allocate_buffers(engine)
print('Running inference on image {}...'.format(input_image_path))
inputs[0].host = image
start = time.time()
trt_outputs = common.do_inference_v2(context, bindings=bindings, inputs=inputs, outputs=outputs, stream=stream)
end = time.time()
print("use time:", end-start)
# stream.synchronize()
dst = trt_outputs[0].reshape((1024, 2048))
cv2.namedWindow("dst", 0)
cv2.imshow("dst", np.array(dst, np.uint8) * 10)
cv2.waitKey(0)
if __name__ == '__main__':
main()
common
import argparse
import os
import time
import numpy as np
import pycuda.autoinit
import pycuda.driver as cuda
import tensorrt as trt
EXPLICIT_BATCH = 1 << (int)(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)
def GiB(val):
return val * 1 << 30
def add_help(description):
parser = argparse.ArgumentParser(description=description, formatter_class=argparse.ArgumentDefaultsHelpFormatter)
args, _ = parser.parse_known_args()
def find_sample_data(description="Runs a TensorRT Python sample", subfolder="", find_files=[], err_msg=""):
# Standard command-line arguments for all samples.
kDEFAULT_DATA_ROOT = os.path.join(os.sep, "usr", "src", "tensorrt", "data")
parser = argparse.ArgumentParser(description=description, formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument("-d", "--datadir",
help="Location of the TensorRT sample data directory, and any additional data directories.",
action="append", default=[kDEFAULT_DATA_ROOT])
args, _ = parser.parse_known_args()
def get_data_path(data_dir):
# If the subfolder exists, append it to the path, otherwise use the provided path as-is.
data_path = os.path.join(data_dir, subfolder)
if not os.path.exists(data_path):
if data_dir != kDEFAULT_DATA_ROOT:
print("WARNING: " + data_path + " does not exist. Trying " + data_dir + " instead.")
data_path = data_dir
# Make sure data directory exists.
if not (os.path.exists(data_path)) and data_dir != kDEFAULT_DATA_ROOT:
print("WARNING: {:} does not exist. Please provide the correct data path with the -d option.".format(
data_path))
return data_path
data_paths = [get_data_path(data_dir) for data_dir in args.datadir]
return data_paths, locate_files(data_paths, find_files, err_msg)
def locate_files(data_paths, filenames, err_msg=""):
found_files = [None] * len(filenames)
for data_path in data_paths:
# Find all requested files.
for index, (found, filename) in enumerate(zip(found_files, filenames)):
if not found:
file_path = os.path.abspath(os.path.join(data_path, filename))
if os.path.exists(file_path):
found_files[index] = file_path
# Check that all files were found
for f, filename in zip(found_files, filenames):
if not f or not os.path.exists(f):
raise FileNotFoundError(
"Could not find {:}. Searched in data paths: {:}\n{:}".format(filename, data_paths, err_msg))
return found_files
# Simple helper data class that's a little nicer to use than a 2-tuple.
class HostDeviceMem(object):
def __init__(self, host_mem, device_mem):
self.host = host_mem
self.device = device_mem
def __str__(self):
return "Host:\n" + str(self.host) + "\nDevice:\n" + str(self.device)
def __repr__(self):
return self.__str__()
# Allocates all buffers required for an engine, i.e. host/device inputs/outputs.
def allocate_buffers(engine):
inputs = []
outputs = []
bindings = []
stream = cuda.Stream()
for binding in engine:
size = trt.volume(engine.get_binding_shape(binding)) * engine.max_batch_size
dtype = trt.nptype(engine.get_binding_dtype(binding))
# Allocate host and device buffers
host_mem = cuda.pagelocked_empty(size, dtype)
device_mem = cuda.mem_alloc(host_mem.nbytes)
# Append the device buffer to device bindings.
bindings.append(int(device_mem))
# Append to the appropriate list.
if engine.binding_is_input(binding):
inputs.append(HostDeviceMem(host_mem, device_mem))
else:
outputs.append(HostDeviceMem(host_mem, device_mem))
return inputs, outputs, bindings, stream
def do_inference(context, bindings, inputs, outputs, stream, batch_size=1):
[cuda.memcpy_htod_async(inp.device, inp.host, stream) for inp in inputs]
context.execute_async(batch_size=batch_size, bindings=bindings, stream_handle=stream.handle)
[cuda.memcpy_dtoh_async(out.host, out.device, stream) for out in outputs]
stream.synchronize()
return [out.host for out in outputs]
def do_inference_v2(context, bindings, inputs, outputs, stream):
[cuda.memcpy_htod_async(inp.device, inp.host, stream) for inp in inputs]
context.execute_async_v2(bindings=bindings, stream_handle=stream.handle)
[cuda.memcpy_dtoh_async(out.host, out.device, stream) for out in outputs]
# stream.synchronize()
return [out.host for out in outputs]