生成静态engine模型--batch=1
一、pytorch模型保存
1、保存模型参数
save_filename = 'net_%s.pth'% epoch_label
save_path = os.path.join('./model',name,save_filename)
torch.save(network.cpu().state_dict(), save_path)
导入模型参数
save_path = os.path.join('./model',name,'net_%s.pth'%opt.which_epoch)
network.load_state_dict(torch.load(save_path))
2、保存整个模型
Save:
torch.save(model_object, 'model.pth')
Load:
model = torch.load('model.pth')
model.eval()
二、pytorch->onnx
def load_network(network,name):
save_path = os.path.join('../model',name,'net_last.pth')
network.load_state_dict(torch.load(save_path))
return network
print(torch.__version__)
input_name = ['input']
output_name = ['output']
name = 'ft_ResNet50_veri'
print("===> Loading model")
model_structure = ft_net(576, stride=2)
model = load_network(model_structure, name)
model.classifier.classifier = nn.Sequential()
print(model)
print('===> Load last checkpoint data')
input = Variable(torch.rand(8,3,256,128))
torch.onnx.export(model, input, 'resnet_veri_8_3_256_128.onnx', input_names=input_name, output_names=output_name, verbose=True)
三、onnx->engine
trtexec --onnx=resnet_veri_8_3_256_128.onnx --minShapes=input:1x3x256x128 --optShapes=input:8x3x256x128 --maxShapes=input:32x3x256x128 --workspace=2048 --saveEngine=resnet_veri_8_3_256_128.engine --fp16
注意:
-
生成模型输入输出尺度要固定,模型确定为8×3×256×128时,输入batch不足8时出现模型尺寸不匹配问题。
生成动态模型--dynamic batch
-
pth转onnx添加动态batch
torch.onnx.export(model, input, 'resnet_veri_1_3_256_128_eval_dynamic_test.onnx', input_names=input_name, output_names=output_name, verbose=True,dynamic_axes={"input":{0: "batch_size"}, "output":{0: "batch_size"},})
-
使用 TensorRT 提供的
trtexec
工具由onnx模型直接生成并保存cuda引擎1、从固定尺寸的onnx转cudaEngine
2、从可变尺寸的onnx转cudaEngine,需要指定profile
trtexec --onnx=resnet_veri_1_3_256_128_eval_dynamic_test.onnx --explicitBatch --minShapes=input:1x3x256x128 --optShapes=input:8x3x256x128 --maxShapes=input:32x3x256x128 --shapes=input:1x3x256x128 --workspace=2048 --saveEngine=resnet_veri_1_3_256_128_eval_dynamic_test.engine --fp16
-
调用代码
import argparse from typing import Tuple, List import numpy as np import pycuda.driver as cuda import pycuda.autoinit import tensorrt as trt TRT_LOGGER = trt.Logger(trt.Logger.WARNING) BatchSize = 32 // 判断 shape是否是动态的或者固定的 def is_fixed(shape: Tuple[int]): return not is_dynamic(shape) def is_dynamic(shape: Tuple[int]): return any(dim is None or dim < 0 for dim in shape) def load_engine(filename: str): # Load serialized engine file into memory 加载序列化的cuda引擎并进行反序列化 with open(filename, "rb") as f, trt.Runtime(TRT_LOGGER) as runtime: return runtime.deserialize_cuda_engine(f.read()) def get_binding_idxs(engine: trt.ICudaEngine, profile_index: int): # Calculate start/end binding indices for current context's profile num_bindings_per_profile = engine.num_bindings // engine.num_optimization_profiles start_binding = profile_index * num_bindings_per_profile end_binding = start_binding + num_bindings_per_profile print("Engine/Binding Metadata") print("\tNumber of optimization profiles: {}".format(engine.num_optimization_profiles)) print("\tNumber of bindings per profile: {}".format(num_bindings_per_profile)) print("\tFirst binding for profile {}: {}".format(profile_index, start_binding)) print("\tLast binding for profile {}: {}".format(profile_index, end_binding-1)) # Separate input and output binding indices for convenience input_binding_idxs = [] output_binding_idxs = [] for binding_index in range(start_binding, end_binding): if engine.binding_is_input(binding_index): input_binding_idxs.append(binding_index) else: output_binding_idxs.append(binding_index) return input_binding_idxs, output_binding_idxs # 指定输入的shape,同时根据输入的shape指定输出的shape,并未输出赋予cuda空间 def setup_binding_shapes( engine: trt.ICudaEngine, context: trt.IExecutionContext, host_inputs: List[np.ndarray], input_binding_idxs: List[int], output_binding_idxs: List[int], ): # Explicitly set the dynamic input shapes, so the dynamic output # shapes can be computed internally for host_input, binding_index in zip(host_inputs, input_binding_idxs): context.set_binding_shape(binding_index, host_input.shape) assert context.all_binding_shapes_specified host_outputs = [] device_outputs = [] for binding_index in output_binding_idxs: output_shape = context.get_binding_shape(binding_index) # Allocate buffers to hold output results after copying back to host buffer = np.empty(output_shape, dtype=np.float32) host_outputs.append(buffer) # Allocate output buffers on device device_outputs.append(cuda.mem_alloc(buffer.nbytes)) return host_outputs, device_outputs def get_random_inputs( engine: trt.ICudaEngine, context: trt.IExecutionContext, input_binding_idxs: List[int], seed: int = 42, ): # Input data for inference host_inputs = [] print("Generating Random Inputs") print("\tUsing random seed: {}".format(seed)) np.random.seed(seed) for binding_index in input_binding_idxs: # If input shape is fixed, we'll just use it input_shape = context.get_binding_shape(binding_index) input_name = engine.get_binding_name(binding_index) print("\tInput [{}] shape: {}".format(input_name, input_shape)) # If input shape is dynamic, we'll arbitrarily select one of the # the min/opt/max shapes from our optimization profile if is_dynamic(input_shape): profile_index = context.active_optimization_profile profile_shapes = engine.get_profile_shape(profile_index, binding_index) print("\tProfile Shapes for [{}]: [kMIN {} | kOPT {} | kMAX {}]".format(input_name, *profile_shapes)) # 0=min, 1=opt, 2=max, or choose any shape, (min <= shape <= max) input_shape = (BatchSize, 3, 224, 224)#profile_shapes[1] print("\tInput [{}] shape was dynamic, setting inference shape to {}".format(input_name, input_shape)) host_inputs.append(np.random.random(input_shape).astype(np.float32)) return host_inputs
def main(): parser = argparse.ArgumentParser() parser.add_argument("-e", "--engine", required=True, type=str, help="Path to TensorRT engine file.") parser.add_argument("-s", "--seed", type=int, default=42, help="Random seed for reproducibility.") args = parser.parse_args() # Load a serialized engine into memory engine = load_engine(args.engine) // 加载序列化的cuda引擎 print("Loaded engine: {}".format(args.engine)) # Create context, this can be re-used 创建 执行环境 context = engine.create_execution_context() # Profile 0 (first profile) is used by default context可以设置多个profile, 这里选择第一个,也是默认的profile,其中规定了输入尺寸的变化区间 context.active_optimization_profile = 0 print("Active Optimization Profile: {}".format(context.active_optimization_profile)) # These binding_idxs can change if either the context or the # active_optimization_profile are changed 获得输入输出变量名对应profile的idx input_binding_idxs, output_binding_idxs = get_binding_idxs( engine, context.active_optimization_profile ) # 获得输入变量的变量名 input_names = [engine.get_binding_name(binding_idx) for binding_idx in input_binding_idxs] # Generate random inputs based on profile shapes, 随机产生输入变量 host_inputs = get_random_inputs(engine, context, input_binding_idxs, seed=args.seed) # Allocate device memory for inputs. This can be easily re-used if the # input shapes don't change 为输入变量赋予host空间,该空间可复用 device_inputs = [cuda.mem_alloc(h_input.nbytes) for h_input in host_inputs] # Copy host inputs to device, this needs to be done for each new input, 由host拷贝到device for h_input, d_input in zip(host_inputs, device_inputs): cuda.memcpy_htod(d_input, h_input) print("Input Metadata") print("\tNumber of Inputs: {}".format(len(input_binding_idxs))) print("\tInput Bindings for Profile {}: {}".format(context.active_optimization_profile, input_binding_idxs)) print("\tInput names: {}".format(input_names)) print("\tInput shapes: {}".format([inp.shape for inp in host_inputs])) # This needs to be called everytime your input shapes change # If your inputs are always the same shape (same batch size, etc.), # then you will only need to call this once 重新指定网络输入输出的大小。 host_outputs, device_outputs = setup_binding_shapes( engine, context, host_inputs, input_binding_idxs, output_binding_idxs, ) # 返回的是输出的idx和device buffer output_names = [engine.get_binding_name(binding_idx) for binding_idx in output_binding_idxs] print("Output Metadata") print("\tNumber of Outputs: {}".format(len(output_binding_idxs))) print("\tOutput names: {}".format(output_names)) print("\tOutput shapes: {}".format([out.shape for out in host_outputs])) print("\tOutput Bindings for Profile {}: {}".format(context.active_optimization_profile, output_binding_idxs)) # Bindings are a list of device pointers for inputs and outputs bindings = device_inputs + device_outputs # list的合并 # Inference t1 = time.time() for i in range(1000): context.execute_v2(bindings) // 执行1000次, 该处和execute_async_v2函数不大一样 t2 = time.time() print("Inference iterations: {}".format(((t2-t1)))) print("Inference iterations per sample: {}".format(((t2-t1)/BatchSize))) # Copy outputs back to host to view results 将输出由gpu拷贝到cpu。 for h_output, d_output in zip(host_outputs, device_outputs): cuda.memcpy_dtoh(h_output, d_output) # View outputs # print("Inference Outputs:", host_outputs) # Cleanup (Can also use context managers instead) del context del engine
问题:
- 1、错误:pycuda._driver.MemoryError: cuMemHostAlloc failed: out of memory
- 原因:
size = trt.volume(engine.get_binding_shape(binding)) * n
size结果为负值,host_mem = cuda.pagelocked_empty(size, dtype),无法分配内存。动态生成模型时,模型第一维为-1,如(-1,3,128,128),size为负值/
-
添加如下代码:
-
size = trt.volume(engine.get_binding_shape(binding)) * n dims = engine.get_binding_shape(binding) if dims[0] < 0: size *= -1