python3 TensorRT 推理Demo

1. 使用tensorrt 对人脸68个特征点推理demo 代码

import tensorrt as trt
import pycuda.driver as cuda
import pycuda.autoinit
import numpy as np
import cv2

def get_engine(engine_path):
    # If a serialized engine exists, use it instead of building an engine.
    print("Reading engine from file {}".format(engine_path))
    with open(engine_path, "rb") as f, trt.Runtime(TRT_LOGGER) as runtime:
        return runtime.deserialize_cuda_engine(f.read())

TRT_LOGGER = trt.Logger()
# engine = get_engine("yolov4_1.trt")


engine = get_engine("pfld.engine")
print(engine)
for binding in engine:
        size = trt.volume(engine.get_binding_shape(binding)) * 1
        dims = engine.get_binding_shape(binding)
        print(size)
        print(dims)
        print(binding)
        print(engine.binding_is_input(binding))
        dtype = trt.nptype(engine.get_binding_dtype(binding))
        print("dtype = ", dtype)



engine = get_engine("pfld.engine")
context = engine.create_execution_context()

def get_landmarks(img):
    resized = cv2.resize(img, (112, 112), interpolation=cv2.INTER_LINEAR)
    img_in = cv2.cvtColor(resized, cv2.COLOR_BGR2RGB)
    cv2.imwrite("tmp.jpg", img_in)
    img_in = np.transpose(img_in, (2, 0, 1)).astype(np.float32)
    img_in = np.expand_dims(img_in, axis=0)
    img_in /= 255.0
    # img_in = np.ascontiguousarray(img_in)
    print("Shape of the network input: ", img_in.shape)
    # print(img_in)

    # with get_engine("mobilefacenet-res2-6-10-2-dim512/onnx/face_reg_mnet.engine") as engine, engine.create_execution_context() as context:
    h_input = cuda.pagelocked_empty(trt.volume(context.get_binding_shape(0)), dtype=np.float32)
    h_output = cuda.pagelocked_empty(trt.volume(context.get_binding_shape(1)), dtype=np.float32)

    # Allocate device memory for inputs and outputs.
    d_input = cuda.mem_alloc(h_input.nbytes)
    d_output = cuda.mem_alloc(h_output.nbytes)
    # Create a stream in which to copy inputs/outputs and run inference.
    stream = cuda.Stream()

    # set the host input data
    # h_input = img_in
    np.copyto(h_input, img_in.ravel())
    # np.copyto(h_input, img_in.unsqueeze_(0))

    # print(h_input)
    # Transfer input data to the GPU.
    cuda.memcpy_htod_async(d_input, h_input, stream)
    # Run inference.
    context.execute_async_v2(bindings=[int(d_input), int(d_output)], stream_handle=stream.handle)
    # Transfer predictions back from the GPU.
    cuda.memcpy_dtoh_async(h_output, d_output, stream)
    # Synchronize the stream
    stream.synchronize()
    # Return the host output. 

    # print(h_output)
    return h_output


img1 = cv2.imread("./s_28.jpg")
print(img1.shape)
output = get_landmarks(img1).reshape(-1, 2)

print(output)
for xy in output:
    x = xy[0] * 112
    y = xy[1] * 112
    cv2.circle(img1, (int(x), int(y)), 2, (0,255,0), -1)

cv2.imwrite("out.jpg", img1)
上一篇:TensorRT-7.x自定义插件详细指南


下一篇:用于ONNX的TensorRT后端