在tensorflow1.8之后的版本中,tensorflow.contrib部分都有tensorrt的组件,该组件存在的意义在于,你可以读取pb文件,并调用tensorrt的方法进行子图压缩,
tensorrt-integration-speeds-tensorflow-inference.下面是我修改的代码,在P40卡上,FP32=FP16,因为P40不支持FP16。不过问题在于INT8未通过。原因待查询。
# -*- coding: utf-8 -*-
r""" TF-TensorRT integration sample script
1 - Specify the fraction of GPU memory allowed for TensorFlow. TensorRT can use the remaining memory.
2 - Let TensorRT analyze the TensorFlow graph, apply optimizations, and replace subgraphs with TensorRT nodes.
"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import os
import sys
import time
import json
import os.path as osp
import argparse, itertools, datetime
import numpy as np
import tensorflow as tf
from tensorflow.python.ops import data_flow_ops
from tensorflow.python.platform import gfile
from tensorflow.python.client import timeline
import tensorflow.contrib.tensorrt as trt
tf.logging.set_verbosity(tf.logging.INFO)
class TF2TensorRT(object):
'''通过调用tensorflow.contrib中自带的tensorrt进行模型转换,
tensorrt自动变换其中的子图。从而还能生成pb文件。
在后续部署中,仍可以通过tensorflow的api接口去读取,
其中tensorrt能转换的则tensorrt运行,不能的部分则tf运行'''
def __init__(self, percent, batch_size, output_nodes):
'''Use the new per_process_gpu_memory_fraction parameter of the GPUOptions
function to specify the GPU memory fraction TensorRT can consume. This
parameter should be set the first time the TensorFlow-TensorRT process
starts. As an example, 0.67 would allocate 67% of GPU memory for TensorFlow,
making the remaining 33% available for TensorRT engines. '''
self.batch_size = batch_size
self.output_nodes = output_nodes
self.gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=percent)
self.config = tf.ConfigProto(gpu_options=self.gpu_options)
def read_pb(self, pb_path, graph, sess):
'''read the model from pb file '''
self.pb_path = pb_path
with graph.as_default():
with gfile.FastGFile(pb_path, 'rb') as fr:
graph_def = tf.GraphDef()
graph_def.ParseFromString(fr.read())
return graph_def
def _write_pb(self, trt_graph, precision_mode):
'''write converted model into new pb file '''
dir_path, ext = osp.splitext(self.pb_path)
newpb_filename = f'{dir_path}{precision_mode}{ext}'
with gfile.FastGFile(newpb_filename, 'wb') as fw:
fw.write(trt_graph.SerializeToString())
return newpb_filename
def create_workspace(self):
graph = tf.Graph()
with graph.as_default():
sess = tf.Session(graph=graph,config=self.config)
return graph,sess
def close_workspace(self,*args,sess=None):
sess.close()
def get_FPxx(self,
graph,graph_def,
workspace_size=1<<30,
precision_mode='FP32',
dump=True):
'''You apply TensorRT optimizations to the frozen graph with the new
create_inference_graph function. TensorRT then takes a frozen TensorFlow
graph as input and returns an optimized graph with TensorRT nodes
You should use the per_process_gpu_memory_fraction and max_workspace_size_bytes
parameters together for best overall application performance. For example,
set the per_process_gpu_memory_fraction parameter to ( 12 – 4 ) / 12 = 0.67
and the max_workspace_size_bytes parameter to 4000000000 for a 12GB GPU
in order to allocate ~4GB for the TensorRT engines.
TensorRT automatically uses Tensor Cores in Volta GPUs for inference when using
half-precision arithmetic. The peak performance of Tensor Cores on the NVIDIA
Tesla V100 is about an order of magnitude (10x) faster than double precision (FP64)
and about 4 times faster than single precision (FP32). Just use FP16 as value for
the precision_mode parameter in the create_inference_graph function to enable
half precision
---
frozen_graph_def: frozen TensorFlow graphout
put_node_name: list of strings with names of output nodes
e.g. ["resnet_v1_50/predictions/Reshape_1"]
max_batch_size: integer, size of input batch e.g. 16
max_workspace_size_bytes: integer, maximum GPU memory size available for TensorRT
precision_mode: string, allowed values FP32, FP16 or INT8
'''
with graph.as_default():
trt_graph = trt.create_inference_graph(graph_def, self.output_nodes,
max_batch_size=self.batch_size,
max_workspace_size_bytes=workspace_size,
precision_mode=precision_mode )
if dump:
newpb_path = self._write_pb(trt_graph, precision_mode)
else:
newpb_path=''
return trt_graph,newpb_path
def get_INT8(self,
graph,
calib_graph,
workspace_size=1<<30,
precision_mode='INT8'):
'''TensorRT provides capabilities to take models trained in single (FP32) and
half (FP16) precision and convert them for deployment with INT8 quantizations
while minimizing accuracy loss.
To convert models for deployment with INT8, you need to calibrate the trained
FP32 model before applying TensorRT’s optimizations described in the earlier
sections. The remaining workflow remains unchanged
1 - First use the create_inference_graph function with the precision_mode parameter
set to INT8 to calibrate the model. The output of this function is a frozen
TensorFlow graph ready for calibration.
2 - Next, execute the calibration graph with calibration data. TensorRT uses the
distribution of node data to quantize the weights for the nodes. It is important
to use calibration data that closely reflects the distribution of the problem
dataset in production. We suggest checking for error accumulation during inference
when first using models calibrated with INT8.
3 - After executing the graph on calibration data, apply TensorRT optimizations to
the calibration graph with the calib_graph_to_infer_graph function. This function
also replaces the TensorFlow subgraph with a TensorRT node optimized for INT8.
The output of the function is a frozen TensorFlow graph that can be used for
inference as usual.
'''
with graph.as_defalut():
trt_graph = trt.calib_graph_to_infer_graph(calib_graph)
newpb_path = self._write_pb(trt_graph,precision_mode)
return trt_graph,newpb_path
def convert_NHWC2NCHW(self, graph,sess,tensor_input):
with graph.as_default():
tensor_output = tf.transpose(tensor_input, perm=(0,3,1,2))
tensor_output = sess.run(tensor_output)
return tensor_output
def read_tensor_from_image_file(self, graph, sess, file_name, input_height=224, input_width=224,
input_mean=0, input_std=255, input_name = "file_reader",
output_name = "normalized"):
""" Read a jpg image file and return a tensor """
with graph.as_default():
file_reader = tf.read_file(file_name, input_name)
image_reader = tf.image.decode_png(file_reader, channels = 3, name='jpg_reader')
float_caster = tf.cast(image_reader, tf.float32)
dims_expander = tf.expand_dims(float_caster, 0);
resized = tf.image.resize_bilinear(dims_expander, [input_height, input_width])
normalized = tf.divide(tf.subtract(resized, [input_mean]), [input_std])
normalized_NHWC = sess.run(normalized)
normalized_NCHW = self.convert_NHWC2NCHW(graph,sess,normalized_NHWC)
return normalized_NHWC,normalized_NCHW
def run(self, graph, graph_def, sess, num_loops, tensor_input):
tf.logging.info('Starting execution')
with graph.as_default():
output = tf.import_graph_def(graph_def=graph_def,
input_map={"input":tensor_input},
return_elements=self.output_nodes)
ans = sess.run(output[0].outputs[0])
return ans
def topX(arr,X):
ind=np.argsort(arr)[:,-X:][:,::-1]
return arr[np.arange(np.shape(arr)[0])[:,np.newaxis],ind],ind
def getLabels(labels,ids):
return [labels[str(x+1)] for x in ids]
if "__main__" == __name__:
parser = argparse.ArgumentParser(prog="convert pb model file into uff!")
parser.add_argument('--FP32',action='store_true')
parser.add_argument('--FP16',action='store_true')
parser.add_argument('--INT8',action='store_true')
parser.add_argument('--native',action='store_true')
parser.add_argument('--num_loops',type=int,default=20)
parser.add_argument('--data_dir',type=str,default='./data')
parser.add_argument('--pb_path',type=str,default='resnetV150_frozen.pb')
parser.add_argument('--mem_percent',type=float,default=0.5)
parser.add_argument('--topN',type=int,default=10)
parser.add_argument('--batch_size',type=int,default=1)
parser.add_argument('--workspace_size',type=int,default=1<<10,help="workspace size in MB")
f,unparsed = parser.parse_known_args()
batch_size = f.batch_size
pb_path = f.pb_path
mem_percent = f.mem_percent
workspace_size = f.workspace_size
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
print(f'===============start==================')
print("Starting at",datetime.datetime.now())
output_nodes= ["resnet_v1_50/predictions/Reshape_1"]
tft = TF2TensorRT(mem_percent, batch_size, output_nodes)
if f.native:
graph,sess = tft.create_workspace()
graph_def = tft.read_pb(pb_path, graph, sess)
imageName = 'grace_hopper.jpg'
image_input = tft.read_tensor_from_image_file(graph,sess,imageName,
input_height=224,
input_width=224,
input_mean=0,
input_std=1.0)
image_input = image_input[0]
ans = tft.run(graph,graph_def,sess,0,image_input)
tft.close_workspace(graph,graph_def,sess=sess)
ans_topX = topX(ans,1)
print('='*50,ans_topX)
if f.FP32:
graph,sess = tft.create_workspace()
# 读取pb文件
graph_def = tft.read_pb(pb_path, graph, sess)
# 调用tensorflow.contrib中的tensorrt进行转换。并生成对应pb
trt_graph_FP32,newpb_path = tft.get_FPxx(graph,graph_def,
workspace_size=1<<30,
precision_mode='FP32')
tft.close_workspace(graph,graph_def,trt_graph_FP32,sess=sess)
# read the converted pb file
graph,sess = tft.create_workspace()
# 读取生成的不同精度pb文件
graph_def_FP32 = tft.read_pb(newpb_path, graph, sess)
# 采用tensorflow的方式去运行该pb
ans = tft.run(graph,graph_def_FP32,sess,0,image_input)
tft.close_workspace(graph,graph_def_FP32,sess=sess)
ans_topX = topX(ans,1)
print('='*50,ans_topX)
if f.FP16:
graph,sess = tft.create_workspace()
graph_def = tft.read_pb(pb_path, graph, sess)
trt_graph_FP16,newpb_path = tft.get_FPxx(graph,graph_def,
workspace_size=1<<30,
precision_mode='FP16')
tft.close_workspace(graph,graph_def,trt_graph_FP16,sess=sess)
# read the converted pb file
graph,sess = tft.create_workspace()
graph_def_FP16 = tft.read_pb(newpb_path, graph, sess)
ans = tft.run(graph,graph_def_FP16,sess,0,image_input)
tft.close_workspace(graph,graph_def_FP16,sess=sess)
ans_topX = topX(ans,1)
print('='*50,ans_topX)
if f.INT8:
graph,sess = tft.create_workspace()
graph_def = tft.read_pb(pb_path, graph, sess)
calibGraph,_ = tft.get_FPxx(graph,graph_def,
workspace_size=1<<30,
precision_mode='INT8',
dump=False)
print("Running Calibration")
# TODO
timings,comp,_,mdstats=tft.run(graph,calibGraph,sess,1,image_input)
print("Creating inference graph")
int8Graph,newpb_path = self.get_INT8(calibGraph)
tft.close_workspace(graph,graph_def,calibGraph,int8Graph,sess=sess)
print('='*50)
# read the converted pb file
graph,sess = tft.create_workspace()
graph_def_INT8 = tft.read_pb(newpb_path, graph, sess)
ans = tft.run(graph,graph_def_INT8,sess,0,image_input)
tft.close_workspace(graph,graph_def_INT8,sess=sess)
ans_topX = topX(ans,1)
print('='*50,ans_topX)