模型加速[tensorflow&tensorrt]


在tensorflow1.8之后的版本中,tensorflow.contrib部分都有tensorrt的组件,该组件存在的意义在于,你可以读取pb文件,并调用tensorrt的方法进行子图压缩,
tensorrt-integration-speeds-tensorflow-inference.下面是我修改的代码,在P40卡上,FP32=FP16,因为P40不支持FP16。不过问题在于INT8未通过。原因待查询。

# -*- coding: utf-8 -*-
r""" TF-TensorRT integration sample script 

1 - Specify the fraction of GPU memory allowed for TensorFlow. TensorRT can use the remaining memory.
2 - Let TensorRT analyze the TensorFlow graph, apply optimizations, and replace subgraphs with TensorRT nodes.

"""

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import os
import sys
import time
import json
import os.path as osp
import argparse, itertools, datetime

import numpy as np
import tensorflow as tf
from tensorflow.python.ops import data_flow_ops
from tensorflow.python.platform import gfile
from tensorflow.python.client import timeline
import tensorflow.contrib.tensorrt as trt

tf.logging.set_verbosity(tf.logging.INFO)


class TF2TensorRT(object):
    '''通过调用tensorflow.contrib中自带的tensorrt进行模型转换,
        tensorrt自动变换其中的子图。从而还能生成pb文件。
        在后续部署中,仍可以通过tensorflow的api接口去读取,
        其中tensorrt能转换的则tensorrt运行,不能的部分则tf运行'''
    def __init__(self, percent, batch_size, output_nodes):
        '''Use the new per_process_gpu_memory_fraction parameter of the GPUOptions
           function to specify the GPU memory fraction TensorRT can consume. This 
           parameter should be set the first time the TensorFlow-TensorRT process
           starts. As an example, 0.67 would allocate 67% of GPU memory for TensorFlow, 
           making the remaining 33% available for TensorRT engines. '''
        self.batch_size = batch_size
        self.output_nodes = output_nodes

        self.gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=percent)
        self.config = tf.ConfigProto(gpu_options=self.gpu_options)

    def read_pb(self, pb_path, graph, sess):
        '''read the model from pb file '''
        self.pb_path = pb_path
        with graph.as_default():
            with gfile.FastGFile(pb_path, 'rb') as fr:
                graph_def = tf.GraphDef()
                graph_def.ParseFromString(fr.read())
        return graph_def


    def _write_pb(self, trt_graph,  precision_mode):
        '''write converted model into new pb file '''
        dir_path, ext = osp.splitext(self.pb_path)
        newpb_filename = f'{dir_path}{precision_mode}{ext}'
        with gfile.FastGFile(newpb_filename, 'wb') as fw:
            fw.write(trt_graph.SerializeToString())
        return newpb_filename

    def create_workspace(self):
        graph = tf.Graph() 
        with graph.as_default():
            sess = tf.Session(graph=graph,config=self.config)
        return graph,sess
        

    def close_workspace(self,*args,sess=None):
        sess.close()

    def get_FPxx(self, 
                 graph,graph_def,
                 workspace_size=1<<30, 
                 precision_mode='FP32', 
                 dump=True):
        '''You apply TensorRT optimizations to the frozen graph with the new 
           create_inference_graph function. TensorRT then takes a frozen TensorFlow 
           graph as input and returns an optimized graph with TensorRT nodes 

           You should use the per_process_gpu_memory_fraction and max_workspace_size_bytes 
           parameters together for best overall application performance. For example, 
           set the per_process_gpu_memory_fraction parameter to ( 12 – 4 ) / 12 = 0.67 
           and the max_workspace_size_bytes parameter to 4000000000 for a 12GB GPU 
           in order to allocate ~4GB for the TensorRT engines.

           TensorRT automatically uses Tensor Cores in Volta GPUs for inference when using 
           half-precision arithmetic. The peak performance of Tensor Cores on the NVIDIA 
           Tesla V100 is about an order of magnitude (10x) faster than double precision (FP64) 
           and about 4 times faster than single precision (FP32). Just use FP16 as value for 
           the precision_mode parameter in the create_inference_graph function to enable 
           half precision
        ---
        frozen_graph_def: frozen TensorFlow graphout
        put_node_name:    list of strings with names of output nodes 
                          e.g. ["resnet_v1_50/predictions/Reshape_1"]
        max_batch_size:   integer, size of input batch e.g. 16
        max_workspace_size_bytes:   integer, maximum GPU memory size available for TensorRT
        precision_mode:   string, allowed values FP32, FP16 or INT8

        '''
        
        with graph.as_default():
            trt_graph = trt.create_inference_graph(graph_def, self.output_nodes,
                                                   max_batch_size=self.batch_size,
                                                   max_workspace_size_bytes=workspace_size,
                                                   precision_mode=precision_mode )
            if dump:
                newpb_path = self._write_pb(trt_graph, precision_mode)
            else:
                newpb_path=''

        return trt_graph,newpb_path


    def get_INT8(self, 
                 graph,
                 calib_graph,
                 workspace_size=1<<30, 
                 precision_mode='INT8'): 
        '''TensorRT provides capabilities to take models trained in single (FP32) and 
           half (FP16) precision and convert them for deployment with INT8 quantizations 
           while minimizing accuracy loss.

           To convert models for deployment with INT8, you need to calibrate the trained 
           FP32 model before applying TensorRT’s optimizations described in the earlier 
           sections. The remaining workflow remains unchanged 
          
           1 - First use the create_inference_graph function with the precision_mode parameter 
               set to INT8 to calibrate the model. The output of this function is a frozen 
               TensorFlow graph ready for calibration.
           2 - Next, execute the calibration graph with calibration data. TensorRT uses the 
               distribution of node data to quantize the weights for the nodes. It is important
               to use calibration data that closely reflects the distribution of the problem 
               dataset in production. We suggest checking for error accumulation during inference
               when first using models calibrated with INT8.
           3 - After executing the graph on calibration data, apply TensorRT optimizations to 
               the calibration graph with the calib_graph_to_infer_graph function. This function 
               also replaces the TensorFlow subgraph with a TensorRT node optimized for INT8. 
               The output of the function is a frozen TensorFlow graph that can be used for 
               inference as usual.

        '''
        with graph.as_defalut():
            trt_graph = trt.calib_graph_to_infer_graph(calib_graph)
            newpb_path = self._write_pb(trt_graph,precision_mode)
        return trt_graph,newpb_path
          

    def convert_NHWC2NCHW(self, graph,sess,tensor_input):
        with graph.as_default():
            tensor_output = tf.transpose(tensor_input, perm=(0,3,1,2))
            tensor_output = sess.run(tensor_output)
        return tensor_output

    def read_tensor_from_image_file(self, graph, sess, file_name, input_height=224, input_width=224,
                                input_mean=0, input_std=255, input_name = "file_reader",
                                output_name = "normalized"):
        """ Read a jpg image file and return a tensor """
        with graph.as_default():
            file_reader = tf.read_file(file_name, input_name)
            image_reader = tf.image.decode_png(file_reader, channels = 3, name='jpg_reader')
            float_caster = tf.cast(image_reader, tf.float32)
            dims_expander = tf.expand_dims(float_caster, 0);
            resized = tf.image.resize_bilinear(dims_expander, [input_height, input_width])
            normalized = tf.divide(tf.subtract(resized, [input_mean]), [input_std])
            normalized_NHWC = sess.run(normalized)
            normalized_NCHW = self.convert_NHWC2NCHW(graph,sess,normalized_NHWC)
      
        return normalized_NHWC,normalized_NCHW

    def run(self, graph, graph_def, sess, num_loops, tensor_input):
        tf.logging.info('Starting execution')
        
        with graph.as_default():
            output = tf.import_graph_def(graph_def=graph_def,
                                     input_map={"input":tensor_input},
                                     return_elements=self.output_nodes)     
            
            ans = sess.run(output[0].outputs[0])
        return ans
            




def topX(arr,X):
  ind=np.argsort(arr)[:,-X:][:,::-1]
  return arr[np.arange(np.shape(arr)[0])[:,np.newaxis],ind],ind

def getLabels(labels,ids):
  return [labels[str(x+1)] for x in ids]

if "__main__" == __name__:

  parser = argparse.ArgumentParser(prog="convert pb model file into uff!")
  parser.add_argument('--FP32',action='store_true')
  parser.add_argument('--FP16',action='store_true')
  parser.add_argument('--INT8',action='store_true')
  parser.add_argument('--native',action='store_true')
  parser.add_argument('--num_loops',type=int,default=20)
  parser.add_argument('--data_dir',type=str,default='./data')
  parser.add_argument('--pb_path',type=str,default='resnetV150_frozen.pb')
  parser.add_argument('--mem_percent',type=float,default=0.5)
  parser.add_argument('--topN',type=int,default=10)
  parser.add_argument('--batch_size',type=int,default=1)
  parser.add_argument('--workspace_size',type=int,default=1<<10,help="workspace size in MB")
  
  f,unparsed = parser.parse_known_args()
  batch_size = f.batch_size
  pb_path = f.pb_path
  mem_percent = f.mem_percent
  workspace_size = f.workspace_size
  
  
  os.environ["CUDA_VISIBLE_DEVICES"] = "0" 
  print(f'===============start==================')
  print("Starting at",datetime.datetime.now())

  output_nodes= ["resnet_v1_50/predictions/Reshape_1"]
  tft = TF2TensorRT(mem_percent, batch_size, output_nodes)
  

  if f.native:
      graph,sess = tft.create_workspace()
      graph_def = tft.read_pb(pb_path, graph, sess)
      imageName = 'grace_hopper.jpg' 
      image_input = tft.read_tensor_from_image_file(graph,sess,imageName,
                                  input_height=224,
                                  input_width=224,
                                  input_mean=0,
                                  input_std=1.0)

      image_input = image_input[0]
      ans = tft.run(graph,graph_def,sess,0,image_input) 
      tft.close_workspace(graph,graph_def,sess=sess)
      ans_topX = topX(ans,1)
      print('='*50,ans_topX)

  if f.FP32:
      graph,sess = tft.create_workspace()
      # 读取pb文件
      graph_def = tft.read_pb(pb_path, graph, sess)
      # 调用tensorflow.contrib中的tensorrt进行转换。并生成对应pb
      trt_graph_FP32,newpb_path = tft.get_FPxx(graph,graph_def,
                                    workspace_size=1<<30,
                                    precision_mode='FP32')
      tft.close_workspace(graph,graph_def,trt_graph_FP32,sess=sess)
      # read the converted pb file
      
      graph,sess = tft.create_workspace()
      # 读取生成的不同精度pb文件
      graph_def_FP32 = tft.read_pb(newpb_path, graph, sess)
      # 采用tensorflow的方式去运行该pb
      ans = tft.run(graph,graph_def_FP32,sess,0,image_input)
      tft.close_workspace(graph,graph_def_FP32,sess=sess)
      ans_topX = topX(ans,1)
      print('='*50,ans_topX)

  if f.FP16:
      graph,sess = tft.create_workspace()
      graph_def = tft.read_pb(pb_path, graph, sess)
      trt_graph_FP16,newpb_path = tft.get_FPxx(graph,graph_def,
                                    workspace_size=1<<30,
                                    precision_mode='FP16')
      tft.close_workspace(graph,graph_def,trt_graph_FP16,sess=sess)
      # read the converted pb file

      graph,sess = tft.create_workspace()
      graph_def_FP16 = tft.read_pb(newpb_path, graph, sess)
      ans = tft.run(graph,graph_def_FP16,sess,0,image_input)
      tft.close_workspace(graph,graph_def_FP16,sess=sess)
      ans_topX = topX(ans,1)
      print('='*50,ans_topX)

  if f.INT8:
      graph,sess = tft.create_workspace()
      graph_def = tft.read_pb(pb_path, graph, sess)
      calibGraph,_ = tft.get_FPxx(graph,graph_def,
                                workspace_size=1<<30,
                                precision_mode='INT8',
                                dump=False)
      print("Running Calibration")
      # TODO
      timings,comp,_,mdstats=tft.run(graph,calibGraph,sess,1,image_input)
      print("Creating inference graph")
      int8Graph,newpb_path = self.get_INT8(calibGraph)
      tft.close_workspace(graph,graph_def,calibGraph,int8Graph,sess=sess)
      print('='*50)

      # read the converted pb file
      graph,sess = tft.create_workspace()
      graph_def_INT8 = tft.read_pb(newpb_path, graph, sess)
      ans = tft.run(graph,graph_def_INT8,sess,0,image_input)
      tft.close_workspace(graph,graph_def_INT8,sess=sess)
      ans_topX = topX(ans,1)
      print('='*50,ans_topX)
上一篇:TensorRT Development document


下一篇:文本分类-TensorRT优化结果对比图