/* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY * KIND, either express or implied. See the License for the * specific language governing permissions and limitations * under the License. */ /*! * \file runtime/contrib/tensorrt/tensorrt_builder.h * \brief The TensorRTBuilder class can be used to convert a JSONRuntime graph into a TRT engine * which can be used for inference. */ #ifndef TVM_RUNTIME_CONTRIB_TENSORRT_TENSORRT_BUILDER_H_ #define TVM_RUNTIME_CONTRIB_TENSORRT_TENSORRT_BUILDER_H_ #include #include #include #include #include "../json/json_node.h" #include "NvInfer.h" #include "tensorrt_logger.h" #include "tensorrt_ops.h" namespace tvm { namespace runtime { namespace contrib { using JSONGraphNode = tvm::runtime::json::JSONGraphNode; using JSONGraphNodeEntry = tvm::runtime::json::JSONGraphNodeEntry; /*! * \brief The product of TensorRTBuilder which provides everything needed to * perform inference. */ struct TensorRTEngineAndContext { nvinfer1::ICudaEngine* engine; nvinfer1::IExecutionContext* context; std::vector inputs; std::vector outputs; }; /*! * \brief Converts a JSONRuntime graph into a TensorRT engine and execution context. Inputs, * constants, layers, and outputs can be added to construct the TensorRT network definition. * BuildEngine() will then use the network definition to build the TensorRT engine and context which * can be used to run inference - this phase can take a long time because TensorRT will query the * performance of all available kernels and fusions to optimize the engine. */ class TensorRTBuilder { public: /*! * \brief Create TensorRT builder. * \param logger TensorRT logger to use for errors and warnings. * \param max_workspace_size Workspace size parameter for TensorRT engine build phase. * \param use_implicit_batch Whether to use implicit batch mode (default) * \param use_fp16 Whether to use implicit batch mode (default) * \param batch_size If use_implicit_batch, */ TensorRTBuilder(TensorRTLogger* logger, const std::vector& data_entry, size_t max_workspace_size, bool use_implicit_batch, bool use_fp16, int batch_size, nvinfer1::IInt8Calibrator* calibrator = nullptr); /*! * \brief Add TensorRT input(s) for input node in network definition. * \param nid The input node id. * \param entry_id The index into data_entry_ for first entry in node. * \param node The input node. */ void AddInput(int nid, uint32_t entry_id, const JSONGraphNode& node); /*! * \brief Add TensorRT weight for input constant in network definition. * \param nid The input node id. * \param node The data tensor on CPU. */ void AddConstant(int nid, const DLTensor* data); /*! * \brief Add TensorRT layer for op node in network definition. * \param nid The input node id. * \param node The op node. */ void AddLayer(int nid, const JSONGraphNode& node); /*! * \brief Mark TensorRT output in network definition. * \param entry The output node entry. * \param entry_id The output node entry id. */ void AddOutput(const JSONGraphNodeEntry& entry, uint32_t entry_id); /*! * \brief Takes network definition and "compiles" a TensorRT engine which can be used for * inference. This step is time confusing. * \return TRT engine, context, and input/output information. */ TensorRTEngineAndContext BuildEngine(); private: /*! \brief Convert a DLTensor to a TensorRT weight. */ nvinfer1::Weights GetDLTensorAsWeights(const DLTensor* dptr, DLDeviceType src_device); /*! \brief Convert an input to a Tensor if it is a Weight */ nvinfer1::ITensor* GetInputAsTensor(const TensorRTOpInput& input); /*! \brief Clean up resources used to create engine. */ void CleanUp(); /*! \brief Maps a node to its outputs. */ std::unordered_map> node_output_map_; /*! \brief TensorRT builder. */ nvinfer1::IBuilder* builder_; #if TRT_VERSION_GE(6, 0, 1) /*! \brief TensorRT builder config. */ nvinfer1::IBuilderConfig* config_; #endif /*! \brief TensorRT network definition. */ nvinfer1::INetworkDefinition* network_; /*! \brief List of all weights held in memory. */ std::vector trt_weights_; /*! \brief Input and output tensors from TVM. */ const std::vector& data_entry_; /*! \brief Map TensorRT binding name to index in data_entry_. */ std::unordered_map entry_id_map_; /*! \brief Max workspace size in bytes for TRT. */ size_t max_workspace_size_; /*! \brief Whether to use implicit batch mode. */ bool use_implicit_batch_; /*! \brief Whether to automatically convert model to 16-bit floating point precision. */ bool use_fp16_; /*! \brief whether to automatically convert model to int8 precision */ bool use_int8_; /*! \brief Batch size to optimize for. */ int batch_size_; /*! \brief Input names. */ std::vector network_input_names_; /*! \brief Output names. */ std::vector network_output_names_; /*! \brief calibrator pointer to add batch data when using int8 mode */ /*! \brief pointer will be nullptr when it is fp16 or fp32 precision */ nvinfer1::IInt8Calibrator* calibrator_; }; } // namespace contrib } // namespace runtime } // namespace tvm #endif // TVM_RUNTIME_CONTRIB_TENSORRT_TENSORRT_BUILDER_H_