trt_backend.h 5.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155
  1. // Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
  2. //
  3. // Licensed under the Apache License, Version 2.0 (the "License");
  4. // you may not use this file except in compliance with the License.
  5. // You may obtain a copy of the License at
  6. //
  7. // http://www.apache.org/licenses/LICENSE-2.0
  8. //
  9. // Unless required by applicable law or agreed to in writing, software
  10. // distributed under the License is distributed on an "AS IS" BASIS,
  11. // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. // See the License for the specific language governing permissions and
  13. // limitations under the License.
  14. #pragma once
  15. #include <cuda_runtime_api.h>
  16. #include <iostream>
  17. #include <map>
  18. #include <string>
  19. #include <vector>
  20. #include "NvInfer.h"
  21. #include "NvOnnxParser.h"
  22. #include "ultra_infer/runtime/backends/backend.h"
  23. #include "ultra_infer/runtime/backends/tensorrt/option.h"
  24. #include "ultra_infer/runtime/backends/tensorrt/utils.h"
  25. #include "ultra_infer/utils/unique_ptr.h"
  26. class Int8EntropyCalibrator2 : public nvinfer1::IInt8EntropyCalibrator2 {
  27. public:
  28. explicit Int8EntropyCalibrator2(const std::string &calibration_cache)
  29. : calibration_cache_(calibration_cache) {}
  30. int getBatchSize() const noexcept override { return 0; }
  31. bool getBatch(void *bindings[], const char *names[],
  32. int nbBindings) noexcept override {
  33. return false;
  34. }
  35. const void *readCalibrationCache(size_t &length) noexcept override {
  36. length = calibration_cache_.size();
  37. return length ? calibration_cache_.data() : nullptr;
  38. }
  39. void writeCalibrationCache(const void *cache,
  40. size_t length) noexcept override {
  41. ultra_infer::FDERROR << "NOT IMPLEMENT." << std::endl;
  42. }
  43. private:
  44. const std::string calibration_cache_;
  45. };
  46. namespace ultra_infer {
  47. struct TrtValueInfo {
  48. std::string name;
  49. std::vector<int> shape;
  50. nvinfer1::DataType dtype; // dtype of TRT model
  51. FDDataType original_dtype; // dtype of original ONNX/Paddle model
  52. };
  53. std::vector<int> toVec(const nvinfer1::Dims &dim);
  54. size_t TrtDataTypeSize(const nvinfer1::DataType &dtype);
  55. FDDataType GetFDDataType(const nvinfer1::DataType &dtype);
  56. class TrtBackend : public BaseBackend {
  57. public:
  58. TrtBackend() : engine_(nullptr), context_(nullptr) {}
  59. bool Init(const RuntimeOption &runtime_option);
  60. bool Infer(std::vector<FDTensor> &inputs, std::vector<FDTensor> *outputs,
  61. bool copy_to_fd = true) override;
  62. int NumInputs() const { return inputs_desc_.size(); }
  63. int NumOutputs() const { return outputs_desc_.size(); }
  64. TensorInfo GetInputInfo(int index);
  65. TensorInfo GetOutputInfo(int index);
  66. std::vector<TensorInfo> GetInputInfos() override;
  67. std::vector<TensorInfo> GetOutputInfos() override;
  68. std::unique_ptr<BaseBackend> Clone(RuntimeOption &runtime_option,
  69. void *stream = nullptr,
  70. int device_id = -1) override;
  71. ~TrtBackend() {
  72. if (parser_) {
  73. parser_.reset();
  74. }
  75. }
  76. private:
  77. void BuildOption(const TrtBackendOption &option);
  78. bool InitFromPaddle(const std::string &model_buffer,
  79. const std::string &params_buffer,
  80. const TrtBackendOption &option = TrtBackendOption(),
  81. bool verbose = false);
  82. bool InitFromOnnx(const std::string &model_buffer,
  83. const TrtBackendOption &option = TrtBackendOption());
  84. TrtBackendOption option_;
  85. std::shared_ptr<nvinfer1::ICudaEngine> engine_;
  86. std::shared_ptr<nvinfer1::IExecutionContext> context_;
  87. FDUniquePtr<nvonnxparser::IParser> parser_;
  88. FDUniquePtr<nvinfer1::IBuilder> builder_;
  89. FDUniquePtr<nvinfer1::INetworkDefinition> network_;
  90. cudaStream_t stream_{};
  91. std::vector<void *> bindings_;
  92. std::vector<TrtValueInfo> inputs_desc_;
  93. std::vector<TrtValueInfo> outputs_desc_;
  94. std::map<std::string, FDDeviceBuffer> inputs_device_buffer_;
  95. std::map<std::string, FDDeviceBuffer> outputs_device_buffer_;
  96. std::map<std::string, int> io_name_index_;
  97. std::string calibration_str_;
  98. bool save_external_ = false;
  99. std::string model_file_name_ = "";
  100. // Sometimes while the number of outputs > 1
  101. // the output order of tensorrt may not be same
  102. // with the original onnx model
  103. // So this parameter will record to origin outputs
  104. // order, to help recover the right order
  105. std::map<std::string, int> outputs_order_;
  106. // temporary store onnx model content
  107. // once it used to build trt egnine done
  108. // it will be released
  109. std::string onnx_model_buffer_;
  110. // Stores shape information of the loaded model
  111. // For dynamic shape will record its range information
  112. // Also will update the range information while inferencing
  113. std::map<std::string, ShapeRangeInfo> shape_range_info_;
  114. // If the final output tensor's dtype is different from the
  115. // model output tensor's dtype, then we need cast the data
  116. // to the final output's dtype.
  117. // E.g. When trt model output tensor is int32, but final tensor is int64
  118. // This map stores the casted tensors.
  119. std::map<std::string, FDTensor> casted_output_tensors_;
  120. void GetInputOutputInfo();
  121. bool CreateTrtEngineFromOnnx(const std::string &onnx_model_buffer);
  122. bool BuildTrtEngine();
  123. bool LoadTrtCache(const std::string &trt_engine_file);
  124. int ShapeRangeInfoUpdated(const std::vector<FDTensor> &inputs);
  125. void SetInputs(const std::vector<FDTensor> &inputs);
  126. void AllocateOutputsBuffer(std::vector<FDTensor> *outputs,
  127. bool copy_to_fd = true);
  128. };
  129. } // namespace ultra_infer