engine_config.h 6.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216
  1. // Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
  2. //
  3. // Licensed under the Apache License, Version 2.0 (the "License");
  4. // you may not use this file except in compliance with the License.
  5. // You may obtain a copy of the License at
  6. //
  7. // http://www.apache.org/licenses/LICENSE-2.0
  8. //
  9. // Unless required by applicable law or agreed to in writing, software
  10. // distributed under the License is distributed on an "AS IS" BASIS,
  11. // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. // See the License for the specific language governing permissions and
  13. // limitations under the License.
  14. #pragma once
  15. #include <string>
  16. #include <vector>
  17. #include <map>
  18. #include "yaml-cpp/yaml.h"
  19. namespace PaddleDeploy {
  20. struct PaddleEngineConfig {
  21. // model file path
  22. std::string model_filename = "";
  23. // model params file path
  24. std::string params_filename = "";
  25. // model encrypt key
  26. std::string key = "";
  27. // Whether to use mkdnn accelerator library when deploying on CPU
  28. bool use_mkl = true;
  29. // The number of threads set when using mkldnn accelerator
  30. int mkl_thread_num = 8;
  31. // Whether to use GPU
  32. bool use_gpu = false;
  33. // Set GPU ID, default is 0
  34. int gpu_id = 0;
  35. // Enable IR optimization
  36. bool use_ir_optim = true;
  37. // Whether to use TensorRT
  38. bool use_trt = false;
  39. // Set batchsize
  40. int max_batch_size = 1;
  41. // Set TensorRT min_subgraph_size
  42. int min_subgraph_size = 1;
  43. /*Set TensorRT data precision
  44. 0: FP32
  45. 1: FP16
  46. 2: Int8
  47. */
  48. int precision = 0;
  49. // When tensorrt is used, whether to serialize tensorrt engine to disk
  50. bool use_static = false;
  51. // Is offline calibration required, when tensorrt is used
  52. bool use_calib_mode = false;
  53. // tensorrt workspace size
  54. int max_workspace_size = 1 << 10;
  55. // tensorrt dynamic shape , min input shape
  56. std::map<std::string, std::vector<int>> min_input_shape;
  57. // tensorrt dynamic shape , max input shape
  58. std::map<std::string, std::vector<int>> max_input_shape;
  59. // tensorrt dynamic shape , optimal input shape
  60. std::map<std::string, std::vector<int>> optim_input_shape;
  61. };
  62. struct TritonEngineConfig {
  63. TritonEngineConfig() : model_name_(""), model_version_(""),
  64. request_id_(""), sequence_id_(0), sequence_start_(false),
  65. sequence_end_(false), priority_(0), server_timeout_(0),
  66. client_timeout_(0), verbose_(false), url_("") {}
  67. /// The name of the model to run inference.
  68. std::string model_name_;
  69. /// The version of the model to use while running inference. The default
  70. /// value is an empty string which means the server will select the
  71. /// version of the model based on its internal policy.
  72. std::string model_version_;
  73. /// An identifier for the request. If specified will be returned
  74. /// in the response. Default value is an empty string which means no
  75. /// request_id will be used.
  76. std::string request_id_;
  77. /// The unique identifier for the sequence being represented by the
  78. /// object. Default value is 0 which means that the request does not
  79. /// belong to a sequence.
  80. uint64_t sequence_id_;
  81. /// Indicates whether the request being added marks the start of the
  82. /// sequence. Default value is False. This argument is ignored if
  83. /// 'sequence_id' is 0.
  84. bool sequence_start_;
  85. /// Indicates whether the request being added marks the end of the
  86. /// sequence. Default value is False. This argument is ignored if
  87. /// 'sequence_id' is 0.
  88. bool sequence_end_;
  89. /// Indicates the priority of the request. Priority value zero
  90. /// indicates that the default priority level should be used
  91. /// (i.e. same behavior as not specifying the priority parameter).
  92. /// Lower value priorities indicate higher priority levels. Thus
  93. /// the highest priority level is indicated by setting the parameter
  94. /// to 1, the next highest is 2, etc. If not provided, the server
  95. /// will handle the request using default setting for the model.
  96. uint64_t priority_;
  97. /// The timeout value for the request, in microseconds. If the request
  98. /// cannot be completed within the time by the server can take a
  99. /// model-specific action such as terminating the request. If not
  100. /// provided, the server will handle the request using default setting
  101. /// for the model.
  102. uint64_t server_timeout_;
  103. // The maximum end-to-end time, in microseconds, the request is allowed
  104. // to take. Note the HTTP library only offer the precision upto
  105. // milliseconds. The client will abort request when the specified time
  106. // elapses. The request will return error with message "Deadline Exceeded".
  107. // The default value is 0 which means client will wait for the
  108. // response from the server. This option is not supported for streaming
  109. // requests. Instead see 'stream_timeout' argument in
  110. // InferenceServerGrpcClient::StartStream().
  111. uint64_t client_timeout_;
  112. // open client log
  113. bool verbose_;
  114. // Request the address
  115. std::string url_;
  116. };
  117. struct TensorRTEngineConfig {
  118. // onnx model path
  119. std::string model_file_ = "";
  120. // paddle model config file
  121. std::string cfg_file_ = "";
  122. // GPU workspace size
  123. int max_workspace_size_ = 1<<28;
  124. int max_batch_size_ = 1;
  125. int gpu_id_ = 0;
  126. bool save_engine_ = false;
  127. std::string trt_cache_file_ = "";
  128. // input and output info
  129. YAML::Node yaml_config_;
  130. };
  131. struct InferenceConfig {
  132. std::string engine_type;
  133. union {
  134. PaddleEngineConfig* paddle_config;
  135. TritonEngineConfig* triton_config;
  136. TensorRTEngineConfig* tensorrt_config;
  137. };
  138. InferenceConfig() {
  139. paddle_config = nullptr;
  140. }
  141. explicit InferenceConfig(std::string engine_type) {
  142. engine_type = engine_type;
  143. if ("paddle" == engine_type) {
  144. paddle_config = new PaddleEngineConfig();
  145. } else if ("triton" == engine_type) {
  146. triton_config = new TritonEngineConfig();
  147. } else if ("tensorrt" == engine_type) {
  148. tensorrt_config = new TensorRTEngineConfig();
  149. }
  150. }
  151. InferenceConfig(const InferenceConfig& config) {
  152. engine_type = config.engine_type;
  153. if ("paddle" == engine_type) {
  154. paddle_config = new PaddleEngineConfig();
  155. *paddle_config = *(config.paddle_config);
  156. } else if ("triton" == engine_type) {
  157. triton_config = new TritonEngineConfig();
  158. *triton_config = *(config.triton_config);
  159. } else if ("tensorrt" == engine_type) {
  160. tensorrt_config = new TensorRTEngineConfig();
  161. *tensorrt_config = *(config.tensorrt_config);
  162. }
  163. }
  164. ~InferenceConfig() {
  165. if ("paddle" == engine_type) {
  166. delete paddle_config;
  167. paddle_config = NULL;
  168. } else if ("triton" == engine_type) {
  169. delete triton_config;
  170. triton_config = NULL;
  171. } else if ("tensorrt" == engine_type) {
  172. delete tensorrt_config;
  173. tensorrt_config = NULL;
  174. }
  175. }
  176. };
  177. } // namespace PaddleDeploy