engine_config.h 6.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213
  1. // Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
  2. //
  3. // Licensed under the Apache License, Version 2.0 (the "License");
  4. // you may not use this file except in compliance with the License.
  5. // You may obtain a copy of the License at
  6. //
  7. // http://www.apache.org/licenses/LICENSE-2.0
  8. //
  9. // Unless required by applicable law or agreed to in writing, software
  10. // distributed under the License is distributed on an "AS IS" BASIS,
  11. // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. // See the License for the specific language governing permissions and
  13. // limitations under the License.
  14. #pragma once
  15. #include <string>
  16. #include <vector>
  17. #include <map>
  18. #include "yaml-cpp/yaml.h"
  19. namespace PaddleDeploy {
  20. struct PaddleEngineConfig {
  21. // model file path
  22. std::string model_filename = "";
  23. // model params file path
  24. std::string params_filename = "";
  25. // Whether to use mkdnn accelerator library when deploying on CPU
  26. bool use_mkl = true;
  27. // The number of threads set when using mkldnn accelerator
  28. int mkl_thread_num = 8;
  29. // Whether to use GPU
  30. bool use_gpu = false;
  31. // Set GPU ID, default is 0
  32. int gpu_id = 0;
  33. // Enable IR optimization
  34. bool use_ir_optim = true;
  35. // Whether to use TensorRT
  36. bool use_trt = false;
  37. // Set batchsize
  38. int max_batch_size = 1;
  39. // Set TensorRT min_subgraph_size
  40. int min_subgraph_size = 1;
  41. /*Set TensorRT data precision
  42. 0: FP32
  43. 1: FP16
  44. 2: Int8
  45. */
  46. int precision = 0;
  47. // When tensorrt is used, whether to serialize tensorrt engine to disk
  48. bool use_static = false;
  49. // Is offline calibration required, when tensorrt is used
  50. bool use_calib_mode = false;
  51. // tensorrt workspace size
  52. int max_workspace_size = 1 << 10;
  53. // tensorrt dynamic shape , min input shape
  54. std::map<std::string, std::vector<int>> min_input_shape;
  55. // tensorrt dynamic shape , max input shape
  56. std::map<std::string, std::vector<int>> max_input_shape;
  57. // tensorrt dynamic shape , optimal input shape
  58. std::map<std::string, std::vector<int>> optim_input_shape;
  59. };
  60. struct TritonEngineConfig {
  61. TritonEngineConfig() : model_name_(""), model_version_(""),
  62. request_id_(""), sequence_id_(0), sequence_start_(false),
  63. sequence_end_(false), priority_(0), server_timeout_(0),
  64. client_timeout_(0), verbose_(false), url_("") {}
  65. /// The name of the model to run inference.
  66. std::string model_name_;
  67. /// The version of the model to use while running inference. The default
  68. /// value is an empty string which means the server will select the
  69. /// version of the model based on its internal policy.
  70. std::string model_version_;
  71. /// An identifier for the request. If specified will be returned
  72. /// in the response. Default value is an empty string which means no
  73. /// request_id will be used.
  74. std::string request_id_;
  75. /// The unique identifier for the sequence being represented by the
  76. /// object. Default value is 0 which means that the request does not
  77. /// belong to a sequence.
  78. uint64_t sequence_id_;
  79. /// Indicates whether the request being added marks the start of the
  80. /// sequence. Default value is False. This argument is ignored if
  81. /// 'sequence_id' is 0.
  82. bool sequence_start_;
  83. /// Indicates whether the request being added marks the end of the
  84. /// sequence. Default value is False. This argument is ignored if
  85. /// 'sequence_id' is 0.
  86. bool sequence_end_;
  87. /// Indicates the priority of the request. Priority value zero
  88. /// indicates that the default priority level should be used
  89. /// (i.e. same behavior as not specifying the priority parameter).
  90. /// Lower value priorities indicate higher priority levels. Thus
  91. /// the highest priority level is indicated by setting the parameter
  92. /// to 1, the next highest is 2, etc. If not provided, the server
  93. /// will handle the request using default setting for the model.
  94. uint64_t priority_;
  95. /// The timeout value for the request, in microseconds. If the request
  96. /// cannot be completed within the time by the server can take a
  97. /// model-specific action such as terminating the request. If not
  98. /// provided, the server will handle the request using default setting
  99. /// for the model.
  100. uint64_t server_timeout_;
  101. // The maximum end-to-end time, in microseconds, the request is allowed
  102. // to take. Note the HTTP library only offer the precision upto
  103. // milliseconds. The client will abort request when the specified time
  104. // elapses. The request will return error with message "Deadline Exceeded".
  105. // The default value is 0 which means client will wait for the
  106. // response from the server. This option is not supported for streaming
  107. // requests. Instead see 'stream_timeout' argument in
  108. // InferenceServerGrpcClient::StartStream().
  109. uint64_t client_timeout_;
  110. // open client log
  111. bool verbose_;
  112. // Request the address
  113. std::string url_;
  114. };
  115. struct TensorRTEngineConfig {
  116. // onnx model path
  117. std::string model_file_ = "";
  118. // paddle model config file
  119. std::string cfg_file_ = "";
  120. // GPU workspace size
  121. int max_workspace_size_ = 1<<28;
  122. int max_batch_size_ = 1;
  123. int gpu_id_ = 0;
  124. bool save_engine_ = false;
  125. std::string trt_cache_file_ = "";
  126. // input and output info
  127. YAML::Node yaml_config_;
  128. };
  129. struct InferenceConfig {
  130. std::string engine_type;
  131. union {
  132. PaddleEngineConfig* paddle_config;
  133. TritonEngineConfig* triton_config;
  134. TensorRTEngineConfig* tensorrt_config;
  135. };
  136. InferenceConfig() {
  137. paddle_config = nullptr;
  138. }
  139. explicit InferenceConfig(std::string engine_type) {
  140. engine_type = engine_type;
  141. if ("paddle" == engine_type) {
  142. paddle_config = new PaddleEngineConfig();
  143. } else if ("triton" == engine_type) {
  144. triton_config = new TritonEngineConfig();
  145. } else if ("tensorrt" == engine_type) {
  146. tensorrt_config = new TensorRTEngineConfig();
  147. }
  148. }
  149. InferenceConfig(const InferenceConfig& config) {
  150. engine_type = config.engine_type;
  151. if ("paddle" == engine_type) {
  152. paddle_config = new PaddleEngineConfig();
  153. *paddle_config = *(config.paddle_config);
  154. } else if ("triton" == engine_type) {
  155. triton_config = new TritonEngineConfig();
  156. *triton_config = *(config.triton_config);
  157. } else if ("tensorrt" == engine_type) {
  158. tensorrt_config = new TensorRTEngineConfig();
  159. *tensorrt_config = *(config.tensorrt_config);
  160. }
  161. }
  162. ~InferenceConfig() {
  163. if ("paddle" == engine_type) {
  164. delete paddle_config;
  165. paddle_config = NULL;
  166. } else if ("triton" == engine_type) {
  167. delete triton_config;
  168. triton_config = NULL;
  169. } else if ("tensorrt" == engine_type) {
  170. delete tensorrt_config;
  171. tensorrt_config = NULL;
  172. }
  173. }
  174. };
  175. } // namespace PaddleDeploy