engine_config.h 6.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196
  1. // Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
  2. //
  3. // Licensed under the Apache License, Version 2.0 (the "License");
  4. // you may not use this file except in compliance with the License.
  5. // You may obtain a copy of the License at
  6. //
  7. // http://www.apache.org/licenses/LICENSE-2.0
  8. //
  9. // Unless required by applicable law or agreed to in writing, software
  10. // distributed under the License is distributed on an "AS IS" BASIS,
  11. // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. // See the License for the specific language governing permissions and
  13. // limitations under the License.
  14. #pragma once
  15. #include <string>
  16. #include <vector>
  17. #include <map>
  18. #include "yaml-cpp/yaml.h"
  19. namespace PaddleDeploy {
  20. struct PaddleEngineConfig {
  21. // model file path
  22. std::string model_filename = "";
  23. // model params file path
  24. std::string params_filename = "";
  25. // Whether to use mkdnn accelerator library when deploying on CPU
  26. bool use_mkl = true;
  27. // The number of threads set when using mkldnn accelerator
  28. int mkl_thread_num = 8;
  29. // Whether to use GPU
  30. bool use_gpu = false;
  31. // Set GPU ID, default is 0
  32. int gpu_id = 0;
  33. // Enable IR optimization
  34. bool use_ir_optim = true;
  35. // Whether to use TensorRT
  36. bool use_trt = false;
  37. // Set batchsize
  38. int batch_size = 1;
  39. // Set TensorRT min_subgraph_size
  40. int min_subgraph_size = 1;
  41. /*Set TensorRT data precision
  42. 0: FP32
  43. 1: FP16
  44. 2: Int8
  45. */
  46. int precision = 0;
  47. // When tensorrt is used, whether to serialize tensorrt engine to disk
  48. bool use_static = false;
  49. // Is offline calibration required, when tensorrt is used
  50. bool use_calib_mode = false;
  51. };
  52. struct TritonEngineConfig {
  53. TritonEngineConfig() : model_name_(""), model_version_(""),
  54. request_id_(""), sequence_id_(0), sequence_start_(false),
  55. sequence_end_(false), priority_(0), server_timeout_(0),
  56. client_timeout_(0) {}
  57. /// The name of the model to run inference.
  58. std::string model_name_;
  59. /// The version of the model to use while running inference. The default
  60. /// value is an empty string which means the server will select the
  61. /// version of the model based on its internal policy.
  62. std::string model_version_;
  63. /// An identifier for the request. If specified will be returned
  64. /// in the response. Default value is an empty string which means no
  65. /// request_id will be used.
  66. std::string request_id_;
  67. /// The unique identifier for the sequence being represented by the
  68. /// object. Default value is 0 which means that the request does not
  69. /// belong to a sequence.
  70. uint64_t sequence_id_;
  71. /// Indicates whether the request being added marks the start of the
  72. /// sequence. Default value is False. This argument is ignored if
  73. /// 'sequence_id' is 0.
  74. bool sequence_start_;
  75. /// Indicates whether the request being added marks the end of the
  76. /// sequence. Default value is False. This argument is ignored if
  77. /// 'sequence_id' is 0.
  78. bool sequence_end_;
  79. /// Indicates the priority of the request. Priority value zero
  80. /// indicates that the default priority level should be used
  81. /// (i.e. same behavior as not specifying the priority parameter).
  82. /// Lower value priorities indicate higher priority levels. Thus
  83. /// the highest priority level is indicated by setting the parameter
  84. /// to 1, the next highest is 2, etc. If not provided, the server
  85. /// will handle the request using default setting for the model.
  86. uint64_t priority_;
  87. /// The timeout value for the request, in microseconds. If the request
  88. /// cannot be completed within the time by the server can take a
  89. /// model-specific action such as terminating the request. If not
  90. /// provided, the server will handle the request using default setting
  91. /// for the model.
  92. uint64_t server_timeout_;
  93. // The maximum end-to-end time, in microseconds, the request is allowed
  94. // to take. Note the HTTP library only offer the precision upto
  95. // milliseconds. The client will abort request when the specified time
  96. // elapses. The request will return error with message "Deadline Exceeded".
  97. // The default value is 0 which means client will wait for the
  98. // response from the server. This option is not supported for streaming
  99. // requests. Instead see 'stream_timeout' argument in
  100. // InferenceServerGrpcClient::StartStream().
  101. uint64_t client_timeout_;
  102. bool verbose_ = false;
  103. std::string url_;
  104. };
  105. struct TensorRTEngineConfig {
  106. // onnx model path
  107. std::string model_file_;
  108. // GPU workspace size
  109. int max_workspace_size_ = 1<<28;
  110. int max_batch_size_ = 1;
  111. int gpu_id_ = 0;
  112. bool save_engine_ = false;
  113. std::string trt_cache_file_ = "";
  114. // input and output info
  115. YAML::Node yaml_config_;
  116. };
  117. struct InferenceConfig {
  118. std::string engine_type;
  119. union {
  120. PaddleEngineConfig* paddle_config;
  121. TritonEngineConfig* triton_config;
  122. TensorRTEngineConfig* tensorrt_config;
  123. };
  124. InferenceConfig() {
  125. paddle_config = nullptr;
  126. }
  127. explicit InferenceConfig(std::string engine_type) {
  128. engine_type = engine_type;
  129. if ("paddle" == engine_type) {
  130. paddle_config = new PaddleEngineConfig();
  131. } else if ("triton" == engine_type) {
  132. triton_config = new TritonEngineConfig();
  133. } else if ("tensorrt" == engine_type) {
  134. tensorrt_config = new TensorRTEngineConfig();
  135. }
  136. }
  137. InferenceConfig(const InferenceConfig& config) {
  138. engine_type = config.engine_type;
  139. if ("paddle" == engine_type) {
  140. paddle_config = new PaddleEngineConfig();
  141. *paddle_config = *(config.paddle_config);
  142. } else if ("triton" == engine_type) {
  143. triton_config = new TritonEngineConfig();
  144. *triton_config = *(config.triton_config);
  145. } else if ("tensorrt" == engine_type) {
  146. tensorrt_config = new TensorRTEngineConfig();
  147. *tensorrt_config = *(config.tensorrt_config);
  148. }
  149. }
  150. ~InferenceConfig() {
  151. if ("paddle" == engine_type) {
  152. delete paddle_config;
  153. paddle_config = NULL;
  154. } else if ("triton" == engine_type) {
  155. delete triton_config;
  156. triton_config = NULL;
  157. } else if ("tensorrt" == engine_type) {
  158. delete tensorrt_config;
  159. tensorrt_config = NULL;
  160. }
  161. }
  162. };
  163. } // namespace PaddleDeploy