option.h 5.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171
  1. // Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
  2. //
  3. // Licensed under the Apache License, Version 2.0 (the "License");
  4. // you may not use this file except in compliance with the License.
  5. // You may obtain a copy of the License at
  6. //
  7. // http://www.apache.org/licenses/LICENSE-2.0
  8. //
  9. // Unless required by applicable law or agreed to in writing, software
  10. // distributed under the License is distributed on an "AS IS" BASIS,
  11. // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. // See the License for the specific language governing permissions and
  13. // limitations under the License.
  14. #pragma once
  15. #include "ultra_infer/core/fd_type.h"
  16. #include "ultra_infer/runtime/backends/tensorrt/option.h"
  17. #include <iostream>
  18. #include <memory>
  19. #include <string>
  20. #include <vector>
  21. namespace ultra_infer {
  22. /*! @brief Option object to configure GraphCore IPU
  23. */
  24. struct IpuOption {
  25. /// IPU device id
  26. int ipu_device_num;
  27. /// the batch size in the graph, only work when graph has no batch shape info
  28. int ipu_micro_batch_size;
  29. /// enable pipelining
  30. bool ipu_enable_pipelining;
  31. /// the number of batches per run in pipelining
  32. int ipu_batches_per_step;
  33. /// enable fp16
  34. bool ipu_enable_fp16;
  35. /// the number of graph replication
  36. int ipu_replica_num;
  37. /// the available memory proportion for matmul/conv
  38. float ipu_available_memory_proportion;
  39. /// enable fp16 partial for matmul, only work with fp16
  40. bool ipu_enable_half_partial;
  41. };
  42. /*! @brief Option object to configure KUNLUNXIN XPU
  43. */
  44. struct XpuOption {
  45. /// kunlunxin device id
  46. int kunlunxin_device_id = 0;
  47. /// EnableXpu
  48. /// kunlunxin_l3_workspace_size
  49. int kunlunxin_l3_workspace_size = 0xfffc00;
  50. /// kunlunxin_locked
  51. bool kunlunxin_locked = false;
  52. /// kunlunxin_autotune
  53. bool kunlunxin_autotune = true;
  54. /// kunlunxin_autotune_file
  55. std::string kunlunxin_autotune_file = "";
  56. /// kunlunxin_precision
  57. std::string kunlunxin_precision = "int16";
  58. /// kunlunxin_adaptive_seqlen
  59. bool kunlunxin_adaptive_seqlen = false;
  60. /// kunlunxin_enable_multi_stream
  61. bool kunlunxin_enable_multi_stream = false;
  62. /// SetXpuConfig
  63. /// quant post dynamic weight bits
  64. int kunlunxin_quant_post_dynamic_weight_bits = -1;
  65. /// quant post dynamic op types
  66. std::vector<std::string> kunlunxin_quant_post_dynamic_op_types = {};
  67. };
  68. /*! @brief Option object to configure Paddle Inference backend
  69. */
  70. struct PaddleBackendOption {
  71. /// Print log information while initialize Paddle Inference backend
  72. bool enable_log_info = false;
  73. /// Enable MKLDNN while inference on CPU
  74. bool enable_mkldnn = true;
  75. /// Use Paddle Inference + TensorRT to inference model on GPU
  76. bool enable_trt = false;
  77. /// Whether enable memory optimize, default true
  78. bool enable_memory_optimize = true;
  79. /// Whether enable ir debug, default false
  80. bool switch_ir_debug = false;
  81. /// Whether enable ir optimize, default true
  82. bool switch_ir_optimize = true;
  83. /// Whether the load model is quantized model
  84. bool is_quantize_model = false;
  85. std::string inference_precision = "float32";
  86. bool enable_inference_cutlass = false;
  87. /*
  88. * @brief IPU option, this will configure the IPU hardware, if inference model
  89. * in IPU
  90. */
  91. IpuOption ipu_option;
  92. /*
  93. * @brief XPU option, this will configure the KUNLUNXIN XPU hardware, if
  94. * inference model in XPU
  95. */
  96. XpuOption xpu_option;
  97. /// Wenable_tuned_tensorrt_dynamic_shapeDynamicShape, default true
  98. bool allow_build_trt_at_runtime = true;
  99. /// Collect shape for model while enable_trt is true
  100. bool collect_trt_shape = false;
  101. /// Collect shape for model by device (for some custom ops)
  102. bool collect_trt_shape_by_device = false;
  103. /// Cache input shape for mkldnn while the input data will change dynamiclly
  104. int mkldnn_cache_size = -1;
  105. /// initialize memory size(MB) for GPU
  106. int gpu_mem_init_size = 100;
  107. /// The option to enable fixed size optimization for transformer model
  108. bool enable_fixed_size_opt = false;
  109. /// min_subgraph_size for paddle-trt
  110. int trt_min_subgraph_size = 3;
  111. #if PADDLEINFERENCE_VERSION_MAJOR == 2
  112. bool enable_new_ir = false;
  113. #else
  114. bool enable_new_ir = true;
  115. #endif
  116. /// Disable type of operators run on TensorRT
  117. void DisableTrtOps(const std::vector<std::string> &ops) {
  118. trt_disabled_ops_.insert(trt_disabled_ops_.end(), ops.begin(), ops.end());
  119. }
  120. /// Delete pass by name
  121. void DeletePass(const std::string &pass_name) {
  122. delete_pass_names.push_back(pass_name);
  123. }
  124. void SetIpuConfig(bool enable_fp16, int replica_num,
  125. float available_memory_proportion,
  126. bool enable_half_partial) {
  127. ipu_option.ipu_enable_fp16 = enable_fp16;
  128. ipu_option.ipu_replica_num = replica_num;
  129. ipu_option.ipu_available_memory_proportion = available_memory_proportion;
  130. ipu_option.ipu_enable_half_partial = enable_half_partial;
  131. }
  132. void SetXpuConfig(
  133. int quant_post_dynamic_weight_bits = -1,
  134. const std::vector<std::string> &quant_post_dynamic_op_types = {}) {
  135. xpu_option.kunlunxin_quant_post_dynamic_weight_bits =
  136. quant_post_dynamic_weight_bits;
  137. xpu_option.kunlunxin_quant_post_dynamic_op_types =
  138. quant_post_dynamic_op_types;
  139. }
  140. // The belowing parameters may be removed, please do not
  141. // read or write them directly
  142. TrtBackendOption trt_option;
  143. bool enable_pinned_memory = false;
  144. void *external_stream_ = nullptr;
  145. Device device = Device::CPU;
  146. /// device id for CPU/GPU
  147. int device_id = 0;
  148. std::vector<std::string> trt_disabled_ops_{};
  149. int cpu_thread_num = 8;
  150. std::vector<std::string> delete_pass_names = {};
  151. std::string model_file = ""; // Path of model file
  152. std::string params_file = ""; // Path of parameters file, can be empty
  153. // load model and paramters from memory
  154. bool model_from_memory_ = false;
  155. };
  156. } // namespace ultra_infer