| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171 |
- // Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
- //
- // Licensed under the Apache License, Version 2.0 (the "License");
- // you may not use this file except in compliance with the License.
- // You may obtain a copy of the License at
- //
- // http://www.apache.org/licenses/LICENSE-2.0
- //
- // Unless required by applicable law or agreed to in writing, software
- // distributed under the License is distributed on an "AS IS" BASIS,
- // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- // See the License for the specific language governing permissions and
- // limitations under the License.
- #pragma once
- #include "ultra_infer/core/fd_type.h"
- #include "ultra_infer/runtime/backends/tensorrt/option.h"
- #include <iostream>
- #include <memory>
- #include <string>
- #include <vector>
- namespace ultra_infer {
- /*! @brief Option object to configure GraphCore IPU
- */
- struct IpuOption {
- /// IPU device id
- int ipu_device_num;
- /// the batch size in the graph, only work when graph has no batch shape info
- int ipu_micro_batch_size;
- /// enable pipelining
- bool ipu_enable_pipelining;
- /// the number of batches per run in pipelining
- int ipu_batches_per_step;
- /// enable fp16
- bool ipu_enable_fp16;
- /// the number of graph replication
- int ipu_replica_num;
- /// the available memory proportion for matmul/conv
- float ipu_available_memory_proportion;
- /// enable fp16 partial for matmul, only work with fp16
- bool ipu_enable_half_partial;
- };
- /*! @brief Option object to configure KUNLUNXIN XPU
- */
- struct XpuOption {
- /// kunlunxin device id
- int kunlunxin_device_id = 0;
- /// EnableXpu
- /// kunlunxin_l3_workspace_size
- int kunlunxin_l3_workspace_size = 0xfffc00;
- /// kunlunxin_locked
- bool kunlunxin_locked = false;
- /// kunlunxin_autotune
- bool kunlunxin_autotune = true;
- /// kunlunxin_autotune_file
- std::string kunlunxin_autotune_file = "";
- /// kunlunxin_precision
- std::string kunlunxin_precision = "int16";
- /// kunlunxin_adaptive_seqlen
- bool kunlunxin_adaptive_seqlen = false;
- /// kunlunxin_enable_multi_stream
- bool kunlunxin_enable_multi_stream = false;
- /// SetXpuConfig
- /// quant post dynamic weight bits
- int kunlunxin_quant_post_dynamic_weight_bits = -1;
- /// quant post dynamic op types
- std::vector<std::string> kunlunxin_quant_post_dynamic_op_types = {};
- };
- /*! @brief Option object to configure Paddle Inference backend
- */
- struct PaddleBackendOption {
- /// Print log information while initialize Paddle Inference backend
- bool enable_log_info = false;
- /// Enable MKLDNN while inference on CPU
- bool enable_mkldnn = true;
- /// Use Paddle Inference + TensorRT to inference model on GPU
- bool enable_trt = false;
- /// Whether enable memory optimize, default true
- bool enable_memory_optimize = true;
- /// Whether enable ir debug, default false
- bool switch_ir_debug = false;
- /// Whether enable ir optimize, default true
- bool switch_ir_optimize = true;
- /// Whether the load model is quantized model
- bool is_quantize_model = false;
- std::string inference_precision = "float32";
- bool enable_inference_cutlass = false;
- /*
- * @brief IPU option, this will configure the IPU hardware, if inference model
- * in IPU
- */
- IpuOption ipu_option;
- /*
- * @brief XPU option, this will configure the KUNLUNXIN XPU hardware, if
- * inference model in XPU
- */
- XpuOption xpu_option;
- /// Wenable_tuned_tensorrt_dynamic_shapeDynamicShape, default true
- bool allow_build_trt_at_runtime = true;
- /// Collect shape for model while enable_trt is true
- bool collect_trt_shape = false;
- /// Collect shape for model by device (for some custom ops)
- bool collect_trt_shape_by_device = false;
- /// Cache input shape for mkldnn while the input data will change dynamiclly
- int mkldnn_cache_size = -1;
- /// initialize memory size(MB) for GPU
- int gpu_mem_init_size = 100;
- /// The option to enable fixed size optimization for transformer model
- bool enable_fixed_size_opt = false;
- /// min_subgraph_size for paddle-trt
- int trt_min_subgraph_size = 3;
- #if PADDLEINFERENCE_VERSION_MAJOR == 2
- bool enable_new_ir = false;
- #else
- bool enable_new_ir = true;
- #endif
- /// Disable type of operators run on TensorRT
- void DisableTrtOps(const std::vector<std::string> &ops) {
- trt_disabled_ops_.insert(trt_disabled_ops_.end(), ops.begin(), ops.end());
- }
- /// Delete pass by name
- void DeletePass(const std::string &pass_name) {
- delete_pass_names.push_back(pass_name);
- }
- void SetIpuConfig(bool enable_fp16, int replica_num,
- float available_memory_proportion,
- bool enable_half_partial) {
- ipu_option.ipu_enable_fp16 = enable_fp16;
- ipu_option.ipu_replica_num = replica_num;
- ipu_option.ipu_available_memory_proportion = available_memory_proportion;
- ipu_option.ipu_enable_half_partial = enable_half_partial;
- }
- void SetXpuConfig(
- int quant_post_dynamic_weight_bits = -1,
- const std::vector<std::string> &quant_post_dynamic_op_types = {}) {
- xpu_option.kunlunxin_quant_post_dynamic_weight_bits =
- quant_post_dynamic_weight_bits;
- xpu_option.kunlunxin_quant_post_dynamic_op_types =
- quant_post_dynamic_op_types;
- }
- // The belowing parameters may be removed, please do not
- // read or write them directly
- TrtBackendOption trt_option;
- bool enable_pinned_memory = false;
- void *external_stream_ = nullptr;
- Device device = Device::CPU;
- /// device id for CPU/GPU
- int device_id = 0;
- std::vector<std::string> trt_disabled_ops_{};
- int cpu_thread_num = 8;
- std::vector<std::string> delete_pass_names = {};
- std::string model_file = ""; // Path of model file
- std::string params_file = ""; // Path of parameters file, can be empty
- // load model and paramters from memory
- bool model_from_memory_ = false;
- };
- } // namespace ultra_infer
|