| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283 |
- // Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
- //
- // Licensed under the Apache License, Version 2.0 (the "License");
- // you may not use this file except in compliance with the License.
- // You may obtain a copy of the License at
- //
- // http://www.apache.org/licenses/LICENSE-2.0
- //
- // Unless required by applicable law or agreed to in writing, software
- // distributed under the License is distributed on an "AS IS" BASIS,
- // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- // See the License for the specific language governing permissions and
- // limitations under the License.
- /*! \file runtime_option.h
- \brief A brief file description.
- More details
- */
- #pragma once
- #include "ultra_infer/benchmark/option.h"
- #include "ultra_infer/runtime/backends/lite/option.h"
- #include "ultra_infer/runtime/backends/openvino/option.h"
- #include "ultra_infer/runtime/backends/ort/option.h"
- #include "ultra_infer/runtime/backends/paddle/option.h"
- #include "ultra_infer/runtime/backends/poros/option.h"
- #include "ultra_infer/runtime/backends/rknpu2/option.h"
- #include "ultra_infer/runtime/backends/sophgo/option.h"
- #include "ultra_infer/runtime/backends/tensorrt/option.h"
- #include "ultra_infer/runtime/backends/tvm/option.h"
- #include "ultra_infer/runtime/enum_variables.h"
- #include <algorithm>
- #include <map>
- #include <vector>
- namespace ultra_infer {
- /*! @brief Option object used when create a new Runtime object
- */
- struct ULTRAINFER_DECL RuntimeOption {
- /** \brief Set path of model file and parameter file
- *
- * \param[in] model_path Path of model file, e.g ResNet50/model.pdmodel for
- * Paddle format model / ResNet50/model.onnx for ONNX format model \param[in]
- * params_path Path of parameter file, this only used when the model format is
- * Paddle, e.g Resnet50/model.pdiparams \param[in] format Format of the loaded
- * model
- */
- void SetModelPath(const std::string &model_path,
- const std::string ¶ms_path = "",
- const ModelFormat &format = ModelFormat::PADDLE);
- /** \brief Specify the memory buffer of model and parameter. Used when model
- * and params are loaded directly from memory
- *
- * \param[in] model_buffer The string of model memory buffer
- * \param[in] params_buffer The string of parameters memory buffer
- * \param[in] format Format of the loaded model
- */
- void SetModelBuffer(const std::string &model_buffer,
- const std::string ¶ms_buffer = "",
- const ModelFormat &format = ModelFormat::PADDLE);
- /// Use cpu to inference, the runtime will inference on CPU by default
- void UseCpu();
- /// Use Nvidia GPU to inference
- void UseGpu(int gpu_id = 0);
- /// Use RKNPU2 e.g RK3588/RK356X to inference
- void UseRKNPU2(ultra_infer::rknpu2::CpuName rknpu2_name =
- ultra_infer::rknpu2::CpuName::RK356X,
- ultra_infer::rknpu2::CoreMask rknpu2_core =
- ultra_infer::rknpu2::CoreMask::RKNN_NPU_CORE_AUTO);
- // Use Horizon NPU to inference
- void UseHorizon();
- /// Use TimVX e.g RV1126/A311D to inference
- void UseTimVX();
- /// Use Huawei Ascend to inference
- void UseAscend(int npu_id);
- /// Use onnxruntime DirectML to inference
- void UseDirectML();
- /// Use Sophgo to inference
- void UseSophgo();
- /// \brief Turn on KunlunXin XPU.
- ///
- /// \param kunlunxin_id the KunlunXin XPU card to use (default is 0).
- /// \param l3_workspace_size The size of the video memory allocated by the l3
- /// cache, the maximum is 16M.
- /// \param locked Whether the allocated L3 cache can be locked. If false,
- /// it means that the L3 cache is not locked, and the allocated L3
- /// cache can be shared by multiple models, and multiple models
- /// sharing the L3 cache will be executed sequentially on the card.
- /// \param autotune Whether to autotune the conv operator in the model. If
- /// true, when the conv operator of a certain dimension is executed
- /// for the first time, it will automatically search for a better
- /// algorithm to improve the performance of subsequent conv operators
- /// of the same dimension.
- /// \param autotune_file Specify the path of the autotune file. If
- /// autotune_file is specified, the algorithm specified in the
- /// file will be used and autotune will not be performed again.
- /// \param precision Calculation accuracy of multi_encoder
- /// \param adaptive_seqlen Is the input of multi_encoder variable length
- /// \param enable_multi_stream Whether to enable the multi stream of
- /// KunlunXin XPU.
- /// \param gm_default_size The default size of global memory of KunlunXin XPU.
- ///
- void UseKunlunXin(int kunlunxin_id = 0, int l3_workspace_size = 0xfffc00,
- bool locked = false, bool autotune = true,
- const std::string &autotune_file = "",
- const std::string &precision = "int16",
- bool adaptive_seqlen = false,
- bool enable_multi_stream = false,
- int64_t gm_default_size = 0);
- void SetExternalStream(void *external_stream);
- /*
- * @brief Set number of cpu threads while inference on CPU, by default it will
- * decided by the different backends
- */
- void SetCpuThreadNum(int thread_num);
- /// Set Paddle Inference as inference backend, support CPU/GPU
- void UsePaddleInferBackend() { return UsePaddleBackend(); }
- /// Set ONNX Runtime as inference backend, support CPU/GPU
- void UseOrtBackend();
- /// Set SOPHGO Runtime as inference backend, support SOPHGO
- void UseSophgoBackend();
- /// Set TensorRT as inference backend, only support GPU
- void UseTrtBackend();
- /// Set Poros backend as inference backend, support CPU/GPU
- void UsePorosBackend();
- /// Set OpenVINO as inference backend, only support CPU
- void UseOpenVINOBackend();
- /// Set Paddle Lite as inference backend, only support arm cpu
- void UsePaddleLiteBackend() { return UseLiteBackend(); }
- /** \Use Graphcore IPU to inference.
- *
- * \param[in] device_num the number of IPUs.
- * \param[in] micro_batch_size the batch size in the graph, only work when
- * graph has no batch shape info. \param[in] enable_pipelining enable
- * pipelining. \param[in] batches_per_step the number of batches per run in
- * pipelining.
- */
- void UseIpu(int device_num = 1, int micro_batch_size = 1,
- bool enable_pipelining = false, int batches_per_step = 1);
- /// Option to configure ONNX Runtime backend
- OrtBackendOption ort_option;
- /// Option to configure TensorRT backend
- TrtBackendOption trt_option;
- /// Option to configure Paddle Inference backend
- PaddleBackendOption paddle_infer_option;
- /// Option to configure Poros backend
- PorosBackendOption poros_option;
- /// Option to configure OpenVINO backend
- OpenVINOBackendOption openvino_option;
- /// Option to configure Paddle Lite backend
- LiteBackendOption paddle_lite_option;
- /// Option to configure RKNPU2 backend
- RKNPU2BackendOption rknpu2_option;
- /// Option to configure TVM backend
- TVMBackendOption tvm_option;
- // \brief Set the profile mode as 'true'.
- //
- // \param[in] inclue_h2d_d2h Whether to
- // include time of H2D_D2H for time of runtime.
- // \param[in] repeat Repeat times for runtime inference.
- // \param[in] warmup Warmup times for runtime inference.
- //
- void EnableProfiling(bool inclue_h2d_d2h = false, int repeat = 100,
- int warmup = 50) {
- benchmark_option.enable_profile = true;
- benchmark_option.warmup = warmup;
- benchmark_option.repeats = repeat;
- benchmark_option.include_h2d_d2h = inclue_h2d_d2h;
- }
- // \brief Set the profile mode as 'false'.
- //
- void DisableProfiling() { benchmark_option.enable_profile = false; }
- // \brief Enable to check if current backend set by
- // user can be found at valid_xxx_backend.
- //
- void EnableValidBackendCheck() { enable_valid_backend_check = true; }
- // \brief Disable to check if current backend set by
- // user can be found at valid_xxx_backend.
- //
- void DisableValidBackendCheck() { enable_valid_backend_check = false; }
- // Benchmark option
- benchmark::BenchmarkOption benchmark_option;
- // enable the check for valid backend, default true.
- bool enable_valid_backend_check = true;
- // If model_from_memory is true, the model_file and params_file is
- // binary stream in memory;
- // Otherwise, the model_file and params_file means the path of file
- std::string model_file = "";
- std::string params_file = "";
- bool model_from_memory_ = false;
- // format of input model
- ModelFormat model_format = ModelFormat::PADDLE;
- // for cpu inference
- // default will let the backend choose their own default value
- int cpu_thread_num = -1;
- int device_id = 0;
- Backend backend = Backend::UNKNOWN;
- Device device = Device::CPU;
- void *external_stream_ = nullptr;
- bool enable_pinned_memory = false;
- // *** The belowing api are deprecated, will be removed in v1.2.0
- // *** Do not use it anymore
- void SetPaddleMKLDNN(bool pd_mkldnn = true);
- void EnablePaddleToTrt();
- void DeletePaddleBackendPass(const std::string &delete_pass_name);
- void EnablePaddleLogInfo();
- void DisablePaddleLogInfo();
- void SetPaddleMKLDNNCacheSize(int size);
- void SetOpenVINODevice(const std::string &name = "CPU");
- void SetOpenVINOShapeInfo(
- const std::map<std::string, std::vector<int64_t>> &shape_info) {
- openvino_option.shape_infos = shape_info;
- }
- void SetOpenVINOCpuOperators(const std::vector<std::string> &operators) {
- openvino_option.SetCpuOperators(operators);
- }
- void SetLiteOptimizedModelDir(const std::string &optimized_model_dir);
- void SetLiteSubgraphPartitionPath(
- const std::string &nnadapter_subgraph_partition_config_path);
- void SetLiteSubgraphPartitionConfigBuffer(
- const std::string &nnadapter_subgraph_partition_config_buffer);
- void
- SetLiteContextProperties(const std::string &nnadapter_context_properties);
- void SetLiteModelCacheDir(const std::string &nnadapter_model_cache_dir);
- void SetLiteDynamicShapeInfo(
- const std::map<std::string, std::vector<std::vector<int64_t>>>
- &nnadapter_dynamic_shape_info);
- void SetLiteMixedPrecisionQuantizationConfigPath(
- const std::string &nnadapter_mixed_precision_quantization_config_path);
- void EnableLiteFP16();
- void DisableLiteFP16();
- void EnableLiteInt8();
- void DisableLiteInt8();
- void SetLitePowerMode(LitePowerMode mode);
- void SetTrtInputShape(
- const std::string &input_name, const std::vector<int32_t> &min_shape,
- const std::vector<int32_t> &opt_shape = std::vector<int32_t>(),
- const std::vector<int32_t> &max_shape = std::vector<int32_t>());
- void SetTrtInputData(
- const std::string &input_name, const std::vector<float> &min_shape_data,
- const std::vector<float> &opt_shape_data = std::vector<float>(),
- const std::vector<float> &max_shape_data = std::vector<float>());
- void SetTrtMaxWorkspaceSize(size_t trt_max_workspace_size);
- void SetTrtMaxBatchSize(size_t max_batch_size);
- void EnableTrtFP16();
- void DisableTrtFP16();
- void SetTrtCacheFile(const std::string &cache_file_path);
- void EnablePinnedMemory();
- void DisablePinnedMemory();
- void EnablePaddleTrtCollectShape();
- void DisablePaddleTrtCollectShape();
- void DisablePaddleTrtOPs(const std::vector<std::string> &ops);
- void SetOpenVINOStreams(int num_streams);
- void SetOrtGraphOptLevel(int level = -1);
- void UsePaddleBackend();
- void UseLiteBackend();
- void UseHorizonNPUBackend();
- void UseTVMBackend();
- void UseOMBackend();
- };
- } // namespace ultra_infer
|