před 8 měsíci · 3c9400e6dd
--- a/docs/module_usage/instructions/benchmark.en.md
+++ b/docs/module_usage/instructions/benchmark.en.md
@@ -1,78 +1,199 @@
 
				 # Model Inference Benchmark
			
 
				 
			
 
				-PaddleX support to benchmark model inference. Just set the related flags:
			
 
				+## Table of Contents
			
 
				 
			
 
				-* `PADDLE_PDX_INFER_BENCHMARK`: `True` means enable benchmark. `False` by default;
			
 
				-* `PADDLE_PDX_INFER_BENCHMARK_WARMUP`: Number of warmup. Using random data to infer before testing benchmark if `input` is set to `None`. `0` by default;
			
 
				-* `PADDLE_PDX_INFER_BENCHMARK_DATA_SIZE`: The size of randomly generated data. Valid only when `input` is set to `None`. `224` by default;
			
 
				-* `PADDLE_PDX_INFER_BENCHMARK_ITER`: Number of benchmark testing using random data. Valid only when `input` is set to `None`. `10` by default;
			
 
				-* `PADDLE_PDX_INFER_BENCHMARK_OUTPUT`: The directory to save benchmark result. `None` by default, that means not save.
			
 
				+- [1. Instructions](#1.-Instructions)
			
 
				+- [2. Usage Examples](#2.-Usage-Examples)
			
 
				+  - [2.1 Command Line Method](#2.1-Command-Line-Method)
			
 
				+  - [2.2 Python Script Method](#2.2-Python-Script-Method)
			
 
				+- [3. Explanation of Results](#3.-Explanation-of-Results)
			
 
				 
			
 
				-The example is as follows：
			
 
				+## 1. Instructions
			
 
				+
			
 
				+The benchmark feature collects the average execution time per iteration for each operation in the end-to-end model inference process as well as the average execution time per instance, and provides summary information. The time measurements are in milliseconds.
			
 
				+
			
 
				+To enable the benchmark feature, you must set the following environment variables:
			
 
				+
			
 
				+* `PADDLE_PDX_INFER_BENCHMARK`: When set to `True`, the benchmark feature is enabled (default is `False`);
			
 
				+* `PADDLE_PDX_INFER_BENCHMARK_WARMUP`: The number of warm-up iterations before testing (default is `0`);
			
 
				+* `PADDLE_PDX_INFER_BENCHMARK_ITERS`: The number of iterations for testing (default is `0`);
			
 
				+* `PADDLE_PDX_INFER_BENCHMARK_OUTPUT_DIR`: The directory where the metrics are saved (e.g., `./benchmark`). The default is `None`, meaning the benchmark metrics will not be saved.
			
 
				+
			
 
				+**Note**:
			
 
				+
			
 
				+* At least one of `PADDLE_PDX_INFER_BENCHMARK_WARMUP` or `PADDLE_PDX_INFER_BENCHMARK_ITERS` must be set to a value greater than zero; otherwise, the benchmark feature cannot be used.
			
 
				+* The benchmark feature does not currently apply to model pipelines.
			
 
				+
			
 
				+## 2. Usage Examples
			
 
				+
			
 
				+You can use the benchmark feature by either the command line method or the Python script method.
			
 
				+
			
 
				+### 2.1 Command Line Method
			
 
				+
			
 
				+**Note**:
			
 
				+
			
 
				+- For a description of the input parameters, please refer to the [PaddleX Common Model Configuration File Parameter Explanation](./config_parameters_common.en.md).
			
 
				+- If `batch_size` is greater than 1, the input data will be duplicated `batch_size` times to match the size of `batch_size`.
			
 
				+
			
 
				+Execute the command:
			
 
				 
			
 
				 ```bash
			
 
				 PADDLE_PDX_INFER_BENCHMARK=True \
			
 
				 PADDLE_PDX_INFER_BENCHMARK_WARMUP=5 \
			
 
				-PADDLE_PDX_INFER_BENCHMARK_DATA_SIZE=320 \
			
 
				-PADDLE_PDX_INFER_BENCHMARK_ITER=10 \
			
 
				-PADDLE_PDX_INFER_BENCHMARK_OUTPUT=./benchmark \
			
 
				+PADDLE_PDX_INFER_BENCHMARK_ITERS=10 \
			
 
				+PADDLE_PDX_INFER_BENCHMARK_OUTPUT_DIR=./benchmark \
			
 
				 python main.py \
			
 
				-    -c ./paddlex/configs/object_detection/PicoDet-XS.yaml \
			
 
				+    -c ./paddlex/configs/modules/object_detection/PicoDet-XS.yaml \
			
 
				     -o Global.mode=predict \
			
 
				     -o Predict.model_dir=None \
			
 
				     -o Predict.batch_size=2 \
			
 
				-    -o Predict.input=None
			
 
				+    -o Predict.input=./test.png
			
 
				 ```
			
 
				 
			
 
				-The benchmark infomation would be print:
			
 
				+### 2.2 Python Script Method
			
 
				+
			
 
				+**Note**:
			
 
				+
			
 
				+- For a description of the input parameters, please refer to the [PaddleX Single Model Python Usage Instructions](./model_python_API.en.md).
			
 
				+- If `batch_size` is greater than 1, the input data will be duplicated `batch_size` times to match the size of `batch_size`.
			
 
				+
			
 
				+Create the script `test_infer.py`:
			
 
				 
			
 
				+```python
			
 
				+from paddlex import create_model
			
 
				+
			
 
				+model = create_model(model_name="PicoDet-XS", model_dir=None)
			
 
				+output = list(model.predict(input="./test.png", batch_size=2))
			
 
				 ```
			
 
				-+----------------+-----------------+-----------------+------------------------+
			
 
				-|   Component    | Total Time (ms) | Number of Calls | Avg Time Per Call (ms) |
			
 
				-+----------------+-----------------+-----------------+------------------------+
			
 
				-|    ReadCmp     |   99.60412979   |        10       |       9.96041298       |
			
 
				-|     Resize     |   17.01641083   |        20       |       0.85082054       |
			
 
				-|   Normalize    |   44.61312294   |        20       |       2.23065615       |
			
 
				-|   ToCHWImage   |    0.03385544   |        20       |       0.00169277       |
			
 
				-|    Copy2GPU    |   13.46874237   |        10       |       1.34687424       |
			
 
				-|     Infer      |   71.31743431   |        10       |       7.13174343       |
			
 
				-|    Copy2CPU    |    0.39076805   |        10       |       0.03907681       |
			
 
				-| DetPostProcess |    0.36168098   |        20       |       0.01808405       |
			
 
				-+----------------+-----------------+-----------------+------------------------+
			
 
				-+-------------+-----------------+---------------------+----------------------------+
			
 
				-|    Stage    | Total Time (ms) | Number of Instances | Avg Time Per Instance (ms) |
			
 
				-+-------------+-----------------+---------------------+----------------------------+
			
 
				-|  PreProcess |   161.26751900  |          20         |         8.06337595         |
			
 
				-|  Inference  |   85.17694473   |          20         |         4.25884724         |
			
 
				-| PostProcess |    0.36168098   |          20         |         0.01808405         |
			
 
				-|   End2End   |   256.90770149  |          20         |        12.84538507         |
			
 
				-|    WarmUp   |  5412.37807274  |          10         |        541.23780727        |
			
 
				-+-------------+-----------------+---------------------+----------------------------+
			
 
				+
			
 
				+Run the script:
			
 
				+
			
 
				+```bash
			
 
				+PADDLE_PDX_INFER_BENCHMARK=True \
			
 
				+PADDLE_PDX_INFER_BENCHMARK_WARMUP=5 \
			
 
				+PADDLE_PDX_INFER_BENCHMARK_ITERS=10 \
			
 
				+PADDLE_PDX_INFER_BENCHMARK_OUTPUT_DIR=./benchmark \
			
 
				+python test_infer.py
			
 
				 ```
			
 
				 
			
 
				-The first table show the benchmark infomation by each component(`Component`), include `Total Time` (unit is "ms"), `Number of Calls` and `Avg Time Per Call`  (unit is "ms"). `Avg Time Per Call` is `Total Time` devide by `Number of Calls`. It should be noted that the `Number of Calls` is the number of times the component has been called.
			
 
				+## 3. Explanation of Results
			
 
				+
			
 
				+After enabling the benchmark feature, the benchmark results will be automatically printed. The details are as follows:
			
 
				 
			
 
				-And the second table show the benchmark infomation by different stages: `WarmUp`, `PreProcess`, `Inference`, `PostProcess` and `End2End`. Different from the first table, `Number of Instances` is the number of instances (samples), not the number of calls.
			
 
				+<table border="1">
			
 
				+    <thead>
			
 
				+        <tr>
			
 
				+            <th>Field Name</th>
			
 
				+            <th>Field Description</th>
			
 
				+        </tr>
			
 
				+    </thead>
			
 
				+    <tbody>
			
 
				+        <tr>
			
 
				+            <td>Iters</td>
			
 
				+            <td>Number of iterations, i.e., the number of times inference is executed in a loop.</td>
			
 
				+        </tr>
			
 
				+        <tr>
			
 
				+            <td>Batch Size</td>
			
 
				+            <td>Batch size, i.e., the number of instances processed in each iteration.</td>
			
 
				+        </tr>
			
 
				+        <tr>
			
 
				+            <td>Instances</td>
			
 
				+            <td>Total number of instances, calculated as <code>Iters</code> multiplied by <code>Batch Size</code>.</td>
			
 
				+        </tr>
			
 
				+        <tr>
			
 
				+            <td>Operation</td>
			
 
				+            <td>Name of the operation, such as <code>Resize</code>, <code>Normalize</code>, etc.</td>
			
 
				+        </tr>
			
 
				+        <tr>
			
 
				+            <td>Type</td>
			
 
				+            <td>Type of time consumption, including:
			
 
				+            <ul>
			
 
				+            <li>preprocessing time (<code>Preprocessing</code>)</li>
			
 
				+            <li>model inference time (<code>Inference</code>)</li>
			
 
				+            <li>postprocessing time (<code>Postprocessing</code>)</li>
			
 
				+            <li>core time (<code>Core</code>, i.e., Preprocessing + Inference + Postprocessing)</li>
			
 
				+            <li>other time (<code>Other</code>)</li>
			
 
				+            <li>end-to-end time (<code>End-to-End</code>, i.e., Core + Other)</li>
			
 
				+            </ul>
			
 
				+            </td>
			
 
				+        </tr>
			
 
				+        <tr>
			
 
				+            <td>Avg Time Per Iter (ms)</td>
			
 
				+            <td>Average execution time per iteration, in milliseconds.</td>
			
 
				+        </tr>
			
 
				+        <tr>
			
 
				+            <td>Avg Time Per Instance (ms)</td>
			
 
				+            <td>Average execution time per instance, in milliseconds.</td>
			
 
				+        </tr>
			
 
				+    </tbody>
			
 
				+</table>
			
 
				 
			
 
				-Meanwhile, the benchmark infomation would be saved to local files (`detail.csv` and `summary.csv`) if you set `PADDLE_PDX_INFER_BENCHMARK_OUTPUT`:
			
 
				+Below is an example of the benchmark results obtained by running the example program in Section 2:
			
 
				+
			
 
				+```
			
 
				+                                               WarmUp Data
			
 
				++-------+------------+-----------+----------------+------------------------+----------------------------+
			
 
				+| Iters | Batch Size | Instances |      Type      | Avg Time Per Iter (ms) | Avg Time Per Instance (ms) |
			
 
				++-------+------------+-----------+----------------+------------------------+----------------------------+
			
 
				+|   5   |     2      |     10    | Preprocessing  |      97.89338876       |        48.94669438         |
			
 
				+|   5   |     2      |     10    |   Inference    |      66.70711380       |        33.35355690         |
			
 
				+|   5   |     2      |     10    | Postprocessing |       0.20138482       |         0.10069241         |
			
 
				+|   5   |     2      |     10    |      Core      |      164.80188738      |        82.40094369         |
			
 
				+|   5   |     2      |     10    |     Other      |       3.41097047       |         1.70548523         |
			
 
				+|   5   |     2      |     10    |   End-to-End   |      168.21285784      |        84.10642892         |
			
 
				++-------+------------+-----------+----------------+------------------------+----------------------------+
			
 
				+                                                 Detail Data
			
 
				++-------+------------+-----------+--------------------+------------------------+----------------------------+
			
 
				+| Iters | Batch Size | Instances |     Operation      | Avg Time Per Iter (ms) | Avg Time Per Instance (ms) |
			
 
				++-------+------------+-----------+--------------------+------------------------+----------------------------+
			
 
				+|   10  |     2      |     20    |     ReadImage      |      76.22221033       |        38.11110517         |
			
 
				+|   10  |     2      |     20    |       Resize       |      12.02824502       |         6.01412251         |
			
 
				+|   10  |     2      |     20    |     Normalize      |       6.14072606       |         3.07036303         |
			
 
				+|   10  |     2      |     20    |     ToCHWImage     |       0.00533939       |         0.00266969         |
			
 
				+|   10  |     2      |     20    |      ToBatch       |       0.93134162       |         0.46567081         |
			
 
				+|   10  |     2      |     20    | PaddleCopyToDevice |       0.92240779       |         0.46120390         |
			
 
				+|   10  |     2      |     20    |  PaddleModelInfer  |       9.66330138       |         4.83165069         |
			
 
				+|   10  |     2      |     20    |  PaddleCopyToHost  |       0.06802108       |         0.03401054         |
			
 
				+|   10  |     2      |     20    |   DetPostProcess   |       0.18665448       |         0.09332724         |
			
 
				++-------+------------+-----------+--------------------+------------------------+----------------------------+
			
 
				+                                               Summary Data
			
 
				++-------+------------+-----------+----------------+------------------------+----------------------------+
			
 
				+| Iters | Batch Size | Instances |      Type      | Avg Time Per Iter (ms) | Avg Time Per Instance (ms) |
			
 
				++-------+------------+-----------+----------------+------------------------+----------------------------+
			
 
				+|   10  |     2      |     20    | Preprocessing  |      95.32786242       |        47.66393121         |
			
 
				+|   10  |     2      |     20    |   Inference    |      10.65373025       |         5.32686512         |
			
 
				+|   10  |     2      |     20    | Postprocessing |       0.18665448       |         0.09332724         |
			
 
				+|   10  |     2      |     20    |      Core      |      106.16824715      |        53.08412358         |
			
 
				+|   10  |     2      |     20    |     Other      |       2.74794563       |         1.37397281         |
			
 
				+|   10  |     2      |     20    |   End-to-End   |      108.91619278      |        54.45809639         |
			
 
				++-------+------------+-----------+----------------+------------------------+----------------------------+
			
 
				+```
			
 
				+
			
 
				+Additionally, since `PADDLE_PDX_INFER_BENCHMARK_OUTPUT_DIR=./benchmark` is set, the above results will be saved locally in `./benchmark/detail.csv` and `./benchmark/summary.csv`.
			
 
				+
			
 
				+The contents of `detail.csv` are as follows:
			
 
				 
			
 
				 ```csv
			
 
				-Component,Total Time (ms),Number of Calls,Avg Time Per Call (ms)
			
 
				-ReadCmp,99.60412979125977,10,9.960412979125977
			
 
				-Resize,17.01641082763672,20,0.8508205413818359
			
 
				-Normalize,44.61312294006348,20,2.230656147003174
			
 
				-ToCHWImage,0.033855438232421875,20,0.0016927719116210938
			
 
				-Copy2GPU,13.468742370605469,10,1.3468742370605469
			
 
				-Infer,71.31743431091309,10,7.131743431091309
			
 
				-Copy2CPU,0.39076805114746094,10,0.039076805114746094
			
 
				-DetPostProcess,0.3616809844970703,20,0.018084049224853516
			
 
				+Iters,Batch Size,Instances,Operation,Avg Time Per Iter (ms),Avg Time Per Instance (ms)
			
 
				+10,2,20,ReadImage,76.22221033,38.11110517
			
 
				+10,2,20,Resize,12.02824502,6.01412251
			
 
				+10,2,20,Normalize,6.14072606,3.07036303
			
 
				+10,2,20,ToCHWImage,0.00533939,0.00266969
			
 
				+10,2,20,ToBatch,0.93134162,0.46567081
			
 
				+10,2,20,PaddleCopyToDevice,0.92240779,0.46120390
			
 
				+10,2,20,PaddleModelInfer,9.66330138,4.83165069
			
 
				+10,2,20,PaddleCopyToHost,0.06802108,0.03401054
			
 
				+10,2,20,DetPostProcess,0.18665448,0.09332724
			
 
				 ```
			
 
				 
			
 
				+The contents of `summary.csv` are as follows:
			
 
				+
			
 
				 ```csv
			
 
				-Stage,Total Time (ms),Number of Instances,Avg Time Per Instance (ms)
			
 
				-PreProcess,161.26751899719238,20,8.06337594985962
			
 
				-Inference,85.17694473266602,20,4.258847236633301
			
 
				-PostProcess,0.3616809844970703,20,0.018084049224853516
			
 
				-End2End,256.90770149230957,20,12.845385074615479
			
 
				-WarmUp,5412.3780727386475,10,541.2378072738647
			
 
				+Iters,Batch Size,Instances,Type,Avg Time Per Iter (ms),Avg Time Per Instance (ms)
			
 
				+10,2,20,Preprocessing,95.32786242,47.66393121
			
 
				+10,2,20,Inference,10.65373025,5.32686512
			
 
				+10,2,20,Postprocessing,0.18665448,0.09332724
			
 
				+10,2,20,Core,106.16824715,53.08412358
			
 
				+10,2,20,Other,2.74794563,1.37397281
			
 
				+10,2,20,End-to-End,108.91619278,54.45809639
			
 
				 ```
			
--- a/docs/module_usage/instructions/benchmark.md
+++ b/docs/module_usage/instructions/benchmark.md
@@ -2,62 +2,60 @@
 
				 
			
 
				 ## 目录
			
 
				 
			
 
				-- [1. 使用说明](#1.使用说明)
			
 
				-- [2. 使用示例](#2.使用示例)
			
 
				+- [1. 使用说明](#1.-使用说明)
			
 
				+- [2. 使用示例](#2.-使用示例)
			
 
				   - [2.1 命令行方式](#2.1-命令行方式)
			
 
				   - [2.2 Python 脚本方式](#2.2-Python-脚本方式)
			
 
				-- [3. 结果说明](#3.结果说明)
			
 
				+- [3. 结果说明](#3.-结果说明)
			
 
				 
			
 
				-## 1.使用说明
			
 
				+## 1. 使用说明
			
 
				 
			
 
				-Benchmark 会统计模型在端到端推理过程中，所有操作（`Operation`）和阶段（`Stage`）的每次迭代的平均执行时间（`Avg Time Per Iter (ms)`）和每个样本的平均执行时间（`Avg Time Per Instance (ms)`），单位为毫秒。
			
 
				+Benchmark 功能会统计模型在端到端推理过程中，所有操作的每次迭代的平均执行时间和每个实例的平均执行时间，并给出汇总信息。耗时数据单位为毫秒。
			
 
				 
			
 
				-需通过环境变量启用 Benchmark，具体如下：
			
 
				+需通过环境变量启用 benchmark 功能，具体如下：
			
 
				 
			
 
				-* `PADDLE_PDX_INFER_BENCHMARK`：设置为 `True` 时则开启 Benchmark，默认为 `False`；
			
 
				-* `PADDLE_PDX_INFER_BENCHMARK_WARMUP`：设置 warm up，在开始测试前循环迭代 n 次，默认为 `0`；
			
 
				-* `PADDLE_PDX_INFER_BENCHMARK_ITER`：进行 Benchmark 测试的循环次数，默认为 `0`；
			
 
				-* `PADDLE_PDX_INFER_BENCHMARK_OUTPUT`：用于设置保存的目录，如 `./benchmark`，默认为 `None`，表示不保存 Benchmark 指标；
			
 
				+* `PADDLE_PDX_INFER_BENCHMARK`：设置为 `True` 时则开启 benchmark 功能，默认为 `False`；
			
 
				+* `PADDLE_PDX_INFER_BENCHMARK_WARMUP`：测试前的预热次数，默认为 `0`；
			
 
				+* `PADDLE_PDX_INFER_BENCHMARK_ITERS`：测试的循环次数，默认为 `0`；
			
 
				+* `PADDLE_PDX_INFER_BENCHMARK_OUTPUT_DIR`：保存指标的目录，如 `./benchmark`，默认为 `None`，表示不保存 benchmark 指标。
			
 
				 
			
 
				 **注意**：
			
 
				 
			
 
				-* `PADDLE_PDX_INFER_BENCHMARK_WARMUP` 或 `PADDLE_PDX_INFER_BENCHMARK_ITER` 需要至少设置一个大于零的值，否则无法启用 Benchmark。
			
 
				+* `PADDLE_PDX_INFER_BENCHMARK_WARMUP` 或 `PADDLE_PDX_INFER_BENCHMARK_ITERS` 需要至少设置一个大于零的值，否则无法使用 benchmark 功能。
			
 
				+* Benchmark 功能目前不适用于模型产线。
			
 
				 
			
 
				-## 2.使用示例
			
 
				+## 2. 使用示例
			
 
				 
			
 
				-您可以通过以下两种方式来使用 benchmark：命令行方式和 Python 脚本方式。
			
 
				+您可以通过以下两种方式之一来使用 benchmark 功能：命令行方式和 Python 脚本方式。
			
 
				 
			
 
				 ### 2.1 命令行方式
			
 
				 
			
 
				 **注意**：
			
 
				 
			
 
				-- 输入参数说明可参考 [PaddleX通用模型配置文件参数说明](./config_parameters_common.md)
			
 
				-- `Predict.input` 在 Benchmark 只能被设置为输入数据的本地路径。如果 `batch_size` 大于 1，输入数据将被重复 `batch_size` 次以匹配 `batch_size` 的大小。
			
 
				+- 输入参数说明请参考 [PaddleX通用模型配置文件参数说明](./config_parameters_common.md)
			
 
				+- 如果 `batch_size` 大于 1，输入数据将被重复 `batch_size` 次以匹配 `batch_size` 的大小。
			
 
				 
			
 
				 执行命令：
			
 
				 
			
 
				 ```bash
			
 
				 PADDLE_PDX_INFER_BENCHMARK=True \
			
 
				 PADDLE_PDX_INFER_BENCHMARK_WARMUP=5 \
			
 
				-PADDLE_PDX_INFER_BENCHMARK_ITER=10 \
			
 
				-PADDLE_PDX_INFER_BENCHMARK_OUTPUT=./benchmark \
			
 
				+PADDLE_PDX_INFER_BENCHMARK_ITERS=10 \
			
 
				+PADDLE_PDX_INFER_BENCHMARK_OUTPUT_DIR=./benchmark \
			
 
				 python main.py \
			
 
				     -c ./paddlex/configs/modules/object_detection/PicoDet-XS.yaml \
			
 
				     -o Global.mode=predict \
			
 
				     -o Predict.model_dir=None \
			
 
				     -o Predict.batch_size=2 \
			
 
				     -o Predict.input=./test.png
			
 
				-
			
 
				-# 使用pptrt推理后端
			
 
				-#   -o Predict.kernel_option="{'run_mode': 'trt_fp32'}"
			
 
				 ```
			
 
				 
			
 
				 ### 2.2 Python 脚本方式
			
 
				 
			
 
				 **注意**：
			
 
				 
			
 
				-- 输入参数说明可参考 [PaddleX单模型Python脚本使用说明](./model_python_API.md)
			
 
				-- `input` 在 Benchmark 只能被设置为输入数据的本地路径。如果 `batch_size` 大于 1，输入数据将被重复 `batch_size` 次以匹配 `batch_size` 的大小。
			
 
				+- 输入参数说明请参考 [PaddleX单模型Python脚本使用说明](./model_python_API.md)
			
 
				+- 如果 `batch_size` 大于 1，输入数据将被重复 `batch_size` 次以匹配 `batch_size` 的大小。
			
 
				 
			
 
				 创建 `test_infer.py` 脚本：
			
 
				 
			
@@ -66,15 +64,6 @@ from paddlex import create_model
 
				 
			
 
				 model = create_model(model_name="PicoDet-XS", model_dir=None)
			
 
				 output = list(model.predict(input="./test.png", batch_size=2))
			
 
				-
			
 
				-# 使用pptrt推理后端
			
 
				-# from paddlex import create_model
			
 
				-# from paddlex.inference.utils.pp_option import PaddlePredictorOption
			
 
				-
			
 
				-# pp_option = PaddlePredictorOption()
			
 
				-# pp_option.run_mode = "trt_fp32"
			
 
				-# model = create_model(model_name="PicoDet-XS", model_dir=None, pp_option=pp_option)
			
 
				-# output = list(model.predict(input="./test.png", batch_size=2))
			
 
				 ```
			
 
				 
			
 
				 执行脚本：
			
@@ -82,14 +71,14 @@ output = list(model.predict(input="./test.png", batch_size=2))
 
				 ```bash
			
 
				 PADDLE_PDX_INFER_BENCHMARK=True \
			
 
				 PADDLE_PDX_INFER_BENCHMARK_WARMUP=5 \
			
 
				-PADDLE_PDX_INFER_BENCHMARK_ITER=10 \
			
 
				-PADDLE_PDX_INFER_BENCHMARK_OUTPUT=./benchmark \
			
 
				+PADDLE_PDX_INFER_BENCHMARK_ITERS=10 \
			
 
				+PADDLE_PDX_INFER_BENCHMARK_OUTPUT_DIR=./benchmark \
			
 
				 python test_infer.py
			
 
				 ```
			
 
				 
			
 
				-## 3.结果示例
			
 
				+## 3. 结果说明
			
 
				 
			
 
				-在开启 Benchmark 后，将自动打印 Benchmark 结果，具体说明如下：
			
 
				+在开启 benchmark 功能后，将自动打印 benchmark 结果，具体说明如下：
			
 
				 
			
 
				 <table border="1">
			
 
				     <thead>
			
@@ -101,23 +90,32 @@ python test_infer.py
 
				     <tbody>
			
 
				         <tr>
			
 
				             <td>Iters</td>
			
 
				-            <td>迭代次数，指执行模型推理的循环次数。</td>
			
 
				+            <td>迭代次数，指执行推理的循环次数。</td>
			
 
				         </tr>
			
 
				         <tr>
			
 
				             <td>Batch Size</td>
			
 
				-            <td>批处理大小，指每次迭代中处理的样本数量。</td>
			
 
				+            <td>批次大小，指每次迭代中处理的实例数量。</td>
			
 
				         </tr>
			
 
				         <tr>
			
 
				             <td>Instances</td>
			
 
				-            <td>总样本数量，计算方式为 <code>Iters</code> 乘以 <code>Batch Size</code>。</td>
			
 
				+            <td>实例总数，计算方式为 <code>Iters</code> 乘以 <code>Batch Size</code>。</td>
			
 
				         </tr>
			
 
				         <tr>
			
 
				             <td>Operation</td>
			
 
				             <td>操作名称，如 <code>Resize</code>、<code>Normalize</code> 等。</td>
			
 
				         </tr>
			
 
				         <tr>
			
 
				-            <td>Stage</td>
			
 
				-            <td>阶段名称，包括预处理（PreProcess）、推理（Inference）、后处理（PostProcess）、以及端到端（End2End）。</td>
			
 
				+            <td>Type</td>
			
 
				+            <td>耗时类型，包括：
			
 
				+            <ul>
			
 
				+            <li>预处理耗时（<code>Preprocessing</code>）</li>
			
 
				+            <li>模型推理耗时（<code>Inference</code>）</li>
			
 
				+            <li>后处理耗时（<code>Postprocessing</code>）</li>
			
 
				+            <li>核心耗时（<code>Core</code>，即预处理耗时+模型推理耗时+后处理耗时）</li>
			
 
				+            <li>其他耗时（<code>Other</code>）</li>
			
 
				+            <li>端到端耗时（<code>End-to-End</code>，即核心耗时+其他耗时）</li>
			
 
				+            </ul>
			
 
				+            </td>
			
 
				         </tr>
			
 
				         <tr>
			
 
				             <td>Avg Time Per Iter (ms)</td>
			
@@ -125,71 +123,77 @@ python test_infer.py
 
				         </tr>
			
 
				         <tr>
			
 
				             <td>Avg Time Per Instance (ms)</td>
			
 
				-            <td>每个样本的平均执行时间，单位为毫秒。</td>
			
 
				+            <td>每个实例的平均执行时间，单位为毫秒。</td>
			
 
				         </tr>
			
 
				     </tbody>
			
 
				 </table>
			
 
				 
			
 
				-运行第2节的示例程序所得到的 Benchmark 结果如下：
			
 
				+运行第2节的示例程序所得到的 benchmark 结果如下：
			
 
				 
			
 
				 ```
			
 
				-                                             WarmUp Data
			
 
				-+-------+------------+-----------+-------------+------------------------+----------------------------+
			
 
				-| Iters | Batch Size | Instances |    Stage    | Avg Time Per Iter (ms) | Avg Time Per Instance (ms) |
			
 
				-+-------+------------+-----------+-------------+------------------------+----------------------------+
			
 
				-|   5   |     2      |     10    |  PreProcess |      98.70615005       |        49.35307503         |
			
 
				-|   5   |     2      |     10    |  Inference  |      68.70298386       |        34.35149193         |
			
 
				-|   5   |     2      |     10    | PostProcess |       0.22978783       |         0.11489391         |
			
 
				-|   5   |     2      |     10    |   End2End   |      167.63892174      |        83.81946087         |
			
 
				-+-------+------------+-----------+-------------+------------------------+----------------------------+
			
 
				-                                               Detail Data
			
 
				+                                               WarmUp Data
			
 
				++-------+------------+-----------+----------------+------------------------+----------------------------+
			
 
				+| Iters | Batch Size | Instances |      Type      | Avg Time Per Iter (ms) | Avg Time Per Instance (ms) |
			
 
				++-------+------------+-----------+----------------+------------------------+----------------------------+
			
 
				+|   5   |     2      |     10    | Preprocessing  |      97.89338876       |        48.94669438         |
			
 
				+|   5   |     2      |     10    |   Inference    |      66.70711380       |        33.35355690         |
			
 
				+|   5   |     2      |     10    | Postprocessing |       0.20138482       |         0.10069241         |
			
 
				+|   5   |     2      |     10    |      Core      |      164.80188738      |        82.40094369         |
			
 
				+|   5   |     2      |     10    |     Other      |       3.41097047       |         1.70548523         |
			
 
				+|   5   |     2      |     10    |   End-to-End   |      168.21285784      |        84.10642892         |
			
 
				++-------+------------+-----------+----------------+------------------------+----------------------------+
			
 
				+                                                 Detail Data
			
 
				++-------+------------+-----------+--------------------+------------------------+----------------------------+
			
 
				+| Iters | Batch Size | Instances |     Operation      | Avg Time Per Iter (ms) | Avg Time Per Instance (ms) |
			
 
				++-------+------------+-----------+--------------------+------------------------+----------------------------+
			
 
				+|   10  |     2      |     20    |     ReadImage      |      76.22221033       |        38.11110517         |
			
 
				+|   10  |     2      |     20    |       Resize       |      12.02824502       |         6.01412251         |
			
 
				+|   10  |     2      |     20    |     Normalize      |       6.14072606       |         3.07036303         |
			
 
				+|   10  |     2      |     20    |     ToCHWImage     |       0.00533939       |         0.00266969         |
			
 
				+|   10  |     2      |     20    |      ToBatch       |       0.93134162       |         0.46567081         |
			
 
				+|   10  |     2      |     20    | PaddleCopyToDevice |       0.92240779       |         0.46120390         |
			
 
				+|   10  |     2      |     20    |  PaddleModelInfer  |       9.66330138       |         4.83165069         |
			
 
				+|   10  |     2      |     20    |  PaddleCopyToHost  |       0.06802108       |         0.03401054         |
			
 
				+|   10  |     2      |     20    |   DetPostProcess   |       0.18665448       |         0.09332724         |
			
 
				++-------+------------+-----------+--------------------+------------------------+----------------------------+
			
 
				+                                               Summary Data
			
 
				 +-------+------------+-----------+----------------+------------------------+----------------------------+
			
 
				-| Iters | Batch Size | Instances |   Operation    | Avg Time Per Iter (ms) | Avg Time Per Instance (ms) |
			
 
				+| Iters | Batch Size | Instances |      Type      | Avg Time Per Iter (ms) | Avg Time Per Instance (ms) |
			
 
				 +-------+------------+-----------+----------------+------------------------+----------------------------+
			
 
				-|   10  |     2      |     20    |   ReadImage    |      77.00567245       |        38.50283623         |
			
 
				-|   10  |     2      |     20    |     Resize     |      11.97342873       |         5.98671436         |
			
 
				-|   10  |     2      |     20    |   Normalize    |       6.09791279       |         3.04895639         |
			
 
				-|   10  |     2      |     20    |   ToCHWImage   |       0.00574589       |         0.00287294         |
			
 
				-|   10  |     2      |     20    |    ToBatch     |       0.72050095       |         0.36025047         |
			
 
				-|   10  |     2      |     20    |    Copy2GPU    |       3.15101147       |         1.57550573         |
			
 
				-|   10  |     2      |     20    |     Infer      |       9.58673954       |         4.79336977         |
			
 
				-|   10  |     2      |     20    |    Copy2CPU    |       0.07462502       |         0.03731251         |
			
 
				-|   10  |     2      |     20    | DetPostProcess |       0.22695065       |         0.11347532         |
			
 
				+|   10  |     2      |     20    | Preprocessing  |      95.32786242       |        47.66393121         |
			
 
				+|   10  |     2      |     20    |   Inference    |      10.65373025       |         5.32686512         |
			
 
				+|   10  |     2      |     20    | Postprocessing |       0.18665448       |         0.09332724         |
			
 
				+|   10  |     2      |     20    |      Core      |      106.16824715      |        53.08412358         |
			
 
				+|   10  |     2      |     20    |     Other      |       2.74794563       |         1.37397281         |
			
 
				+|   10  |     2      |     20    |   End-to-End   |      108.91619278      |        54.45809639         |
			
 
				 +-------+------------+-----------+----------------+------------------------+----------------------------+
			
 
				-                                             Summary Data
			
 
				-+-------+------------+-----------+-------------+------------------------+----------------------------+
			
 
				-| Iters | Batch Size | Instances |    Stage    | Avg Time Per Iter (ms) | Avg Time Per Instance (ms) |
			
 
				-+-------+------------+-----------+-------------+------------------------+----------------------------+
			
 
				-|   10  |     2      |     20    |  PreProcess |      95.80326080       |        47.90163040         |
			
 
				-|   10  |     2      |     20    |  Inference  |      12.81237602       |         6.40618801         |
			
 
				-|   10  |     2      |     20    | PostProcess |       0.22695065       |         0.11347532         |
			
 
				-|   10  |     2      |     20    |   End2End   |      108.84258747      |        54.42129374         |
			
 
				-+-------+------------+-----------+-------------+------------------------+----------------------------+
			
 
				 ```
			
 
				 
			
 
				-同时，由于设置了`PADDLE_PDX_INFER_BENCHMARK_OUTPUT=./benchmark`，所以上述结果会保存到到本地： `./benchmark/detail.csv` 和 `./benchmark/summary.csv`：
			
 
				+同时，由于设置了`PADDLE_PDX_INFER_BENCHMARK_OUTPUT_DIR=./benchmark`，所以上述结果会保存到到本地： `./benchmark/detail.csv` 和 `./benchmark/summary.csv`：
			
 
				 
			
 
				 `detail.csv` 内容如下：
			
 
				 
			
 
				 ```csv
			
 
				 Iters,Batch Size,Instances,Operation,Avg Time Per Iter (ms),Avg Time Per Instance (ms)
			
 
				-10,2,20,ReadImage,77.00567245,38.50283623
			
 
				-10,2,20,Resize,11.97342873,5.98671436
			
 
				-10,2,20,Normalize,6.09791279,3.04895639
			
 
				-10,2,20,ToCHWImage,0.00574589,0.00287294
			
 
				-10,2,20,ToBatch,0.72050095,0.36025047
			
 
				-10,2,20,Copy2GPU,3.15101147,1.57550573
			
 
				-10,2,20,Infer,9.58673954,4.79336977
			
 
				-10,2,20,Copy2CPU,0.07462502,0.03731251
			
 
				-10,2,20,DetPostProcess,0.22695065,0.11347532
			
 
				+10,2,20,ReadImage,76.22221033,38.11110517
			
 
				+10,2,20,Resize,12.02824502,6.01412251
			
 
				+10,2,20,Normalize,6.14072606,3.07036303
			
 
				+10,2,20,ToCHWImage,0.00533939,0.00266969
			
 
				+10,2,20,ToBatch,0.93134162,0.46567081
			
 
				+10,2,20,PaddleCopyToDevice,0.92240779,0.46120390
			
 
				+10,2,20,PaddleModelInfer,9.66330138,4.83165069
			
 
				+10,2,20,PaddleCopyToHost,0.06802108,0.03401054
			
 
				+10,2,20,DetPostProcess,0.18665448,0.09332724
			
 
				 ```
			
 
				 
			
 
				 `summary.csv` 内容如下：
			
 
				 
			
 
				 ```csv
			
 
				-Iters,Batch Size,Instances,Stage,Avg Time Per Iter (ms),Avg Time Per Instance (ms)
			
 
				-10,2,20,PreProcess,95.80326080,47.90163040
			
 
				-10,2,20,Inference,12.81237602,6.40618801
			
 
				-10,2,20,PostProcess,0.22695065,0.11347532
			
 
				-10,2,20,End2End,108.84258747,54.42129374
			
 
				+Iters,Batch Size,Instances,Type,Avg Time Per Iter (ms),Avg Time Per Instance (ms)
			
 
				+10,2,20,Preprocessing,95.32786242,47.66393121
			
 
				+10,2,20,Inference,10.65373025,5.32686512
			
 
				+10,2,20,Postprocessing,0.18665448,0.09332724
			
 
				+10,2,20,Core,106.16824715,53.08412358
			
 
				+10,2,20,Other,2.74794563,1.37397281
			
 
				+10,2,20,End-to-End,108.91619278,54.45809639
			
 
				 ```
			
--- a/paddlex/inference/common/batch_sampler/image_batch_sampler.py
+++ b/paddlex/inference/common/batch_sampler/image_batch_sampler.py
@@ -13,7 +13,6 @@
 
				 # limitations under the License.
			
 
				 
			
 
				 import os
			
 
				-import ast
			
 
				 from pathlib import Path
			
 
				 import numpy as np
			
 
				 
			
@@ -86,7 +85,7 @@ class ImageBatchSampler(BaseBatchSampler):
 
				                 batch.append(input, None, None)
			
 
				                 if len(batch) == self.batch_size:
			
 
				                     yield batch
			
 
				-                    batch.reset()
			
 
				+                    batch = ImgInstance()
			
 
				             elif isinstance(input, str) and input.split(".")[-1] in ("PDF", "pdf"):
			
 
				                 file_path = (
			
 
				                     self._download_from_url(input)
			
@@ -97,7 +96,7 @@ class ImageBatchSampler(BaseBatchSampler):
 
				                     batch.append(page_img, file_path, page_idx)
			
 
				                     if len(batch) == self.batch_size:
			
 
				                         yield batch
			
 
				-                        batch.reset()
			
 
				+                        batch = ImgInstance()
			
 
				             elif isinstance(input, str):
			
 
				                 file_path = (
			
 
				                     self._download_from_url(input)
			
@@ -109,30 +108,10 @@ class ImageBatchSampler(BaseBatchSampler):
 
				                     batch.append(file_path, file_path, None)
			
 
				                     if len(batch) == self.batch_size:
			
 
				                         yield batch
			
 
				-                        batch.reset()
			
 
				+                        batch = ImgInstance()
			
 
				             else:
			
 
				                 logging.warning(
			
 
				                     f"Not supported input data type! Only `numpy.ndarray` and `str` are supported! So has been ignored: {input}."
			
 
				                 )
			
 
				         if len(batch) > 0:
			
 
				             yield batch
			
 
				-
			
 
				-    def _rand_batch(self, data_size):
			
 
				-        def parse_size(s):
			
 
				-            res = ast.literal_eval(s)
			
 
				-            if isinstance(res, int):
			
 
				-                return (res, res)
			
 
				-            else:
			
 
				-                assert isinstance(res, (tuple, list))
			
 
				-                assert len(res) == 2
			
 
				-                assert all(isinstance(item, int) for item in res)
			
 
				-                return res
			
 
				-
			
 
				-        rand_batch = ImgInstance()
			
 
				-        size = parse_size(data_size)
			
 
				-        for _ in range(self.batch_size):
			
 
				-            rand_batch.append(
			
 
				-                np.random.randint(0, 256, (*size, 3), dtype=np.uint8), None, None
			
 
				-            )
			
 
				-
			
 
				-        return rand_batch
			
--- a/paddlex/inference/common/batch_sampler/video_batch_sampler.py
+++ b/paddlex/inference/common/batch_sampler/video_batch_sampler.py
@@ -13,9 +13,7 @@
 
				 # limitations under the License.
			
 
				 
			
 
				 import os
			
 
				-import ast
			
 
				 from pathlib import Path
			
 
				-import numpy as np
			
 
				 
			
 
				 from ....utils import logging
			
 
				 from ....utils.download import download
			
@@ -74,21 +72,3 @@ class VideoBatchSampler(BaseBatchSampler):
 
				                 )
			
 
				         if len(batch) > 0:
			
 
				             yield batch
			
 
				-
			
 
				-    def _rand_batch(self, data_size):
			
 
				-        def parse_size(s):
			
 
				-            res = ast.literal_eval(s)
			
 
				-            if isinstance(res, int):
			
 
				-                return (res, res)
			
 
				-            else:
			
 
				-                assert isinstance(res, (tuple, list))
			
 
				-                assert len(res) == 2
			
 
				-                assert all(isinstance(item, int) for item in res)
			
 
				-                return res
			
 
				-
			
 
				-        size = parse_size(data_size)
			
 
				-        rand_batch = [
			
 
				-            np.random.randint(0, 256, (*size, 3), dtype=np.uint8)
			
 
				-            for _ in range(self.batch_size)
			
 
				-        ]
			
 
				-        return rand_batch
			
--- a/paddlex/inference/common/reader/det_3d_reader.py
+++ b/paddlex/inference/common/reader/det_3d_reader.py
@@ -18,6 +18,8 @@ import os
 
				 from typing import Generic, List, Optional, Any, Dict
			
 
				 import pickle
			
 
				 
			
 
				+from ...utils.benchmark import benchmark
			
 
				+
			
 
				 
			
 
				 class _EasyDict(dict):
			
 
				     def __getattr__(self, key: str):
			
@@ -79,6 +81,7 @@ class Sample(_EasyDict):
 
				         self.attrs = None
			
 
				 
			
 
				 
			
 
				+@benchmark.timeit
			
 
				 class ReadNuscenesData:
			
 
				 
			
 
				     def __init__(
			
--- a/paddlex/inference/common/reader/image_reader.py
+++ b/paddlex/inference/common/reader/image_reader.py
@@ -19,6 +19,7 @@ from ...utils.io import ImageReader, PDFReader
 
				 from ...utils.benchmark import benchmark
			
 
				 
			
 
				 
			
 
				+@benchmark.timeit
			
 
				 class ReadImage:
			
 
				     """Load image from the file."""
			
 
				 
			
@@ -41,7 +42,6 @@ class ReadImage:
 
				         flags = self._FLAGS_DICT[self.format]
			
 
				         self._img_reader = ImageReader(backend="opencv", flags=flags)
			
 
				 
			
 
				-    @benchmark.timeit
			
 
				     def __call__(self, imgs):
			
 
				         """apply"""
			
 
				         return [self.read(img) for img in imgs]
			
--- a/paddlex/inference/common/reader/ts_reader.py
+++ b/paddlex/inference/common/reader/ts_reader.py
@@ -16,8 +16,10 @@ import numpy as np
 
				 import pandas as pd
			
 
				 
			
 
				 from ...utils.io import CSVReader
			
 
				+from ...utils.benchmark import benchmark
			
 
				 
			
 
				 
			
 
				+@benchmark.timeit
			
 
				 class ReadTS:
			
 
				 
			
 
				     def __init__(self):
			
--- a/paddlex/inference/common/reader/video_reader.py
+++ b/paddlex/inference/common/reader/video_reader.py
@@ -16,8 +16,10 @@ import numpy as np
 
				 import cv2
			
 
				 
			
 
				 from ...utils.io import VideoReader
			
 
				+from ...utils.benchmark import benchmark
			
 
				 
			
 
				 
			
 
				+@benchmark.timeit
			
 
				 class ReadVideo:
			
 
				     """Load video from the file."""
			
 
				 
			
--- a/paddlex/inference/models/3d_bev_detection/processors.py
+++ b/paddlex/inference/models/3d_bev_detection/processors.py
@@ -34,6 +34,7 @@ cv2_interp_codes = {
 
				 }
			
 
				 
			
 
				 
			
 
				+@benchmark.timeit
			
 
				 class LoadPointsFromFile:
			
 
				     """Load points from a file and process them according to specified parameters."""
			
 
				 
			
@@ -71,7 +72,6 @@ class LoadPointsFromFile:
 
				         points = np.fromfile(pts_filename, dtype=np.float32)
			
 
				         return points
			
 
				 
			
 
				-    @benchmark.timeit
			
 
				     def __call__(self, results):
			
 
				         """Call function to load points data from file and process it.
			
 
				 
			
@@ -114,6 +114,7 @@ class LoadPointsFromFile:
 
				         return results
			
 
				 
			
 
				 
			
 
				+@benchmark.timeit
			
 
				 class LoadPointsFromMultiSweeps(object):
			
 
				     """Load points from multiple sweeps.This is usually used for nuScenes dataset to utilize previous sweeps."""
			
 
				 
			
@@ -221,7 +222,6 @@ class LoadPointsFromMultiSweeps(object):
 
				         )
			
 
				         return points[filt]
			
 
				 
			
 
				-    @benchmark.timeit
			
 
				     def __call__(self, results):
			
 
				         """Call function to load multi-sweep point clouds from files.
			
 
				 
			
@@ -278,6 +278,7 @@ class LoadPointsFromMultiSweeps(object):
 
				         return results
			
 
				 
			
 
				 
			
 
				+@benchmark.timeit
			
 
				 class LoadMultiViewImageFromFiles:
			
 
				     """Load multi-view images from files."""
			
 
				 
			
@@ -308,7 +309,6 @@ class LoadMultiViewImageFromFiles:
 
				         self.constant_std = constant_std
			
 
				         self.imread_flag = imread_flag
			
 
				 
			
 
				-    @benchmark.timeit
			
 
				     def __call__(self, sample):
			
 
				         """
			
 
				         Call method to load multi-view image from files and update the sample dictionary.
			
@@ -345,6 +345,7 @@ class LoadMultiViewImageFromFiles:
 
				         return sample
			
 
				 
			
 
				 
			
 
				+@benchmark.timeit
			
 
				 class ResizeImage:
			
 
				     """Resize images & bbox & mask."""
			
 
				 
			
@@ -640,7 +641,6 @@ class ResizeImage:
 
				         """Resize semantic segmentation map with ``results['scale']``."""
			
 
				         raise NotImplementedError
			
 
				 
			
 
				-    @benchmark.timeit
			
 
				     def __call__(self, results):
			
 
				         """Call function to resize images, bounding boxes, masks, and semantic segmentation maps according to the provided scale or scale factor.
			
 
				 
			
@@ -676,6 +676,7 @@ class ResizeImage:
 
				         return results
			
 
				 
			
 
				 
			
 
				+@benchmark.timeit
			
 
				 class NormalizeImage:
			
 
				     """Normalize the image."""
			
 
				 
			
@@ -714,7 +715,6 @@ class NormalizeImage:
 
				         cv2.multiply(img, stdinv, img)  # inplace
			
 
				         return img
			
 
				 
			
 
				-    @benchmark.timeit
			
 
				     def __call__(self, results):
			
 
				         """Call method to normalize images in the results dictionary.
			
 
				 
			
@@ -735,6 +735,7 @@ class NormalizeImage:
 
				         return results
			
 
				 
			
 
				 
			
 
				+@benchmark.timeit
			
 
				 class PadImage(object):
			
 
				     """Pad the image & mask."""
			
 
				 
			
@@ -859,13 +860,13 @@ class PadImage(object):
 
				         """Pad semantic segmentation map according to ``results['pad_shape']``."""
			
 
				         raise NotImplementedError
			
 
				 
			
 
				-    @benchmark.timeit
			
 
				     def __call__(self, results):
			
 
				         """Call function to pad images, masks, semantic segmentation maps."""
			
 
				         self._pad_img(results)
			
 
				         return results
			
 
				 
			
 
				 
			
 
				+@benchmark.timeit
			
 
				 class SampleFilterByKey:
			
 
				     """Collect data from the loader relevant to the specific task."""
			
 
				 
			
@@ -897,7 +898,6 @@ class SampleFilterByKey:
 
				         self.keys = keys
			
 
				         self.meta_keys = meta_keys
			
 
				 
			
 
				-    @benchmark.timeit
			
 
				     def __call__(self, sample):
			
 
				         """Call function to filter sample by keys. The keys in `meta_keys` are used to filter metadata from the input sample.
			
 
				 
			
@@ -922,6 +922,7 @@ class SampleFilterByKey:
 
				         return filtered_sample
			
 
				 
			
 
				 
			
 
				+@benchmark.timeit
			
 
				 class GetInferInput:
			
 
				     """Collect infer input data from transformed sample"""
			
 
				 
			
@@ -952,7 +953,6 @@ class GetInferInput:
 
				                 collated_batch[k] = [elem[k] for elem in batch]
			
 
				         return collated_batch
			
 
				 
			
 
				-    @benchmark.timeit
			
 
				     def __call__(self, sample):
			
 
				         """Call function to infer input data from transformed sample
			
 
				 
			
--- a/paddlex/inference/models/anomaly_detection/processors.py
+++ b/paddlex/inference/models/anomaly_detection/processors.py
@@ -18,6 +18,7 @@ from skimage import measure, morphology
 
				 from ...utils.benchmark import benchmark
			
 
				 
			
 
				 
			
 
				+@benchmark.timeit
			
 
				 class MapToMask:
			
 
				     """Map_to_mask"""
			
 
				 
			
@@ -27,7 +28,6 @@ class MapToMask:
 
				         """
			
 
				         super().__init__()
			
 
				 
			
 
				-    @benchmark.timeit
			
 
				     def __call__(self, preds, *args):
			
 
				         """apply"""
			
 
				         return [self.apply(pred) for pred in preds]
			
--- a/paddlex/inference/models/base/predictor/base_predictor.py
+++ b/paddlex/inference/models/base/predictor/base_predictor.py
@@ -16,6 +16,7 @@ from typing import List, Dict, Any, Iterator
 
				 from pathlib import Path
			
 
				 from abc import abstractmethod, ABC
			
 
				 
			
 
				+from .....utils.flags import INFER_BENCHMARK
			
 
				 from ....utils.io import YAMLReader
			
 
				 from ....common.batch_sampler import BaseBatchSampler
			
 
				 
			
@@ -137,7 +138,15 @@ class BasePredictor(ABC):
 
				         Yields:
			
 
				             Iterator[Any]: An iterator yielding prediction results.
			
 
				         """
			
 
				-        for batch_data in self.batch_sampler(input):
			
 
				+        if INFER_BENCHMARK:
			
 
				+            if not isinstance(input, list):
			
 
				+                raise TypeError("In benchmark mode, `input` must be a list")
			
 
				+            batches = list(self.batch_sampler(input))
			
 
				+            if len(batches) != 1 or len(batches[0]) != len(input):
			
 
				+                raise ValueError("Unexpected number of instances")
			
 
				+        else:
			
 
				+            batches = self.batch_sampler(input)
			
 
				+        for batch_data in batches:
			
 
				             prediction = self.process(batch_data, **kwargs)
			
 
				             prediction = PredictionWrap(prediction, len(batch_data))
			
 
				             for idx in range(len(batch_data)):
			
--- a/paddlex/inference/models/base/predictor/basic_predictor.py
+++ b/paddlex/inference/models/base/predictor/basic_predictor.py
@@ -19,11 +19,11 @@ from .....utils.subclass_register import AutoRegisterABCMetaClass
 
				 from .....utils.flags import (
			
 
				     INFER_BENCHMARK,
			
 
				     INFER_BENCHMARK_WARMUP,
			
 
				-    INFER_BENCHMARK_ITER,
			
 
				+    INFER_BENCHMARK_ITERS,
			
 
				 )
			
 
				 from .....utils import logging
			
 
				 from ....utils.pp_option import PaddlePredictorOption
			
 
				-from ....utils.benchmark import benchmark
			
 
				+from ....utils.benchmark import benchmark, ENTRY_POINT_NAME
			
 
				 from .base_predictor import BasePredictor
			
 
				 
			
 
				 
			
@@ -56,7 +56,7 @@ class BasicPredictor(
 
				         if not pp_option:
			
 
				             pp_option = PaddlePredictorOption(model_name=self.model_name)
			
 
				         if device:
			
 
				-            pp_option.device = device
			
 
				+            pp_option.set_device(device)
			
 
				         trt_dynamic_shapes = (
			
 
				             self.config.get("Hpi", {})
			
 
				             .get("backend_configs", {})
			
@@ -95,25 +95,29 @@ class BasicPredictor(
 
				         self.set_predictor(batch_size, device, pp_option)
			
 
				         if INFER_BENCHMARK:
			
 
				             # TODO(zhang-prog): Get metadata of input data
			
 
				-            if not isinstance(input, str):
			
 
				-                raise TypeError("Only support string as input")
			
 
				+            @benchmark.timeit_with_name(ENTRY_POINT_NAME)
			
 
				+            def _apply(input, **kwargs):
			
 
				+                return list(self.apply(input, **kwargs))
			
 
				+
			
 
				+            if isinstance(input, list):
			
 
				+                raise TypeError("`input` cannot be a list in benchmark mode")
			
 
				             input = [input] * batch_size
			
 
				 
			
 
				-            if not (INFER_BENCHMARK_WARMUP > 0 or INFER_BENCHMARK_ITER > 0):
			
 
				+            if not (INFER_BENCHMARK_WARMUP > 0 or INFER_BENCHMARK_ITERS > 0):
			
 
				                 raise RuntimeError(
			
 
				-                    "At least one of `INFER_BENCHMARK_WARMUP` and `INFER_BENCHMARK_ITER` must be greater than zero"
			
 
				+                    "At least one of `INFER_BENCHMARK_WARMUP` and `INFER_BENCHMARK_ITERS` must be greater than zero"
			
 
				                 )
			
 
				 
			
 
				             if INFER_BENCHMARK_WARMUP > 0:
			
 
				                 benchmark.start_warmup()
			
 
				                 for _ in range(INFER_BENCHMARK_WARMUP):
			
 
				-                    output = list(self.apply(input, **kwargs))
			
 
				+                    output = _apply(input, **kwargs)
			
 
				                 benchmark.collect(batch_size)
			
 
				                 benchmark.stop_warmup()
			
 
				 
			
 
				-            if INFER_BENCHMARK_ITER > 0:
			
 
				-                for _ in range(INFER_BENCHMARK_ITER):
			
 
				-                    output = list(self.apply(input, **kwargs))
			
 
				+            if INFER_BENCHMARK_ITERS > 0:
			
 
				+                for _ in range(INFER_BENCHMARK_ITERS):
			
 
				+                    output = _apply(input, **kwargs)
			
 
				                 benchmark.collect(batch_size)
			
 
				 
			
 
				             yield output[0]
			
@@ -141,6 +145,6 @@ class BasicPredictor(
 
				             self.batch_sampler.batch_size = batch_size
			
 
				             self.pp_option.batch_size = batch_size
			
 
				         if device and device != self.pp_option.device:
			
 
				-            self.pp_option.device = device
			
 
				+            self.pp_option.set_device(device)
			
 
				         if pp_option and pp_option != self.pp_option:
			
 
				             self.pp_option = pp_option
			
--- a/paddlex/inference/models/common/static_infer.py
+++ b/paddlex/inference/models/common/static_infer.py
@@ -12,45 +12,126 @@
 
				 # See the License for the specific language governing permissions and
			
 
				 # limitations under the License.
			
 
				 
			
 
				-from typing import Union, Tuple, List, Dict, Any, Iterator
			
 
				-import os
			
 
				+from typing import Sequence, List
			
 
				 from pathlib import Path
			
 
				-import lazy_paddle as paddle
			
 
				+
			
 
				+import lazy_paddle
			
 
				 import numpy as np
			
 
				 
			
 
				-from ....utils.flags import DEBUG, FLAGS_json_format_model, USE_PIR_TRT
			
 
				-from ...utils.benchmark import benchmark
			
 
				 from ....utils import logging
			
 
				+from ....utils.device import constr_device
			
 
				+from ....utils.flags import DEBUG, USE_PIR_TRT
			
 
				+from ...utils.benchmark import benchmark, set_inference_operations
			
 
				+from ...utils.hpi import get_model_paths
			
 
				 from ...utils.pp_option import PaddlePredictorOption
			
 
				 from ...utils.trt_config import TRT_CFG
			
 
				 
			
 
				 
			
 
				+CACHE_DIR = ".cache"
			
 
				+
			
 
				+INFERENCE_OPERATIONS = ["PaddleCopyToDevice", "PaddleCopyToHost", "PaddleModelInfer"]
			
 
				+set_inference_operations(INFERENCE_OPERATIONS)
			
 
				+
			
 
				+
			
 
				+# XXX: Better use Paddle Inference API to do this
			
 
				+def _pd_dtype_to_np_dtype(pd_dtype):
			
 
				+    if pd_dtype == lazy_paddle.inference.DataType.FLOAT64:
			
 
				+        return np.float64
			
 
				+    elif pd_dtype == lazy_paddle.inference.DataType.FLOAT32:
			
 
				+        return np.float32
			
 
				+    elif pd_dtype == lazy_paddle.inference.DataType.INT64:
			
 
				+        return np.int64
			
 
				+    elif pd_dtype == lazy_paddle.inference.DataType.INT32:
			
 
				+        return np.int32
			
 
				+    elif pd_dtype == lazy_paddle.inference.DataType.UINT8:
			
 
				+        return np.uint8
			
 
				+    elif pd_dtype == lazy_paddle.inference.DataType.INT8:
			
 
				+        return np.int8
			
 
				+    else:
			
 
				+        raise TypeError(f"Unsupported data type: {pd_dtype}")
			
 
				+
			
 
				+
			
 
				 # old trt
			
 
				-def collect_trt_shapes(
			
 
				-    model_file, model_params, gpu_id, shape_range_info_path, trt_dynamic_shapes
			
 
				+def _collect_trt_shape_range_info(
			
 
				+    model_file,
			
 
				+    model_params,
			
 
				+    gpu_id,
			
 
				+    shape_range_info_path,
			
 
				+    dynamic_shapes,
			
 
				+    dynamic_shape_input_data,
			
 
				 ):
			
 
				-    config = paddle.inference.Config(model_file, model_params)
			
 
				+
			
 
				+    dynamic_shape_input_data = dynamic_shape_input_data or {}
			
 
				+
			
 
				+    config = lazy_paddle.inference.Config(model_file, model_params)
			
 
				     config.enable_use_gpu(100, gpu_id)
			
 
				+    config.collect_shape_range_info(shape_range_info_path)
			
 
				+    # TODO: Add other needed options
			
 
				+    config.disable_glog_info()
			
 
				+    predictor = lazy_paddle.inference.create_predictor(config)
			
 
				+
			
 
				+    input_names = predictor.get_input_names()
			
 
				+    for name in dynamic_shapes:
			
 
				+        if name not in input_names:
			
 
				+            raise ValueError(
			
 
				+                f"Invalid input name {repr(name)} found in `dynamic_shapes`"
			
 
				+            )
			
 
				+    for name in input_names:
			
 
				+        if name not in dynamic_shapes:
			
 
				+            raise ValueError(f"Input name {repr(name)} not found in `dynamic_shapes`")
			
 
				+    for name in dynamic_shape_input_data:
			
 
				+        if name not in input_names:
			
 
				+            raise ValueError(
			
 
				+                f"Invalid input name {repr(name)} found in `dynamic_shape_input_data`"
			
 
				+            )
			
 
				+    # It would be better to check if the shapes are valid.
			
 
				+
			
 
				     min_arrs, opt_arrs, max_arrs = {}, {}, {}
			
 
				-    for name, candidate_shapes in trt_dynamic_shapes.items():
			
 
				+    for name, candidate_shapes in dynamic_shapes.items():
			
 
				+        # XXX: Currently we have no way to get the data type of the tensor
			
 
				+        # without creating an input handle.
			
 
				+        handle = predictor.get_input_handle(name)
			
 
				+        dtype = _pd_dtype_to_np_dtype(handle.type())
			
 
				         min_shape, opt_shape, max_shape = candidate_shapes
			
 
				-        min_arrs[name] = np.ones(min_shape, dtype=np.float32)
			
 
				-        opt_arrs[name] = np.ones(opt_shape, dtype=np.float32)
			
 
				-        max_arrs[name] = np.ones(max_shape, dtype=np.float32)
			
 
				+        if name in dynamic_shape_input_data:
			
 
				+            min_arrs[name] = np.array(
			
 
				+                dynamic_shape_input_data[name][0], dtype=dtype
			
 
				+            ).reshape(min_shape)
			
 
				+            opt_arrs[name] = np.array(
			
 
				+                dynamic_shape_input_data[name][1], dtype=dtype
			
 
				+            ).reshape(opt_shape)
			
 
				+            max_arrs[name] = np.array(
			
 
				+                dynamic_shape_input_data[name][2], dtype=dtype
			
 
				+            ).reshape(max_shape)
			
 
				+        else:
			
 
				+            min_arrs[name] = np.ones(min_shape, dtype=dtype)
			
 
				+            opt_arrs[name] = np.ones(opt_shape, dtype=dtype)
			
 
				+            max_arrs[name] = np.ones(max_shape, dtype=dtype)
			
 
				 
			
 
				-    config.collect_shape_range_info(shape_range_info_path)
			
 
				-    predictor = paddle.inference.create_predictor(config)
			
 
				-    # opt_arrs would be used twice to simulate the most common situations
			
 
				+    # `opt_arrs` is used twice to ensure it is the most frequently used.
			
 
				     for arrs in [min_arrs, opt_arrs, opt_arrs, max_arrs]:
			
 
				         for name, arr in arrs.items():
			
 
				-            input_handler = predictor.get_input_handle(name)
			
 
				-            input_handler.reshape(arr.shape)
			
 
				-            input_handler.copy_from_cpu(arr)
			
 
				+            handle = predictor.get_input_handle(name)
			
 
				+            handle.reshape(arr.shape)
			
 
				+            handle.copy_from_cpu(arr)
			
 
				         predictor.run()
			
 
				 
			
 
				+    # HACK: The shape range info will be written to the file only when
			
 
				+    # `predictor` is garbage collected. It works in CPython, but it is
			
 
				+    # definitely a bad idea to count on the implementation-dependent behavior of
			
 
				+    # a garbage collector. Is there a more explicit and deterministic way to
			
 
				+    # handle this?
			
 
				+
			
 
				 
			
 
				 # pir trt
			
 
				-def convert_trt(model_name, mode, pp_model_path, trt_save_path, trt_dynamic_shapes):
			
 
				+def _convert_trt(
			
 
				+    model_name,
			
 
				+    mode,
			
 
				+    pp_model_file,
			
 
				+    pp_params_file,
			
 
				+    trt_save_path,
			
 
				+    trt_dynamic_shapes,
			
 
				+):
			
 
				     def _set_trt_config():
			
 
				         if settings := TRT_CFG.get(model_name):
			
 
				             for attr_name in settings:
			
@@ -65,14 +146,33 @@ def convert_trt(model_name, mode, pp_model_path, trt_save_path, trt_dynamic_shap
 
				         PrecisionMode,
			
 
				     )
			
 
				 
			
 
				+    def _get_input_names(model_file, params_file):
			
 
				+        # HACK
			
 
				+        config = lazy_paddle.inference.Config(str(model_file), str(params_file))
			
 
				+        config.disable_glog_info()
			
 
				+        predictor = lazy_paddle.inference.create_predictor(config)
			
 
				+        return predictor.get_input_names()
			
 
				+
			
 
				+    input_names = _get_input_names(pp_model_file, pp_params_file)
			
 
				+    for name in trt_dynamic_shapes:
			
 
				+        if name not in input_names:
			
 
				+            raise ValueError(
			
 
				+                f"Invalid input name {repr(name)} found in `trt_dynamic_shapes`"
			
 
				+            )
			
 
				+    for name in input_names:
			
 
				+        if name not in trt_dynamic_shapes:
			
 
				+            raise ValueError(
			
 
				+                f"Input name {repr(name)} not found in `trt_dynamic_shapes`"
			
 
				+            )
			
 
				+
			
 
				     precision_map = {
			
 
				         "trt_int8": PrecisionMode.INT8,
			
 
				         "trt_fp32": PrecisionMode.FP32,
			
 
				         "trt_fp16": PrecisionMode.FP16,
			
 
				     }
			
 
				     trt_inputs = []
			
 
				-    for name, candidate_shapes in trt_dynamic_shapes.items():
			
 
				-        min_shape, opt_shape, max_shape = candidate_shapes
			
 
				+    for name in input_names:
			
 
				+        min_shape, opt_shape, max_shape = trt_dynamic_shapes[name]
			
 
				         trt_input = Input(
			
 
				             min_input_shape=min_shape,
			
 
				             optim_input_shape=opt_shape,
			
@@ -84,171 +184,198 @@ def convert_trt(model_name, mode, pp_model_path, trt_save_path, trt_dynamic_shap
 
				     trt_config = TensorRTConfig(inputs=trt_inputs)
			
 
				     _set_trt_config()
			
 
				     trt_config.precision_mode = precision_map[mode]
			
 
				-    trt_config.save_model_dir = trt_save_path
			
 
				+    trt_config.save_model_dir = str(trt_save_path)
			
 
				+    pp_model_path = str(pp_model_file.with_suffix(""))
			
 
				     convert(pp_model_path, trt_config)
			
 
				 
			
 
				 
			
 
				-class Copy2GPU:
			
 
				-    @benchmark.timeit
			
 
				+def _sort_inputs(inputs, names):
			
 
				+    # NOTE: Adjust input tensors to match the sorted sequence.
			
 
				+    indices = sorted(range(len(names)), key=names.__getitem__)
			
 
				+    inputs = [inputs[indices.index(i)] for i in range(len(inputs))]
			
 
				+    return inputs
			
 
				+
			
 
				+
			
 
				+def _concatenate(*callables):
			
 
				+    def _chain(x):
			
 
				+        for c in callables:
			
 
				+            x = c(x)
			
 
				+        return x
			
 
				+
			
 
				+    return _chain
			
 
				+
			
 
				+
			
 
				+@benchmark.timeit
			
 
				+class PaddleCopyToDevice:
			
 
				+    def __init__(self, device_type, device_id):
			
 
				+        self.device_type = device_type
			
 
				+        self.device_id = device_id
			
 
				+
			
 
				     def __call__(self, arrs):
			
 
				-        paddle_tensors = [paddle.to_tensor(i) for i in arrs]
			
 
				+        device_id = [self.device_id] if self.device_id is not None else self.device_id
			
 
				+        device = constr_device(self.device_type, device_id)
			
 
				+        paddle_tensors = [lazy_paddle.to_tensor(i, place=device) for i in arrs]
			
 
				         return paddle_tensors
			
 
				 
			
 
				 
			
 
				-class Copy2CPU:
			
 
				-    @benchmark.timeit
			
 
				+@benchmark.timeit
			
 
				+class PaddleCopyToHost:
			
 
				     def __call__(self, paddle_tensors):
			
 
				         arrs = [i.numpy() for i in paddle_tensors]
			
 
				         return arrs
			
 
				 
			
 
				 
			
 
				-class Infer:
			
 
				-
			
 
				+@benchmark.timeit
			
 
				+class PaddleModelInfer:
			
 
				     def __init__(self, predictor):
			
 
				         super().__init__()
			
 
				         self.predictor = predictor
			
 
				 
			
 
				-    @benchmark.timeit
			
 
				     def __call__(self, x):
			
 
				         return self.predictor.run(x)
			
 
				 
			
 
				 
			
 
				-class StaticInfer:
			
 
				-    """Predictor based on Paddle Inference"""
			
 
				+# FIXME: Name might be misleading
			
 
				+@benchmark.timeit
			
 
				+class PaddleInferChainLegacy:
			
 
				+    def __init__(self, predictor):
			
 
				+        self.predictor = predictor
			
 
				+        input_names = self.predictor.get_input_names()
			
 
				+        self.input_handles = []
			
 
				+        self.output_handles = []
			
 
				+        for input_name in input_names:
			
 
				+            input_handle = self.predictor.get_input_handle(input_name)
			
 
				+            self.input_handles.append(input_handle)
			
 
				+        output_names = self.predictor.get_output_names()
			
 
				+        for output_name in output_names:
			
 
				+            output_handle = self.predictor.get_output_handle(output_name)
			
 
				+            self.output_handles.append(output_handle)
			
 
				+
			
 
				+    def __call__(self, x):
			
 
				+        for input_, input_handle in zip(x, self.input_handles):
			
 
				+            input_handle.reshape(input_.shape)
			
 
				+            input_handle.copy_from_cpu(input_)
			
 
				+        self.predictor.run()
			
 
				+        outputs = [o.copy_to_cpu() for o in self.output_handles]
			
 
				+        return outputs
			
 
				+
			
 
				 
			
 
				+class StaticInfer(object):
			
 
				     def __init__(
			
 
				-        self, model_dir: str, model_prefix: str, option: PaddlePredictorOption
			
 
				+        self,
			
 
				+        model_dir: str,
			
 
				+        model_prefix: str,
			
 
				+        option: PaddlePredictorOption,
			
 
				     ) -> None:
			
 
				         super().__init__()
			
 
				         self.model_dir = model_dir
			
 
				-        self.model_prefix = model_prefix
			
 
				-        self.option = option
			
 
				+        self.model_file_prefix = model_prefix
			
 
				+        self._option = option
			
 
				         self.predictor = self._create()
			
 
				-        self.copy2gpu = Copy2GPU()
			
 
				-        self.copy2cpu = Copy2CPU()
			
 
				-        self.infer = Infer(self.predictor)
			
 
				+        if not self._use_legacy_api:
			
 
				+            device_type = self._option.device_type
			
 
				+            device_type = "gpu" if device_type == "dcu" else device_type
			
 
				+            copy_to_device = PaddleCopyToDevice(device_type, self._option.device_id)
			
 
				+            copy_to_host = PaddleCopyToHost()
			
 
				+            model_infer = PaddleModelInfer(self.predictor)
			
 
				+            self.infer = _concatenate(copy_to_device, model_infer, copy_to_host)
			
 
				+        else:
			
 
				+            self.infer = PaddleInferChainLegacy(self.predictor)
			
 
				+
			
 
				+    @property
			
 
				+    def _use_legacy_api(self):
			
 
				+        return self._option.device_type not in ("cpu", "gpu", "dcu")
			
 
				+
			
 
				+    def __call__(self, x: Sequence[np.ndarray]) -> List[np.ndarray]:
			
 
				+        names = self.predictor.get_input_names()
			
 
				+        if len(names) != len(x):
			
 
				+            raise ValueError(
			
 
				+                f"The number of inputs does not match the model: {len(names)} vs {len(x)}"
			
 
				+            )
			
 
				+        # TODO:
			
 
				+        # Ensure that input tensors follow the model's input sequence without sorting.
			
 
				+        x = _sort_inputs(x, names)
			
 
				+        x = list(map(np.ascontiguousarray, x))
			
 
				+        pred = self.infer(x)
			
 
				+        return pred
			
 
				 
			
 
				     def _create(
			
 
				         self,
			
 
				-    ) -> Tuple[
			
 
				-        "paddle.base.libpaddle.PaddleInferPredictor",
			
 
				-        "paddle.base.libpaddle.PaddleInferTensor",
			
 
				-        "paddle.base.libpaddle.PaddleInferTensor",
			
 
				-    ]:
			
 
				+    ):
			
 
				         """_create"""
			
 
				-        from lazy_paddle.inference import Config, create_predictor
			
 
				+        model_paths = get_model_paths(self.model_dir, self.model_file_prefix)
			
 
				+        if "paddle" not in model_paths:
			
 
				+            raise RuntimeError("No valid Paddle model found")
			
 
				+        model_file, params_file = model_paths["paddle"]
			
 
				+
			
 
				+        if self._option.model_name == "LaTeX_OCR_rec":
			
 
				+            import cpuinfo
			
 
				+
			
 
				+            if (
			
 
				+                "GenuineIntel" in cpuinfo.get_cpu_info().get("vendor_id_raw", "")
			
 
				+                and self._option.run_mode != "mkldnn"
			
 
				+            ):
			
 
				+                logging.warning(
			
 
				+                    "Now, the `LaTeX_OCR_rec` model only support `mkldnn` mode when running on Intel CPU devices. So using `mkldnn` instead."
			
 
				+                )
			
 
				+            self._option.run_mode = "mkldnn"
			
 
				+            logging.debug("`run_mode` updated to 'mkldnn'")
			
 
				 
			
 
				-        if FLAGS_json_format_model:
			
 
				-            model_file = (self.model_dir / f"{self.model_prefix}.json").as_posix()
			
 
				-        # when FLAGS_json_format_model is not set, use inference.json if exist, otherwise inference.pdmodel
			
 
				-        else:
			
 
				-            model_file = self.model_dir / f"{self.model_prefix}.json"
			
 
				-            if model_file.exists():
			
 
				-                model_file = model_file.as_posix()
			
 
				-            # default by `pdmodel` suffix
			
 
				-            else:
			
 
				-                model_file = (
			
 
				-                    self.model_dir / f"{self.model_prefix}.pdmodel"
			
 
				-                ).as_posix()
			
 
				-        params_file = (self.model_dir / f"{self.model_prefix}.pdiparams").as_posix()
			
 
				+        if (
			
 
				+            self._option.device_type in ("gpu", "dcu")
			
 
				+            and self._option.device_id is None
			
 
				+        ):
			
 
				+            self._option.device_id = 0
			
 
				+            logging.debug("`device_id` has been set to 0")
			
 
				 
			
 
				         # for TRT
			
 
				-        if self.option.run_mode.startswith("trt"):
			
 
				-            assert self.option.device == "gpu"
			
 
				-            if not USE_PIR_TRT:
			
 
				-                if self.option.shape_info_filename is None:
			
 
				-                    shape_range_info_path = (
			
 
				-                        self.model_dir / "shape_range_info.pbtxt"
			
 
				-                    ).as_posix()
			
 
				-                else:
			
 
				-                    shape_range_info_path = self.option.shape_info_filename
			
 
				-                if not os.path.exists(shape_range_info_path):
			
 
				-                    logging.info(
			
 
				-                        f"Dynamic shape info is collected into: {shape_range_info_path}"
			
 
				-                    )
			
 
				-                    collect_trt_shapes(
			
 
				-                        model_file,
			
 
				-                        params_file,
			
 
				-                        self.option.device_id,
			
 
				-                        shape_range_info_path,
			
 
				-                        self.option.trt_dynamic_shapes,
			
 
				-                    )
			
 
				-                else:
			
 
				-                    logging.info(
			
 
				-                        f"A dynamic shape info file ( {shape_range_info_path} ) already exists. No need to collect again."
			
 
				-                    )
			
 
				-                self.option.shape_info_filename = shape_range_info_path
			
 
				-            else:
			
 
				-                trt_save_path = (
			
 
				-                    Path(self.model_dir) / "trt" / self.model_prefix
			
 
				-                ).as_posix()
			
 
				-                pp_model_path = (Path(self.model_dir) / self.model_prefix).as_posix()
			
 
				-                convert_trt(
			
 
				-                    self.option.model_name,
			
 
				-                    self.option.run_mode,
			
 
				-                    pp_model_path,
			
 
				-                    trt_save_path,
			
 
				-                    self.option.trt_dynamic_shapes,
			
 
				-                )
			
 
				-                model_file = trt_save_path + ".json"
			
 
				-                params_file = trt_save_path + ".pdiparams"
			
 
				+        if self._option.run_mode.startswith("trt"):
			
 
				+            assert self._option.device_type == "gpu"
			
 
				+            cache_dir = self.model_dir / CACHE_DIR / "paddle"
			
 
				+            config = self._configure_trt(
			
 
				+                model_file,
			
 
				+                params_file,
			
 
				+                cache_dir,
			
 
				+            )
			
 
				+        else:
			
 
				+            config = lazy_paddle.inference.Config(str(model_file), str(params_file))
			
 
				 
			
 
				-        config = Config(model_file, params_file)
			
 
				-        if self.option.device == "gpu":
			
 
				+        if self._option.device_type == "gpu":
			
 
				             config.exp_disable_mixed_precision_ops({"feed", "fetch"})
			
 
				-            config.enable_use_gpu(100, self.option.device_id)
			
 
				-            if not self.option.run_mode.startswith("trt"):
			
 
				+            config.enable_use_gpu(100, self._option.device_id)
			
 
				+            if not self._option.run_mode.startswith("trt"):
			
 
				                 if hasattr(config, "enable_new_ir"):
			
 
				-                    config.enable_new_ir(self.option.enable_new_ir)
			
 
				+                    config.enable_new_ir(self._option.enable_new_ir)
			
 
				                 if hasattr(config, "enable_new_executor"):
			
 
				                     config.enable_new_executor()
			
 
				                 config.set_optimization_level(3)
			
 
				-            # NOTE: The pptrt settings are not aligned with those of FD.
			
 
				-            else:
			
 
				-                if not USE_PIR_TRT:
			
 
				-                    precision_map = {
			
 
				-                        "trt_int8": Config.Precision.Int8,
			
 
				-                        "trt_fp32": Config.Precision.Float32,
			
 
				-                        "trt_fp16": Config.Precision.Half,
			
 
				-                    }
			
 
				-                    config.enable_tensorrt_engine(
			
 
				-                        workspace_size=(1 << 30) * self.option.batch_size,
			
 
				-                        max_batch_size=self.option.batch_size,
			
 
				-                        min_subgraph_size=self.option.min_subgraph_size,
			
 
				-                        precision_mode=precision_map[self.option.run_mode],
			
 
				-                        use_static=self.option.trt_use_static,
			
 
				-                        use_calib_mode=self.option.trt_calib_mode,
			
 
				-                    )
			
 
				-                    config.enable_tuned_tensorrt_dynamic_shape(
			
 
				-                        self.option.shape_info_filename, True
			
 
				-                    )
			
 
				-        elif self.option.device == "npu":
			
 
				+        elif self._option.device_type == "npu":
			
 
				             config.enable_custom_device("npu")
			
 
				             if hasattr(config, "enable_new_executor"):
			
 
				                 config.enable_new_executor()
			
 
				-        elif self.option.device == "xpu":
			
 
				+        elif self._option.device_type == "xpu":
			
 
				             if hasattr(config, "enable_new_executor"):
			
 
				                 config.enable_new_executor()
			
 
				-        elif self.option.device == "mlu":
			
 
				+        elif self._option.device_type == "mlu":
			
 
				             config.enable_custom_device("mlu")
			
 
				             if hasattr(config, "enable_new_executor"):
			
 
				                 config.enable_new_executor()
			
 
				-        elif self.option.device == "dcu":
			
 
				-            config.enable_use_gpu(100, self.option.device_id)
			
 
				+        elif self._option.device_type == "dcu":
			
 
				+            config.enable_use_gpu(100, self._option.device_id)
			
 
				             if hasattr(config, "enable_new_executor"):
			
 
				                 config.enable_new_executor()
			
 
				             # XXX: is_compiled_with_rocm() must be True on dcu platform ?
			
 
				-            if paddle.is_compiled_with_rocm():
			
 
				+            if lazy_paddle.is_compiled_with_rocm():
			
 
				                 # Delete unsupported passes in dcu
			
 
				                 config.delete_pass("conv2d_add_act_fuse_pass")
			
 
				                 config.delete_pass("conv2d_add_fuse_pass")
			
 
				         else:
			
 
				-            assert self.option.device == "cpu"
			
 
				+            assert self._option.device_type == "cpu"
			
 
				             config.disable_gpu()
			
 
				-            if "mkldnn" in self.option.run_mode:
			
 
				+            if "mkldnn" in self._option.run_mode:
			
 
				                 try:
			
 
				                     config.enable_mkldnn()
			
 
				-                    if "bf16" in self.option.run_mode:
			
 
				+                    if "bf16" in self._option.run_mode:
			
 
				                         config.enable_mkldnn_bfloat16()
			
 
				                 except Exception as e:
			
 
				                     logging.warning(
			
@@ -258,43 +385,115 @@ class StaticInfer:
 
				             else:
			
 
				                 if hasattr(config, "disable_mkldnn"):
			
 
				                     config.disable_mkldnn()
			
 
				-            config.set_cpu_math_library_num_threads(self.option.cpu_threads)
			
 
				+            config.set_cpu_math_library_num_threads(self._option.cpu_threads)
			
 
				 
			
 
				             if hasattr(config, "enable_new_ir"):
			
 
				-                config.enable_new_ir(self.option.enable_new_ir)
			
 
				+                config.enable_new_ir(self._option.enable_new_ir)
			
 
				             if hasattr(config, "enable_new_executor"):
			
 
				                 config.enable_new_executor()
			
 
				             config.set_optimization_level(3)
			
 
				 
			
 
				         config.enable_memory_optim()
			
 
				-        for del_p in self.option.delete_pass:
			
 
				+        for del_p in self._option.delete_pass:
			
 
				             config.delete_pass(del_p)
			
 
				 
			
 
				         # Disable paddle inference logging
			
 
				         if not DEBUG:
			
 
				             config.disable_glog_info()
			
 
				 
			
 
				-        predictor = create_predictor(config)
			
 
				-
			
 
				-        # Get input and output handlers
			
 
				-        input_names = predictor.get_input_names()
			
 
				-        input_names.sort()
			
 
				+        predictor = lazy_paddle.inference.create_predictor(config)
			
 
				 
			
 
				         return predictor
			
 
				 
			
 
				-    def __call__(self, x) -> List[Any]:
			
 
				-        # NOTE: Adjust input tensors to match the sorted sequence.
			
 
				-        names = self.predictor.get_input_names()
			
 
				-        if len(names) != len(x):
			
 
				-            raise ValueError(
			
 
				-                f"The number of inputs does not match the model: {len(names)} vs {len(x)}"
			
 
				+    def _configure_trt(self, model_file, params_file, cache_dir):
			
 
				+        # TODO: Support calibration
			
 
				+        if USE_PIR_TRT:
			
 
				+            trt_save_path = cache_dir / "trt" / self.model_file_prefix
			
 
				+            _convert_trt(
			
 
				+                self._option.model_name,
			
 
				+                self._option.run_mode,
			
 
				+                model_file,
			
 
				+                params_file,
			
 
				+                trt_save_path,
			
 
				+                self._option.trt_dynamic_shapes,
			
 
				+            )
			
 
				+            model_file = trt_save_path.with_suffix(".json")
			
 
				+            params_file = trt_save_path.with_suffix(".pdiparams")
			
 
				+            config = lazy_paddle.inference.Config(str(model_file), str(params_file))
			
 
				+        else:
			
 
				+            PRECISION_MAP = {
			
 
				+                "trt_int8": lazy_paddle.inference.Config.Precision.Int8,
			
 
				+                "trt_fp32": lazy_paddle.inference.Config.Precision.Float32,
			
 
				+                "trt_fp16": lazy_paddle.inference.Config.Precision.Half,
			
 
				+            }
			
 
				+
			
 
				+            config = lazy_paddle.inference.Config(str(model_file), str(params_file))
			
 
				+
			
 
				+            config.set_optim_cache_dir(str(cache_dir / "optim_cache"))
			
 
				+
			
 
				+            config.enable_tensorrt_engine(
			
 
				+                workspace_size=self._option.trt_max_workspace_size,
			
 
				+                max_batch_size=self._option.trt_max_batch_size,
			
 
				+                min_subgraph_size=self._option.trt_min_subgraph_size,
			
 
				+                precision_mode=PRECISION_MAP[self._option.run_mode],
			
 
				+                use_static=self._option.trt_use_static,
			
 
				+                use_calib_mode=self._option.trt_use_calib_mode,
			
 
				             )
			
 
				-        indices = sorted(range(len(names)), key=names.__getitem__)
			
 
				-        x = [x[indices.index(i)] for i in range(len(x))]
			
 
				-        # TODO:
			
 
				-        # Ensure that input tensors follow the model's input sequence without sorting.
			
 
				 
			
 
				-        inputs = self.copy2gpu(x)
			
 
				-        outputs = self.infer(inputs)
			
 
				-        pred = self.copy2cpu(outputs)
			
 
				-        return pred
			
 
				+            if self._option.trt_use_dynamic_shapes:
			
 
				+                if self._option.trt_collect_shape_range_info:
			
 
				+                    # NOTE: We always use a shape range info file.
			
 
				+                    if self._option.trt_shape_range_info_path is not None:
			
 
				+                        trt_shape_range_info_path = Path(
			
 
				+                            self._option.trt_shape_range_info_path
			
 
				+                        )
			
 
				+                    else:
			
 
				+                        trt_shape_range_info_path = cache_dir / "shape_range_info.pbtxt"
			
 
				+                    should_collect_shape_range_info = True
			
 
				+                    if not trt_shape_range_info_path.exists():
			
 
				+                        trt_shape_range_info_path.parent.mkdir(
			
 
				+                            parents=True, exist_ok=True
			
 
				+                        )
			
 
				+                        logging.info(
			
 
				+                            f"Shape range info will be collected into {trt_shape_range_info_path}"
			
 
				+                        )
			
 
				+                    elif self._option.trt_discard_cached_shape_range_info:
			
 
				+                        trt_shape_range_info_path.unlink()
			
 
				+                        logging.info(
			
 
				+                            f"The shape range info file ({trt_shape_range_info_path}) has been removed, and the shape range info will be re-collected."
			
 
				+                        )
			
 
				+                    else:
			
 
				+                        logging.info(
			
 
				+                            f"A shape range info file ({trt_shape_range_info_path}) already exists. There is no need to collect the info again."
			
 
				+                        )
			
 
				+                        should_collect_shape_range_info = False
			
 
				+                    if should_collect_shape_range_info:
			
 
				+                        _collect_trt_shape_range_info(
			
 
				+                            str(model_file),
			
 
				+                            str(params_file),
			
 
				+                            self._option.device_id,
			
 
				+                            str(trt_shape_range_info_path),
			
 
				+                            self._option.trt_dynamic_shapes,
			
 
				+                            self._option.trt_dynamic_shape_input_data,
			
 
				+                        )
			
 
				+                    config.enable_tuned_tensorrt_dynamic_shape(
			
 
				+                        str(trt_shape_range_info_path),
			
 
				+                        self._option.trt_allow_rebuild_at_runtime,
			
 
				+                    )
			
 
				+                else:
			
 
				+                    if self._option.trt_dynamic_shapes is not None:
			
 
				+                        min_shapes, opt_shapes, max_shapes = {}, {}, {}
			
 
				+                        for (
			
 
				+                            key,
			
 
				+                            shapes,
			
 
				+                        ) in self._option.trt_dynamic_shapes.items():
			
 
				+                            min_shapes[key] = shapes[0]
			
 
				+                            opt_shapes[key] = shapes[1]
			
 
				+                            max_shapes[key] = shapes[2]
			
 
				+                            config.set_trt_dynamic_shape_info(
			
 
				+                                min_shapes, max_shapes, opt_shapes
			
 
				+                            )
			
 
				+                    else:
			
 
				+                        raise RuntimeError("No dynamic shape information provided")
			
 
				+
			
 
				+        return config
			
--- a/paddlex/inference/models/common/ts/processors.py
+++ b/paddlex/inference/models/common/ts/processors.py
@@ -32,6 +32,7 @@ __all__ = [
 
				 ]
			
 
				 
			
 
				 
			
 
				+@benchmark.timeit
			
 
				 class TSCutOff:
			
 
				     """Truncates time series data to a specified length for training.
			
 
				 
			
@@ -53,7 +54,6 @@ class TSCutOff:
 
				         super().__init__()
			
 
				         self.size = size
			
 
				 
			
 
				-    @benchmark.timeit
			
 
				     def __call__(self, ts_list: List) -> List:
			
 
				         """Applies the cut off operation to a list of time series.
			
 
				 
			
@@ -91,6 +91,7 @@ class TSCutOff:
 
				         return ts_data
			
 
				 
			
 
				 
			
 
				+@benchmark.timeit
			
 
				 class TSNormalize:
			
 
				     """Normalizes time series data using a pre-fitted scaler.
			
 
				 
			
@@ -112,7 +113,6 @@ class TSNormalize:
 
				         self.scaler = joblib.load(scale_path)
			
 
				         self.params_info = params_info
			
 
				 
			
 
				-    @benchmark.timeit
			
 
				     def __call__(self, ts_list: List[pd.DataFrame]) -> List[pd.DataFrame]:
			
 
				         """Applies normalization to a list of time series data frames.
			
 
				 
			
@@ -147,6 +147,7 @@ class TSNormalize:
 
				         return ts
			
 
				 
			
 
				 
			
 
				+@benchmark.timeit
			
 
				 class BuildTSDataset:
			
 
				     """Constructs a time series dataset from a list of time series data frames."""
			
 
				 
			
@@ -160,7 +161,6 @@ class BuildTSDataset:
 
				         super().__init__()
			
 
				         self.params_info = params_info
			
 
				 
			
 
				-    @benchmark.timeit
			
 
				     def __call__(self, ts_list: List) -> List:
			
 
				         """Applies the dataset construction to a list of time series.
			
 
				 
			
@@ -185,6 +185,7 @@ class BuildTSDataset:
 
				         return ts_data
			
 
				 
			
 
				 
			
 
				+@benchmark.timeit
			
 
				 class TimeFeature:
			
 
				     """Extracts time features from time series data for forecasting."""
			
 
				 
			
@@ -203,7 +204,6 @@ class TimeFeature:
 
				         self.size = size
			
 
				         self.holiday = holiday
			
 
				 
			
 
				-    @benchmark.timeit
			
 
				     def __call__(self, ts_list: List) -> List:
			
 
				         """Applies time feature extraction to a list of time series.
			
 
				 
			
@@ -250,6 +250,7 @@ class TimeFeature:
 
				         return ts
			
 
				 
			
 
				 
			
 
				+@benchmark.timeit
			
 
				 class TStoArray:
			
 
				     """Converts time series data into arrays for model input."""
			
 
				 
			
@@ -262,7 +263,6 @@ class TStoArray:
 
				         super().__init__()
			
 
				         self.input_data = input_data
			
 
				 
			
 
				-    @benchmark.timeit
			
 
				     def __call__(self, ts_list: List[Dict[str, Any]]) -> List[List[np.ndarray]]:
			
 
				         """Converts a list of time series data frames into arrays.
			
 
				 
			
@@ -292,6 +292,7 @@ class TStoArray:
 
				         return ts_list
			
 
				 
			
 
				 
			
 
				+@benchmark.timeit
			
 
				 class TStoBatch:
			
 
				     """Convert a list of time series into batches for processing.
			
 
				 
			
@@ -300,7 +301,6 @@ class TStoBatch:
 
				     equal-length arrays or DataFrames.
			
 
				     """
			
 
				 
			
 
				-    @benchmark.timeit
			
 
				     def __call__(self, ts_list: List[np.ndarray]) -> List[np.ndarray]:
			
 
				         """Convert a list of time series into batches.
			
 
				 
			
--- a/paddlex/inference/models/common/vision/processors.py
+++ b/paddlex/inference/models/common/vision/processors.py
@@ -81,6 +81,7 @@ class _BaseResize:
 
				         return rescaled_size, scale
			
 
				 
			
 
				 
			
 
				+@benchmark.timeit
			
 
				 class Resize(_BaseResize):
			
 
				     """Resize the image."""
			
 
				 
			
@@ -113,7 +114,6 @@ class Resize(_BaseResize):
 
				 
			
 
				         self.keep_ratio = keep_ratio
			
 
				 
			
 
				-    @benchmark.timeit
			
 
				     def __call__(self, imgs):
			
 
				         """apply"""
			
 
				         return [self.resize(img) for img in imgs]
			
@@ -135,6 +135,7 @@ class Resize(_BaseResize):
 
				         return img
			
 
				 
			
 
				 
			
 
				+@benchmark.timeit
			
 
				 class ResizeByLong(_BaseResize):
			
 
				     """
			
 
				     Proportionally resize the image by specifying the target length of the
			
@@ -157,7 +158,6 @@ class ResizeByLong(_BaseResize):
 
				         super().__init__(size_divisor=size_divisor, interp=interp, backend=backend)
			
 
				         self.target_long_edge = target_long_edge
			
 
				 
			
 
				-    @benchmark.timeit
			
 
				     def __call__(self, imgs):
			
 
				         """apply"""
			
 
				         return [self.resize(img) for img in imgs]
			
@@ -177,6 +177,7 @@ class ResizeByLong(_BaseResize):
 
				         return img
			
 
				 
			
 
				 
			
 
				+@benchmark.timeit
			
 
				 class ResizeByShort(_BaseResize):
			
 
				     """
			
 
				     Proportionally resize the image by specifying the target length of the
			
@@ -199,7 +200,6 @@ class ResizeByShort(_BaseResize):
 
				         super().__init__(size_divisor=size_divisor, interp=interp, backend=backend)
			
 
				         self.target_short_edge = target_short_edge
			
 
				 
			
 
				-    @benchmark.timeit
			
 
				     def __call__(self, imgs):
			
 
				         """apply"""
			
 
				         return [self.resize(img) for img in imgs]
			
@@ -219,6 +219,7 @@ class ResizeByShort(_BaseResize):
 
				         return img
			
 
				 
			
 
				 
			
 
				+@benchmark.timeit
			
 
				 class Normalize:
			
 
				     """Normalize the image."""
			
 
				 
			
@@ -247,7 +248,6 @@ class Normalize:
 
				         self.std = np.asarray(std).astype("float32")
			
 
				         self.preserve_dtype = preserve_dtype
			
 
				 
			
 
				-    @benchmark.timeit
			
 
				     def __call__(self, imgs):
			
 
				         """apply"""
			
 
				         old_type = imgs[0].dtype
			
@@ -262,16 +262,16 @@ class Normalize:
 
				         return list(imgs)
			
 
				 
			
 
				 
			
 
				+@benchmark.timeit
			
 
				 class ToCHWImage:
			
 
				     """Reorder the dimensions of the image from HWC to CHW."""
			
 
				 
			
 
				-    @benchmark.timeit
			
 
				     def __call__(self, imgs):
			
 
				         """apply"""
			
 
				         return [img.transpose((2, 0, 1)) for img in imgs]
			
 
				 
			
 
				 
			
 
				+@benchmark.timeit
			
 
				 class ToBatch:
			
 
				-    @benchmark.timeit
			
 
				     def __call__(self, imgs):
			
 
				         return [np.stack(imgs, axis=0).astype(dtype=np.float32, copy=False)]
			
--- a/paddlex/inference/models/formula_recognition/processors.py
+++ b/paddlex/inference/models/formula_recognition/processors.py
@@ -31,6 +31,7 @@ from ....utils import logging
 
				 from ...utils.benchmark import benchmark
			
 
				 
			
 
				 
			
 
				+@benchmark.timeit
			
 
				 class MinMaxResize:
			
 
				     """Class for resizing images to be within specified minimum and maximum dimensions, with padding and normalization."""
			
 
				 
			
@@ -143,7 +144,6 @@ class MinMaxResize:
 
				             img = np.dstack((img, img, img))
			
 
				             return img
			
 
				 
			
 
				-    @benchmark.timeit
			
 
				     def __call__(self, imgs: List[np.ndarray]) -> List[np.ndarray]:
			
 
				         """Applies the resize method to a list of images.
			
 
				 
			
@@ -156,6 +156,7 @@ class MinMaxResize:
 
				         return [self.resize(img) for img in imgs]
			
 
				 
			
 
				 
			
 
				+@benchmark.timeit
			
 
				 class LatexTestTransform:
			
 
				     """
			
 
				     A transform class for processing images according to Latex test requirements.
			
@@ -183,7 +184,6 @@ class LatexTestTransform:
 
				         squeezed = np.squeeze(grayscale_image)
			
 
				         return cv2.merge([squeezed] * self.num_output_channels)
			
 
				 
			
 
				-    @benchmark.timeit
			
 
				     def __call__(self, imgs: List[np.ndarray]) -> List[np.ndarray]:
			
 
				         """
			
 
				         Apply the transform to a list of images.
			
@@ -197,6 +197,7 @@ class LatexTestTransform:
 
				         return [self.transform(img) for img in imgs]
			
 
				 
			
 
				 
			
 
				+@benchmark.timeit
			
 
				 class LatexImageFormat:
			
 
				     """Class for formatting images to a specific format suitable for LaTeX."""
			
 
				 
			
@@ -223,7 +224,6 @@ class LatexImageFormat:
 
				         img_expanded = img[:, :, np.newaxis].transpose(2, 0, 1)
			
 
				         return img_expanded[np.newaxis, :]
			
 
				 
			
 
				-    @benchmark.timeit
			
 
				     def __call__(self, imgs: List[np.ndarray]) -> List[np.ndarray]:
			
 
				         """Applies the format method to a list of images.
			
 
				 
			
@@ -236,6 +236,7 @@ class LatexImageFormat:
 
				         return [self.format(img) for img in imgs]
			
 
				 
			
 
				 
			
 
				+@benchmark.timeit
			
 
				 class NormalizeImage(object):
			
 
				     """Normalize an image by subtracting the mean and dividing by the standard deviation.
			
 
				 
			
@@ -279,12 +280,12 @@ class NormalizeImage(object):
 
				         img = (img.astype("float32") * self.scale - self.mean) / self.std
			
 
				         return img
			
 
				 
			
 
				-    @benchmark.timeit
			
 
				     def __call__(self, imgs: List[Union[np.ndarray, Image.Image]]) -> List[np.ndarray]:
			
 
				         """Apply normalization to a list of images."""
			
 
				         return [self.normalize(img) for img in imgs]
			
 
				 
			
 
				 
			
 
				+@benchmark.timeit
			
 
				 class ToBatch(object):
			
 
				     """A class for batching images."""
			
 
				 
			
@@ -292,7 +293,6 @@ class ToBatch(object):
 
				         """Initializes the ToBatch object."""
			
 
				         super(ToBatch, self).__init__()
			
 
				 
			
 
				-    @benchmark.timeit
			
 
				     def __call__(self, imgs: List[np.ndarray]) -> List[np.ndarray]:
			
 
				         """Concatenates a list of images into a single batch.
			
 
				 
			
@@ -308,6 +308,7 @@ class ToBatch(object):
 
				         return x
			
 
				 
			
 
				 
			
 
				+@benchmark.timeit
			
 
				 class LaTeXOCRDecode(object):
			
 
				     """Class for decoding LaTeX OCR tokens based on a provided character list."""
			
 
				 
			
@@ -377,7 +378,6 @@ class LaTeXOCRDecode(object):
 
				         ]
			
 
				         return [self.post_process(dec_str) for dec_str in dec_str_list]
			
 
				 
			
 
				-    @benchmark.timeit
			
 
				     def __call__(
			
 
				         self,
			
 
				         preds: np.ndarray,
			
@@ -409,6 +409,7 @@ class LaTeXOCRDecode(object):
 
				         return text, label
			
 
				 
			
 
				 
			
 
				+@benchmark.timeit
			
 
				 class UniMERNetImgDecode(object):
			
 
				     """Class for decoding images for UniMERNet, including cropping margins, resizing, and padding."""
			
 
				 
			
@@ -550,7 +551,6 @@ class UniMERNetImgDecode(object):
 
				         )
			
 
				         return np.array(ImageOps.expand(img, padding))
			
 
				 
			
 
				-    @benchmark.timeit
			
 
				     def __call__(self, imgs: List[np.ndarray]) -> List[Optional[np.ndarray]]:
			
 
				         """Calls the img_decode method on a list of images.
			
 
				 
			
@@ -562,6 +562,7 @@ class UniMERNetImgDecode(object):
 
				         return [self.img_decode(img) for img in imgs]
			
 
				 
			
 
				 
			
 
				+@benchmark.timeit
			
 
				 class UniMERNetDecode(object):
			
 
				     """Class for decoding tokenized inputs using UniMERNet tokenizer.
			
 
				 
			
@@ -879,7 +880,6 @@ class UniMERNetDecode(object):
 
				         text = self.normalize(text)
			
 
				         return text
			
 
				 
			
 
				-    @benchmark.timeit
			
 
				     def __call__(
			
 
				         self,
			
 
				         preds: np.ndarray,
			
@@ -909,6 +909,7 @@ class UniMERNetDecode(object):
 
				         return text, label
			
 
				 
			
 
				 
			
 
				+@benchmark.timeit
			
 
				 class UniMERNetTestTransform:
			
 
				     """
			
 
				     A class for transforming images according to UniMERNet test specifications.
			
@@ -943,7 +944,6 @@ class UniMERNetTestTransform:
 
				         img = cv2.merge([squeezed] * self.num_output_channels)
			
 
				         return img
			
 
				 
			
 
				-    @benchmark.timeit
			
 
				     def __call__(self, imgs: List[np.ndarray]) -> List[np.ndarray]:
			
 
				         """
			
 
				         Applies the transform to a list of images.
			
@@ -957,6 +957,7 @@ class UniMERNetTestTransform:
 
				         return [self.transform(img) for img in imgs]
			
 
				 
			
 
				 
			
 
				+@benchmark.timeit
			
 
				 class UniMERNetImageFormat:
			
 
				     """Class for formatting images to UniMERNet's required format."""
			
 
				 
			
@@ -984,7 +985,6 @@ class UniMERNetImageFormat:
 
				         img_expanded = img[:, :, np.newaxis].transpose(2, 0, 1)
			
 
				         return img_expanded[np.newaxis, :]
			
 
				 
			
 
				-    @benchmark.timeit
			
 
				     def __call__(self, imgs: List[np.ndarray]) -> List[np.ndarray]:
			
 
				         """Applies the format method to a list of images.
			
 
				 
			
--- a/paddlex/inference/models/image_classification/processors.py
+++ b/paddlex/inference/models/image_classification/processors.py
@@ -19,6 +19,7 @@ from ..common.vision import F
 
				 from ...utils.benchmark import benchmark
			
 
				 
			
 
				 
			
 
				+@benchmark.timeit
			
 
				 class Crop:
			
 
				     """Crop region from the image."""
			
 
				 
			
@@ -42,7 +43,6 @@ class Crop:
 
				             raise ValueError("Unsupported interpolation method")
			
 
				         self.mode = mode
			
 
				 
			
 
				-    @benchmark.timeit
			
 
				     def __call__(self, imgs):
			
 
				         """apply"""
			
 
				         return [self.crop(img) for img in imgs]
			
@@ -66,6 +66,7 @@ class Crop:
 
				         return img
			
 
				 
			
 
				 
			
 
				+@benchmark.timeit
			
 
				 class Topk:
			
 
				     """Topk Transform"""
			
 
				 
			
@@ -80,7 +81,6 @@ class Topk:
 
				         class_id_map = {id: str(lb) for id, lb in enumerate(class_ids)}
			
 
				         return class_id_map
			
 
				 
			
 
				-    @benchmark.timeit
			
 
				     def __call__(self, preds, topk=5):
			
 
				         indexes = preds[0].argsort(axis=1)[:, -topk:][:, ::-1].astype("int32")
			
 
				         scores = [
			
--- a/paddlex/inference/models/image_feature/processors.py
+++ b/paddlex/inference/models/image_feature/processors.py
@@ -17,6 +17,7 @@ import numpy as np
 
				 from ...utils.benchmark import benchmark
			
 
				 
			
 
				 
			
 
				+@benchmark.timeit
			
 
				 class NormalizeFeatures:
			
 
				     """Normalize Features Transform"""
			
 
				 
			
@@ -26,7 +27,6 @@ class NormalizeFeatures:
 
				         features = np.divide(preds[0], feas_norm)
			
 
				         return features
			
 
				 
			
 
				-    @benchmark.timeit
			
 
				     def __call__(self, preds):
			
 
				         normalized_features = [self._normalize(feature) for feature in preds]
			
 
				         return normalized_features
			
--- a/paddlex/inference/models/image_multilabel_classification/processors.py
+++ b/paddlex/inference/models/image_multilabel_classification/processors.py
@@ -18,6 +18,7 @@ from typing import Union
 
				 from ...utils.benchmark import benchmark
			
 
				 
			
 
				 
			
 
				+@benchmark.timeit
			
 
				 class MultiLabelThreshOutput:
			
 
				     """MultiLabelThresh Transform"""
			
 
				 
			
@@ -33,7 +34,6 @@ class MultiLabelThreshOutput:
 
				         class_id_map = {id: str(lb) for id, lb in enumerate(class_ids)}
			
 
				         return class_id_map
			
 
				 
			
 
				-    @benchmark.timeit
			
 
				     def __call__(self, preds, threshold: Union[float, dict, list]):
			
 
				         threshold_list = []
			
 
				         num_classes = preds[0].shape[-1]
			
--- a/paddlex/inference/models/image_unwarping/processors.py
+++ b/paddlex/inference/models/image_unwarping/processors.py
@@ -18,6 +18,7 @@ from typing import List, Union, Tuple
 
				 from ...utils.benchmark import benchmark
			
 
				 
			
 
				 
			
 
				+@benchmark.timeit
			
 
				 class DocTrPostProcess:
			
 
				     """
			
 
				     Post-processing class for cropping regions from images (though currently only performs scaling and color channel adjustments).
			
@@ -46,7 +47,6 @@ class DocTrPostProcess:
 
				             np.float32(scale) if isinstance(scale, (str, float)) else np.float32(255.0)
			
 
				         )
			
 
				 
			
 
				-    @benchmark.timeit
			
 
				     def __call__(
			
 
				         self, imgs: List[Union[np.ndarray, Tuple[np.ndarray, ...]]]
			
 
				     ) -> List[np.ndarray]:
			
--- a/paddlex/inference/models/instance_segmentation/processors.py
+++ b/paddlex/inference/models/instance_segmentation/processors.py
@@ -41,6 +41,7 @@ def extract_masks_from_boxes(boxes, masks):
 
				     return new_masks
			
 
				 
			
 
				 
			
 
				+@benchmark.timeit
			
 
				 class InstanceSegPostProcess(object):
			
 
				     """Save Result Transform"""
			
 
				 
			
@@ -79,7 +80,6 @@ class InstanceSegPostProcess(object):
 
				 
			
 
				         return result
			
 
				 
			
 
				-    @benchmark.timeit
			
 
				     def __call__(
			
 
				         self,
			
 
				         batch_outputs: List[dict],
			
--- a/paddlex/inference/models/keypoint_detection/processors.py
+++ b/paddlex/inference/models/keypoint_detection/processors.py
@@ -66,6 +66,7 @@ def get_warp_matrix(
 
				     return matrix
			
 
				 
			
 
				 
			
 
				+@benchmark.timeit
			
 
				 class TopDownAffine:
			
 
				     """refer to https://github.com/open-mmlab/mmpose/blob/71ec36ebd63c475ab589afc817868e749a61491f/mmpose/datasets/transforms/topdown_transforms.py#L13
			
 
				     Get the bbox image as the model input by affine transform.
			
@@ -137,7 +138,6 @@ class TopDownAffine:
 
				 
			
 
				         return img, center, scale
			
 
				 
			
 
				-    @benchmark.timeit
			
 
				     def __call__(self, datas: List[dict]) -> List[dict]:
			
 
				         for data in datas:
			
 
				             ori_img = data["img"]
			
@@ -198,6 +198,7 @@ def transform_preds(
 
				     return target_coords
			
 
				 
			
 
				 
			
 
				+@benchmark.timeit
			
 
				 class KptPostProcess:
			
 
				     """Save Result Transform"""
			
 
				 
			
@@ -218,7 +219,6 @@ class KptPostProcess:
 
				             for kpt, score in zip(keypoints, scores)
			
 
				         ]
			
 
				 
			
 
				-    @benchmark.timeit
			
 
				     def __call__(self, batch_outputs: List[dict], datas: List[dict]) -> List[Kpts]:
			
 
				         """Apply the post-processing to a batch of outputs.
			
 
				 
			
--- a/paddlex/inference/models/object_detection/processors.py
+++ b/paddlex/inference/models/object_detection/processors.py
@@ -27,10 +27,10 @@ Boxes = List[dict]
 
				 Number = Union[int, float]
			
 
				 
			
 
				 
			
 
				+@benchmark.timeit
			
 
				 class ReadImage(CommonReadImage):
			
 
				     """Reads images from a list of raw image data or file paths."""
			
 
				 
			
 
				-    @benchmark.timeit
			
 
				     def __call__(self, raw_imgs: List[Union[ndarray, str, dict]]) -> List[dict]:
			
 
				         """Processes the input list of raw image data or file paths and returns a list of dictionaries containing image information.
			
 
				 
			
@@ -94,8 +94,8 @@ class ReadImage(CommonReadImage):
 
				             )
			
 
				 
			
 
				 
			
 
				+@benchmark.timeit
			
 
				 class Resize(CommonResize):
			
 
				-    @benchmark.timeit
			
 
				     def __call__(self, datas: List[dict]) -> List[dict]:
			
 
				         """
			
 
				         Args:
			
@@ -125,6 +125,7 @@ class Resize(CommonResize):
 
				         return datas
			
 
				 
			
 
				 
			
 
				+@benchmark.timeit
			
 
				 class Normalize(CommonNormalize):
			
 
				     """Normalizes images in a list of dictionaries containing image data"""
			
 
				 
			
@@ -141,7 +142,6 @@ class Normalize(CommonNormalize):
 
				             img = img.astype(old_type, copy=False)
			
 
				         return img
			
 
				 
			
 
				-    @benchmark.timeit
			
 
				     def __call__(self, datas: List[dict]) -> List[dict]:
			
 
				         """Normalizes images in a list of dictionaries. Iterates over each dictionary,
			
 
				         applies normalization to the 'img' key, and returns the modified list.
			
@@ -151,10 +151,10 @@ class Normalize(CommonNormalize):
 
				         return datas
			
 
				 
			
 
				 
			
 
				+@benchmark.timeit
			
 
				 class ToCHWImage:
			
 
				     """Converts images in a list of dictionaries from HWC to CHW format."""
			
 
				 
			
 
				-    @benchmark.timeit
			
 
				     def __call__(self, datas: List[dict]) -> List[dict]:
			
 
				         """Converts the image data in the list of dictionaries from HWC to CHW format in-place.
			
 
				 
			
@@ -169,6 +169,7 @@ class ToCHWImage:
 
				         return datas
			
 
				 
			
 
				 
			
 
				+@benchmark.timeit
			
 
				 class ToBatch:
			
 
				     """
			
 
				     Class for batch processing of data dictionaries.
			
@@ -212,11 +213,11 @@ class ToBatch:
 
				                 dtype=dtype, copy=False
			
 
				             )
			
 
				 
			
 
				-    @benchmark.timeit
			
 
				     def __call__(self, datas: List[dict]) -> Sequence[ndarray]:
			
 
				         return [self.apply(datas, key) for key in self.ordered_required_keys]
			
 
				 
			
 
				 
			
 
				+@benchmark.timeit
			
 
				 class DetPad:
			
 
				     """
			
 
				     Pad image to a specified size.
			
@@ -248,13 +249,13 @@ class DetPad:
 
				         canvas[0:im_h, 0:im_w, :] = im.astype(np.float32)
			
 
				         return canvas
			
 
				 
			
 
				-    @benchmark.timeit
			
 
				     def __call__(self, datas: List[dict]) -> List[dict]:
			
 
				         for data in datas:
			
 
				             data["img"] = self.apply(data["img"])
			
 
				         return datas
			
 
				 
			
 
				 
			
 
				+@benchmark.timeit
			
 
				 class PadStride:
			
 
				     """padding image for model with FPN , instead PadBatch(pad_to_stride, pad_gt) in original config
			
 
				     Args:
			
@@ -283,7 +284,6 @@ class PadStride:
 
				         padding_im[:, :im_h, :im_w] = im
			
 
				         return padding_im
			
 
				 
			
 
				-    @benchmark.timeit
			
 
				     def __call__(self, datas: List[dict]) -> List[dict]:
			
 
				         for data in datas:
			
 
				             data["img"] = self.apply(data["img"])
			
@@ -382,6 +382,7 @@ def get_affine_transform(
 
				     return trans
			
 
				 
			
 
				 
			
 
				+@benchmark.timeit
			
 
				 class WarpAffine:
			
 
				     """Apply warp affine transformation to the image based on the given parameters.
			
 
				 
			
@@ -446,7 +447,6 @@ class WarpAffine:
 
				 
			
 
				         return inp
			
 
				 
			
 
				-    @benchmark.timeit
			
 
				     def __call__(self, datas: List[dict]) -> List[dict]:
			
 
				 
			
 
				         for data in datas:
			
@@ -661,6 +661,7 @@ def check_containment(boxes, formula_index=None):
 
				     return contains_other, contained_by_other
			
 
				 
			
 
				 
			
 
				+@benchmark.timeit
			
 
				 class DetPostProcess:
			
 
				     """Save Result Transform
			
 
				 
			
@@ -769,7 +770,6 @@ class DetPostProcess:
 
				             )
			
 
				         return boxes
			
 
				 
			
 
				-    @benchmark.timeit
			
 
				     def __call__(
			
 
				         self,
			
 
				         batch_outputs: List[dict],
			
--- a/paddlex/inference/models/open_vocabulary_detection/processors/groundingdino_processors.py
+++ b/paddlex/inference/models/open_vocabulary_detection/processors/groundingdino_processors.py
@@ -98,6 +98,7 @@ def _text_pad_batch_data(
 
				     return return_list if len(return_list) > 1 else return_list[0]
			
 
				 
			
 
				 
			
 
				+@benchmark.timeit
			
 
				 class GroundingDINOPostProcessor(object):
			
 
				     """PostProcessors for GroundingDINO"""
			
 
				 
			
@@ -118,7 +119,6 @@ class GroundingDINOPostProcessor(object):
 
				         self.box_threshold = box_threshold
			
 
				         self.text_threshold = text_threshold
			
 
				 
			
 
				-    @benchmark.timeit
			
 
				     def __call__(
			
 
				         self,
			
 
				         pred_boxes,
			
@@ -208,6 +208,7 @@ class GroundingDINOPostProcessor(object):
 
				             raise NotImplementedError("posmap must be 1-dim")
			
 
				 
			
 
				 
			
 
				+@benchmark.timeit
			
 
				 class GroundingDINOProcessor(object):
			
 
				     """Image and Text Processors for GroundingDINO"""
			
 
				 
			
@@ -236,7 +237,6 @@ class GroundingDINOProcessor(object):
 
				         assert os.path.isdir(tokenizer_dir), f"{tokenizer_dir} not exists."
			
 
				         self.tokenizer = BertTokenizer.from_pretrained(tokenizer_dir)
			
 
				 
			
 
				-    @benchmark.timeit
			
 
				     def __call__(
			
 
				         self,
			
 
				         images: List[PIL.Image.Image],
			
@@ -264,6 +264,7 @@ class GroundingDINOProcessor(object):
 
				         return [arr.numpy() for arr in paddle_rst]
			
 
				 
			
 
				 
			
 
				+@benchmark.timeit
			
 
				 class GroundingDinoTextProcessor(object):
			
 
				     """Constructs a GroundingDino text processor."""
			
 
				 
			
@@ -273,7 +274,6 @@ class GroundingDinoTextProcessor(object):
 
				     ):
			
 
				         self.max_words = max_words
			
 
				 
			
 
				-    @benchmark.timeit
			
 
				     def __call__(
			
 
				         self,
			
 
				         input_ids,
			
@@ -367,6 +367,7 @@ class GroundingDinoTextProcessor(object):
 
				         return attention_mask, position_ids.cast(paddle.int64), cate_to_token_mask_list
			
 
				 
			
 
				 
			
 
				+@benchmark.timeit
			
 
				 class GroundingDinoImageProcessor(object):
			
 
				     """Constructs a GroundingDino image processor."""
			
 
				 
			
@@ -391,7 +392,6 @@ class GroundingDinoImageProcessor(object):
 
				         self.image_std = image_std
			
 
				         self.do_nested = do_nested
			
 
				 
			
 
				-    @benchmark.timeit
			
 
				     def __call__(self, images, **kwargs):
			
 
				         """Preprocess an image or a batch of images."""
			
 
				         return self.preprocess(images, **kwargs)
			
--- a/paddlex/inference/models/open_vocabulary_segmentation/processors/sam_processer.py
+++ b/paddlex/inference/models/open_vocabulary_segmentation/processors/sam_processer.py
@@ -127,6 +127,7 @@ class SAMProcessor(object):
 
				         return [masks]
			
 
				 
			
 
				 
			
 
				+@benchmark.timeit
			
 
				 class SamPromptProcessor(object):
			
 
				     """Constructs a Sam prompt processor."""
			
 
				 
			
@@ -160,7 +161,6 @@ class SamPromptProcessor(object):
 
				         boxes = self.apply_coords(boxes.reshape([-1, 2, 2]), original_size)
			
 
				         return boxes.reshape([-1, 4])
			
 
				 
			
 
				-    @benchmark.timeit
			
 
				     def __call__(
			
 
				         self,
			
 
				         original_size,
			
@@ -182,6 +182,7 @@ class SamPromptProcessor(object):
 
				             return box.astype(np.float32)
			
 
				 
			
 
				 
			
 
				+@benchmark.timeit
			
 
				 class SamImageProcessor(object):
			
 
				     """Constructs a Sam image processor."""
			
 
				 
			
@@ -215,7 +216,6 @@ class SamImageProcessor(object):
 
				 
			
 
				         return np.array(T.resize(image, target_size))
			
 
				 
			
 
				-    @benchmark.timeit
			
 
				     def __call__(self, images, **kwargs):
			
 
				         if not isinstance(images, (list, tuple)):
			
 
				             images = [images]
			
--- a/paddlex/inference/models/semantic_segmentation/processors.py
+++ b/paddlex/inference/models/semantic_segmentation/processors.py
@@ -26,6 +26,7 @@ from ..common.vision import funcs as F
 
				 from ...utils.benchmark import benchmark
			
 
				 
			
 
				 
			
 
				+@benchmark.timeit
			
 
				 class Resize(_BaseResize):
			
 
				     """Resize the image."""
			
 
				 
			
@@ -53,7 +54,6 @@ class Resize(_BaseResize):
 
				 
			
 
				         self.keep_ratio = keep_ratio
			
 
				 
			
 
				-    @benchmark.timeit
			
 
				     def __call__(self, imgs, target_size=None):
			
 
				         """apply"""
			
 
				         target_size = self.target_size if target_size is None else target_size
			
@@ -83,6 +83,7 @@ class Resize(_BaseResize):
 
				         return img
			
 
				 
			
 
				 
			
 
				+@benchmark.timeit
			
 
				 class SegPostProcess:
			
 
				     """Semantic Segmentation PostProcess
			
 
				 
			
@@ -90,7 +91,6 @@ class SegPostProcess:
 
				     restoring the prediction segmentation map to the original image size for now.
			
 
				     """
			
 
				 
			
 
				-    @benchmark.timeit
			
 
				     def __call__(self, imgs, src_images):
			
 
				         assert len(imgs) == len(src_images)
			
 
				 
			
--- a/paddlex/inference/models/table_structure_recognition/processors.py
+++ b/paddlex/inference/models/table_structure_recognition/processors.py
@@ -20,6 +20,7 @@ from ..common.vision import funcs as F
 
				 from ...utils.benchmark import benchmark
			
 
				 
			
 
				 
			
 
				+@benchmark.timeit
			
 
				 class Pad:
			
 
				     """Pad the image."""
			
 
				 
			
@@ -56,12 +57,12 @@ class Pad:
 
				 
			
 
				         return [img, [img.shape[1], img.shape[0]]]
			
 
				 
			
 
				-    @benchmark.timeit
			
 
				     def __call__(self, imgs):
			
 
				         """apply"""
			
 
				         return [self.apply(img) for img in imgs]
			
 
				 
			
 
				 
			
 
				+@benchmark.timeit
			
 
				 class TableLabelDecode:
			
 
				     """decode the table model outputs(probs) to character str"""
			
 
				 
			
@@ -121,7 +122,6 @@ class TableLabelDecode:
 
				             assert False, "unsupported type %s in get_beg_end_flag_idx" % beg_or_end
			
 
				         return idx
			
 
				 
			
 
				-    @benchmark.timeit
			
 
				     def __call__(self, pred, img_size, ori_img_size):
			
 
				         """apply"""
			
 
				         bbox_preds, structure_probs = [], []
			
--- a/paddlex/inference/models/text_detection/processors.py
+++ b/paddlex/inference/models/text_detection/processors.py
@@ -29,6 +29,7 @@ from ....utils import logging
 
				 from ...utils.benchmark import benchmark
			
 
				 
			
 
				 
			
 
				+@benchmark.timeit
			
 
				 class DetResizeForTest:
			
 
				     """DetResizeForTest"""
			
 
				 
			
@@ -51,7 +52,6 @@ class DetResizeForTest:
 
				             self.limit_side_len = 736
			
 
				             self.limit_type = "min"
			
 
				 
			
 
				-    @benchmark.timeit
			
 
				     def __call__(
			
 
				         self,
			
 
				         imgs,
			
@@ -183,6 +183,7 @@ class DetResizeForTest:
 
				         return img, [ratio_h, ratio_w]
			
 
				 
			
 
				 
			
 
				+@benchmark.timeit
			
 
				 class NormalizeImage:
			
 
				     """normalize image such as substract mean, divide std"""
			
 
				 
			
@@ -198,7 +199,6 @@ class NormalizeImage:
 
				         self.mean = np.array(mean).reshape(shape).astype("float32")
			
 
				         self.std = np.array(std).reshape(shape).astype("float32")
			
 
				 
			
 
				-    @benchmark.timeit
			
 
				     def __call__(self, imgs):
			
 
				         """apply"""
			
 
				 
			
@@ -208,6 +208,7 @@ class NormalizeImage:
 
				         return [norm(img) for img in imgs]
			
 
				 
			
 
				 
			
 
				+@benchmark.timeit
			
 
				 class DBPostProcess:
			
 
				     """
			
 
				     The post process for Differentiable Binarization (DB).
			
@@ -415,7 +416,6 @@ class DBPostProcess:
 
				         cv2.fillPoly(mask, contour.reshape(1, -1, 2).astype(np.int32), 1)
			
 
				         return cv2.mean(bitmap[ymin : ymax + 1, xmin : xmax + 1], mask)[0]
			
 
				 
			
 
				-    @benchmark.timeit
			
 
				     def __call__(
			
 
				         self,
			
 
				         preds,
			
--- a/paddlex/inference/models/text_recognition/processors.py
+++ b/paddlex/inference/models/text_recognition/processors.py
@@ -30,6 +30,7 @@ from ....utils import logging
 
				 from ...utils.benchmark import benchmark
			
 
				 
			
 
				 
			
 
				+@benchmark.timeit
			
 
				 class OCRReisizeNormImg:
			
 
				     """for ocr image resize and normalization"""
			
 
				 
			
@@ -58,7 +59,6 @@ class OCRReisizeNormImg:
 
				         padding_im[:, :, 0:resized_w] = resized_image
			
 
				         return padding_im
			
 
				 
			
 
				-    @benchmark.timeit
			
 
				     def __call__(self, imgs):
			
 
				         """apply"""
			
 
				         return [self.resize(img) for img in imgs]
			
@@ -73,6 +73,7 @@ class OCRReisizeNormImg:
 
				         return img
			
 
				 
			
 
				 
			
 
				+@benchmark.timeit
			
 
				 class BaseRecLabelDecode:
			
 
				     """Convert between text-label and text-index"""
			
 
				 
			
@@ -148,7 +149,6 @@ class BaseRecLabelDecode:
 
				         """get_ignored_tokens"""
			
 
				         return [0]  # for ctc blank
			
 
				 
			
 
				-    @benchmark.timeit
			
 
				     def __call__(self, pred):
			
 
				         """apply"""
			
 
				         preds = np.array(pred)
			
@@ -165,13 +165,13 @@ class BaseRecLabelDecode:
 
				         return texts, scores
			
 
				 
			
 
				 
			
 
				+@benchmark.timeit
			
 
				 class CTCLabelDecode(BaseRecLabelDecode):
			
 
				     """Convert between text-label and text-index"""
			
 
				 
			
 
				     def __init__(self, character_list=None, use_space_char=True):
			
 
				         super().__init__(character_list, use_space_char=use_space_char)
			
 
				 
			
 
				-    @benchmark.timeit
			
 
				     def __call__(self, pred):
			
 
				         """apply"""
			
 
				         preds = np.array(pred[0])
			
@@ -191,6 +191,7 @@ class CTCLabelDecode(BaseRecLabelDecode):
 
				         return character_list
			
 
				 
			
 
				 
			
 
				+@benchmark.timeit
			
 
				 class ToBatch:
			
 
				     """A class for batching and padding images to a uniform width."""
			
 
				 
			
@@ -217,7 +218,6 @@ class ToBatch:
 
				             padded_imgs.append(padded_img)
			
 
				         return padded_imgs
			
 
				 
			
 
				-    @benchmark.timeit
			
 
				     def __call__(self, imgs: List[np.ndarray]) -> List[np.ndarray]:
			
 
				         """Call method to pad images and stack them into a batch.
			
 
				 
			
--- a/paddlex/inference/models/ts_anomaly_detection/processors.py
+++ b/paddlex/inference/models/ts_anomaly_detection/processors.py
@@ -19,6 +19,7 @@ import pandas as pd
 
				 from ...utils.benchmark import benchmark
			
 
				 
			
 
				 
			
 
				+@benchmark.timeit
			
 
				 class GetAnomaly:
			
 
				     """A class to detect anomalies in time series data based on a model threshold."""
			
 
				 
			
@@ -34,7 +35,6 @@ class GetAnomaly:
 
				         self.model_threshold = model_threshold
			
 
				         self.info_params = info_params
			
 
				 
			
 
				-    @benchmark.timeit
			
 
				     def __call__(
			
 
				         self, ori_ts_list: List[Dict[str, Any]], pred_list: List[np.ndarray]
			
 
				     ) -> List[pd.DataFrame]:
			
--- a/paddlex/inference/models/ts_classification/processors.py
+++ b/paddlex/inference/models/ts_classification/processors.py
@@ -19,6 +19,7 @@ from typing import List, Any, Dict
 
				 from ...utils.benchmark import benchmark
			
 
				 
			
 
				 
			
 
				+@benchmark.timeit
			
 
				 class GetCls:
			
 
				     """A class to process prediction outputs and return class IDs and scores."""
			
 
				 
			
@@ -26,7 +27,6 @@ class GetCls:
 
				         """Initializes the GetCls instance."""
			
 
				         super().__init__()
			
 
				 
			
 
				-    @benchmark.timeit
			
 
				     def __call__(self, pred_list: List[Any]) -> List[pd.DataFrame]:
			
 
				         """
			
 
				         Processes a list of predictions and returns a list of DataFrames with class IDs and scores.
			
@@ -59,6 +59,7 @@ class GetCls:
 
				         return result
			
 
				 
			
 
				 
			
 
				+@benchmark.timeit
			
 
				 class BuildPadMask:
			
 
				     """A class to build padding masks for time series data."""
			
 
				 
			
@@ -73,7 +74,6 @@ class BuildPadMask:
 
				         super().__init__()
			
 
				         self.input_data = input_data
			
 
				 
			
 
				-    @benchmark.timeit
			
 
				     def __call__(self, ts_list: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
			
 
				         """
			
 
				         Applies padding mask to a list of time series data.
			
--- a/paddlex/inference/models/ts_forecasting/processors.py
+++ b/paddlex/inference/models/ts_forecasting/processors.py
@@ -20,6 +20,7 @@ import pandas as pd
 
				 from ...utils.benchmark import benchmark
			
 
				 
			
 
				 
			
 
				+@benchmark.timeit
			
 
				 class TSDeNormalize:
			
 
				     """A class to de-normalize time series prediction data using a pre-fitted scaler."""
			
 
				 
			
@@ -35,7 +36,6 @@ class TSDeNormalize:
 
				         self.scaler = joblib.load(scale_path)
			
 
				         self.params_info = params_info
			
 
				 
			
 
				-    @benchmark.timeit
			
 
				     def __call__(self, preds_list: List[pd.DataFrame]) -> List[pd.DataFrame]:
			
 
				         """
			
 
				         Applies de-normalization to a list of prediction DataFrames.
			
@@ -63,6 +63,7 @@ class TSDeNormalize:
 
				         return pred
			
 
				 
			
 
				 
			
 
				+@benchmark.timeit
			
 
				 class ArraytoTS:
			
 
				     """A class to convert arrays of predictions into time series format."""
			
 
				 
			
@@ -76,7 +77,6 @@ class ArraytoTS:
 
				         super().__init__()
			
 
				         self.info_params = info_params
			
 
				 
			
 
				-    @benchmark.timeit
			
 
				     def __call__(
			
 
				         self, ori_ts_list: List[Dict[str, Any]], pred_list: List[np.ndarray]
			
 
				     ) -> List[pd.DataFrame]:
			
--- a/paddlex/inference/models/video_classification/processors.py
+++ b/paddlex/inference/models/video_classification/processors.py
@@ -28,6 +28,7 @@ import lazy_paddle
 
				 from ...utils.benchmark import benchmark
			
 
				 
			
 
				 
			
 
				+@benchmark.timeit
			
 
				 class Scale:
			
 
				     """Scale images."""
			
 
				 
			
@@ -123,7 +124,6 @@ class Scale:
 
				         imgs = resized_imgs
			
 
				         return imgs
			
 
				 
			
 
				-    @benchmark.timeit
			
 
				     def __call__(self, videos: List[np.ndarray]) -> List[np.ndarray]:
			
 
				         """
			
 
				         Apply the scaling operation to a list of videos.
			
@@ -138,6 +138,7 @@ class Scale:
 
				         return [self.scale(video) for video in videos]
			
 
				 
			
 
				 
			
 
				+@benchmark.timeit
			
 
				 class CenterCrop:
			
 
				     """Center crop images."""
			
 
				 
			
@@ -184,7 +185,6 @@ class CenterCrop:
 
				                 crop_imgs.append(img[y1 : y1 + th, x1 : x1 + tw])
			
 
				         return crop_imgs
			
 
				 
			
 
				-    @benchmark.timeit
			
 
				     def __call__(self, videos: List[np.ndarray]) -> List[np.ndarray]:
			
 
				         """
			
 
				         Apply the center crop operation to a list of videos.
			
@@ -198,6 +198,7 @@ class CenterCrop:
 
				         return [self.center_crop(video) for video in videos]
			
 
				 
			
 
				 
			
 
				+@benchmark.timeit
			
 
				 class Image2Array:
			
 
				     """Convert a sequence of images to a numpy array with optional transposition."""
			
 
				 
			
@@ -238,7 +239,6 @@ class Image2Array:
 
				                 t_imgs = t_imgs.transpose([3, 0, 1, 2])  # cthw
			
 
				         return t_imgs
			
 
				 
			
 
				-    @benchmark.timeit
			
 
				     def __call__(self, videos: List[np.ndarray]) -> List[np.ndarray]:
			
 
				         """
			
 
				         Apply the image to array conversion to a list of videos.
			
@@ -252,6 +252,7 @@ class Image2Array:
 
				         return [self.img2array(video) for video in videos]
			
 
				 
			
 
				 
			
 
				+@benchmark.timeit
			
 
				 class NormalizeVideo:
			
 
				     """
			
 
				     Normalize video frames by subtracting the mean and dividing by the standard deviation.
			
@@ -316,7 +317,6 @@ class NormalizeVideo:
 
				         imgs = np.expand_dims(imgs, axis=0).copy()
			
 
				         return imgs
			
 
				 
			
 
				-    @benchmark.timeit
			
 
				     def __call__(self, videos: List[np.ndarray]) -> List[np.ndarray]:
			
 
				         """
			
 
				         Apply normalization to a list of videos.
			
@@ -330,6 +330,7 @@ class NormalizeVideo:
 
				         return [self.normalize_video(video) for video in videos]
			
 
				 
			
 
				 
			
 
				+@benchmark.timeit
			
 
				 class VideoClasTopk:
			
 
				     """Applies a top-k transformation on video classification predictions."""
			
 
				 
			
@@ -374,7 +375,6 @@ class VideoClasTopk:
 
				         class_id_map = {id: str(lb) for id, lb in enumerate(class_ids)}
			
 
				         return class_id_map
			
 
				 
			
 
				-    @benchmark.timeit
			
 
				     def __call__(
			
 
				         self, preds: np.ndarray, topk: int = 5
			
 
				     ) -> Tuple[np.ndarray, List[np.ndarray], List[List[str]]]:
			
@@ -401,10 +401,10 @@ class VideoClasTopk:
 
				         return indexes, scores, label_names
			
 
				 
			
 
				 
			
 
				+@benchmark.timeit
			
 
				 class ToBatch:
			
 
				     """A class for batching videos."""
			
 
				 
			
 
				-    @benchmark.timeit
			
 
				     def __call__(self, videos: List[np.ndarray]) -> List[np.ndarray]:
			
 
				         """Call method to stack videos into a batch.
			
 
				 
			
--- a/paddlex/inference/models/video_detection/processors.py
+++ b/paddlex/inference/models/video_detection/processors.py
@@ -24,6 +24,7 @@ import lazy_paddle as paddle
 
				 from ...utils.benchmark import benchmark
			
 
				 
			
 
				 
			
 
				+@benchmark.timeit
			
 
				 class ResizeVideo:
			
 
				     """Resizes frames of a video to a specified target size.
			
 
				 
			
@@ -77,7 +78,6 @@ class ResizeVideo:
 
				                 )
			
 
				         return video
			
 
				 
			
 
				-    @benchmark.timeit
			
 
				     def __call__(self, videos: List) -> List:
			
 
				         """Resizes frames of multiple videos.
			
 
				 
			
@@ -91,6 +91,7 @@ class ResizeVideo:
 
				         return [self.resize(video) for video in videos]
			
 
				 
			
 
				 
			
 
				+@benchmark.timeit
			
 
				 class Image2Array:
			
 
				     """Convert a sequence of images to a numpy array with optional transposition."""
			
 
				 
			
@@ -132,7 +133,6 @@ class Image2Array:
 
				             video[i] = video_one
			
 
				         return video
			
 
				 
			
 
				-    @benchmark.timeit
			
 
				     def __call__(self, videos: List[List[np.ndarray]]) -> List[np.ndarray]:
			
 
				         """
			
 
				         Process videos by converting each video to a transposed numpy array.
			
@@ -147,6 +147,7 @@ class Image2Array:
 
				         return [self.img2array(video) for video in videos]
			
 
				 
			
 
				 
			
 
				+@benchmark.timeit
			
 
				 class NormalizeVideo:
			
 
				     """
			
 
				     A class to normalize video frames by scaling the pixel values.
			
@@ -181,7 +182,6 @@ class NormalizeVideo:
 
				 
			
 
				         return video
			
 
				 
			
 
				-    @benchmark.timeit
			
 
				     def __call__(self, videos: List[List[np.ndarray]]) -> List[List[np.ndarray]]:
			
 
				         """
			
 
				         Apply normalization to a list of videos.
			
@@ -404,6 +404,7 @@ def bbox_iou(box1, box2, x1y1x2y2=True):
 
				     return carea / uarea
			
 
				 
			
 
				 
			
 
				+@benchmark.timeit
			
 
				 class DetVideoPostProcess:
			
 
				     """
			
 
				     A class used to perform post-processing on detection results in videos.
			
@@ -451,6 +452,5 @@ class DetVideoPostProcess:
 
				             pred_all.append(preds)
			
 
				         return pred_all
			
 
				 
			
 
				-    @benchmark.timeit
			
 
				     def __call__(self, preds: List, nms_thresh, score_thresh) -> List:
			
 
				         return [self.postprocess(pred, nms_thresh, score_thresh) for pred in preds]
			
--- a/paddlex/inference/utils/benchmark.py
+++ b/paddlex/inference/utils/benchmark.py
@@ -20,9 +20,14 @@ from pathlib import Path
 
				 import numpy as np
			
 
				 from prettytable import PrettyTable
			
 
				 
			
 
				-from ...utils.flags import INFER_BENCHMARK, INFER_BENCHMARK_OUTPUT
			
 
				+from ...utils.flags import INFER_BENCHMARK, INFER_BENCHMARK_OUTPUT_DIR
			
 
				 from ...utils import logging
			
 
				 
			
 
				+ENTRY_POINT_NAME = "_entry_point_"
			
 
				+
			
 
				+# XXX: Global mutable state
			
 
				+_inference_operations = []
			
 
				+
			
 
				 
			
 
				 class Benchmark:
			
 
				     def __init__(self, enabled):
			
@@ -30,32 +35,54 @@ class Benchmark:
 
				         self._elapses = {}
			
 
				         self._warmup = False
			
 
				 
			
 
				-    def timeit(self, func):
			
 
				-        @functools.wraps(func)
			
 
				-        def wrapper(*args, **kwargs):
			
 
				-            if not self._enabled:
			
 
				-                return func(*args, **kwargs)
			
 
				+    def timeit_with_name(self, name=None):
			
 
				+        # TODO: Refactor
			
 
				+        def _deco(func_or_cls):
			
 
				+            nonlocal name
			
 
				+            if name is None:
			
 
				+                name = func_or_cls.__qualname__
			
 
				+
			
 
				+            if isinstance(func_or_cls, type):
			
 
				+                if not hasattr(func_or_cls, "__call__"):
			
 
				+                    raise TypeError
			
 
				+                func = func_or_cls.__call__
			
 
				+            else:
			
 
				+                if not callable(func_or_cls):
			
 
				+                    raise TypeError
			
 
				+                func = func_or_cls
			
 
				+
			
 
				+            @functools.wraps(func)
			
 
				+            def _wrapper(*args, **kwargs):
			
 
				+                if not self._enabled:
			
 
				+                    return func(*args, **kwargs)
			
 
				 
			
 
				-            name = func.__qualname__
			
 
				+                tic = time.perf_counter()
			
 
				+                output = func(*args, **kwargs)
			
 
				+                if isinstance(output, GeneratorType):
			
 
				+                    return self.watch_generator(output, name)
			
 
				+                else:
			
 
				+                    self._update(time.perf_counter() - tic, name)
			
 
				+                    return output
			
 
				 
			
 
				-            tic = time.time()
			
 
				-            output = func(*args, **kwargs)
			
 
				-            if isinstance(output, GeneratorType):
			
 
				-                return self.watch_generator(output, name)
			
 
				+            if isinstance(func_or_cls, type):
			
 
				+                func_or_cls.__call__ = _wrapper
			
 
				+                return func_or_cls
			
 
				             else:
			
 
				-                self._update(time.time() - tic, name)
			
 
				-                return output
			
 
				+                return _wrapper
			
 
				+
			
 
				+        return _deco
			
 
				 
			
 
				-        return wrapper
			
 
				+    def timeit(self, func_or_cls):
			
 
				+        return self.timeit_with_name(None)(func_or_cls)
			
 
				 
			
 
				     def watch_generator(self, generator, name):
			
 
				         @functools.wraps(generator)
			
 
				         def wrapper():
			
 
				             while True:
			
 
				                 try:
			
 
				-                    tic = time.time()
			
 
				+                    tic = time.perf_counter()
			
 
				                     item = next(generator)
			
 
				-                    self._update(time.time() - tic, name)
			
 
				+                    self._update(time.perf_counter() - tic, name)
			
 
				                     yield item
			
 
				                 except StopIteration:
			
 
				                     break
			
@@ -90,37 +117,57 @@ class Benchmark:
 
				         self.reset()
			
 
				 
			
 
				     def gather(self, batch_size):
			
 
				-        logs = {k.split(".")[0]: v for k, v in self.logs.items()}
			
 
				+        # NOTE: The gathering logic here is based on the following assumptions:
			
 
				+        # 1. The operations are performed sequentially.
			
 
				+        # 2. An operation is performed only once at each iteration.
			
 
				+        # 3. Operations do not nest, except that the entry point operation
			
 
				+        #    contains all other operations.
			
 
				+        # 4. The input batch size for each operation is `batch_size`.
			
 
				+        # 5. Inference operations are always performed, while preprocessing and
			
 
				+        #    postprocessing operations are optional.
			
 
				+        # 6. If present, preprocessing operations are always performed before
			
 
				+        #    inference operations, and inference operations are completed before
			
 
				+        #    any postprocessing operations. There is no interleaving among these
			
 
				+        #    stages.
			
 
				 
			
 
				-        iters = len(logs["Infer"])
			
 
				+        logs = {k: v for k, v in self.logs.items()}
			
 
				+
			
 
				+        summary = {"preprocessing": 0, "inference": 0, "postprocessing": 0}
			
 
				+        base_predictor_time_list = logs.pop(ENTRY_POINT_NAME)
			
 
				+        iters = len(base_predictor_time_list)
			
 
				         instances = iters * batch_size
			
 
				+        summary["end_to_end"] = np.mean(base_predictor_time_list)
			
 
				+
			
 
				         detail_list = []
			
 
				-        summary = {"preprocess": 0, "inference": 0, "postprocess": 0}
			
 
				-        op_tag = "preprocess"
			
 
				+        op_tag = "preprocessing"
			
 
				 
			
 
				         for name, time_list in logs.items():
			
 
				+            assert len(time_list) == iters
			
 
				             avg = np.mean(time_list)
			
 
				             detail_list.append(
			
 
				                 (iters, batch_size, instances, name, avg, avg / batch_size)
			
 
				             )
			
 
				 
			
 
				-            if name in ["Copy2GPU", "Infer", "Copy2CPU"]:
			
 
				+            if name in _inference_operations:
			
 
				                 summary["inference"] += avg
			
 
				-                op_tag = "postprocess"
			
 
				+                op_tag = "postprocessing"
			
 
				             else:
			
 
				                 summary[op_tag] += avg
			
 
				 
			
 
				-        summary["end2end"] = (
			
 
				-            summary["preprocess"] + summary["inference"] + summary["postprocess"]
			
 
				+        summary["core"] = (
			
 
				+            summary["preprocessing"] + summary["inference"] + summary["postprocessing"]
			
 
				         )
			
 
				+
			
 
				+        summary["other"] = summary["end_to_end"] - summary["core"]
			
 
				+
			
 
				         summary_list = [
			
 
				             (
			
 
				                 iters,
			
 
				                 batch_size,
			
 
				                 instances,
			
 
				-                "PreProcess",
			
 
				-                summary["preprocess"],
			
 
				-                summary["preprocess"] / batch_size,
			
 
				+                "Preprocessing",
			
 
				+                summary["preprocessing"],
			
 
				+                summary["preprocessing"] / batch_size,
			
 
				             ),
			
 
				             (
			
 
				                 iters,
			
@@ -134,17 +181,33 @@ class Benchmark:
 
				                 iters,
			
 
				                 batch_size,
			
 
				                 instances,
			
 
				-                "PostProcess",
			
 
				-                summary["postprocess"],
			
 
				-                summary["postprocess"] / batch_size,
			
 
				+                "Postprocessing",
			
 
				+                summary["postprocessing"],
			
 
				+                summary["postprocessing"] / batch_size,
			
 
				+            ),
			
 
				+            (
			
 
				+                iters,
			
 
				+                batch_size,
			
 
				+                instances,
			
 
				+                "Core",
			
 
				+                summary["core"],
			
 
				+                summary["core"] / batch_size,
			
 
				+            ),
			
 
				+            (
			
 
				+                iters,
			
 
				+                batch_size,
			
 
				+                instances,
			
 
				+                "Other",
			
 
				+                summary["other"],
			
 
				+                summary["other"] / batch_size,
			
 
				             ),
			
 
				             (
			
 
				                 iters,
			
 
				                 batch_size,
			
 
				                 instances,
			
 
				-                "End2End",
			
 
				-                summary["end2end"],
			
 
				-                summary["end2end"] / batch_size,
			
 
				+                "End-to-End",
			
 
				+                summary["end_to_end"],
			
 
				+                summary["end_to_end"] / batch_size,
			
 
				             ),
			
 
				         ]
			
 
				 
			
@@ -158,7 +221,7 @@ class Benchmark:
 
				                 "Iters",
			
 
				                 "Batch Size",
			
 
				                 "Instances",
			
 
				-                "Stage",
			
 
				+                "Type",
			
 
				                 "Avg Time Per Iter (ms)",
			
 
				                 "Avg Time Per Instance (ms)",
			
 
				             ]
			
@@ -191,7 +254,7 @@ class Benchmark:
 
				                 "Iters",
			
 
				                 "Batch Size",
			
 
				                 "Instances",
			
 
				-                "Stage",
			
 
				+                "Type",
			
 
				                 "Avg Time Per Iter (ms)",
			
 
				                 "Avg Time Per Instance (ms)",
			
 
				             ]
			
@@ -204,8 +267,8 @@ class Benchmark:
 
				             logging.info(header)
			
 
				             logging.info(table)
			
 
				 
			
 
				-            if INFER_BENCHMARK_OUTPUT:
			
 
				-                save_dir = Path(INFER_BENCHMARK_OUTPUT)
			
 
				+            if INFER_BENCHMARK_OUTPUT_DIR:
			
 
				+                save_dir = Path(INFER_BENCHMARK_OUTPUT_DIR)
			
 
				                 save_dir.mkdir(parents=True, exist_ok=True)
			
 
				                 csv_data = [detail_head, *detail_list]
			
 
				                 with open(Path(save_dir) / "detail.csv", "w", newline="") as file:
			
@@ -218,6 +281,15 @@ class Benchmark:
 
				                     writer.writerows(csv_data)
			
 
				 
			
 
				 
			
 
				+def get_inference_operations():
			
 
				+    return _inference_operations
			
 
				+
			
 
				+
			
 
				+def set_inference_operations(val):
			
 
				+    global _inference_operations
			
 
				+    _inference_operations = val
			
 
				+
			
 
				+
			
 
				 if INFER_BENCHMARK:
			
 
				     benchmark = Benchmark(enabled=True)
			
 
				 else:
			
--- a/paddlex/inference/utils/hpi.py
+++ b/paddlex/inference/utils/hpi.py
@@ -0,0 +1,82 @@
 
				+# copyright (c) 2024 PaddlePaddle Authors. All Rights Reserve.
			
 
				+#
			
 
				+# Licensed under the Apache License, Version 2.0 (the "License");
			
 
				+# you may not use this file except in compliance with the License.
			
 
				+# You may obtain a copy of the License at
			
 
				+#
			
 
				+#    http://www.apache.org/licenses/LICENSE-2.0
			
 
				+#
			
 
				+# Unless required by applicable law or agreed to in writing, software
			
 
				+# distributed under the License is distributed on an "AS IS" BASIS,
			
 
				+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				+# See the License for the specific language governing permissions and
			
 
				+# limitations under the License.
			
 
				+
			
 
				+from os import PathLike
			
 
				+from pathlib import Path
			
 
				+from typing import Dict, List, Literal, Optional, Tuple, TypedDict, Union
			
 
				+
			
 
				+from pydantic import BaseModel
			
 
				+from typing_extensions import TypeAlias
			
 
				+
			
 
				+from ...utils.flags import FLAGS_json_format_model
			
 
				+
			
 
				+
			
 
				+class PaddleInferenceInfo(BaseModel):
			
 
				+    trt_dynamic_shapes: Optional[Dict[str, List[List[int]]]] = None
			
 
				+    trt_dynamic_shape_input_data: Optional[Dict[str, List[List[float]]]] = None
			
 
				+
			
 
				+
			
 
				+class TensorRTInfo(BaseModel):
			
 
				+    dynamic_shapes: Optional[Dict[str, List[List[int]]]] = None
			
 
				+
			
 
				+
			
 
				+class InferenceBackendInfoCollection(BaseModel):
			
 
				+    paddle_infer: Optional[PaddleInferenceInfo] = None
			
 
				+    tensorrt: Optional[TensorRTInfo] = None
			
 
				+
			
 
				+
			
 
				+# Does using `TypedDict` make things more convenient?
			
 
				+class HPIInfo(BaseModel):
			
 
				+    backend_configs: Optional[InferenceBackendInfoCollection] = None
			
 
				+
			
 
				+
			
 
				+# For multi-backend inference only
			
 
				+InferenceBackend: TypeAlias = Literal[
			
 
				+    "paddle", "openvino", "onnxruntime", "tensorrt", "om"
			
 
				+]
			
 
				+
			
 
				+
			
 
				+ModelFormat: TypeAlias = Literal["paddle", "onnx", "om"]
			
 
				+
			
 
				+
			
 
				+class ModelPaths(TypedDict, total=False):
			
 
				+    paddle: Tuple[Path, Path]
			
 
				+    onnx: Path
			
 
				+    om: Path
			
 
				+
			
 
				+
			
 
				+def get_model_paths(
			
 
				+    model_dir: Union[str, PathLike], model_file_prefix: str
			
 
				+) -> ModelPaths:
			
 
				+    model_dir = Path(model_dir)
			
 
				+    model_paths: ModelPaths = {}
			
 
				+    pd_model_path = None
			
 
				+    if FLAGS_json_format_model:
			
 
				+        if (model_dir / f"{model_file_prefix}.json").exists():
			
 
				+            pd_model_path = model_dir / f"{model_file_prefix}.json"
			
 
				+    else:
			
 
				+        if (model_dir / f"{model_file_prefix}.json").exists():
			
 
				+            pd_model_path = model_dir / f"{model_file_prefix}.json"
			
 
				+        elif (model_dir / f"{model_file_prefix}.pdmodel").exists():
			
 
				+            pd_model_path = model_dir / f"{model_file_prefix}.pdmodel"
			
 
				+    if pd_model_path and (model_dir / f"{model_file_prefix}.pdiparams").exists():
			
 
				+        model_paths["paddle"] = (
			
 
				+            pd_model_path,
			
 
				+            model_dir / f"{model_file_prefix}.pdiparams",
			
 
				+        )
			
 
				+    if (model_dir / f"{model_file_prefix}.onnx").exists():
			
 
				+        model_paths["onnx"] = model_dir / f"{model_file_prefix}.onnx"
			
 
				+    if (model_dir / f"{model_file_prefix}.om").exists():
			
 
				+        model_paths["om"] = model_dir / f"{model_file_prefix}.om"
			
 
				+    return model_paths
			
--- a/paddlex/inference/utils/pp_option.py
+++ b/paddlex/inference/utils/pp_option.py
@@ -15,13 +15,13 @@
 
				 import os
			
 
				 from typing import Dict, List
			
 
				 
			
 
				+from ...utils import logging
			
 
				 from ...utils.device import (
			
 
				+    check_supported_device_type,
			
 
				+    get_default_device,
			
 
				     parse_device,
			
 
				     set_env_for_device,
			
 
				-    get_default_device,
			
 
				-    check_supported_device,
			
 
				 )
			
 
				-from ...utils import logging
			
 
				 from .new_ir_blacklist import NEWIR_BLOCKLIST
			
 
				 from .trt_blacklist import TRT_BLOCKLIST
			
 
				 
			
@@ -69,20 +69,26 @@ class PaddlePredictorOption(object):
 
				 
			
 
				     def _get_default_config(self):
			
 
				         """get default config"""
			
 
				-        device_type, device_id = parse_device(get_default_device())
			
 
				+        device_type, device_ids = parse_device(get_default_device())
			
 
				         return {
			
 
				             "run_mode": "paddle",
			
 
				-            "device": device_type,
			
 
				-            "device_id": 0 if device_id is None else device_id[0],
			
 
				-            "min_subgraph_size": 3,
			
 
				-            "shape_info_filename": None,
			
 
				-            "trt_calib_mode": False,
			
 
				+            "device_type": device_type,
			
 
				+            "device_id": None if device_ids is None else device_ids[0],
			
 
				             "cpu_threads": 8,
			
 
				-            "trt_use_static": False,
			
 
				             "delete_pass": [],
			
 
				             "enable_new_ir": True if self.model_name not in NEWIR_BLOCKLIST else False,
			
 
				-            "batch_size": 1,  # only for trt
			
 
				-            "trt_dynamic_shapes": {},  # only for trt
			
 
				+            "trt_max_workspace_size": 1 << 30,  # only for trt
			
 
				+            "trt_max_batch_size": 32,  # only for trt
			
 
				+            "trt_min_subgraph_size": 3,  # only for trt
			
 
				+            "trt_use_static": True,  # only for trt
			
 
				+            "trt_use_calib_mode": False,  # only for trt
			
 
				+            "trt_use_dynamic_shapes": True,  # only for trt
			
 
				+            "trt_collect_shape_range_info": True,  # only for trt
			
 
				+            "trt_discard_cached_shape_range_info": False,  # only for trt
			
 
				+            "trt_dynamic_shapes": None,  # only for trt
			
 
				+            "trt_dynamic_shape_input_data": None,  # only for trt
			
 
				+            "trt_shape_range_info_path": None,  # only for trt
			
 
				+            "trt_allow_rebuild_at_runtime": True,  # only for trt
			
 
				         }
			
 
				 
			
 
				     def _update(self, k, v):
			
@@ -112,59 +118,119 @@ class PaddlePredictorOption(object):
 
				 
			
 
				     @property
			
 
				     def device_type(self):
			
 
				-        return self._cfg["device"]
			
 
				+        return self._cfg["device_type"]
			
 
				+
			
 
				+    @device_type.setter
			
 
				+    def device_type(self, device_type):
			
 
				+        check_supported_device_type(device_type, self.model_name)
			
 
				+        self._update("device_type", device_type)
			
 
				 
			
 
				     @property
			
 
				     def device_id(self):
			
 
				         return self._cfg["device_id"]
			
 
				 
			
 
				+    @device_id.setter
			
 
				+    def device_id(self, device_id):
			
 
				+        self._update("device_id", device_id)
			
 
				+
			
 
				     @property
			
 
				-    def device(self):
			
 
				-        device = self._cfg["device"]
			
 
				-        check_supported_device(device, self.model_name)
			
 
				-        return device
			
 
				+    def cpu_threads(self):
			
 
				+        return self._cfg["cpu_threads"]
			
 
				 
			
 
				-    @device.setter
			
 
				-    def device(self, device: str):
			
 
				-        """set device"""
			
 
				-        if not device:
			
 
				-            return
			
 
				-        device_type, device_ids = parse_device(device)
			
 
				-        if device_type not in self.SUPPORT_DEVICE:
			
 
				-            support_run_mode_str = ", ".join(self.SUPPORT_DEVICE)
			
 
				-            raise ValueError(
			
 
				-                f"The device type must be one of {support_run_mode_str}, but received {repr(device_type)}."
			
 
				-            )
			
 
				-        self._update("device", device_type)
			
 
				-        device_id = device_ids[0] if device_ids is not None else 0
			
 
				-        self._update("device_id", device_id)
			
 
				-        set_env_for_device(device)
			
 
				-        if device_type not in ("cpu"):
			
 
				-            if device_ids is None or len(device_ids) > 1:
			
 
				-                logging.debug(f"The device ID has been set to {device_id}.")
			
 
				-        # XXX(gaotingquan): set flag to accelerate inference in paddle 3.0b2
			
 
				-        if device_type in ("gpu", "cpu"):
			
 
				-            os.environ["FLAGS_enable_pir_api"] = "1"
			
 
				+    @cpu_threads.setter
			
 
				+    def cpu_threads(self, cpu_threads):
			
 
				+        """set cpu threads"""
			
 
				+        if not isinstance(cpu_threads, int) or cpu_threads < 1:
			
 
				+            raise Exception()
			
 
				+        self._update("cpu_threads", cpu_threads)
			
 
				 
			
 
				     @property
			
 
				-    def min_subgraph_size(self):
			
 
				-        return self._cfg["min_subgraph_size"]
			
 
				+    def delete_pass(self):
			
 
				+        return self._cfg["delete_pass"]
			
 
				 
			
 
				-    @min_subgraph_size.setter
			
 
				-    def min_subgraph_size(self, min_subgraph_size: int):
			
 
				+    @delete_pass.setter
			
 
				+    def delete_pass(self, delete_pass):
			
 
				+        self._update("delete_pass", delete_pass)
			
 
				+
			
 
				+    @property
			
 
				+    def enable_new_ir(self):
			
 
				+        return self._cfg["enable_new_ir"]
			
 
				+
			
 
				+    @enable_new_ir.setter
			
 
				+    def enable_new_ir(self, enable_new_ir: bool):
			
 
				+        """set run mode"""
			
 
				+        self._update("enable_new_ir", enable_new_ir)
			
 
				+
			
 
				+    @property
			
 
				+    def trt_max_workspace_size(self):
			
 
				+        return self._cfg["trt_max_workspace_size"]
			
 
				+
			
 
				+    @trt_max_workspace_size.setter
			
 
				+    def trt_max_workspace_size(self, trt_max_workspace_size):
			
 
				+        self._update("trt_max_workspace_size", trt_max_workspace_size)
			
 
				+
			
 
				+    @property
			
 
				+    def trt_max_batch_size(self):
			
 
				+        return self._cfg["trt_max_batch_size"]
			
 
				+
			
 
				+    @trt_max_batch_size.setter
			
 
				+    def trt_max_batch_size(self, trt_max_batch_size):
			
 
				+        self._update("trt_max_batch_size", trt_max_batch_size)
			
 
				+
			
 
				+    @property
			
 
				+    def trt_min_subgraph_size(self):
			
 
				+        return self._cfg["trt_min_subgraph_size"]
			
 
				+
			
 
				+    @trt_min_subgraph_size.setter
			
 
				+    def trt_min_subgraph_size(self, trt_min_subgraph_size: int):
			
 
				         """set min subgraph size"""
			
 
				-        if not isinstance(min_subgraph_size, int):
			
 
				+        if not isinstance(trt_min_subgraph_size, int):
			
 
				             raise Exception()
			
 
				-        self._update("min_subgraph_size", min_subgraph_size)
			
 
				+        self._update("trt_min_subgraph_size", trt_min_subgraph_size)
			
 
				 
			
 
				     @property
			
 
				-    def shape_info_filename(self):
			
 
				-        return self._cfg["shape_info_filename"]
			
 
				+    def trt_use_static(self):
			
 
				+        return self._cfg["trt_use_static"]
			
 
				 
			
 
				-    @shape_info_filename.setter
			
 
				-    def shape_info_filename(self, shape_info_filename: str):
			
 
				-        """set shape info filename"""
			
 
				-        self._update("shape_info_filename", shape_info_filename)
			
 
				+    @trt_use_static.setter
			
 
				+    def trt_use_static(self, trt_use_static):
			
 
				+        """set trt use static"""
			
 
				+        self._update("trt_use_static", trt_use_static)
			
 
				+
			
 
				+    @property
			
 
				+    def trt_use_calib_mode(self):
			
 
				+        return self._cfg["trt_use_calib_mode"]
			
 
				+
			
 
				+    @trt_use_calib_mode.setter
			
 
				+    def trt_use_calib_mode(self, trt_use_calib_mode):
			
 
				+        """set trt calib mode"""
			
 
				+        self._update("trt_use_calib_mode", trt_use_calib_mode)
			
 
				+
			
 
				+    @property
			
 
				+    def trt_use_dynamic_shapes(self):
			
 
				+        return self._cfg["trt_use_dynamic_shapes"]
			
 
				+
			
 
				+    @trt_use_dynamic_shapes.setter
			
 
				+    def trt_use_dynamic_shapes(self, trt_use_dynamic_shapes):
			
 
				+        self._update("trt_use_dynamic_shapes", trt_use_dynamic_shapes)
			
 
				+
			
 
				+    @property
			
 
				+    def trt_collect_shape_range_info(self):
			
 
				+        return self._cfg["trt_collect_shape_range_info"]
			
 
				+
			
 
				+    @trt_collect_shape_range_info.setter
			
 
				+    def trt_collect_shape_range_info(self, trt_collect_shape_range_info):
			
 
				+        self._update("trt_collect_shape_range_info", trt_collect_shape_range_info)
			
 
				+
			
 
				+    @property
			
 
				+    def trt_discard_cached_shape_range_info(self):
			
 
				+        return self._cfg["trt_discard_cached_shape_range_info"]
			
 
				+
			
 
				+    @trt_discard_cached_shape_range_info.setter
			
 
				+    def trt_discard_cached_shape_range_info(self, trt_discard_cached_shape_range_info):
			
 
				+        self._update(
			
 
				+            "trt_discard_cached_shape_range_info", trt_discard_cached_shape_range_info
			
 
				+        )
			
 
				 
			
 
				     @property
			
 
				     def trt_dynamic_shapes(self):
			
@@ -178,58 +244,86 @@ class PaddlePredictorOption(object):
 
				         self._update("trt_dynamic_shapes", trt_dynamic_shapes)
			
 
				 
			
 
				     @property
			
 
				-    def trt_calib_mode(self):
			
 
				-        return self._cfg["trt_calib_mode"]
			
 
				+    def trt_dynamic_shape_input_data(self):
			
 
				+        return self._cfg["trt_dynamic_shape_input_data"]
			
 
				 
			
 
				-    @trt_calib_mode.setter
			
 
				-    def trt_calib_mode(self, trt_calib_mode):
			
 
				-        """set trt calib mode"""
			
 
				-        self._update("trt_calib_mode", trt_calib_mode)
			
 
				+    @trt_dynamic_shape_input_data.setter
			
 
				+    def trt_dynamic_shape_input_data(
			
 
				+        self, trt_dynamic_shape_input_data: Dict[str, List[float]]
			
 
				+    ):
			
 
				+        self._update("trt_dynamic_shape_input_data", trt_dynamic_shape_input_data)
			
 
				 
			
 
				     @property
			
 
				-    def cpu_threads(self):
			
 
				-        return self._cfg["cpu_threads"]
			
 
				+    def trt_shape_range_info_path(self):
			
 
				+        return self._cfg["trt_shape_range_info_path"]
			
 
				 
			
 
				-    @cpu_threads.setter
			
 
				-    def cpu_threads(self, cpu_threads):
			
 
				-        """set cpu threads"""
			
 
				-        if not isinstance(cpu_threads, int) or cpu_threads < 1:
			
 
				-            raise Exception()
			
 
				-        self._update("cpu_threads", cpu_threads)
			
 
				+    @trt_shape_range_info_path.setter
			
 
				+    def trt_shape_range_info_path(self, trt_shape_range_info_path: str):
			
 
				+        """set shape info filename"""
			
 
				+        self._update("trt_shape_range_info_path", trt_shape_range_info_path)
			
 
				 
			
 
				     @property
			
 
				-    def trt_use_static(self):
			
 
				-        return self._cfg["trt_use_static"]
			
 
				+    def trt_allow_rebuild_at_runtime(self):
			
 
				+        return self._cfg["trt_allow_rebuild_at_runtime"]
			
 
				 
			
 
				-    @trt_use_static.setter
			
 
				-    def trt_use_static(self, trt_use_static):
			
 
				-        """set trt use static"""
			
 
				-        self._update("trt_use_static", trt_use_static)
			
 
				+    @trt_allow_rebuild_at_runtime.setter
			
 
				+    def trt_allow_rebuild_at_runtime(self, trt_allow_rebuild_at_runtime):
			
 
				+        self._update("trt_allow_rebuild_at_runtime", trt_allow_rebuild_at_runtime)
			
 
				 
			
 
				+    # For backward compatibility
			
 
				+    # TODO: Issue deprecation warnings
			
 
				     @property
			
 
				-    def delete_pass(self):
			
 
				-        return self._cfg["delete_pass"]
			
 
				+    def min_subgraph_size(self):
			
 
				+        return self.trt_min_subgraph_size
			
 
				 
			
 
				-    @delete_pass.setter
			
 
				-    def delete_pass(self, delete_pass):
			
 
				-        self._update("delete_pass", delete_pass)
			
 
				+    @min_subgraph_size.setter
			
 
				+    def min_subgraph_size(self, min_subgraph_size):
			
 
				+        self.trt_min_subgraph_size = min_subgraph_size
			
 
				 
			
 
				     @property
			
 
				-    def enable_new_ir(self):
			
 
				-        return self._cfg["enable_new_ir"]
			
 
				+    def shape_info_filename(self):
			
 
				+        return self.trt_shape_range_info_path
			
 
				 
			
 
				-    @enable_new_ir.setter
			
 
				-    def enable_new_ir(self, enable_new_ir: bool):
			
 
				-        """set run mode"""
			
 
				-        self._update("enable_new_ir", enable_new_ir)
			
 
				+    @shape_info_filename.setter
			
 
				+    def shape_info_filename(self, shape_info_filename):
			
 
				+        self.trt_shape_range_info_path = shape_info_filename
			
 
				+
			
 
				+    @property
			
 
				+    def trt_calib_mode(self):
			
 
				+        return self.trt_use_calib_mode
			
 
				+
			
 
				+    @trt_calib_mode.setter
			
 
				+    def trt_calib_mode(self, trt_calib_mode):
			
 
				+        self.trt_use_calib_mode = trt_calib_mode
			
 
				 
			
 
				     @property
			
 
				     def batch_size(self):
			
 
				-        return self._cfg["batch_size"]
			
 
				+        return self.trt_max_batch_size
			
 
				 
			
 
				     @batch_size.setter
			
 
				     def batch_size(self, batch_size):
			
 
				-        self._update("batch_size", batch_size)
			
 
				+        self.trt_max_batch_size = batch_size
			
 
				+
			
 
				+    def set_device(self, device: str):
			
 
				+        """set device"""
			
 
				+        if not device:
			
 
				+            return
			
 
				+        device_type, device_ids = parse_device(device)
			
 
				+        if device_type not in self.SUPPORT_DEVICE:
			
 
				+            support_run_mode_str = ", ".join(self.SUPPORT_DEVICE)
			
 
				+            raise ValueError(
			
 
				+                f"The device type must be one of {support_run_mode_str}, but received {repr(device_type)}."
			
 
				+            )
			
 
				+        self.device_type = device_type
			
 
				+        device_id = device_ids[0] if device_ids is not None else None
			
 
				+        self.device_id = device_id
			
 
				+        set_env_for_device(device)
			
 
				+        if device_type not in ("cpu"):
			
 
				+            if device_ids is None or len(device_ids) > 1:
			
 
				+                logging.debug(f"The device ID has been set to {device_id}.")
			
 
				+        # XXX(gaotingquan): set flag to accelerate inference in paddle 3.0b2
			
 
				+        if device_type in ("gpu", "cpu"):
			
 
				+            os.environ["FLAGS_enable_pir_api"] = "1"
			
 
				 
			
 
				     def get_support_run_mode(self):
			
 
				         """get supported run mode"""
			
--- a/paddlex/model.py
+++ b/paddlex/model.py
@@ -85,10 +85,14 @@ class _ModelBasedConfig(_BaseModel):
 
				 
			
 
				         device = self._config.Global.get("device")
			
 
				         kernel_option = predict_kwargs.pop("kernel_option", {})
			
 
				-        kernel_option.update({"device": device})
			
 
				-
			
 
				         pp_option = PaddlePredictorOption(self._model_name, **kernel_option)
			
 
				-        predictor = create_predictor(self._model_name, model_dir, pp_option=pp_option)
			
 
				+
			
 
				+        predictor = create_predictor(
			
 
				+            self._model_name,
			
 
				+            model_dir,
			
 
				+            device=device,
			
 
				+            pp_option=pp_option,
			
 
				+        )
			
 
				         assert "input" in predict_kwargs
			
 
				         return predict_kwargs, predictor
			
 
				 
			
--- a/paddlex/utils/device.py
+++ b/paddlex/utils/device.py
@@ -30,7 +30,7 @@ from .custom_device_whitelist import (
 
				 SUPPORTED_DEVICE_TYPE = ["cpu", "gpu", "xpu", "npu", "mlu", "gcu", "dcu"]
			
 
				 
			
 
				 
			
 
				-def _constr_device(device_type, device_ids):
			
 
				+def constr_device(device_type, device_ids):
			
 
				     if device_ids:
			
 
				         device_ids = ",".join(map(str, device_ids))
			
 
				         return f"{device_type}:{device_ids}"
			
@@ -50,7 +50,7 @@ def get_default_device():
 
				     if not avail_gpus:
			
 
				         return "cpu"
			
 
				     else:
			
 
				-        return _constr_device("gpu", [avail_gpus[0]])
			
 
				+        return constr_device("gpu", [avail_gpus[0]])
			
 
				 
			
 
				 
			
 
				 def parse_device(device):
			
@@ -80,9 +80,9 @@ def update_device_num(device, num):
 
				     device_type, device_ids = parse_device(device)
			
 
				     if device_ids:
			
 
				         assert len(device_ids) >= num
			
 
				-        return _constr_device(device_type, device_ids[:num])
			
 
				+        return constr_device(device_type, device_ids[:num])
			
 
				     else:
			
 
				-        return _constr_device(device_type, device_ids)
			
 
				+        return constr_device(device_type, device_ids)
			
 
				 
			
 
				 
			
 
				 def set_env_for_device(device):
			
@@ -129,6 +129,10 @@ def check_supported_device(device, model_name):
 
				         )
			
 
				         return
			
 
				     device_type, device_ids = parse_device(device)
			
 
				+    return check_supported_device_type(device_type, model_name)
			
 
				+
			
 
				+
			
 
				+def check_supported_device_type(device_type, model_name):
			
 
				     if device_type == "dcu":
			
 
				         assert (
			
 
				             model_name in DCU_WHITELIST
			
--- a/paddlex/utils/flags.py
+++ b/paddlex/utils/flags.py
@@ -21,9 +21,9 @@ __all__ = [
 
				     "CHECK_OPTS",
			
 
				     "EAGER_INITIALIZATION",
			
 
				     "INFER_BENCHMARK",
			
 
				-    "INFER_BENCHMARK_ITER",
			
 
				+    "INFER_BENCHMARK_ITERS",
			
 
				     "INFER_BENCHMARK_WARMUP",
			
 
				-    "INFER_BENCHMARK_OUTPUT",
			
 
				+    "INFER_BENCHMARK_OUTPUT_DIR",
			
 
				     "FLAGS_json_format_model",
			
 
				     "USE_PIR_TRT",
			
 
				     "DISABLE_DEV_MODEL_WL",
			
@@ -55,7 +55,9 @@ INFER_BENCHMARK = get_flag_from_env_var("PADDLE_PDX_INFER_BENCHMARK", None)
 
				 INFER_BENCHMARK_WARMUP = get_flag_from_env_var(
			
 
				     "PADDLE_PDX_INFER_BENCHMARK_WARMUP", 0, int
			
 
				 )
			
 
				-INFER_BENCHMARK_OUTPUT = get_flag_from_env_var(
			
 
				-    "PADDLE_PDX_INFER_BENCHMARK_OUTPUT", None
			
 
				+INFER_BENCHMARK_OUTPUT_DIR = get_flag_from_env_var(
			
 
				+    "PADDLE_PDX_INFER_BENCHMARK_OUTPUT_DIR", None
			
 
				+)
			
 
				+INFER_BENCHMARK_ITERS = get_flag_from_env_var(
			
 
				+    "PADDLE_PDX_INFER_BENCHMARK_ITERS", 0, int
			
 
				 )
			
 
				-INFER_BENCHMARK_ITER = get_flag_from_env_var("PADDLE_PDX_INFER_BENCHMARK_ITER", 0, int)