Эх сурвалжийг харах

support 'max_side_limit' and change 'unclip_ratio' for OCR det model (#3979)

学卿 6 сар өмнө
parent
commit
26957be66b
25 өөрчлөгдсөн 95 нэмэгдсэн , 28 устгасан
  1. 2 1
      docs/API_change_log/v3.0.0rc.en.md
  2. 2 1
      docs/API_change_log/v3.0.0rc.md
  3. 2 1
      docs/pipeline_usage/tutorials/information_extraction_pipelines/document_scene_information_extraction_v3.md
  4. 2 1
      docs/pipeline_usage/tutorials/information_extraction_pipelines/document_scene_information_extraction_v4.en.md
  5. 1 1
      docs/pipeline_usage/tutorials/information_extraction_pipelines/document_scene_information_extraction_v4.md
  6. 2 1
      docs/pipeline_usage/tutorials/ocr_pipelines/PP-StructureV3.en.md
  7. 2 1
      docs/pipeline_usage/tutorials/ocr_pipelines/PP-StructureV3.md
  8. 2 1
      docs/pipeline_usage/tutorials/ocr_pipelines/layout_parsing.md
  9. 2 1
      docs/pipeline_usage/tutorials/ocr_pipelines/table_recognition.en.md
  10. 2 1
      docs/pipeline_usage/tutorials/ocr_pipelines/table_recognition.md
  11. 2 1
      docs/pipeline_usage/tutorials/ocr_pipelines/table_recognition_v2.en.md
  12. 2 1
      docs/pipeline_usage/tutorials/ocr_pipelines/table_recognition_v2.md
  13. 1 0
      docs/practical_tutorials/document_scene_information_extraction(seal_recognition)_tutorial.md
  14. 4 2
      docs/practical_tutorials/high_performance_npu_tutorial.md
  15. 2 1
      paddlex/configs/pipelines/OCR.yaml
  16. 3 1
      paddlex/configs/pipelines/PP-ChatOCRv3-doc.yaml
  17. 3 1
      paddlex/configs/pipelines/PP-ChatOCRv4-doc.yaml
  18. 4 1
      paddlex/configs/pipelines/PP-StructureV3.yaml
  19. 3 1
      paddlex/configs/pipelines/layout_parsing.yaml
  20. 1 0
      paddlex/configs/pipelines/seal_recognition.yaml
  21. 2 1
      paddlex/configs/pipelines/table_recognition.yaml
  22. 2 1
      paddlex/configs/pipelines/table_recognition_v2.yaml
  23. 6 0
      paddlex/inference/models/text_detection/predictor.py
  24. 30 7
      paddlex/inference/models/text_detection/processors.py
  25. 11 0
      paddlex/inference/pipelines/ocr/pipeline.py

+ 2 - 1
docs/API_change_log/v3.0.0rc.en.md

@@ -79,9 +79,10 @@
         model_dir: null
         limit_side_len: 960 # Settings related to the sub-module TextDetection
         limit_type: max
+        max_side_limit: 4000
         thresh: 0.3
         box_thresh: 0.6
-        unclip_ratio: 2.0
+        unclip_ratio: 1.5
     ```
 
 ## 3. Pipeline Features Changes

+ 2 - 1
docs/API_change_log/v3.0.0rc.md

@@ -79,9 +79,10 @@
         model_dir: null
         limit_side_len: 960 # 子模块 TextDetection 的相关设置
         limit_type: max
+        max_side_limit: 4000
         thresh: 0.3
         box_thresh: 0.6
-        unclip_ratio: 2.0
+        unclip_ratio: 1.5
     ```
 
 ## 三、产线功能变更

+ 2 - 1
docs/pipeline_usage/tutorials/information_extraction_pipelines/document_scene_information_extraction_v3.md

@@ -1842,9 +1842,10 @@ SubModules:
     model_dir: null # 替换为微调后的文本检测模型权重路径
     limit_side_len: 960
     limit_type: max
+    max_side_limit: 4000
     thresh: 0.3
     box_thresh: 0.6
-    unclip_ratio: 2.0
+    unclip_ratio: 1.5
 
     TextRecognition:
     module_name: text_recognition

+ 2 - 1
docs/pipeline_usage/tutorials/information_extraction_pipelines/document_scene_information_extraction_v4.en.md

@@ -2055,9 +2055,10 @@ SubModules:
     model_dir: null # Replace with the path to the fine-tuned text detection model weights
     limit_side_len: 960
     limit_type: max
+    max_side_limit: 4000
     thresh: 0.3
     box_thresh: 0.6
-    unclip_ratio: 2.0
+    unclip_ratio: 1.5
 
     TextRecognition:
     module_name: text_recognition

+ 1 - 1
docs/pipeline_usage/tutorials/information_extraction_pipelines/document_scene_information_extraction_v4.md

@@ -2264,7 +2264,7 @@ SubModules:
     limit_type: max
     thresh: 0.3
     box_thresh: 0.6
-    unclip_ratio: 2.0
+    unclip_ratio: 1.5
 
     TextRecognition:
     module_name: text_recognition

+ 2 - 1
docs/pipeline_usage/tutorials/ocr_pipelines/PP-StructureV3.en.md

@@ -1894,9 +1894,10 @@ SubPipelines:
         model_dir: null
         limit_side_len: 960
         limit_type: max
+        max_side_limit: 4000
         thresh: 0.3
         box_thresh: 0.6
-        unclip_ratio: 2.0
+        unclip_ratio: 1.5
 
       TextRecognition:
         module_name: text_recognition

+ 2 - 1
docs/pipeline_usage/tutorials/ocr_pipelines/PP-StructureV3.md

@@ -1838,9 +1838,10 @@ SubPipelines:
         model_dir: null # 替换为微调后的文本测模型权重路径
         limit_side_len: 960
         limit_type: max
+        max_side_limit: 4000
         thresh: 0.3
         box_thresh: 0.6
-        unclip_ratio: 2.0
+        unclip_ratio: 1.5
 
       TextRecognition:
         module_name: text_recognition

+ 2 - 1
docs/pipeline_usage/tutorials/ocr_pipelines/layout_parsing.md

@@ -1644,9 +1644,10 @@ SubPipelines:
         model_dir: null # 替换为微调后的文本测模型权重路径
         limit_side_len: 960
         limit_type: max
+        max_side_limit: 4000
         thresh: 0.3
         box_thresh: 0.6
-        unclip_ratio: 2.0
+        unclip_ratio: 1.5
 
       TextRecognition:
         module_name: text_recognition

+ 2 - 1
docs/pipeline_usage/tutorials/ocr_pipelines/table_recognition.en.md

@@ -1481,9 +1481,10 @@ SubPipelines:
         model_dir: null # Replace with fine-tuned model weight paths
         limit_side_len: 960
         limit_type: max
+        max_side_limit: 4000
         thresh: 0.3
         box_thresh: 0.4
-        unclip_ratio: 2.0
+        unclip_ratio: 1.5
       TextRecognition:
         module_name: text_recognition
         model_name: PP-OCRv4_server_rec

+ 2 - 1
docs/pipeline_usage/tutorials/ocr_pipelines/table_recognition.md

@@ -1424,9 +1424,10 @@ SubPipelines:
         model_dir: null # 替换为微调后的文本检测模型权重路径
         limit_side_len: 960
         limit_type: max
+        max_side_limit: 4000
         thresh: 0.3
         box_thresh: 0.4
-        unclip_ratio: 2.0
+        unclip_ratio: 1.5
       TextRecognition:
         module_name: text_recognition
         model_name: PP-OCRv4_server_rec

+ 2 - 1
docs/pipeline_usage/tutorials/ocr_pipelines/table_recognition_v2.en.md

@@ -1611,9 +1611,10 @@ SubPipelines:
         model_dir: null
         limit_side_len: 960
         limit_type: max
+        max_side_limit: 4000
         thresh: 0.3
         box_thresh: 0.4
-        unclip_ratio: 2.0
+        unclip_ratio: 1.5
 
       TextRecognition:
         module_name: text_recognition

+ 2 - 1
docs/pipeline_usage/tutorials/ocr_pipelines/table_recognition_v2.md

@@ -1616,9 +1616,10 @@ SubPipelines:
         model_dir: null # 替换为微调后的文本检测模型权重路径
         limit_side_len: 960
         limit_type: max
+        max_side_limit: 4000
         thresh: 0.3
         box_thresh: 0.4
-        unclip_ratio: 2.0
+        unclip_ratio: 1.5
 
       TextRecognition:
         module_name: text_recognition

+ 1 - 0
docs/practical_tutorials/document_scene_information_extraction(seal_recognition)_tutorial.md

@@ -409,6 +409,7 @@ SubPipelines:
                 model_dir: output/best_accuracy/inference # 修改为微调后的模型路径
                 limit_side_len: 736
                 limit_type: min
+                max_side_limit: 4000
                 thresh: 0.2
                 box_thresh: 0.6
                 unclip_ratio: 0.5

+ 4 - 2
docs/practical_tutorials/high_performance_npu_tutorial.md

@@ -262,9 +262,10 @@ SubModules:
     model_dir: PP-OCRv4_mobile_det_infer_om
     limit_side_len: 960
     limit_type: max
+    max_side_limit: 4000
     thresh: 0.3
     box_thresh: 0.6
-    unclip_ratio: 2.0
+    unclip_ratio: 1.5
     input_shape: [3, 640, 480]
   TextLineOrientation:
     module_name: textline_orientation
@@ -336,9 +337,10 @@ SubModules:
     model_dir: PP-OCRv4_mobile_det_infer_onnx
     limit_side_len: 960
     limit_type: max
+    max_side_limit: 4000
     thresh: 0.3
     box_thresh: 0.6
-    unclip_ratio: 2.0
+    unclip_ratio: 1.5
     hpi_config:
       auto_config: False
       backend: onnxruntime

+ 2 - 1
paddlex/configs/pipelines/OCR.yaml

@@ -28,9 +28,10 @@ SubModules:
     model_dir: null
     limit_side_len: 960
     limit_type: max
+    max_side_limit: 4000
     thresh: 0.3
     box_thresh: 0.6
-    unclip_ratio: 2.0
+    unclip_ratio: 1.5
   TextLineOrientation:
     module_name: textline_orientation
     model_name: PP-LCNet_x0_25_textline_ori 

+ 3 - 1
paddlex/configs/pipelines/PP-ChatOCRv3-doc.yaml

@@ -99,9 +99,10 @@ SubPipelines:
             model_dir: null
             limit_side_len: 960
             limit_type: max
+            max_side_limit: 4000
             thresh: 0.3
             box_thresh: 0.6
-            unclip_ratio: 2.0
+            unclip_ratio: 1.5
             
           TextRecognition:
             module_name: text_recognition
@@ -138,6 +139,7 @@ SubPipelines:
                 model_dir: null
                 limit_side_len: 736
                 limit_type: min
+                max_side_limit: 4000
                 thresh: 0.2
                 box_thresh: 0.6
                 unclip_ratio: 0.5

+ 3 - 1
paddlex/configs/pipelines/PP-ChatOCRv4-doc.yaml

@@ -134,9 +134,10 @@ SubPipelines:
             model_dir: null
             limit_side_len: 960
             limit_type: max
+            max_side_limit: 4000
             thresh: 0.3
             box_thresh: 0.6
-            unclip_ratio: 2.0
+            unclip_ratio: 1.5
             
           TextRecognition:
             module_name: text_recognition
@@ -173,6 +174,7 @@ SubPipelines:
                 model_dir: null
                 limit_side_len: 736
                 limit_type: min
+                max_side_limit: 4000
                 thresh: 0.2
                 box_thresh: 0.6
                 unclip_ratio: 0.5

+ 4 - 1
paddlex/configs/pipelines/PP-StructureV3.yaml

@@ -125,6 +125,7 @@ SubPipelines:
         model_dir: null
         limit_side_len: 736
         limit_type: min
+        max_side_limit: 4000
         thresh: 0.3
         box_thresh: 0.6
         unclip_ratio: 1.5
@@ -184,9 +185,10 @@ SubPipelines:
             model_dir: null
             limit_side_len: 736
             limit_type: min
+            max_side_limit: 4000
             thresh: 0.3
             box_thresh: 0.4
-            unclip_ratio: 2.0
+            unclip_ratio: 1.5
           TextLineOrientation:
             module_name: textline_orientation
             model_name: PP-LCNet_x0_25_textline_ori
@@ -218,6 +220,7 @@ SubPipelines:
             model_dir: null
             limit_side_len: 736
             limit_type: min
+            max_side_limit: 4000
             thresh: 0.2
             box_thresh: 0.6
             unclip_ratio: 0.5

+ 3 - 1
paddlex/configs/pipelines/layout_parsing.yaml

@@ -39,9 +39,10 @@ SubPipelines:
         model_dir: null
         limit_side_len: 960
         limit_type: max
+        max_side_limit: 4000
         thresh: 0.3
         box_thresh: 0.6
-        unclip_ratio: 2.0
+        unclip_ratio: 1.5
         
       TextRecognition:
         module_name: text_recognition
@@ -78,6 +79,7 @@ SubPipelines:
             model_dir: null
             limit_side_len: 736
             limit_type: min
+            max_side_limit: 4000
             thresh: 0.2
             box_thresh: 0.6
             unclip_ratio: 0.5

+ 1 - 0
paddlex/configs/pipelines/seal_recognition.yaml

@@ -40,6 +40,7 @@ SubPipelines:
         model_dir: null
         limit_side_len: 736
         limit_type: min
+        max_side_len: 4000
         thresh: 0.2
         box_thresh: 0.6
         unclip_ratio: 0.5

+ 2 - 1
paddlex/configs/pipelines/table_recognition.yaml

@@ -44,9 +44,10 @@ SubPipelines:
         model_dir: null
         limit_side_len: 960
         limit_type: max
+        max_side_limit: 4000
         thresh: 0.3
         box_thresh: 0.6
-        unclip_ratio: 2.0
+        unclip_ratio: 1.5
         
       TextRecognition:
         module_name: text_recognition

+ 2 - 1
paddlex/configs/pipelines/table_recognition_v2.yaml

@@ -69,9 +69,10 @@ SubPipelines:
         model_dir: null
         limit_side_len: 960
         limit_type: max
+        max_side_limit: 4000
         thresh: 0.3
         box_thresh: 0.4
-        unclip_ratio: 2.0
+        unclip_ratio: 1.5
         
       TextRecognition:
         module_name: text_recognition

+ 6 - 0
paddlex/inference/models/text_detection/predictor.py

@@ -41,6 +41,7 @@ class TextDetPredictor(BasePredictor):
         box_thresh: Union[float, None] = None,
         unclip_ratio: Union[float, None] = None,
         input_shape=None,
+        max_side_limit: int = 4000,
         *args,
         **kwargs
     ):
@@ -52,6 +53,7 @@ class TextDetPredictor(BasePredictor):
         self.box_thresh = box_thresh
         self.unclip_ratio = unclip_ratio
         self.input_shape = input_shape
+        self.max_side_limit = max_side_limit
         self.pre_tfs, self.infer, self.post_op = self._build()
 
     def _build_batch_sampler(self):
@@ -85,6 +87,7 @@ class TextDetPredictor(BasePredictor):
         thresh: Union[float, None] = None,
         box_thresh: Union[float, None] = None,
         unclip_ratio: Union[float, None] = None,
+        max_side_limit: Union[int, None] = None,
     ):
 
         batch_raw_imgs = self.pre_tfs["Read"](imgs=batch_data.instances)
@@ -92,6 +95,9 @@ class TextDetPredictor(BasePredictor):
             imgs=batch_raw_imgs,
             limit_side_len=limit_side_len or self.limit_side_len,
             limit_type=limit_type or self.limit_type,
+            max_side_limit=(
+                max_side_limit if max_side_limit is not None else self.max_side_limit
+            ),
         )
         batch_imgs = self.pre_tfs["Normalize"](imgs=batch_imgs)
         batch_imgs = self.pre_tfs["ToCHW"](imgs=batch_imgs)

+ 30 - 7
paddlex/inference/models/text_detection/processors.py

@@ -32,8 +32,7 @@ if is_dep_available("pyclipper"):
 class DetResizeForTest:
     """DetResizeForTest"""
 
-    def __init__(self, input_shape=None, **kwargs):
-        super().__init__()
+    def __init__(self, input_shape=None, max_side_limit=4000, **kwargs):
         self.resize_type = 0
         self.keep_ratio = False
         if input_shape is not None:
@@ -54,22 +53,34 @@ class DetResizeForTest:
             self.limit_side_len = 736
             self.limit_type = "min"
 
+        self.max_side_limit = max_side_limit
+
     def __call__(
         self,
         imgs,
         limit_side_len: Union[int, None] = None,
         limit_type: Union[str, None] = None,
+        max_side_limit: Union[int, None] = None,
     ):
         """apply"""
+        max_side_limit = (
+            max_side_limit if max_side_limit is not None else self.max_side_limit
+        )
         resize_imgs, img_shapes = [], []
         for ori_img in imgs:
-            img, shape = self.resize(ori_img, limit_side_len, limit_type)
+            img, shape = self.resize(
+                ori_img, limit_side_len, limit_type, max_side_limit
+            )
             resize_imgs.append(img)
             img_shapes.append(shape)
         return resize_imgs, img_shapes
 
     def resize(
-        self, img, limit_side_len: Union[int, None], limit_type: Union[str, None]
+        self,
+        img,
+        limit_side_len: Union[int, None],
+        limit_type: Union[str, None],
+        max_side_limit: Union[int, None] = None,
     ):
         src_h, src_w, _ = img.shape
         if sum([src_h, src_w]) < 64:
@@ -78,7 +89,7 @@ class DetResizeForTest:
         if self.resize_type == 0:
             # img, shape = self.resize_image_type0(img)
             img, [ratio_h, ratio_w] = self.resize_image_type0(
-                img, limit_side_len, limit_type
+                img, limit_side_len, limit_type, max_side_limit
             )
         elif self.resize_type == 2:
             img, [ratio_h, ratio_w] = self.resize_image_type2(img)
@@ -113,7 +124,11 @@ class DetResizeForTest:
         return img, [ratio_h, ratio_w]
 
     def resize_image_type0(
-        self, img, limit_side_len: Union[int, None], limit_type: Union[str, None]
+        self,
+        img,
+        limit_side_len: Union[int, None],
+        limit_type: Union[str, None],
+        max_side_limit: Union[int, None] = None,
     ):
         """
         resize image to a size multiple of 32 which is required by the network
@@ -150,6 +165,14 @@ class DetResizeForTest:
         resize_h = int(h * ratio)
         resize_w = int(w * ratio)
 
+        if max(resize_h, resize_w) > max_side_limit:
+            logging.warning(
+                f"Resized image size ({resize_h}x{resize_w}) exceeds max_side_limit of {max_side_limit}. "
+                f"Resizing to fit within limit."
+            )
+            ratio = float(max_side_limit) / max(resize_h, resize_w)
+            resize_h, resize_w = int(resize_h * ratio), int(resize_w * ratio)
+
         resize_h = max(int(round(resize_h / 32) * 32), 32)
         resize_w = max(int(round(resize_w / 32) * 32), 32)
 
@@ -264,7 +287,7 @@ class DBPostProcess:
         use_dilation=False,
         score_mode="fast",
         box_type="quad",
-        **kwargs
+        **kwargs,
     ):
         super().__init__()
         self.thresh = thresh

+ 11 - 0
paddlex/inference/pipelines/ocr/pipeline.py

@@ -91,6 +91,7 @@ class _OCRPipeline(BasePipeline):
         if self.text_type == "general":
             self.text_det_limit_side_len = text_det_config.get("limit_side_len", 960)
             self.text_det_limit_type = text_det_config.get("limit_type", "max")
+            self.text_det_max_side_limit = text_det_config.get("max_side_limit", 4000)
             self.text_det_thresh = text_det_config.get("thresh", 0.3)
             self.text_det_box_thresh = text_det_config.get("box_thresh", 0.6)
             self.input_shape = text_det_config.get("input_shape", None)
@@ -100,6 +101,7 @@ class _OCRPipeline(BasePipeline):
         elif self.text_type == "seal":
             self.text_det_limit_side_len = text_det_config.get("limit_side_len", 736)
             self.text_det_limit_type = text_det_config.get("limit_type", "min")
+            self.text_det_max_side_limit = text_det_config.get("max_side_limit", 4000)
             self.text_det_thresh = text_det_config.get("thresh", 0.2)
             self.text_det_box_thresh = text_det_config.get("box_thresh", 0.6)
             self.text_det_unclip_ratio = text_det_config.get("unclip_ratio", 0.5)
@@ -113,6 +115,7 @@ class _OCRPipeline(BasePipeline):
             text_det_config,
             limit_side_len=self.text_det_limit_side_len,
             limit_type=self.text_det_limit_type,
+            max_side_limit=self.text_det_max_side_limit,
             thresh=self.text_det_thresh,
             box_thresh=self.text_det_box_thresh,
             unclip_ratio=self.text_det_unclip_ratio,
@@ -232,6 +235,7 @@ class _OCRPipeline(BasePipeline):
         self,
         text_det_limit_side_len: Optional[int] = None,
         text_det_limit_type: Optional[str] = None,
+        text_det_max_side_limit: Optional[int] = None,
         text_det_thresh: Optional[float] = None,
         text_det_box_thresh: Optional[float] = None,
         text_det_unclip_ratio: Optional[float] = None,
@@ -244,6 +248,7 @@ class _OCRPipeline(BasePipeline):
         Args:
             text_det_limit_side_len (Optional[int]): The maximum side length of the text box.
             text_det_limit_type (Optional[str]): The type of limit to apply to the text box.
+            text_det_max_side_limit (Optional[int]): The maximum side length of the text box.
             text_det_thresh (Optional[float]): The threshold for text detection.
             text_det_box_thresh (Optional[float]): The threshold for the bounding box.
             text_det_unclip_ratio (Optional[float]): The ratio for unclipping the text box.
@@ -255,6 +260,8 @@ class _OCRPipeline(BasePipeline):
             text_det_limit_side_len = self.text_det_limit_side_len
         if text_det_limit_type is None:
             text_det_limit_type = self.text_det_limit_type
+        if text_det_max_side_limit is None:
+            text_det_max_side_limit = self.text_det_max_side_limit
         if text_det_thresh is None:
             text_det_thresh = self.text_det_thresh
         if text_det_box_thresh is None:
@@ -265,6 +272,7 @@ class _OCRPipeline(BasePipeline):
             limit_side_len=text_det_limit_side_len,
             limit_type=text_det_limit_type,
             thresh=text_det_thresh,
+            max_side_limit=text_det_max_side_limit,
             box_thresh=text_det_box_thresh,
             unclip_ratio=text_det_unclip_ratio,
         )
@@ -277,6 +285,7 @@ class _OCRPipeline(BasePipeline):
         use_textline_orientation: Optional[bool] = None,
         text_det_limit_side_len: Optional[int] = None,
         text_det_limit_type: Optional[str] = None,
+        text_det_max_side_limit: Optional[int] = None,
         text_det_thresh: Optional[float] = None,
         text_det_box_thresh: Optional[float] = None,
         text_det_unclip_ratio: Optional[float] = None,
@@ -292,6 +301,7 @@ class _OCRPipeline(BasePipeline):
             use_textline_orientation (Optional[bool]): Whether to use textline orientation prediction.
             text_det_limit_side_len (Optional[int]): Maximum side length for text detection.
             text_det_limit_type (Optional[str]): Type of limit to apply for text detection.
+            text_det_max_side_limit (Optional[int]): Maximum side length for text detection.
             text_det_thresh (Optional[float]): Threshold for text detection.
             text_det_box_thresh (Optional[float]): Threshold for text detection boxes.
             text_det_unclip_ratio (Optional[float]): Ratio for unclipping text detection boxes.
@@ -310,6 +320,7 @@ class _OCRPipeline(BasePipeline):
         text_det_params = self.get_text_det_params(
             text_det_limit_side_len,
             text_det_limit_type,
+            text_det_max_side_limit,
             text_det_thresh,
             text_det_box_thresh,
             text_det_unclip_ratio,