소스 검색

Layout parsing v2 (#2902)

* Fix ocr pipeline & Support saving images in the dict

* update

* support layout_parsing_v2 pipeline

* update layout_parsing_v2
cuicheng01 10 달 전
부모
커밋
4474a183fa

+ 35 - 0
api_examples/pipelines/test_layout_parsing_v2.py

@@ -0,0 +1,35 @@
+# copyright (c) 2024 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from paddlex import create_pipeline
+
+pipeline = create_pipeline(pipeline="layout_parsing_v2")
+
+output = pipeline.predict(
+    "./test_samples/demo_paper.png",
+    use_doc_orientation_classify=False,
+    use_doc_unwarping=False,
+    use_common_ocr=True,
+    use_seal_recognition=True,
+    use_table_recognition=True,
+)
+
+for res in output:
+    res.print()
+    res.save_to_img("./output")
+    res.save_to_json("./output")
+    res.save_to_xlsx("./output")
+    res.save_to_html("./output")
+    res.save_to_markdown("./output")
+    res.save_to_pdf_order("./output")

+ 132 - 0
paddlex/configs/pipelines/layout_parsing_v2.yaml

@@ -0,0 +1,132 @@
+
+pipeline_name: layout_parsing_v2
+
+use_doc_preprocessor: True
+use_general_ocr: True
+use_seal_recognition: True
+use_table_recognition: True
+use_formula_recognition: True
+
+SubModules:
+  LayoutDetection:
+    module_name: layout_detection
+    model_name: PP-DocLayout-L
+    model_dir: null
+
+SubPipelines:
+  DocPreprocessor:
+    pipeline_name: doc_preprocessor
+    use_doc_orientation_classify: True
+    use_doc_unwarping: True
+    SubModules:
+      DocOrientationClassify:
+        module_name: doc_text_orientation
+        model_name: PP-LCNet_x1_0_doc_ori
+        model_dir: null
+      DocUnwarping:
+        module_name: image_unwarping
+        model_name: UVDoc
+        model_dir: null
+
+  GeneralOCR:
+    pipeline_name: OCR
+    text_type: general
+    use_doc_preprocessor: False
+    use_textline_orientation: False
+    SubModules:
+      TextDetection:
+        module_name: text_detection
+        model_name: PP-OCRv4_server_det
+        model_dir: null
+        limit_side_len: 960
+        limit_type: max
+        thresh: 0.3
+        box_thresh: 0.6
+        unclip_ratio: 2.0
+        
+      TextRecognition:
+        module_name: text_recognition
+        model_name: PP-OCRv4_server_rec
+        model_dir: null
+        batch_size: 1
+        score_thresh: 0.0
+
+  # TableRecognition:
+  #   pipeline_name: table_recognition_v2
+  #   use_layout_detection: False
+  #   use_doc_preprocessor: False
+  #   use_ocr_model: True
+  #   SubModules:  
+  #     TableClassification:
+  #       module_name: table_classification
+  #       model_name: PP-LCNet_x1_0_table_cls
+  #       model_dir: null
+
+  #     WiredTableStructureRecognition:
+  #       module_name: table_structure_recognition
+  #       model_name: SLANeXt_wired
+  #       model_dir: null
+      
+  #     WirelessTableStructureRecognition:
+  #       module_name: table_structure_recognition
+  #       model_name: SLANeXt_wireless
+  #       model_dir: null
+      
+  #     WiredTableCellsDetection:
+  #       module_name: table_cells_detection
+  #       model_name: RT-DETR-L_wired_table_cell_det
+  #       model_dir: null
+      
+  #     WirelessTableCellsDetection:
+  #       module_name: table_cells_detection
+  #       model_name: RT-DETR-L_wireless_table_cell_det
+  #       model_dir: null
+
+  TableRecognition:
+    pipeline_name: table_recognition
+    use_layout_detection: False
+    use_doc_preprocessor: False
+    use_ocr_model: False
+    SubModules:
+      TableStructureRecognition:
+        module_name: table_structure_recognition
+        model_name: SLANet_plus
+        model_dir: null
+
+  SealRecognition:
+    pipeline_name: seal_recognition
+    use_layout_detection: False
+    use_doc_preprocessor: False
+    SubPipelines:
+      SealOCR:
+        pipeline_name: OCR
+        text_type: seal
+        use_doc_preprocessor: False
+        use_textline_orientation: False
+        SubModules:
+          TextDetection:
+            module_name: seal_text_detection
+            model_name: PP-OCRv4_server_seal_det
+            model_dir: null
+            limit_side_len: 736
+            limit_type: min
+            thresh: 0.2
+            box_thresh: 0.6
+            unclip_ratio: 0.5
+          TextRecognition:
+            module_name: text_recognition
+            model_name: PP-OCRv4_server_rec
+            model_dir: null
+            batch_size: 1
+            score_thresh: 0
+    
+  FormulaRecognition:
+    pipeline_name: formula_recognition
+    use_layout_detection: False
+    use_doc_preprocessor: False
+    SubModules:
+      FormulaRecognition:
+        module_name: formula_recognition
+        model_name: PP-FormulaNet-L
+        model_dir: null
+        batch_size: 5

+ 1 - 0
paddlex/inference/pipelines_new/layout_parsing/__init__.py

@@ -13,3 +13,4 @@
 # limitations under the License.
 
 from .pipeline import LayoutParsingPipeline
+from .pipeline_v2 import LayoutParsingPipelineV2

+ 1 - 1
paddlex/inference/pipelines_new/layout_parsing/pipeline_v2.py

@@ -40,7 +40,7 @@ from .utils import get_sub_regions_ocr_res
 class LayoutParsingPipelineV2(BasePipeline):
     """Layout Parsing Pipeline V2"""
 
-    entities = ["layout_parsing"]
+    entities = ["layout_parsing_v2"]
 
     def __init__(
         self,

+ 1 - 1
paddlex/inference/pipelines_new/layout_parsing/result_v2.py

@@ -103,7 +103,7 @@ class LayoutParsingResultV2(BaseCVResult, HtmlMixin, XlsxMixin, MarkdownMixin):
                 formula_region_id = formula_res["formula_region_id"]
                 sub_formula_res_dict = formula_res.img
                 key = f"formula_res_region{formula_region_id}"
-                res_img_dict[key] = sub_formula_res_dict
+                res_img_dict[key] = sub_formula_res_dict["res"]
 
         return res_img_dict