zhouchangda 8 bulan lalu
induk
melakukan
66bdd0923e

+ 2 - 2
paddlex/configs/pipelines/PP-StructureV3.yaml

@@ -48,10 +48,10 @@ SubPipelines:
         module_name: text_detection
         model_name: PP-OCRv4_server_det
         model_dir: null
-        limit_side_len: 960
+        limit_side_len: 1200
         limit_type: max
         thresh: 0.3
-        box_thresh: 0.6
+        box_thresh: 0.4
         unclip_ratio: 2.0
       TextLineOrientation:
         module_name: textline_orientation

+ 21 - 3
paddlex/inference/pipelines/layout_parsing/pipeline_v2.py

@@ -256,12 +256,20 @@ class LayoutParsingPipelineV2(BasePipeline):
         matched_ocr_dict = {}
         image = np.array(image)
         object_boxes = []
+        footnote_list = []
+        max_bottom_text_coordinate = 0
 
         for object_box_idx, box_info in enumerate(layout_det_res["boxes"]):
             box = box_info["coordinate"]
             label = box_info["label"].lower()
             object_boxes.append(box)
 
+            # set the label of footnote to text, when it is above the text boxes
+            if label == "footnote":
+                footnote_list.append(object_box_idx)
+            if label == "text" and box[3] > max_bottom_text_coordinate:
+                max_bottom_text_coordinate = box[3]
+
             if label not in ["formula", "table", "seal"]:
                 _, matched_idxs = get_sub_regions_ocr_res(
                     overall_ocr_res, [box], return_match_idx=True
@@ -272,6 +280,13 @@ class LayoutParsingPipelineV2(BasePipeline):
                     else:
                         matched_ocr_dict[matched_idx].append(object_box_idx)
 
+        for footnote_idx in footnote_list:
+            if (
+                layout_det_res["boxes"][footnote_idx]["coordinate"][3]
+                < max_bottom_text_coordinate
+            ):
+                layout_det_res["boxes"][footnote_idx]["label"] = "text"
+
         already_processed = set()
         for matched_idx, layout_box_ids in matched_ocr_dict.items():
             if len(layout_box_ids) <= 1:
@@ -578,9 +593,12 @@ class LayoutParsingPipelineV2(BasePipeline):
                     table_contents["rec_texts"].append(
                         f'<div style="text-align: center;"><img src="{img_path}" alt="Image" /></div>'
                     )
-                    table_contents["rec_boxes"] = np.vstack(
-                        (table_contents["rec_boxes"], img["coordinate"])
-                    )
+                    if table_contents["rec_boxes"].size == 0:
+                        table_contents["rec_boxes"] = np.array([img["coordinate"]])
+                    else:
+                        table_contents["rec_boxes"] = np.vstack(
+                            (table_contents["rec_boxes"], img["coordinate"])
+                        )
                     table_contents["rec_polys"].append(poly_points)
                     table_contents["rec_scores"].append(img["score"])