Преглед изворни кода

bugfix && remove redundant page index

zhouchangda пре 6 месеци
родитељ
комит
fdec0ca3c4

+ 19 - 8
paddlex/configs/pipelines/PP-StructureV3.yaml

@@ -8,12 +8,12 @@ use_seal_recognition: True
 use_table_recognition: True
 use_formula_recognition: True
 use_chart_recognition: False
-use_region_detection: False
+use_region_detection: True
 
 SubModules:
   LayoutDetection:
     module_name: layout_detection
-    model_name: PP-DocLayout-L
+    model_name: PP-DocLayout_plus-L
     model_dir: null
     batch_size: 8
     threshold: 
@@ -94,6 +94,12 @@ SubModules:
     model_name: PP-Chart2Table
     model_dir: null
     batch_size: 1 
+  RegionDetection:
+    module_name: layout_detection
+    model_name: PP-DocBlockLayout
+    model_dir: null
+    layout_nms: True
+    layout_merge_bboxes_mode: "small"
 
 SubPipelines:
   DocPreprocessor:
@@ -121,7 +127,7 @@ SubPipelines:
     SubModules:
       TextDetection:
         module_name: text_detection
-        model_name: PP-OCRv4_server_det
+        model_name: PP-OCRv5_server_det
         model_dir: null
         limit_side_len: 736
         limit_type: min
@@ -136,7 +142,7 @@ SubPipelines:
         batch_size: 8
       TextRecognition:
         module_name: text_recognition
-        model_name: PP-OCRv4_server_rec_doc
+        model_name: PP-OCRv5_server_rec
         model_dir: null
         batch_size: 8
         score_thresh: 0.0
@@ -172,6 +178,11 @@ SubPipelines:
         module_name: table_cells_detection
         model_name: RT-DETR-L_wireless_table_cell_det
         model_dir: null
+
+      TableOrientationClassify:
+        module_name: doc_text_orientation
+        model_name: PP-LCNet_x1_0_doc_ori
+        model_dir: null
     SubPipelines:
       GeneralOCR:
         pipeline_name: OCR
@@ -181,7 +192,7 @@ SubPipelines:
         SubModules:
           TextDetection:
             module_name: text_detection
-            model_name: PP-OCRv4_server_det
+            model_name: PP-OCRv5_server_det
             model_dir: null
             limit_side_len: 736
             limit_type: min
@@ -196,7 +207,7 @@ SubPipelines:
             batch_size: 8
           TextRecognition:
             module_name: text_recognition
-            model_name: PP-OCRv4_server_rec_doc
+            model_name: PP-OCRv5_server_rec
             model_dir: null
             batch_size: 8
         score_thresh: 0.0
@@ -226,7 +237,7 @@ SubPipelines:
             unclip_ratio: 0.5
           TextRecognition:
             module_name: text_recognition
-            model_name: PP-OCRv4_server_rec
+            model_name: PP-OCRv5_server_rec
             model_dir: null
             batch_size: 8
             score_thresh: 0
@@ -239,6 +250,6 @@ SubPipelines:
     SubModules:
       FormulaRecognition:
         module_name: formula_recognition
-        model_name: PP-FormulaNet-L
+        model_name: PP-FormulaNet_plus-L
         model_dir: null
         batch_size: 8

+ 0 - 9
paddlex/inference/models/formula_recognition/result.py

@@ -18,7 +18,6 @@ import os
 import re
 import subprocess
 import tempfile
-from pathlib import Path
 from typing import List, Optional
 
 import numpy as np
@@ -38,14 +37,6 @@ if is_dep_available("pypdfium2"):
 
 
 class FormulaRecResult(BaseCVResult):
-    def _get_input_fn(self):
-        fn = super()._get_input_fn()
-        if (page_idx := self["page_index"]) is not None:
-            fp = Path(fn)
-            stem, suffix = fp.stem, fp.suffix
-            return f"{stem}_{page_idx}{suffix}"
-        else:
-            return fn
 
     def _to_str(self, *args, **kwargs):
         data = copy.deepcopy(self)

+ 4 - 6
paddlex/inference/models/object_detection/processors.py

@@ -746,16 +746,14 @@ class DetPostProcess:
         if layout_nms:
             selected_indices = nms(boxes, iou_same=0.6, iou_diff=0.98)
             boxes = np.array(boxes[selected_indices])
-        
+
         filter_large_image = True
         if filter_large_image and len(boxes) > 1:
             if img_size[0] > img_size[1]:
-                area_thres = 0.82 
+                area_thres = 0.82
             else:
                 area_thres = 0.93
-            image_index = (
-                self.labels.index("image") if "image" in self.labels else None
-            )
+            image_index = self.labels.index("image") if "image" in self.labels else None
             img_area = img_size[0] * img_size[1]
             filtered_boxes = []
             for box in boxes:
@@ -823,7 +821,7 @@ class DetPostProcess:
                 boxes = boxes[keep_mask]
 
         if boxes.size == 0:
-            return np.array([])
+            return []
 
         if layout_unclip_ratio:
             if isinstance(layout_unclip_ratio, float):

+ 0 - 10
paddlex/inference/models/table_structure_recognition/result.py

@@ -13,7 +13,6 @@
 # limitations under the License.
 
 import copy
-from pathlib import Path
 
 import numpy as np
 
@@ -28,15 +27,6 @@ if is_dep_available("opencv-contrib-python"):
 class TableRecResult(BaseCVResult):
     """SaveTableResults"""
 
-    def _get_input_fn(self):
-        fn = super()._get_input_fn()
-        if (page_idx := self["page_index"]) is not None:
-            fp = Path(fn)
-            stem, suffix = fp.stem, fp.suffix
-            return f"{stem}_{page_idx}{suffix}"
-        else:
-            return fn
-
     def _to_img(self):
         image = self["input_img"]
         bbox_res = self["bbox"]

+ 0 - 10
paddlex/inference/models/text_detection/result.py

@@ -13,7 +13,6 @@
 # limitations under the License.
 
 import copy
-from pathlib import Path
 
 import numpy as np
 
@@ -27,15 +26,6 @@ if is_dep_available("opencv-contrib-python"):
 @class_requires_deps("opencv-contrib-python")
 class TextDetResult(BaseCVResult):
 
-    def _get_input_fn(self):
-        fn = super()._get_input_fn()
-        if (page_idx := self["page_index"]) is not None:
-            fp = Path(fn)
-            stem, suffix = fp.stem, fp.suffix
-            return f"{stem}_{page_idx}{suffix}"
-        else:
-            return fn
-
     def _to_img(self):
         """draw rectangle"""
         boxes = self["dt_polys"]

+ 0 - 10
paddlex/inference/pipelines/formula_recognition/result.py

@@ -17,7 +17,6 @@ import os
 import random
 import subprocess
 import tempfile
-from pathlib import Path
 from typing import Dict, Tuple
 
 import numpy as np
@@ -45,15 +44,6 @@ if is_dep_available("opencv-contrib-python"):
 class FormulaRecognitionResult(BaseCVResult):
     """Formula Recognition Result"""
 
-    def _get_input_fn(self):
-        fn = super()._get_input_fn()
-        if (page_idx := self["page_index"]) is not None:
-            fp = Path(fn)
-            stem, suffix = fp.stem, fp.suffix
-            return f"{stem}_{page_idx}{suffix}"
-        else:
-            return fn
-
     def _to_img(self) -> Dict[str, Image.Image]:
         """
         Converts the internal data to a PIL Image with detection and recognition results.

+ 40 - 29
paddlex/inference/pipelines/layout_parsing/pipeline_v2.py

@@ -100,7 +100,7 @@ class _LayoutParsingPipelineV2(BasePipeline):
         self.use_seal_recognition = config.get("use_seal_recognition", True)
         self.use_region_detection = config.get(
             "use_region_detection",
-            False,
+            True,
         )
         self.use_formula_recognition = config.get(
             "use_formula_recognition",
@@ -494,7 +494,7 @@ class _LayoutParsingPipelineV2(BasePipeline):
             region_det_res["boxes"] = [
                 {
                     "coordinate": base_region_bbox,
-                    "label": "SupplementaryBlock",
+                    "label": "SupplementaryRegion",
                     "score": 1,
                 }
             ]
@@ -521,7 +521,7 @@ class _LayoutParsingPipelineV2(BasePipeline):
                     matched_bboxes = [block_bboxes[idx] for idx in matched_idxes]
                     new_region_bbox = calculate_minimum_enclosing_bbox(matched_bboxes)
                     region_det_res["boxes"][region_idx]["coordinate"] = new_region_bbox
-            # Supplement region block when there is no matched block
+            # Supplement region when there is no matched block
             if len(block_idxes_set) > 0:
                 while len(block_idxes_set) > 0:
                     matched_idxes = []
@@ -555,7 +555,7 @@ class _LayoutParsingPipelineV2(BasePipeline):
                     region_det_res["boxes"].append(
                         {
                             "coordinate": supplement_region_bbox,
-                            "label": "SupplementaryBlock",
+                            "label": "SupplementaryRegion",
                             "score": 1,
                         }
                     )
@@ -950,7 +950,7 @@ class _LayoutParsingPipelineV2(BasePipeline):
         use_seal_recognition: Union[bool, None] = None,
         use_table_recognition: Union[bool, None] = None,
         use_formula_recognition: Union[bool, None] = None,
-        use_chart_recognition: Union[bool, None] = None,
+        use_chart_recognition: Union[bool, None] = False,
         use_region_detection: Union[bool, None] = None,
         layout_threshold: Optional[Union[float, dict]] = None,
         layout_nms: Optional[bool] = None,
@@ -1117,9 +1117,19 @@ class _LayoutParsingPipelineV2(BasePipeline):
                 )
 
             if model_settings["use_table_recognition"]:
-                table_contents = []
-                for overall_ocr_res, formula_res_list, imgs_in_doc_for_img in zip(
-                    overall_ocr_results, formula_res_lists, imgs_in_doc
+                table_res_lists = []
+                for (
+                    layout_det_res,
+                    doc_preprocessor_image,
+                    overall_ocr_res,
+                    formula_res_list,
+                    imgs_in_doc_for_img,
+                ) in zip(
+                    layout_det_results,
+                    doc_preprocessor_images,
+                    overall_ocr_results,
+                    formula_res_lists,
+                    imgs_in_doc,
                 ):
                     table_contents_for_img = copy.deepcopy(overall_ocr_res)
                     for formula_res in formula_res_list:
@@ -1174,27 +1184,28 @@ class _LayoutParsingPipelineV2(BasePipeline):
                         table_contents_for_img["rec_polys"].append(poly_points)
                         table_contents_for_img["rec_scores"].append(img["score"])
 
-                    table_contents.append(table_contents_for_img)
-
-                table_res_all = list(
-                    self.table_recognition_pipeline(
-                        doc_preprocessor_images,
-                        use_doc_orientation_classify=False,
-                        use_doc_unwarping=False,
-                        use_layout_detection=False,
-                        use_ocr_model=False,
-                        overall_ocr_res=table_contents,
-                        layout_det_res=layout_det_results,
-                        cell_sort_by_y_projection=True,
-                        use_wired_table_cells_trans_to_html=use_wired_table_cells_trans_to_html,
-                        use_wireless_table_cells_trans_to_html=use_wireless_table_cells_trans_to_html,
-                        use_table_orientation_classify=use_table_orientation_classify,
-                        use_ocr_results_with_table_cells=use_ocr_results_with_table_cells,
-                        use_e2e_wired_table_rec_model=use_e2e_wired_table_rec_model,
-                        use_e2e_wireless_table_rec_model=use_e2e_wireless_table_rec_model,
-                    ),
-                )
-                table_res_lists = [item["table_res_list"] for item in table_res_all]
+                    table_res_all = list(
+                        self.table_recognition_pipeline(
+                            doc_preprocessor_image,
+                            use_doc_orientation_classify=False,
+                            use_doc_unwarping=False,
+                            use_layout_detection=False,
+                            use_ocr_model=False,
+                            overall_ocr_res=table_contents_for_img,
+                            layout_det_res=layout_det_res,
+                            cell_sort_by_y_projection=True,
+                            use_wired_table_cells_trans_to_html=use_wired_table_cells_trans_to_html,
+                            use_wireless_table_cells_trans_to_html=use_wireless_table_cells_trans_to_html,
+                            use_table_orientation_classify=use_table_orientation_classify,
+                            use_ocr_results_with_table_cells=use_ocr_results_with_table_cells,
+                            use_e2e_wired_table_rec_model=use_e2e_wired_table_rec_model,
+                            use_e2e_wireless_table_rec_model=use_e2e_wireless_table_rec_model,
+                        ),
+                    )
+                    single_table_res_lists = [
+                        item["table_res_list"] for item in table_res_all
+                    ]
+                    table_res_lists.extend(single_table_res_lists)
             else:
                 table_res_lists = [[] for _ in doc_preprocessor_images]
 

+ 0 - 10
paddlex/inference/pipelines/layout_parsing/result.py

@@ -13,7 +13,6 @@
 # limitations under the License.
 
 import copy
-from pathlib import Path
 from typing import Dict
 
 import numpy as np
@@ -31,15 +30,6 @@ class LayoutParsingResult(BaseCVResult, HtmlMixin, XlsxMixin):
         HtmlMixin.__init__(self)
         XlsxMixin.__init__(self)
 
-    def _get_input_fn(self):
-        fn = super()._get_input_fn()
-        if (page_idx := self["page_index"]) is not None:
-            fp = Path(fn)
-            stem, suffix = fp.stem, fp.suffix
-            return f"{stem}_{page_idx}{suffix}"
-        else:
-            return fn
-
     def _to_img(self) -> Dict[str, np.ndarray]:
         res_img_dict = {}
         model_settings = self["model_settings"]

+ 0 - 10
paddlex/inference/pipelines/layout_parsing/result_v2.py

@@ -17,7 +17,6 @@ import copy
 import math
 import re
 from functools import partial
-from pathlib import Path
 from typing import List
 
 import numpy as np
@@ -204,15 +203,6 @@ class LayoutParsingResultV2(BaseCVResult, HtmlMixin, XlsxMixin, MarkdownMixin):
         MarkdownMixin.__init__(self)
         JsonMixin.__init__(self)
 
-    def _get_input_fn(self):
-        fn = super()._get_input_fn()
-        if (page_idx := self["page_index"]) is not None:
-            fp = Path(fn)
-            stem, suffix = fp.stem, fp.suffix
-            return f"{stem}_{page_idx}{suffix}"
-        else:
-            return fn
-
     def _to_img(self) -> dict[str, np.ndarray]:
         from .utils import get_show_color
 

+ 12 - 7
paddlex/inference/pipelines/layout_parsing/utils.py

@@ -390,6 +390,10 @@ def is_english_letter(char):
     return bool(re.match(r"^[A-Za-z]$", char))
 
 
+def is_numeric(char):
+    return bool(re.match(r"^[\d.]+$", char))
+
+
 def is_non_breaking_punctuation(char):
     """
     判断一个字符是否是不需要换行的标点符号,包括全角和半角的符号。
@@ -481,16 +485,17 @@ def format_line(
         len(line_text) > 0 and is_english_letter(line_text[-1])
     ) or line_text.endswith("$"):
         line_text += " "
-    else:
-        if (
-            block_stop_coordinate - last_span_box[text_stop_index] > block_width * 0.3
-            and block_label != "formula"
-        ):
+    elif (
+        len(line_text) > 0
+        and not is_english_letter(line_text[-1])
+        and not is_non_breaking_punctuation(line_text[-1])
+        and not is_numeric(line_text[-1])
+    ) or text_direction == "vertical":
+        if block_stop_coordinate - last_span_box[text_stop_index] > block_width * 0.4:
             line_text += "\n"
         if (
             first_span_box[text_start_index] - block_start_coordinate
-            > block_width * 0.3
-            and block_label != "formula"
+            > block_width * 0.4
         ):
             line_text = "\n" + line_text
 

+ 11 - 14
paddlex/inference/pipelines/layout_parsing/xycut_enhanced/utils.py

@@ -56,7 +56,7 @@ def get_nearest_edge_distance(
     return min_x_distance + min_y_distance
 
 
-def _projection_by_bboxes(boxes: np.ndarray, axis: int) -> np.ndarray:
+def projection_by_bboxes(boxes: np.ndarray, axis: int) -> np.ndarray:
     """
     Generate a 1D projection histogram from bounding boxes along a specified axis.
 
@@ -84,7 +84,7 @@ def _projection_by_bboxes(boxes: np.ndarray, axis: int) -> np.ndarray:
     return projection
 
 
-def _split_projection_profile(arr_values: np.ndarray, min_value: float, min_gap: float):
+def split_projection_profile(arr_values: np.ndarray, min_value: float, min_gap: float):
     """
     Split the projection profile into segments based on specified thresholds.
 
@@ -144,8 +144,8 @@ def recursive_yx_cut(
     y_sorted_indices = np.array(indices)[y_sorted_indices]
 
     # Perform Y-axis projection
-    y_projection = _projection_by_bboxes(boxes=y_sorted_boxes, axis=1)
-    y_intervals = _split_projection_profile(y_projection, 0, 1)
+    y_projection = projection_by_bboxes(boxes=y_sorted_boxes, axis=1)
+    y_intervals = split_projection_profile(y_projection, 0, 1)
 
     if not y_intervals:
         return
@@ -165,8 +165,8 @@ def recursive_yx_cut(
         x_sorted_indices_chunk = y_indices_chunk[x_sorted_indices]
 
         # Perform X-axis projection
-        x_projection = _projection_by_bboxes(boxes=x_sorted_boxes_chunk, axis=0)
-        x_intervals = _split_projection_profile(x_projection, 0, min_gap)
+        x_projection = projection_by_bboxes(boxes=x_sorted_boxes_chunk, axis=0)
+        x_intervals = split_projection_profile(x_projection, 0, min_gap)
 
         if not x_intervals:
             continue
@@ -216,8 +216,8 @@ def recursive_xy_cut(
     x_sorted_indices = np.array(indices)[x_sorted_indices]
 
     # Perform X-axis projection
-    x_projection = _projection_by_bboxes(boxes=x_sorted_boxes, axis=0)
-    x_intervals = _split_projection_profile(x_projection, 0, 1)
+    x_projection = projection_by_bboxes(boxes=x_sorted_boxes, axis=0)
+    x_intervals = split_projection_profile(x_projection, 0, 1)
 
     if not x_intervals:
         return
@@ -239,8 +239,8 @@ def recursive_xy_cut(
         y_sorted_indices_chunk = x_indices_chunk[y_sorted_indices]
 
         # Perform Y-axis projection
-        y_projection = _projection_by_bboxes(boxes=y_sorted_boxes_chunk, axis=1)
-        y_intervals = _split_projection_profile(y_projection, 0, min_gap)
+        y_projection = projection_by_bboxes(boxes=y_sorted_boxes_chunk, axis=1)
+        y_intervals = split_projection_profile(y_projection, 0, min_gap)
 
         if not y_intervals:
             continue
@@ -543,9 +543,7 @@ def sort_normal_blocks(blocks, text_line_height, text_line_width, region_directi
     return blocks
 
 
-def get_cut_blocks(
-    blocks, cut_direction, cut_coordinates, overall_region_box, mask_labels=[]
-):
+def get_cut_blocks(blocks, cut_direction, cut_coordinates, mask_labels=[]):
     """
     Cut blocks based on the given cut direction and coordinates.
 
@@ -553,7 +551,6 @@ def get_cut_blocks(
         blocks (list): list of blocks to be cut.
         cut_direction (str): cut direction, either "horizontal" or "vertical".
         cut_coordinates (list): list of cut coordinates.
-        overall_region_box (list): the overall region box that contains all blocks.
 
     Returns:
         list: a list of tuples containing the cutted blocks and their corresponding mean width。

+ 43 - 19
paddlex/inference/pipelines/layout_parsing/xycut_enhanced/xycuts.py

@@ -23,14 +23,15 @@ from ..utils import calculate_overlap_ratio, calculate_projection_overlap_ratio
 from .utils import (
     calculate_discontinuous_projection,
     get_cut_blocks,
-    get_nearest_edge_distance,
     insert_child_blocks,
     manhattan_insert,
+    projection_by_bboxes,
     recursive_xy_cut,
     recursive_yx_cut,
     reference_insert,
     shrink_overlapping_boxes,
     sort_normal_blocks,
+    split_projection_profile,
     update_doc_title_child_blocks,
     update_paragraph_title_child_blocks,
     update_vision_child_blocks,
@@ -131,27 +132,50 @@ def pre_process(
         current_interval = discontinuous[0]
         for interval in discontinuous[1:]:
             gap_len = interval[0] - current_interval[1]
-            if gap_len >= region.text_line_height * 5:
+            if gap_len >= region.text_line_height * 3:
                 cut_coordinates.append(current_interval[1])
-            elif gap_len > region.text_line_height * 2:
-                x1, _, x2, __ = region.bbox
-                y1 = current_interval[1]
-                y2 = interval[0]
-                bbox = [x1, y1, x2, y2]
-                ref_interval = interval[0] - current_interval[1]
-                ref_bboxes = []
-                for block in blocks:
-                    if get_nearest_edge_distance(bbox, block.bbox) < ref_interval * 2:
-                        ref_bboxes.append(block.bbox)
-                discontinuous = calculate_discontinuous_projection(
-                    ref_bboxes, direction=region.direction
+            elif gap_len > region.text_line_height * 1.8:
+                (pre_blocks, post_blocks) = get_cut_blocks(
+                    list(block_map.values()), cut_direction, [current_interval[1]], []
                 )
-                if len(discontinuous) != 2:
-                    cut_coordinates.append(current_interval[1])
+                pre_bboxes = np.array([block.bbox for block in pre_blocks])
+                post_bboxes = np.array([block.bbox for block in post_blocks])
+                projection_index = 1 if cut_direction == "horizontal" else 0
+                pre_projection = projection_by_bboxes(pre_bboxes, projection_index)
+                post_projection = projection_by_bboxes(post_bboxes, projection_index)
+                pre_projection_min = np.min(pre_projection)
+                post_projection_min = np.min(post_projection)
+                pre_projection_min += 5 if pre_projection_min != 0 else 0
+                post_projection_min += 5 if post_projection_min != 0 else 0
+                pre_intervals = split_projection_profile(
+                    pre_projection, pre_projection_min, 1
+                )
+                post_intervals = split_projection_profile(
+                    post_projection, post_projection_min, 1
+                )
+                pre_gap_boxes = []
+                if pre_intervals is not None:
+                    for start, end in zip(*pre_intervals):
+                        bbox = [0] * 4
+                        bbox[projection_index] = start
+                        bbox[projection_index + 2] = end
+                        pre_gap_boxes.append(bbox)
+                post_gap_boxes = []
+                if post_intervals is not None:
+                    for start, end in zip(*post_intervals):
+                        bbox = [0] * 4
+                        bbox[projection_index] = start
+                        bbox[projection_index + 2] = end
+                        post_gap_boxes.append(bbox)
+                max_gap_boxes_num = max(len(pre_gap_boxes), len(post_gap_boxes))
+                if max_gap_boxes_num > 0:
+                    discontinuous_intervals = calculate_discontinuous_projection(
+                        pre_gap_boxes + post_gap_boxes, direction=region.direction
+                    )
+                    if len(discontinuous_intervals) != max_gap_boxes_num:
+                        cut_coordinates.append(current_interval[1])
             current_interval = interval
-    cut_list = get_cut_blocks(
-        blocks, cut_direction, cut_coordinates, region.bbox, mask_labels
-    )
+    cut_list = get_cut_blocks(blocks, cut_direction, cut_coordinates, mask_labels)
     pre_cut_list.extend(cut_list)
     if region.direction == "vertical":
         pre_cut_list = pre_cut_list[::-1]

+ 0 - 10
paddlex/inference/pipelines/ocr/result.py

@@ -14,7 +14,6 @@
 
 import math
 import random
-from pathlib import Path
 from typing import Dict
 
 import numpy as np
@@ -32,15 +31,6 @@ if is_dep_available("opencv-contrib-python"):
 class OCRResult(BaseCVResult):
     """OCR result"""
 
-    def _get_input_fn(self):
-        fn = super()._get_input_fn()
-        if (page_idx := self["page_index"]) is not None:
-            fp = Path(fn)
-            stem, suffix = fp.stem, fp.suffix
-            return f"{stem}_{page_idx}{suffix}"
-        else:
-            return fn
-
     def get_minarea_rect(self, points: np.ndarray) -> np.ndarray:
         """
         Get the minimum area rectangle for the given points using OpenCV.