소스 검색

update document for layout_parsing_v2 & layout_parsing_v2 order (#3320)

* delete layout_parsing_v2 color function

* update document for layout_parsing_v2 & layout_parsing_v2 order
shuai.liu 9 달 전
부모
커밋
ab4a282a44

+ 12 - 9
docs/pipeline_usage/tutorials/ocr_pipelines/layout_parsing_v2.md

@@ -1096,9 +1096,18 @@ for item in markdown_images:
         - `angle`: `(int)` 文档图像方向分类子模块的预测结果,启用时返回实际角度值
 
     - `parsing_res_list`: `(List[Dict])` 解析结果的列表,每个元素为一个字典,列表顺序为解析后的阅读顺序。
-        - `layout_bbox`: `(np.ndarray)` 版面区域的边界框。
-        - `label`: `(str)` key 为版面区域的标签,例如`text`, `table`等,内容为版面区域内的内容。
-        - `layout`: `(str)` 版面排版类型,例如 `double`, `single` 等。
+        - `block_bbox`: `(np.ndarray)` 版面区域的边界框。
+        - `block_label`: `(str)` 版面区域的标签,例如`text`, `table`等。
+        - `block_content`: `(str)` 内容为版面区域内的内容。
+        - `seg_start_flag`: `(bool)` 标识该版面区域是否是段落的开始。
+        - `seg_end_flag`: `(bool)` 标识该版面区域是否是段落的结束。
+        - `sub_label`: `(str)` 版面区域的子标签,例如`text`的子标签可能为`title_text`。
+        - `sub_index`: `(int)` 版面区域的子索引,用于恢复Markdown。
+        - `index`: `(int)` 版面区域的索引,用于显示版面排序结果。
+
+
+
+
 
     - `overall_ocr_res`: `(Dict[str, Union[List[str], List[float], numpy.ndarray]])` 全局 OCR 结果的字典
       - `input_path`: `(Union[str, None])` 图像OCR子产线接受的图像路径,当输入为`numpy.ndarray`时,保存为`None`
@@ -1121,12 +1130,6 @@ for item in markdown_images:
       - `rec_scores`: `(List[float])` 文本识别的置信度列表,已按`text_rec_score_thresh`过滤
       - `rec_polys`: `(List[numpy.ndarray])` 经过置信度过滤的文本检测框列表,格式同`dt_polys`
 
-    - `text_paragraphs_ocr_res`: `(Dict[str, Union[List[str], List[float], numpy.ndarray]])` 段落OCR结果,版面类型非表格、印章和公式类型的段落OCR结果
-        - `rec_polys`: `(List[numpy.ndarray])` 文本检测框列表,格式同`dt_polys`
-        - `rec_texts`: `(List[str])` 文本识别结果列表
-        - `rec_scores`: `(List[float])` 文本识别结果的置信度列表
-        - `rec_boxes`: `(numpy.ndarray)` 检测框的矩形边界框数组,shape为(n, 4),dtype为int16。每一行表示一个
-
     - `formula_res_list`: `(List[Dict[str, Union[numpy.ndarray, List[float], str]]])` 公式识别结果列表,每个元素为一个字典
         - `rec_formula`: `(str)` 公式识别结果
         - `rec_polys`: `(numpy.ndarray)` 公式检测框,shape为(4, 2),dtype为int16

+ 0 - 37
paddlex/inference/pipelines/layout_parsing/result_v2.py

@@ -44,7 +44,6 @@ class LayoutParsingResultV2(BaseCVResult, HtmlMixin, XlsxMixin, MarkdownMixin):
         XlsxMixin.__init__(self)
         MarkdownMixin.__init__(self)
         JsonMixin.__init__(self)
-        self.already_sorted = False
 
     def _get_input_fn(self):
         fn = super()._get_input_fn()
@@ -58,7 +57,6 @@ class LayoutParsingResultV2(BaseCVResult, HtmlMixin, XlsxMixin, MarkdownMixin):
     def _to_img(self) -> dict[str, np.ndarray]:
         res_img_dict = {}
         model_settings = self["model_settings"]
-        page_index = self["page_index"]
         if model_settings["use_doc_preprocessor"]:
             for key, value in self["doc_preprocessor_res"].img.items():
                 res_img_dict[key] = value
@@ -95,21 +93,6 @@ class LayoutParsingResultV2(BaseCVResult, HtmlMixin, XlsxMixin, MarkdownMixin):
         image = Image.fromarray(self["doc_preprocessor_res"]["output_img"])
         draw = ImageDraw.Draw(image, "RGBA")
         parsing_result = self["parsing_res_list"]
-
-        if self.already_sorted == False:
-            parsing_result = get_layout_ordering(
-                parsing_result,
-                no_mask_labels=[
-                    "text",
-                    "formula",
-                    "algorithm",
-                    "reference",
-                    "content",
-                    "abstract",
-                ],
-                already_sorted=self.already_sorted,
-            )
-
         for block in parsing_result:
             bbox = block["block_bbox"]
             index = block.get("index", None)
@@ -120,7 +103,6 @@ class LayoutParsingResultV2(BaseCVResult, HtmlMixin, XlsxMixin, MarkdownMixin):
                 text_position = (bbox[2] + 2, bbox[1] - 10)
                 draw.text(text_position, str(index), fill="red")
 
-        self.already_sorted = True
         res_img_dict["layout_order_res"] = image
 
         return res_img_dict
@@ -256,23 +238,6 @@ class LayoutParsingResultV2(BaseCVResult, HtmlMixin, XlsxMixin, MarkdownMixin):
         Returns:
             Dict
         """
-
-        parsing_result = self["parsing_res_list"]
-        if self.already_sorted == False:
-            parsing_result = get_layout_ordering(
-                parsing_result,
-                no_mask_labels=[
-                    "text",
-                    "formula",
-                    "algorithm",
-                    "reference",
-                    "content",
-                    "abstract",
-                ],
-                already_sorted=self.already_sorted,
-            )
-        self.already_sorted == True
-
         recursive_img_array2path(self["parsing_res_list"], labels=["block_image"])
 
         def _format_data(obj):
@@ -335,7 +300,6 @@ class LayoutParsingResultV2(BaseCVResult, HtmlMixin, XlsxMixin, MarkdownMixin):
                 "text": lambda: block["block_content"]
                 .replace("-\n", " ")
                 .replace("\n", " "),
-                # 'number': lambda: str(block['number']),
                 "abstract": lambda: block["block_content"]
                 .replace("-\n", " ")
                 .replace("\n", " "),
@@ -346,7 +310,6 @@ class LayoutParsingResultV2(BaseCVResult, HtmlMixin, XlsxMixin, MarkdownMixin):
                 "chart": lambda: format_image("block_image"),
                 "formula": lambda: f"$${block['block_content']}$$",
                 "table": format_table,
-                # "reference": format_reference,
                 "reference": lambda: block["block_content"],
                 "algorithm": lambda: block["block_content"].strip("\n"),
                 "seal": lambda: format_image("block_content"),

+ 62 - 41
paddlex/inference/pipelines/layout_parsing/utils.py

@@ -491,6 +491,18 @@ def get_single_block_parsing_res(
                     },
                 )
 
+    single_block_layout_parsing_res = get_layout_ordering(
+        single_block_layout_parsing_res,
+        no_mask_labels=[
+            "text",
+            "formula",
+            "algorithm",
+            "reference",
+            "content",
+            "abstract",
+        ],
+    )
+
     return single_block_layout_parsing_res
 
 
@@ -1253,42 +1265,36 @@ def _get_sub_category(
 
 
 def get_layout_ordering(
-    data: List[Dict[str, Any]],
+    parsing_res_list: List[Dict[str, Any]],
     no_mask_labels: List[str] = [],
-    already_sorted: bool = False,
 ) -> None:
     """
     Process layout parsing results to remove overlapping bounding boxes
     and assign an ordering index based on their positions.
 
     Modifies:
-        The 'data' list by adding an 'index' to each block.
+        The 'parsing_res_list' list by adding an 'index' to each block.
 
     Args:
-        data (List[Dict[str, Any]]): List of block dictionaries with 'block_bbox' and 'block_label'.
+        parsing_res_list (List[Dict[str, Any]]): List of block dictionaries with 'block_bbox' and 'block_label'.
         no_mask_labels (List[str]): Labels for which overlapping removal is not performed.
-        already_sorted (bool): Assumes data is already sorted by position if True.
     """
-    if already_sorted:
-        return data
-
     title_text_labels = ["doc_title"]
     title_labels = ["doc_title", "paragraph_title"]
     vision_labels = ["image", "table", "seal", "chart", "figure"]
     vision_title_labels = ["table_title", "chart_title", "figure_title"]
 
-    parsing_result = data
-    parsing_result, _ = _remove_overlap_blocks(
-        parsing_result,
+    parsing_res_list, _ = _remove_overlap_blocks(
+        parsing_res_list,
         threshold=0.5,
         smaller=True,
     )
-    parsing_result = _get_sub_category(parsing_result, title_text_labels)
+    parsing_res_list = _get_sub_category(parsing_res_list, title_text_labels)
 
     doc_flag = False
-    median_width = _get_text_median_width(parsing_result)
-    parsing_result, projection_direction = _get_layout_property(
-        parsing_result,
+    median_width = _get_text_median_width(parsing_res_list)
+    parsing_res_list, projection_direction = _get_layout_property(
+        parsing_res_list,
         median_width,
         no_mask_labels=no_mask_labels,
         threshold=0.3,
@@ -1306,7 +1312,7 @@ def get_layout_ordering(
 
     drop_indexes = []
 
-    for index, block in enumerate(parsing_result):
+    for index, block in enumerate(parsing_res_list):
         label = block["sub_label"]
         block["block_bbox"] = list(map(int, block["block_bbox"]))
 
@@ -1337,15 +1343,15 @@ def get_layout_ordering(
             drop_indexes.append(index)
 
     for index in sorted(drop_indexes, reverse=True):
-        del parsing_result[index]
+        del parsing_res_list[index]
 
-    if len(parsing_result) > 0:
+    if len(parsing_res_list) > 0:
         # single text label
-        if len(double_text_blocks) > len(parsing_result) or projection_direction:
-            parsing_result.extend(title_blocks + double_text_blocks)
+        if len(double_text_blocks) > len(parsing_res_list) or projection_direction:
+            parsing_res_list.extend(title_blocks + double_text_blocks)
             title_blocks = []
             double_text_blocks = []
-            block_bboxes = [block["block_bbox"] for block in parsing_result]
+            block_bboxes = [block["block_bbox"] for block in parsing_res_list]
             block_bboxes.sort(
                 key=lambda x: (
                     x[0] // max(20, median_width),
@@ -1359,7 +1365,7 @@ def get_layout_ordering(
                 min_gap=1,
             )
         else:
-            block_bboxes = [block["block_bbox"] for block in parsing_result]
+            block_bboxes = [block["block_bbox"] for block in parsing_res_list]
             block_bboxes.sort(key=lambda x: (x[0] // 20, x[1]))
             block_bboxes = np.array(block_bboxes)
             sorted_indices = sort_by_xycut(
@@ -1370,7 +1376,7 @@ def get_layout_ordering(
 
         sorted_boxes = block_bboxes[sorted_indices].tolist()
 
-        for block in parsing_result:
+        for block in parsing_res_list:
             block["index"] = sorted_boxes.index(block["block_bbox"]) + 1
             block["sub_index"] = sorted_boxes.index(block["block_bbox"]) + 1
 
@@ -1384,7 +1390,7 @@ def get_layout_ordering(
                 float("inf"),
             ]  # for double text
             nearest_gt_index = 0
-            for match_block in parsing_result:
+            for match_block in parsing_res_list:
                 match_bbox = match_block["block_bbox"]
                 if distance_type == "nearest_iou_edge_distance":
                     distance, min_distance_config = _nearest_iou_edge_distance(
@@ -1468,7 +1474,7 @@ def get_layout_ordering(
             else:
                 block["sub_index"] = nearest_gt_index
 
-            parsing_result.append(block)
+            parsing_res_list.append(block)
 
     # double text label
     double_text_blocks.sort(
@@ -1482,11 +1488,11 @@ def get_layout_ordering(
         double_text_blocks,
         distance_type="nearest_iou_edge_distance",
     )
-    parsing_result.sort(
+    parsing_res_list.sort(
         key=lambda x: (x["index"], x["block_bbox"][1], x["block_bbox"][0]),
     )
 
-    for idx, block in enumerate(parsing_result):
+    for idx, block in enumerate(parsing_res_list):
         block["index"] = idx + 1
         block["sub_index"] = idx + 1
 
@@ -1506,15 +1512,15 @@ def get_layout_ordering(
             label: priority for priority, label in enumerate(text_sort_labels)
         }
         doc_titles = []
-        for i, block in enumerate(parsing_result):
+        for i, block in enumerate(parsing_res_list):
             if block["block_label"] == "doc_title":
                 doc_titles.append(
                     (i, block["block_bbox"][1], block["block_bbox"][0]),
                 )
         doc_titles.sort(key=lambda x: (x[1], x[2]))
         first_doc_title_index = doc_titles[0][0]
-        parsing_result[first_doc_title_index]["index"] = 1
-        parsing_result.sort(
+        parsing_res_list[first_doc_title_index]["index"] = 1
+        parsing_res_list.sort(
             key=lambda x: (
                 x["index"],
                 text_label_priority.get(x["block_label"], 9999),
@@ -1523,7 +1529,7 @@ def get_layout_ordering(
             ),
         )
     else:
-        parsing_result.sort(
+        parsing_res_list.sort(
             key=lambda x: (
                 x["index"],
                 x["block_bbox"][1],
@@ -1531,7 +1537,7 @@ def get_layout_ordering(
             ),
         )
 
-    for idx, block in enumerate(parsing_result):
+    for idx, block in enumerate(parsing_res_list):
         block["index"] = idx + 1
         block["sub_index"] = idx + 1
 
@@ -1541,7 +1547,7 @@ def get_layout_ordering(
     text_label_priority = {
         label: priority for priority, label in enumerate(text_sort_labels)
     }
-    parsing_result.sort(
+    parsing_res_list.sort(
         key=lambda x: (
             x["index"],
             text_label_priority.get(x["sub_label"], 9999),
@@ -1550,7 +1556,7 @@ def get_layout_ordering(
         ),
     )
 
-    for idx, block in enumerate(parsing_result):
+    for idx, block in enumerate(parsing_res_list):
         block["index"] = idx + 1
         block["sub_index"] = idx + 1
 
@@ -1560,7 +1566,7 @@ def get_layout_ordering(
         distance_type="nearest_iou_edge_distance",
         is_add_index=False,
     )
-    parsing_result.sort(
+    parsing_res_list.sort(
         key=lambda x: (
             x["sub_index"],
             x["block_bbox"][1],
@@ -1568,7 +1574,7 @@ def get_layout_ordering(
         ),
     )
 
-    for idx, block in enumerate(parsing_result):
+    for idx, block in enumerate(parsing_res_list):
         block["sub_index"] = idx + 1
 
     # image,figure,chart,seal title label
@@ -1577,7 +1583,7 @@ def get_layout_ordering(
         distance_type="nearest_iou_edge_distance",
         is_add_index=False,
     )
-    parsing_result.sort(
+    parsing_res_list.sort(
         key=lambda x: (
             x["sub_index"],
             x["block_bbox"][1],
@@ -1585,7 +1591,7 @@ def get_layout_ordering(
         ),
     )
 
-    for idx, block in enumerate(parsing_result):
+    for idx, block in enumerate(parsing_res_list):
         block["sub_index"] = idx + 1
 
     # vision footnote label
@@ -1595,7 +1601,7 @@ def get_layout_ordering(
         is_add_index=False,
     )
     text_label_priority = {"vision_footnote": 9999}
-    parsing_result.sort(
+    parsing_res_list.sort(
         key=lambda x: (
             x["sub_index"],
             text_label_priority.get(x["sub_label"], 0),
@@ -1604,13 +1610,28 @@ def get_layout_ordering(
         ),
     )
 
-    for idx, block in enumerate(parsing_result):
+    for idx, block in enumerate(parsing_res_list):
         block["sub_index"] = idx + 1
 
     # header、footnote、header_image... label
     nearest_match_(other_blocks, distance_type="manhattan", is_add_index=False)
 
-    return data
+    parsing_res_list = [
+        {
+            "block_label": parsing_res["block_label"],
+            "block_content": parsing_res["block_content"],
+            "block_bbox": parsing_res["block_bbox"],
+            "block_image": parsing_res.get("block_image", None),
+            "seg_start_flag": parsing_res["seg_start_flag"],
+            "seg_end_flag": parsing_res["seg_end_flag"],
+            "sub_label": parsing_res["sub_label"],
+            "sub_index": parsing_res["sub_index"],
+            "index": parsing_res.get("index", None),
+        }
+        for parsing_res in parsing_res_list
+    ]
+
+    return parsing_res_list
 
 
 def _manhattan_distance(