Browse Source

update document for layout_parsing_v2 & layout_parsing_v2 order (#3320)

* delete layout_parsing_v2 color function

* update document for layout_parsing_v2 & layout_parsing_v2 order
shuai.liu 9 months ago
parent
commit
ab4a282a44

+ 12 - 9
docs/pipeline_usage/tutorials/ocr_pipelines/layout_parsing_v2.md

@@ -1096,9 +1096,18 @@ for item in markdown_images:
         - `angle`: `(int)` 文档图像方向分类子模块的预测结果,启用时返回实际角度值
         - `angle`: `(int)` 文档图像方向分类子模块的预测结果,启用时返回实际角度值
 
 
     - `parsing_res_list`: `(List[Dict])` 解析结果的列表,每个元素为一个字典,列表顺序为解析后的阅读顺序。
     - `parsing_res_list`: `(List[Dict])` 解析结果的列表,每个元素为一个字典,列表顺序为解析后的阅读顺序。
-        - `layout_bbox`: `(np.ndarray)` 版面区域的边界框。
-        - `label`: `(str)` key 为版面区域的标签,例如`text`, `table`等,内容为版面区域内的内容。
-        - `layout`: `(str)` 版面排版类型,例如 `double`, `single` 等。
+        - `block_bbox`: `(np.ndarray)` 版面区域的边界框。
+        - `block_label`: `(str)` 版面区域的标签,例如`text`, `table`等。
+        - `block_content`: `(str)` 内容为版面区域内的内容。
+        - `seg_start_flag`: `(bool)` 标识该版面区域是否是段落的开始。
+        - `seg_end_flag`: `(bool)` 标识该版面区域是否是段落的结束。
+        - `sub_label`: `(str)` 版面区域的子标签,例如`text`的子标签可能为`title_text`。
+        - `sub_index`: `(int)` 版面区域的子索引,用于恢复Markdown。
+        - `index`: `(int)` 版面区域的索引,用于显示版面排序结果。
+
+
+
+
 
 
     - `overall_ocr_res`: `(Dict[str, Union[List[str], List[float], numpy.ndarray]])` 全局 OCR 结果的字典
     - `overall_ocr_res`: `(Dict[str, Union[List[str], List[float], numpy.ndarray]])` 全局 OCR 结果的字典
       - `input_path`: `(Union[str, None])` 图像OCR子产线接受的图像路径,当输入为`numpy.ndarray`时,保存为`None`
       - `input_path`: `(Union[str, None])` 图像OCR子产线接受的图像路径,当输入为`numpy.ndarray`时,保存为`None`
@@ -1121,12 +1130,6 @@ for item in markdown_images:
       - `rec_scores`: `(List[float])` 文本识别的置信度列表,已按`text_rec_score_thresh`过滤
       - `rec_scores`: `(List[float])` 文本识别的置信度列表,已按`text_rec_score_thresh`过滤
       - `rec_polys`: `(List[numpy.ndarray])` 经过置信度过滤的文本检测框列表,格式同`dt_polys`
       - `rec_polys`: `(List[numpy.ndarray])` 经过置信度过滤的文本检测框列表,格式同`dt_polys`
 
 
-    - `text_paragraphs_ocr_res`: `(Dict[str, Union[List[str], List[float], numpy.ndarray]])` 段落OCR结果,版面类型非表格、印章和公式类型的段落OCR结果
-        - `rec_polys`: `(List[numpy.ndarray])` 文本检测框列表,格式同`dt_polys`
-        - `rec_texts`: `(List[str])` 文本识别结果列表
-        - `rec_scores`: `(List[float])` 文本识别结果的置信度列表
-        - `rec_boxes`: `(numpy.ndarray)` 检测框的矩形边界框数组,shape为(n, 4),dtype为int16。每一行表示一个
-
     - `formula_res_list`: `(List[Dict[str, Union[numpy.ndarray, List[float], str]]])` 公式识别结果列表,每个元素为一个字典
     - `formula_res_list`: `(List[Dict[str, Union[numpy.ndarray, List[float], str]]])` 公式识别结果列表,每个元素为一个字典
         - `rec_formula`: `(str)` 公式识别结果
         - `rec_formula`: `(str)` 公式识别结果
         - `rec_polys`: `(numpy.ndarray)` 公式检测框,shape为(4, 2),dtype为int16
         - `rec_polys`: `(numpy.ndarray)` 公式检测框,shape为(4, 2),dtype为int16

+ 0 - 37
paddlex/inference/pipelines/layout_parsing/result_v2.py

@@ -44,7 +44,6 @@ class LayoutParsingResultV2(BaseCVResult, HtmlMixin, XlsxMixin, MarkdownMixin):
         XlsxMixin.__init__(self)
         XlsxMixin.__init__(self)
         MarkdownMixin.__init__(self)
         MarkdownMixin.__init__(self)
         JsonMixin.__init__(self)
         JsonMixin.__init__(self)
-        self.already_sorted = False
 
 
     def _get_input_fn(self):
     def _get_input_fn(self):
         fn = super()._get_input_fn()
         fn = super()._get_input_fn()
@@ -58,7 +57,6 @@ class LayoutParsingResultV2(BaseCVResult, HtmlMixin, XlsxMixin, MarkdownMixin):
     def _to_img(self) -> dict[str, np.ndarray]:
     def _to_img(self) -> dict[str, np.ndarray]:
         res_img_dict = {}
         res_img_dict = {}
         model_settings = self["model_settings"]
         model_settings = self["model_settings"]
-        page_index = self["page_index"]
         if model_settings["use_doc_preprocessor"]:
         if model_settings["use_doc_preprocessor"]:
             for key, value in self["doc_preprocessor_res"].img.items():
             for key, value in self["doc_preprocessor_res"].img.items():
                 res_img_dict[key] = value
                 res_img_dict[key] = value
@@ -95,21 +93,6 @@ class LayoutParsingResultV2(BaseCVResult, HtmlMixin, XlsxMixin, MarkdownMixin):
         image = Image.fromarray(self["doc_preprocessor_res"]["output_img"])
         image = Image.fromarray(self["doc_preprocessor_res"]["output_img"])
         draw = ImageDraw.Draw(image, "RGBA")
         draw = ImageDraw.Draw(image, "RGBA")
         parsing_result = self["parsing_res_list"]
         parsing_result = self["parsing_res_list"]
-
-        if self.already_sorted == False:
-            parsing_result = get_layout_ordering(
-                parsing_result,
-                no_mask_labels=[
-                    "text",
-                    "formula",
-                    "algorithm",
-                    "reference",
-                    "content",
-                    "abstract",
-                ],
-                already_sorted=self.already_sorted,
-            )
-
         for block in parsing_result:
         for block in parsing_result:
             bbox = block["block_bbox"]
             bbox = block["block_bbox"]
             index = block.get("index", None)
             index = block.get("index", None)
@@ -120,7 +103,6 @@ class LayoutParsingResultV2(BaseCVResult, HtmlMixin, XlsxMixin, MarkdownMixin):
                 text_position = (bbox[2] + 2, bbox[1] - 10)
                 text_position = (bbox[2] + 2, bbox[1] - 10)
                 draw.text(text_position, str(index), fill="red")
                 draw.text(text_position, str(index), fill="red")
 
 
-        self.already_sorted = True
         res_img_dict["layout_order_res"] = image
         res_img_dict["layout_order_res"] = image
 
 
         return res_img_dict
         return res_img_dict
@@ -256,23 +238,6 @@ class LayoutParsingResultV2(BaseCVResult, HtmlMixin, XlsxMixin, MarkdownMixin):
         Returns:
         Returns:
             Dict
             Dict
         """
         """
-
-        parsing_result = self["parsing_res_list"]
-        if self.already_sorted == False:
-            parsing_result = get_layout_ordering(
-                parsing_result,
-                no_mask_labels=[
-                    "text",
-                    "formula",
-                    "algorithm",
-                    "reference",
-                    "content",
-                    "abstract",
-                ],
-                already_sorted=self.already_sorted,
-            )
-        self.already_sorted == True
-
         recursive_img_array2path(self["parsing_res_list"], labels=["block_image"])
         recursive_img_array2path(self["parsing_res_list"], labels=["block_image"])
 
 
         def _format_data(obj):
         def _format_data(obj):
@@ -335,7 +300,6 @@ class LayoutParsingResultV2(BaseCVResult, HtmlMixin, XlsxMixin, MarkdownMixin):
                 "text": lambda: block["block_content"]
                 "text": lambda: block["block_content"]
                 .replace("-\n", " ")
                 .replace("-\n", " ")
                 .replace("\n", " "),
                 .replace("\n", " "),
-                # 'number': lambda: str(block['number']),
                 "abstract": lambda: block["block_content"]
                 "abstract": lambda: block["block_content"]
                 .replace("-\n", " ")
                 .replace("-\n", " ")
                 .replace("\n", " "),
                 .replace("\n", " "),
@@ -346,7 +310,6 @@ class LayoutParsingResultV2(BaseCVResult, HtmlMixin, XlsxMixin, MarkdownMixin):
                 "chart": lambda: format_image("block_image"),
                 "chart": lambda: format_image("block_image"),
                 "formula": lambda: f"$${block['block_content']}$$",
                 "formula": lambda: f"$${block['block_content']}$$",
                 "table": format_table,
                 "table": format_table,
-                # "reference": format_reference,
                 "reference": lambda: block["block_content"],
                 "reference": lambda: block["block_content"],
                 "algorithm": lambda: block["block_content"].strip("\n"),
                 "algorithm": lambda: block["block_content"].strip("\n"),
                 "seal": lambda: format_image("block_content"),
                 "seal": lambda: format_image("block_content"),

+ 62 - 41
paddlex/inference/pipelines/layout_parsing/utils.py

@@ -491,6 +491,18 @@ def get_single_block_parsing_res(
                     },
                     },
                 )
                 )
 
 
+    single_block_layout_parsing_res = get_layout_ordering(
+        single_block_layout_parsing_res,
+        no_mask_labels=[
+            "text",
+            "formula",
+            "algorithm",
+            "reference",
+            "content",
+            "abstract",
+        ],
+    )
+
     return single_block_layout_parsing_res
     return single_block_layout_parsing_res
 
 
 
 
@@ -1253,42 +1265,36 @@ def _get_sub_category(
 
 
 
 
 def get_layout_ordering(
 def get_layout_ordering(
-    data: List[Dict[str, Any]],
+    parsing_res_list: List[Dict[str, Any]],
     no_mask_labels: List[str] = [],
     no_mask_labels: List[str] = [],
-    already_sorted: bool = False,
 ) -> None:
 ) -> None:
     """
     """
     Process layout parsing results to remove overlapping bounding boxes
     Process layout parsing results to remove overlapping bounding boxes
     and assign an ordering index based on their positions.
     and assign an ordering index based on their positions.
 
 
     Modifies:
     Modifies:
-        The 'data' list by adding an 'index' to each block.
+        The 'parsing_res_list' list by adding an 'index' to each block.
 
 
     Args:
     Args:
-        data (List[Dict[str, Any]]): List of block dictionaries with 'block_bbox' and 'block_label'.
+        parsing_res_list (List[Dict[str, Any]]): List of block dictionaries with 'block_bbox' and 'block_label'.
         no_mask_labels (List[str]): Labels for which overlapping removal is not performed.
         no_mask_labels (List[str]): Labels for which overlapping removal is not performed.
-        already_sorted (bool): Assumes data is already sorted by position if True.
     """
     """
-    if already_sorted:
-        return data
-
     title_text_labels = ["doc_title"]
     title_text_labels = ["doc_title"]
     title_labels = ["doc_title", "paragraph_title"]
     title_labels = ["doc_title", "paragraph_title"]
     vision_labels = ["image", "table", "seal", "chart", "figure"]
     vision_labels = ["image", "table", "seal", "chart", "figure"]
     vision_title_labels = ["table_title", "chart_title", "figure_title"]
     vision_title_labels = ["table_title", "chart_title", "figure_title"]
 
 
-    parsing_result = data
-    parsing_result, _ = _remove_overlap_blocks(
-        parsing_result,
+    parsing_res_list, _ = _remove_overlap_blocks(
+        parsing_res_list,
         threshold=0.5,
         threshold=0.5,
         smaller=True,
         smaller=True,
     )
     )
-    parsing_result = _get_sub_category(parsing_result, title_text_labels)
+    parsing_res_list = _get_sub_category(parsing_res_list, title_text_labels)
 
 
     doc_flag = False
     doc_flag = False
-    median_width = _get_text_median_width(parsing_result)
-    parsing_result, projection_direction = _get_layout_property(
-        parsing_result,
+    median_width = _get_text_median_width(parsing_res_list)
+    parsing_res_list, projection_direction = _get_layout_property(
+        parsing_res_list,
         median_width,
         median_width,
         no_mask_labels=no_mask_labels,
         no_mask_labels=no_mask_labels,
         threshold=0.3,
         threshold=0.3,
@@ -1306,7 +1312,7 @@ def get_layout_ordering(
 
 
     drop_indexes = []
     drop_indexes = []
 
 
-    for index, block in enumerate(parsing_result):
+    for index, block in enumerate(parsing_res_list):
         label = block["sub_label"]
         label = block["sub_label"]
         block["block_bbox"] = list(map(int, block["block_bbox"]))
         block["block_bbox"] = list(map(int, block["block_bbox"]))
 
 
@@ -1337,15 +1343,15 @@ def get_layout_ordering(
             drop_indexes.append(index)
             drop_indexes.append(index)
 
 
     for index in sorted(drop_indexes, reverse=True):
     for index in sorted(drop_indexes, reverse=True):
-        del parsing_result[index]
+        del parsing_res_list[index]
 
 
-    if len(parsing_result) > 0:
+    if len(parsing_res_list) > 0:
         # single text label
         # single text label
-        if len(double_text_blocks) > len(parsing_result) or projection_direction:
-            parsing_result.extend(title_blocks + double_text_blocks)
+        if len(double_text_blocks) > len(parsing_res_list) or projection_direction:
+            parsing_res_list.extend(title_blocks + double_text_blocks)
             title_blocks = []
             title_blocks = []
             double_text_blocks = []
             double_text_blocks = []
-            block_bboxes = [block["block_bbox"] for block in parsing_result]
+            block_bboxes = [block["block_bbox"] for block in parsing_res_list]
             block_bboxes.sort(
             block_bboxes.sort(
                 key=lambda x: (
                 key=lambda x: (
                     x[0] // max(20, median_width),
                     x[0] // max(20, median_width),
@@ -1359,7 +1365,7 @@ def get_layout_ordering(
                 min_gap=1,
                 min_gap=1,
             )
             )
         else:
         else:
-            block_bboxes = [block["block_bbox"] for block in parsing_result]
+            block_bboxes = [block["block_bbox"] for block in parsing_res_list]
             block_bboxes.sort(key=lambda x: (x[0] // 20, x[1]))
             block_bboxes.sort(key=lambda x: (x[0] // 20, x[1]))
             block_bboxes = np.array(block_bboxes)
             block_bboxes = np.array(block_bboxes)
             sorted_indices = sort_by_xycut(
             sorted_indices = sort_by_xycut(
@@ -1370,7 +1376,7 @@ def get_layout_ordering(
 
 
         sorted_boxes = block_bboxes[sorted_indices].tolist()
         sorted_boxes = block_bboxes[sorted_indices].tolist()
 
 
-        for block in parsing_result:
+        for block in parsing_res_list:
             block["index"] = sorted_boxes.index(block["block_bbox"]) + 1
             block["index"] = sorted_boxes.index(block["block_bbox"]) + 1
             block["sub_index"] = sorted_boxes.index(block["block_bbox"]) + 1
             block["sub_index"] = sorted_boxes.index(block["block_bbox"]) + 1
 
 
@@ -1384,7 +1390,7 @@ def get_layout_ordering(
                 float("inf"),
                 float("inf"),
             ]  # for double text
             ]  # for double text
             nearest_gt_index = 0
             nearest_gt_index = 0
-            for match_block in parsing_result:
+            for match_block in parsing_res_list:
                 match_bbox = match_block["block_bbox"]
                 match_bbox = match_block["block_bbox"]
                 if distance_type == "nearest_iou_edge_distance":
                 if distance_type == "nearest_iou_edge_distance":
                     distance, min_distance_config = _nearest_iou_edge_distance(
                     distance, min_distance_config = _nearest_iou_edge_distance(
@@ -1468,7 +1474,7 @@ def get_layout_ordering(
             else:
             else:
                 block["sub_index"] = nearest_gt_index
                 block["sub_index"] = nearest_gt_index
 
 
-            parsing_result.append(block)
+            parsing_res_list.append(block)
 
 
     # double text label
     # double text label
     double_text_blocks.sort(
     double_text_blocks.sort(
@@ -1482,11 +1488,11 @@ def get_layout_ordering(
         double_text_blocks,
         double_text_blocks,
         distance_type="nearest_iou_edge_distance",
         distance_type="nearest_iou_edge_distance",
     )
     )
-    parsing_result.sort(
+    parsing_res_list.sort(
         key=lambda x: (x["index"], x["block_bbox"][1], x["block_bbox"][0]),
         key=lambda x: (x["index"], x["block_bbox"][1], x["block_bbox"][0]),
     )
     )
 
 
-    for idx, block in enumerate(parsing_result):
+    for idx, block in enumerate(parsing_res_list):
         block["index"] = idx + 1
         block["index"] = idx + 1
         block["sub_index"] = idx + 1
         block["sub_index"] = idx + 1
 
 
@@ -1506,15 +1512,15 @@ def get_layout_ordering(
             label: priority for priority, label in enumerate(text_sort_labels)
             label: priority for priority, label in enumerate(text_sort_labels)
         }
         }
         doc_titles = []
         doc_titles = []
-        for i, block in enumerate(parsing_result):
+        for i, block in enumerate(parsing_res_list):
             if block["block_label"] == "doc_title":
             if block["block_label"] == "doc_title":
                 doc_titles.append(
                 doc_titles.append(
                     (i, block["block_bbox"][1], block["block_bbox"][0]),
                     (i, block["block_bbox"][1], block["block_bbox"][0]),
                 )
                 )
         doc_titles.sort(key=lambda x: (x[1], x[2]))
         doc_titles.sort(key=lambda x: (x[1], x[2]))
         first_doc_title_index = doc_titles[0][0]
         first_doc_title_index = doc_titles[0][0]
-        parsing_result[first_doc_title_index]["index"] = 1
-        parsing_result.sort(
+        parsing_res_list[first_doc_title_index]["index"] = 1
+        parsing_res_list.sort(
             key=lambda x: (
             key=lambda x: (
                 x["index"],
                 x["index"],
                 text_label_priority.get(x["block_label"], 9999),
                 text_label_priority.get(x["block_label"], 9999),
@@ -1523,7 +1529,7 @@ def get_layout_ordering(
             ),
             ),
         )
         )
     else:
     else:
-        parsing_result.sort(
+        parsing_res_list.sort(
             key=lambda x: (
             key=lambda x: (
                 x["index"],
                 x["index"],
                 x["block_bbox"][1],
                 x["block_bbox"][1],
@@ -1531,7 +1537,7 @@ def get_layout_ordering(
             ),
             ),
         )
         )
 
 
-    for idx, block in enumerate(parsing_result):
+    for idx, block in enumerate(parsing_res_list):
         block["index"] = idx + 1
         block["index"] = idx + 1
         block["sub_index"] = idx + 1
         block["sub_index"] = idx + 1
 
 
@@ -1541,7 +1547,7 @@ def get_layout_ordering(
     text_label_priority = {
     text_label_priority = {
         label: priority for priority, label in enumerate(text_sort_labels)
         label: priority for priority, label in enumerate(text_sort_labels)
     }
     }
-    parsing_result.sort(
+    parsing_res_list.sort(
         key=lambda x: (
         key=lambda x: (
             x["index"],
             x["index"],
             text_label_priority.get(x["sub_label"], 9999),
             text_label_priority.get(x["sub_label"], 9999),
@@ -1550,7 +1556,7 @@ def get_layout_ordering(
         ),
         ),
     )
     )
 
 
-    for idx, block in enumerate(parsing_result):
+    for idx, block in enumerate(parsing_res_list):
         block["index"] = idx + 1
         block["index"] = idx + 1
         block["sub_index"] = idx + 1
         block["sub_index"] = idx + 1
 
 
@@ -1560,7 +1566,7 @@ def get_layout_ordering(
         distance_type="nearest_iou_edge_distance",
         distance_type="nearest_iou_edge_distance",
         is_add_index=False,
         is_add_index=False,
     )
     )
-    parsing_result.sort(
+    parsing_res_list.sort(
         key=lambda x: (
         key=lambda x: (
             x["sub_index"],
             x["sub_index"],
             x["block_bbox"][1],
             x["block_bbox"][1],
@@ -1568,7 +1574,7 @@ def get_layout_ordering(
         ),
         ),
     )
     )
 
 
-    for idx, block in enumerate(parsing_result):
+    for idx, block in enumerate(parsing_res_list):
         block["sub_index"] = idx + 1
         block["sub_index"] = idx + 1
 
 
     # image,figure,chart,seal title label
     # image,figure,chart,seal title label
@@ -1577,7 +1583,7 @@ def get_layout_ordering(
         distance_type="nearest_iou_edge_distance",
         distance_type="nearest_iou_edge_distance",
         is_add_index=False,
         is_add_index=False,
     )
     )
-    parsing_result.sort(
+    parsing_res_list.sort(
         key=lambda x: (
         key=lambda x: (
             x["sub_index"],
             x["sub_index"],
             x["block_bbox"][1],
             x["block_bbox"][1],
@@ -1585,7 +1591,7 @@ def get_layout_ordering(
         ),
         ),
     )
     )
 
 
-    for idx, block in enumerate(parsing_result):
+    for idx, block in enumerate(parsing_res_list):
         block["sub_index"] = idx + 1
         block["sub_index"] = idx + 1
 
 
     # vision footnote label
     # vision footnote label
@@ -1595,7 +1601,7 @@ def get_layout_ordering(
         is_add_index=False,
         is_add_index=False,
     )
     )
     text_label_priority = {"vision_footnote": 9999}
     text_label_priority = {"vision_footnote": 9999}
-    parsing_result.sort(
+    parsing_res_list.sort(
         key=lambda x: (
         key=lambda x: (
             x["sub_index"],
             x["sub_index"],
             text_label_priority.get(x["sub_label"], 0),
             text_label_priority.get(x["sub_label"], 0),
@@ -1604,13 +1610,28 @@ def get_layout_ordering(
         ),
         ),
     )
     )
 
 
-    for idx, block in enumerate(parsing_result):
+    for idx, block in enumerate(parsing_res_list):
         block["sub_index"] = idx + 1
         block["sub_index"] = idx + 1
 
 
     # header、footnote、header_image... label
     # header、footnote、header_image... label
     nearest_match_(other_blocks, distance_type="manhattan", is_add_index=False)
     nearest_match_(other_blocks, distance_type="manhattan", is_add_index=False)
 
 
-    return data
+    parsing_res_list = [
+        {
+            "block_label": parsing_res["block_label"],
+            "block_content": parsing_res["block_content"],
+            "block_bbox": parsing_res["block_bbox"],
+            "block_image": parsing_res.get("block_image", None),
+            "seg_start_flag": parsing_res["seg_start_flag"],
+            "seg_end_flag": parsing_res["seg_end_flag"],
+            "sub_label": parsing_res["sub_label"],
+            "sub_index": parsing_res["sub_index"],
+            "index": parsing_res.get("index", None),
+        }
+        for parsing_res in parsing_res_list
+    ]
+
+    return parsing_res_list
 
 
 
 
 def _manhattan_distance(
 def _manhattan_distance(