Kaynağa Gözat

support format block content (#4507)

* support save markdown content in parsing_res_list

* update docs
changdazhou 2 ay önce
ebeveyn
işleme
260b7ce919

+ 1 - 0
docs/pipeline_usage/tutorials/ocr_pipelines/PP-StructureV3.en.md

@@ -1994,6 +1994,7 @@ In the above Python script, the following steps are executed:
         - `use_seal_recognition`: `(bool)` Controls whether to enable the seal recognition sub-line.
         - `use_table_recognition`: `(bool)` Controls whether to enable the table recognition sub-line.
         - `use_formula_recognition`: `(bool)` Controls whether to enable the formula recognition sub-line.
+        - `format_block_content`: `(bool)` Controls whether to format the `block_content` into Markdown format
 
     - `parsing_res_list`: `(List[Dict])` A list of parsing results, where each element is a dictionary. The order of the list is the reading order after parsing.
         - `block_bbox`: `(np.ndarray)` The bounding box of the layout area.

+ 1 - 0
docs/pipeline_usage/tutorials/ocr_pipelines/PP-StructureV3.md

@@ -1956,6 +1956,7 @@ for item in markdown_images:
         - `use_seal_recognition`: `(bool)` 控制是否启用印章识别子产线
         - `use_table_recognition`: `(bool)` 控制是否启用表格识别子产线
         - `use_formula_recognition`: `(bool)` 控制是否启用公式识别子产线
+        - `format_block_content`: `(bool)` 控制是否将 `block_content` 中的内容格式化为Markdown格式
 
     - `doc_preprocessor_res`: `(Dict[str, Union[List[float], str]])` 文档预处理结果字典,仅当`use_doc_preprocessor=True`时存在
         - `input_path`: `(str)` 文档预处理子产线接受的图像路径,当输入为`numpy.ndarray`时,保存为`None`,此处为`None`

+ 1 - 0
paddlex/configs/pipelines/PP-StructureV3.yaml

@@ -9,6 +9,7 @@ use_table_recognition: True
 use_formula_recognition: True
 use_chart_recognition: False
 use_region_detection: True
+format_block_content: False
 
 SubModules:
   LayoutDetection:

+ 10 - 0
paddlex/inference/pipelines/layout_parsing/pipeline_v2.py

@@ -106,6 +106,7 @@ class _LayoutParsingPipelineV2(BasePipeline):
             self.use_doc_preprocessor = False
         self.use_table_recognition = config.get("use_table_recognition", True)
         self.use_seal_recognition = config.get("use_seal_recognition", True)
+        self.format_block_content = config.get("format_block_content", False)
         self.use_region_detection = config.get(
             "use_region_detection",
             True,
@@ -848,6 +849,7 @@ class _LayoutParsingPipelineV2(BasePipeline):
         use_formula_recognition: Union[bool, None],
         use_chart_recognition: Union[bool, None],
         use_region_detection: Union[bool, None],
+        format_block_content: Union[bool, None],
     ) -> dict:
         """
         Get the model settings based on the provided parameters or default values.
@@ -858,6 +860,7 @@ class _LayoutParsingPipelineV2(BasePipeline):
             use_seal_recognition (Union[bool, None]): Enables seal recognition if True. Defaults to system setting if None.
             use_table_recognition (Union[bool, None]): Enables table recognition if True. Defaults to system setting if None.
             use_formula_recognition (Union[bool, None]): Enables formula recognition if True. Defaults to system setting if None.
+            format_block_content (Union[bool, None]): Enables block content formatting if True. Defaults to system setting if None.
 
         Returns:
             dict: A dictionary containing the model settings.
@@ -886,6 +889,9 @@ class _LayoutParsingPipelineV2(BasePipeline):
         if use_chart_recognition is None:
             use_chart_recognition = self.use_chart_recognition
 
+        if format_block_content is None:
+            format_block_content = self.format_block_content
+
         return dict(
             use_doc_preprocessor=use_doc_preprocessor,
             use_seal_recognition=use_seal_recognition,
@@ -893,6 +899,7 @@ class _LayoutParsingPipelineV2(BasePipeline):
             use_formula_recognition=use_formula_recognition,
             use_chart_recognition=use_chart_recognition,
             use_region_detection=use_region_detection,
+            format_block_content=format_block_content,
         )
 
     def predict(
@@ -906,6 +913,7 @@ class _LayoutParsingPipelineV2(BasePipeline):
         use_formula_recognition: Union[bool, None] = None,
         use_chart_recognition: Union[bool, None] = None,
         use_region_detection: Union[bool, None] = None,
+        format_block_content: Union[bool, None] = None,
         layout_threshold: Optional[Union[float, dict]] = None,
         layout_nms: Optional[bool] = None,
         layout_unclip_ratio: Optional[Union[float, Tuple[float, float], dict]] = None,
@@ -943,6 +951,7 @@ class _LayoutParsingPipelineV2(BasePipeline):
             use_table_recognition (Optional[bool]): Whether to use table recognition.
             use_formula_recognition (Optional[bool]): Whether to use formula recognition.
             use_region_detection (Optional[bool]): Whether to use region detection.
+            format_block_content (Optional[bool]): Whether to format block content.
             layout_threshold (Optional[float]): The threshold value to filter out low-confidence predictions. Default is None.
             layout_nms (bool, optional): Whether to use layout-aware NMS. Defaults to False.
             layout_unclip_ratio (Optional[Union[float, Tuple[float, float]]], optional): The ratio of unclipping the bounding box.
@@ -982,6 +991,7 @@ class _LayoutParsingPipelineV2(BasePipeline):
             use_formula_recognition,
             use_chart_recognition,
             use_region_detection,
+            format_block_content,
         )
 
         if not self.check_model_settings_valid(model_settings):

+ 90 - 5
paddlex/inference/pipelines/layout_parsing/result_v2.py

@@ -278,23 +278,108 @@ class LayoutParsingResultV2(BaseCVResult, HtmlMixin, XlsxMixin, MarkdownMixin):
         Returns:
             Dict[str, str]: A dictionary containing the object's data in JSON format.
         """
+        if self["model_settings"].get("format_block_content", False):
+            original_image_width = self["doc_preprocessor_res"]["output_img"].shape[1]
+            format_text_func = lambda block: format_centered_by_html(
+                format_text_plain_func(block)
+            )
+            format_image_func = lambda block: format_centered_by_html(
+                format_image_scaled_by_html_func(
+                    block,
+                    original_image_width=original_image_width,
+                )
+            )
+
+            if self["model_settings"].get("use_chart_recognition", False):
+                format_chart_func = format_chart2table_func
+            else:
+                format_chart_func = format_image_func
+
+            if self["model_settings"].get("use_seal_recognition", False):
+                format_seal_func = lambda block: "\n".join(
+                    [format_image_func(block), format_text_func(block)]
+                )
+            else:
+                format_seal_func = format_image_func
+
+            if self["model_settings"].get("use_table_recognition", False):
+                format_table_func = lambda block: "\n" + format_text_func(
+                    block
+                ).replace("<table>", '<table border="1">')
+            else:
+                format_table_func = format_image_func
+
+            if self["model_settings"].get("use_formula_recognition", False):
+                format_formula_func = lambda block: f"$${block.content}$$"
+            else:
+                format_formula_func = format_image_func
+
+            handle_funcs_dict = {
+                "paragraph_title": format_title_func,
+                "abstract_title": format_title_func,
+                "reference_title": format_title_func,
+                "content_title": format_title_func,
+                "doc_title": lambda block: f"# {block.content}".replace(
+                    "-\n",
+                    "",
+                ).replace("\n", " "),
+                "table_title": format_text_func,
+                "figure_title": format_text_func,
+                "chart_title": format_text_func,
+                "vision_footnote": lambda block: block.content.replace(
+                    "\n\n", "\n"
+                ).replace("\n", "\n\n"),
+                "text": lambda block: block.content.replace("\n\n", "\n").replace(
+                    "\n", "\n\n"
+                ),
+                "abstract": partial(
+                    format_first_line_func,
+                    templates=["摘要", "abstract"],
+                    format_func=lambda l: f"## {l}\n",
+                    spliter=" ",
+                ),
+                "content": lambda block: block.content.replace("-\n", "  \n").replace(
+                    "\n", "  \n"
+                ),
+                "image": format_image_func,
+                "chart": format_chart_func,
+                "formula": format_formula_func,
+                "table": format_table_func,
+                "reference": partial(
+                    format_first_line_func,
+                    templates=["参考文献", "references"],
+                    format_func=lambda l: f"## {l}",
+                    spliter="\n",
+                ),
+                "algorithm": lambda block: block.content.strip("\n"),
+                "seal": format_seal_func,
+            }
+
         data = {}
         data["input_path"] = self["input_path"]
         data["page_index"] = self["page_index"]
         model_settings = self["model_settings"]
         data["model_settings"] = model_settings
         parsing_res_list: List[LayoutBlock] = self["parsing_res_list"]
-        parsing_res_list = [
-            {
+        parsing_res_list_json = []
+        for parsing_res in parsing_res_list:
+            res_dict = {
                 "block_label": parsing_res.label,
                 "block_content": parsing_res.content,
                 "block_bbox": parsing_res.bbox,
                 "block_id": parsing_res.index,
                 "block_order": parsing_res.order_index,
             }
-            for parsing_res in parsing_res_list
-        ]
-        data["parsing_res_list"] = parsing_res_list
+            if self["model_settings"].get("format_block_content", False):
+                if handle_funcs_dict.get(parsing_res.label):
+                    res_dict["block_content"] = handle_funcs_dict[parsing_res.label](
+                        parsing_res
+                    )
+                else:
+                    res_dict["block_content"] = parsing_res.content
+
+            parsing_res_list_json.append(res_dict)
+        data["parsing_res_list"] = parsing_res_list_json
         if self["model_settings"]["use_doc_preprocessor"]:
             data["doc_preprocessor_res"] = self["doc_preprocessor_res"].json["res"]
         data["layout_det_res"] = self["layout_det_res"].json["res"]