9 months ago · 6f6a7c13b0
--- a/paddlex/inference/pipelines/doc_preprocessor/result.py
+++ b/paddlex/inference/pipelines/doc_preprocessor/result.py
@@ -79,6 +79,7 @@ class DocPreprocessorResult(BaseCVResult):
 
															         """
														
 
															         data = {}
														
 
															         data["input_path"] = self["input_path"]
														
 
															+        data["page_index"] = self["page_index"]
														
 
															         data["model_settings"] = self["model_settings"]
														
 
															         data["angle"] = self["angle"]
														
 
															         return JsonMixin._to_str(data, *args, **kwargs)
														
@@ -96,6 +97,7 @@ class DocPreprocessorResult(BaseCVResult):
 
															         """
														
 
															         data = {}
														
 
															         data["input_path"] = self["input_path"]
														
 
															+        data["page_index"] = self["page_index"]
														
 
															         data["model_settings"] = self["model_settings"]
														
 
															         data["angle"] = self["angle"]
														
 
															         return JsonMixin._to_json(data, *args, **kwargs)
														
--- a/paddlex/inference/pipelines/formula_recognition/result.py
+++ b/paddlex/inference/pipelines/formula_recognition/result.py
@@ -163,6 +163,7 @@ class FormulaRecognitionResult(BaseCVResult):
 
															         """
														
 
															         data = {}
														
 
															         data["input_path"] = self["input_path"]
														
 
															+        data["page_index"] = self["page_index"]
														
 
															         data["model_settings"] = self["model_settings"]
														
 
															         if self["model_settings"]["use_doc_preprocessor"]:
														
 
															             data["doc_preprocessor_res"] = self["doc_preprocessor_res"].str["res"]
														
@@ -195,6 +196,7 @@ class FormulaRecognitionResult(BaseCVResult):
 
															         """
														
 
															         data = {}
														
 
															         data["input_path"] = self["input_path"]
														
 
															+        data["page_index"] = str(self["page_index"])
														
 
															         data["model_settings"] = self["model_settings"]
														
 
															         if self["model_settings"]["use_doc_preprocessor"]:
														
 
															             data["doc_preprocessor_res"] = self["doc_preprocessor_res"].str["res"]
														
--- a/paddlex/inference/pipelines/layout_parsing/pipeline.py
+++ b/paddlex/inference/pipelines/layout_parsing/pipeline.py
@@ -12,12 +12,11 @@
 
															 # See the License for the specific language governing permissions and
														
 
															 # limitations under the License.
														
 
															-from email.mime import image
														
 
															-from typing import Any, Dict, Optional, Union, List, Tuple
														
 
															+from typing import Dict, Optional, Union, List, Tuple
														
 
															 import numpy as np
														
 
															 from ..base import BasePipeline
														
 
															 from .utils import get_sub_regions_ocr_res, sorted_layout_boxes
														
 
															-from ..components import convert_points_to_boxes
														
 
															+from ..components import CropByBoxes
														
 
															 from .result import LayoutParsingResult
														
 
															 from ....utils import logging
														
 
															 from ...utils.pp_option import PaddlePredictorOption
														
@@ -56,6 +55,7 @@ class LayoutParsingPipeline(BasePipeline):
 
															         self.batch_sampler = ImageBatchSampler(batch_size=1)
														
 
															         self.img_reader = ReadImage(format="BGR")
														
 
															+        self._crop_by_boxes = CropByBoxes()
														
 
															     def inintial_predictor(self, config: Dict) -> None:
														
 
															         """Initializes the predictor based on the provided configuration.
														
@@ -88,7 +88,6 @@ class LayoutParsingPipeline(BasePipeline):
 
															             "LayoutDetection",
														
 
															             {"model_config_error": "config error for layout_det_model!"},
														
 
															         )
														
 
															-        self.layout_det_model = self.create_model(layout_det_config)
														
 
															         layout_kwargs = {}
														
 
															         if (threshold := layout_det_config.get("threshold", None)) is not None:
														
 
															             layout_kwargs["threshold"] = threshold
														
@@ -205,7 +204,9 @@ class LayoutParsingPipeline(BasePipeline):
 
															             list: A list of dictionaries representing the layout parsing result.
														
 
															         """
														
 
															         layout_parsing_res = []
														
 
															+        sub_image_list = []
														
 
															         matched_ocr_dict = {}
														
 
															+        sub_image_region_id = 0
														
 
															         formula_index = 0
														
 
															         table_index = 0
														
 
															         seal_index = 0
														
@@ -218,15 +219,15 @@ class LayoutParsingPipeline(BasePipeline):
 
															             label = box_info["label"].lower()
														
 
															             single_box_res["layout_bbox"] = box
														
 
															             object_boxes.append(box)
														
 
															-            if label == "formula":
														
 
															+            if label == "formula" and len(formula_res_list) > formula_index:
														
 
															                 single_box_res["formula"] = formula_res_list[formula_index][
														
 
															                     "rec_formula"
														
 
															                 ]
														
 
															                 formula_index += 1
														
 
															-            elif label == "table":
														
 
															+            elif label == "table" and len(table_res_list) > table_index:
														
 
															                 single_box_res["table"] = table_res_list[table_index]["pred_html"]
														
 
															                 table_index += 1
														
 
															-            elif label == "seal":
														
 
															+            elif label == "seal" and len(seal_res_list) > seal_index:
														
 
															                 single_box_res["seal"] = "".join(seal_res_list[seal_index]["rec_texts"])
														
 
															                 seal_index += 1
														
 
															             else:
														
@@ -239,9 +240,9 @@ class LayoutParsingPipeline(BasePipeline):
 
															                     else:
														
 
															                         matched_ocr_dict[matched_idx].append(object_box_idx)
														
 
															                 if label in image_labels:
														
 
															-                    x1, y1, x2, y2 = [int(i) for i in box]
														
 
															-                    sub_image = image[y1:y2, x1:x2, :]
														
 
															-                    single_box_res["image"] = sub_image
														
 
															+                    crop_img_info = self._crop_by_boxes(image, [box_info])
														
 
															+                    crop_img_info = crop_img_info[0]
														
 
															+                    sub_image_list.append(crop_img_info["img"])
														
 
															                     single_box_res[f"{label}_text"] = "\n".join(
														
 
															                         ocr_res_in_box["rec_texts"]
														
 
															                     )
														
@@ -286,7 +287,7 @@ class LayoutParsingPipeline(BasePipeline):
 
															         layout_parsing_res = sorted_layout_boxes(layout_parsing_res, w=image.shape[1])
														
 
															-        return layout_parsing_res
														
 
															+        return layout_parsing_res, sub_image_list
														
 
															     def check_model_settings_valid(self, input_params: Dict) -> bool:
														
 
															         """
														
@@ -380,10 +381,15 @@ class LayoutParsingPipeline(BasePipeline):
 
															         input: Union[str, List[str], np.ndarray, List[np.ndarray]],
														
 
															         use_doc_orientation_classify: Optional[bool] = None,
														
 
															         use_doc_unwarping: Optional[bool] = None,
														
 
															+        use_textline_orientation: Optional[bool] = None,
														
 
															         use_general_ocr: Optional[bool] = None,
														
 
															         use_seal_recognition: Optional[bool] = None,
														
 
															         use_table_recognition: Optional[bool] = None,
														
 
															         use_formula_recognition: Optional[bool] = None,
														
 
															+        layout_threshold: Optional[Union[float, dict]] = None,
														
 
															+        layout_nms: Optional[bool] = None,
														
 
															+        layout_unclip_ratio: Optional[Union[float, Tuple[float, float]]] = None,
														
 
															+        layout_merge_bboxes_mode: Optional[str] = None,
														
 
															         text_det_limit_side_len: Optional[int] = None,
														
 
															         text_det_limit_type: Optional[str] = None,
														
 
															         text_det_thresh: Optional[float] = None,
														
@@ -396,10 +402,6 @@ class LayoutParsingPipeline(BasePipeline):
 
															         seal_det_box_thresh: Optional[float] = None,
														
 
															         seal_det_unclip_ratio: Optional[float] = None,
														
 
															         seal_rec_score_thresh: Optional[float] = None,
														
 
															-        layout_threshold: Optional[Union[float, dict]] = None,
														
 
															-        layout_nms: Optional[bool] = None,
														
 
															-        layout_unclip_ratio: Optional[Union[float, Tuple[float, float]]] = None,
														
 
															-        layout_merge_bboxes_mode: Optional[str] = None,
														
 
															         **kwargs,
														
 
															     ) -> LayoutParsingResult:
														
 
															         """
														
@@ -407,11 +409,34 @@ class LayoutParsingPipeline(BasePipeline):
 
															         Args:
														
 
															             input (Union[str, list[str], np.ndarray, list[np.ndarray]]): The input image(s) or pdf(s) to be processed.
														
 
															-            use_doc_orientation_classify (bool): Whether to use document orientation classification.
														
 
															-            use_doc_unwarping (bool): Whether to use document unwarping.
														
 
															-            use_general_ocr (bool): Whether to use general OCR.
														
 
															-            use_seal_recognition (bool): Whether to use seal recognition.
														
 
															-            use_table_recognition (bool): Whether to use table recognition.
														
 
															+            use_doc_orientation_classify (Optional[bool]): Whether to use document orientation classification.
														
 
															+            use_doc_unwarping (Optional[bool]): Whether to use document unwarping.
														
 
															+            use_textline_orientation (Optional[bool]): Whether to use textline orientation prediction.
														
 
															+            use_general_ocr (Optional[bool]): Whether to use general OCR.
														
 
															+            use_seal_recognition (Optional[bool]): Whether to use seal recognition.
														
 
															+            use_table_recognition (Optional[bool]): Whether to use table recognition.
														
 
															+            use_formula_recognition (Optional[bool]): Whether to use formula recognition.
														
 
															+            layout_threshold (Optional[float]): The threshold value to filter out low-confidence predictions. Default is None.
														
 
															+            layout_nms (bool, optional): Whether to use layout-aware NMS. Defaults to False.
														
 
															+            layout_unclip_ratio (Optional[Union[float, Tuple[float, float]]], optional): The ratio of unclipping the bounding box.
														
 
															+                Defaults to None.
														
 
															+                If it's a single number, then both width and height are used.
														
 
															+                If it's a tuple of two numbers, then they are used separately for width and height respectively.
														
 
															+                If it's None, then no unclipping will be performed.
														
 
															+            layout_merge_bboxes_mode (Optional[str], optional): The mode for merging bounding boxes. Defaults to None.
														
 
															+            text_det_limit_side_len (Optional[int]): Maximum side length for text detection.
														
 
															+            text_det_limit_type (Optional[str]): Type of limit to apply for text detection.
														
 
															+            text_det_thresh (Optional[float]): Threshold for text detection.
														
 
															+            text_det_box_thresh (Optional[float]): Threshold for text detection boxes.
														
 
															+            text_det_unclip_ratio (Optional[float]): Ratio for unclipping text detection boxes.
														
 
															+            text_rec_score_thresh (Optional[float]): Score threshold for text recognition.
														
 
															+            seal_det_limit_side_len (Optional[int]): Maximum side length for seal detection.
														
 
															+            seal_det_limit_type (Optional[str]): Type of limit to apply for seal detection.
														
 
															+            seal_det_thresh (Optional[float]): Threshold for seal detection.
														
 
															+            seal_det_box_thresh (Optional[float]): Threshold for seal detection boxes.
														
 
															+            seal_det_unclip_ratio (Optional[float]): Ratio for unclipping seal detection boxes.
														
 
															+            seal_rec_score_thresh (Optional[float]): Score threshold for seal recognition.
														
 
															+
														
 
															             **kwargs: Additional keyword arguments.
														
 
															         Returns:
														
@@ -463,6 +488,7 @@ class LayoutParsingPipeline(BasePipeline):
 
															                 overall_ocr_res = next(
														
 
															                     self.general_ocr_pipeline(
														
 
															                         doc_preprocessor_image,
														
 
															+                        use_textline_orientation=use_textline_orientation,
														
 
															                         text_det_limit_side_len=text_det_limit_side_len,
														
 
															                         text_det_limit_type=text_det_limit_type,
														
 
															                         text_det_thresh=text_det_thresh,
														
@@ -531,7 +557,7 @@ class LayoutParsingPipeline(BasePipeline):
 
															             else:
														
 
															                 formula_res_list = []
														
 
															-            parsing_res_list = self.get_layout_parsing_res(
														
 
															+            parsing_res_list, sub_image_list = self.get_layout_parsing_res(
														
 
															                 doc_preprocessor_image,
														
 
															                 layout_det_res=layout_det_res,
														
 
															                 overall_ocr_res=overall_ocr_res,
														
@@ -558,5 +584,6 @@ class LayoutParsingPipeline(BasePipeline):
 
															                 "formula_res_list": formula_res_list,
														
 
															                 "parsing_res_list": parsing_res_list,
														
 
															                 "model_settings": model_settings,
														
 
															+                "sub_image_list": sub_image_list,
														
 
															             }
														
 
															             yield LayoutParsingResult(single_img_res)
														
--- a/paddlex/inference/pipelines/layout_parsing/result.py
+++ b/paddlex/inference/pipelines/layout_parsing/result.py
@@ -12,12 +12,11 @@
 
															 # See the License for the specific language governing permissions and
														
 
															 # limitations under the License.
														
 
															-import os
														
 
															 from typing import Dict
														
 
															 import numpy as np
														
 
															 from PIL import Image, ImageDraw
														
 
															 import copy
														
 
															-from ...common.result import BaseCVResult, HtmlMixin, XlsxMixin, StrMixin, JsonMixin
														
 
															+from ...common.result import BaseCVResult, HtmlMixin, XlsxMixin, JsonMixin
														
 
															 class LayoutParsingResult(BaseCVResult, HtmlMixin, XlsxMixin):
														
@@ -63,6 +62,7 @@ class LayoutParsingResult(BaseCVResult, HtmlMixin, XlsxMixin):
 
															                     table_draw.rectangle(
														
 
															                         [x1, y1, x2, y2], outline=rectangle_color, width=2
														
 
															                     )
														
 
															+            res_img_dict["table_cell_img"] = table_cell_img
														
 
															         if model_settings["use_seal_recognition"] and len(self["seal_res_list"]) > 0:
														
 
															             for sno in range(len(self["seal_res_list"])):
														
@@ -82,6 +82,16 @@ class LayoutParsingResult(BaseCVResult, HtmlMixin, XlsxMixin):
 
															                 sub_formula_res_dict = formula_res.img
														
 
															                 key = f"formula_res_region{formula_region_id}"
														
 
															                 res_img_dict[key] = sub_formula_res_dict["res"]
														
 
															+
														
 
															+        if len(self["sub_image_list"]) > 0:
														
 
															+            for sno in range(len(self["sub_image_list"])):
														
 
															+                sub_region_image = Image.fromarray(
														
 
															+                    copy.deepcopy(self["sub_image_list"][sno])
														
 
															+                )
														
 
															+                sub_region_image_id = sno + 1
														
 
															+                key = f"sub_region_image{sub_region_image_id}"
														
 
															+                res_img_dict[key] = sub_region_image
														
 
															+
														
 
															         return res_img_dict
														
 
															     def _to_str(self, *args, **kwargs) -> Dict[str, str]:
														
@@ -96,6 +106,7 @@ class LayoutParsingResult(BaseCVResult, HtmlMixin, XlsxMixin):
 
															         """
														
 
															         data = {}
														
 
															         data["input_path"] = self["input_path"]
														
 
															+        data["page_index"] = self["page_index"]
														
 
															         model_settings = self["model_settings"]
														
 
															         data["model_settings"] = model_settings
														
 
															         data["parsing_res_list"] = self["parsing_res_list"]
														
@@ -147,6 +158,7 @@ class LayoutParsingResult(BaseCVResult, HtmlMixin, XlsxMixin):
 
															         """
														
 
															         data = {}
														
 
															         data["input_path"] = self["input_path"]
														
 
															+        data["page_index"] = self["page_index"]
														
 
															         model_settings = self["model_settings"]
														
 
															         data["model_settings"] = model_settings
														
 
															         data["parsing_res_list"] = self["parsing_res_list"]
														
--- a/paddlex/inference/pipelines/layout_parsing/utils.py
+++ b/paddlex/inference/pipelines/layout_parsing/utils.py
@@ -108,6 +108,8 @@ def get_sub_regions_ocr_res(
 
															             sub_regions_ocr_res["rec_boxes"].append(
														
 
															                 overall_ocr_res["rec_boxes"][box_no]
														
 
															             )
														
 
															+    for key in ["rec_polys", "rec_scores", "rec_boxes"]:
														
 
															+        sub_regions_ocr_res[key] = np.array(sub_regions_ocr_res[key])
														
 
															     return (
														
 
															         (sub_regions_ocr_res, match_idx_list)
														
 
															         if return_match_idx
														
--- a/paddlex/inference/pipelines/seal_recognition/result.py
+++ b/paddlex/inference/pipelines/seal_recognition/result.py
@@ -74,6 +74,7 @@ class SealRecognitionResult(BaseCVResult):
 
															         """
														
 
															         data = {}
														
 
															         data["input_path"] = self["input_path"]
														
 
															+        data["page_index"] = self["page_index"]
														
 
															         data["model_settings"] = self["model_settings"]
														
 
															         if self["model_settings"]["use_doc_preprocessor"]:
														
 
															             data["doc_preprocessor_res"] = self["doc_preprocessor_res"].json["res"]
														
--- a/paddlex/inference/pipelines/table_recognition/result.py
+++ b/paddlex/inference/pipelines/table_recognition/result.py
@@ -138,6 +138,7 @@ class TableRecognitionResult(BaseCVResult, HtmlMixin, XlsxMixin):
 
															         """
														
 
															         data = {}
														
 
															         data["input_path"] = self["input_path"]
														
 
															+        data["page_index"] = self["page_index"]
														
 
															         data["model_settings"] = self["model_settings"]
														
 
															         if self["model_settings"]["use_doc_preprocessor"]:
														
 
															             data["doc_preprocessor_res"] = self["doc_preprocessor_res"].str["res"]
														
@@ -163,6 +164,7 @@ class TableRecognitionResult(BaseCVResult, HtmlMixin, XlsxMixin):
 
															         """
														
 
															         data = {}
														
 
															         data["input_path"] = self["input_path"]
														
 
															+        data["page_index"] = self["page_index"]
														
 
															         data["model_settings"] = self["model_settings"]
														
 
															         if self["model_settings"]["use_doc_preprocessor"]:
														
 
															             data["doc_preprocessor_res"] = self["doc_preprocessor_res"].json["res"]