Sfoglia il codice sorgente

rm use_general_ocr

gaotingquan 6 mesi fa
parent
commit
670baf15b2

+ 0 - 1
paddlex/configs/pipelines/PP-StructureV3.yaml

@@ -4,7 +4,6 @@ pipeline_name: PP-StructureV3
 batch_size: 8
 
 use_doc_preprocessor: True
-use_general_ocr: True
 use_seal_recognition: True
 use_table_recognition: True
 use_formula_recognition: True

+ 0 - 1
paddlex/configs/pipelines/layout_parsing.yaml

@@ -2,7 +2,6 @@
 pipeline_name: layout_parsing
 
 use_doc_preprocessor: True
-use_general_ocr: True
 use_seal_recognition: True
 use_table_recognition: True
 use_formula_recognition: False

+ 16 - 39
paddlex/inference/pipelines/layout_parsing/pipeline.py

@@ -77,7 +77,6 @@ class _LayoutParsingPipeline(BasePipeline):
         """
 
         self.use_doc_preprocessor = config.get("use_doc_preprocessor", True)
-        self.use_general_ocr = config.get("use_general_ocr", True)
         self.use_table_recognition = config.get("use_table_recognition", True)
         self.use_seal_recognition = config.get("use_seal_recognition", True)
         self.use_formula_recognition = config.get("use_formula_recognition", True)
@@ -114,12 +113,11 @@ class _LayoutParsingPipeline(BasePipeline):
             layout_kwargs["layout_merge_bboxes_mode"] = layout_merge_bboxes_mode
         self.layout_det_model = self.create_model(layout_det_config, **layout_kwargs)
 
-        if self.use_general_ocr or self.use_table_recognition:
-            general_ocr_config = config.get("SubPipelines", {}).get(
-                "GeneralOCR",
-                {"pipeline_config_error": "config error for general_ocr_pipeline!"},
-            )
-            self.general_ocr_pipeline = self.create_pipeline(general_ocr_config)
+        general_ocr_config = config.get("SubPipelines", {}).get(
+            "GeneralOCR",
+            {"pipeline_config_error": "config error for general_ocr_pipeline!"},
+        )
+        self.general_ocr_pipeline = self.create_pipeline(general_ocr_config)
 
         if self.use_seal_recognition:
             seal_recognition_config = config.get("SubPipelines", {}).get(
@@ -305,12 +303,6 @@ class _LayoutParsingPipeline(BasePipeline):
             )
             return False
 
-        if input_params["use_general_ocr"] and not self.use_general_ocr:
-            logging.error(
-                "Set use_general_ocr, but the models for general OCR are not initialized."
-            )
-            return False
-
         if input_params["use_seal_recognition"] and not self.use_seal_recognition:
             logging.error(
                 "Set use_seal_recognition, but the models for seal recognition are not initialized."
@@ -329,7 +321,6 @@ class _LayoutParsingPipeline(BasePipeline):
         self,
         use_doc_orientation_classify: Optional[bool],
         use_doc_unwarping: Optional[bool],
-        use_general_ocr: Optional[bool],
         use_seal_recognition: Optional[bool],
         use_table_recognition: Optional[bool],
         use_formula_recognition: Optional[bool],
@@ -340,7 +331,6 @@ class _LayoutParsingPipeline(BasePipeline):
         Args:
             use_doc_orientation_classify (Optional[bool]): Whether to use document orientation classification.
             use_doc_unwarping (Optional[bool]): Whether to use document unwarping.
-            use_general_ocr (Optional[bool]): Whether to use general OCR.
             use_seal_recognition (Optional[bool]): Whether to use seal recognition.
             use_table_recognition (Optional[bool]): Whether to use table recognition.
 
@@ -355,9 +345,6 @@ class _LayoutParsingPipeline(BasePipeline):
             else:
                 use_doc_preprocessor = False
 
-        if use_general_ocr is None:
-            use_general_ocr = self.use_general_ocr
-
         if use_seal_recognition is None:
             use_seal_recognition = self.use_seal_recognition
 
@@ -369,7 +356,6 @@ class _LayoutParsingPipeline(BasePipeline):
 
         return dict(
             use_doc_preprocessor=use_doc_preprocessor,
-            use_general_ocr=use_general_ocr,
             use_seal_recognition=use_seal_recognition,
             use_table_recognition=use_table_recognition,
             use_formula_recognition=use_formula_recognition,
@@ -381,7 +367,6 @@ class _LayoutParsingPipeline(BasePipeline):
         use_doc_orientation_classify: Optional[bool] = None,
         use_doc_unwarping: Optional[bool] = None,
         use_textline_orientation: Optional[bool] = None,
-        use_general_ocr: Optional[bool] = None,
         use_seal_recognition: Optional[bool] = None,
         use_table_recognition: Optional[bool] = None,
         use_formula_recognition: Optional[bool] = None,
@@ -411,7 +396,6 @@ class _LayoutParsingPipeline(BasePipeline):
             use_doc_orientation_classify (Optional[bool]): Whether to use document orientation classification.
             use_doc_unwarping (Optional[bool]): Whether to use document unwarping.
             use_textline_orientation (Optional[bool]): Whether to use textline orientation prediction.
-            use_general_ocr (Optional[bool]): Whether to use general OCR.
             use_seal_recognition (Optional[bool]): Whether to use seal recognition.
             use_table_recognition (Optional[bool]): Whether to use table recognition.
             use_formula_recognition (Optional[bool]): Whether to use formula recognition.
@@ -445,7 +429,6 @@ class _LayoutParsingPipeline(BasePipeline):
         model_settings = self.get_model_settings(
             use_doc_orientation_classify,
             use_doc_unwarping,
-            use_general_ocr,
             use_seal_recognition,
             use_table_recognition,
             use_formula_recognition,
@@ -480,24 +463,18 @@ class _LayoutParsingPipeline(BasePipeline):
                 )
             )
 
-            if (
-                model_settings["use_general_ocr"]
-                or model_settings["use_table_recognition"]
-            ):
-                overall_ocr_res = next(
-                    self.general_ocr_pipeline(
-                        doc_preprocessor_image,
-                        use_textline_orientation=use_textline_orientation,
-                        text_det_limit_side_len=text_det_limit_side_len,
-                        text_det_limit_type=text_det_limit_type,
-                        text_det_thresh=text_det_thresh,
-                        text_det_box_thresh=text_det_box_thresh,
-                        text_det_unclip_ratio=text_det_unclip_ratio,
-                        text_rec_score_thresh=text_rec_score_thresh,
-                    )
+            overall_ocr_res = next(
+                self.general_ocr_pipeline(
+                    doc_preprocessor_image,
+                    use_textline_orientation=use_textline_orientation,
+                    text_det_limit_side_len=text_det_limit_side_len,
+                    text_det_limit_type=text_det_limit_type,
+                    text_det_thresh=text_det_thresh,
+                    text_det_box_thresh=text_det_box_thresh,
+                    text_det_unclip_ratio=text_det_unclip_ratio,
+                    text_rec_score_thresh=text_rec_score_thresh,
                 )
-            else:
-                overall_ocr_res = {}
+            )
 
             if model_settings["use_table_recognition"]:
                 table_res_all = next(

+ 35 - 55
paddlex/inference/pipelines/layout_parsing/pipeline_v2.py

@@ -96,7 +96,6 @@ class _LayoutParsingPipelineV2(BasePipeline):
         """
 
         self.use_doc_preprocessor = config.get("use_doc_preprocessor", True)
-        self.use_general_ocr = config.get("use_general_ocr", True)
         self.use_table_recognition = config.get("use_table_recognition", True)
         self.use_seal_recognition = config.get("use_seal_recognition", True)
         self.use_region_detection = config.get(
@@ -154,14 +153,13 @@ class _LayoutParsingPipelineV2(BasePipeline):
             layout_kwargs["layout_merge_bboxes_mode"] = layout_merge_bboxes_mode
         self.layout_det_model = self.create_model(layout_det_config, **layout_kwargs)
 
-        if self.use_general_ocr or self.use_table_recognition:
-            general_ocr_config = config.get("SubPipelines", {}).get(
-                "GeneralOCR",
-                {"pipeline_config_error": "config error for general_ocr_pipeline!"},
-            )
-            self.general_ocr_pipeline = self.create_pipeline(
-                general_ocr_config,
-            )
+        general_ocr_config = config.get("SubPipelines", {}).get(
+            "GeneralOCR",
+            {"pipeline_config_error": "config error for general_ocr_pipeline!"},
+        )
+        self.general_ocr_pipeline = self.create_pipeline(
+            general_ocr_config,
+        )
 
         if self.use_seal_recognition:
             seal_recognition_config = config.get("SubPipelines", {}).get(
@@ -251,12 +249,6 @@ class _LayoutParsingPipelineV2(BasePipeline):
             )
             return False
 
-        if input_params["use_general_ocr"] and not self.use_general_ocr:
-            logging.error(
-                "Set use_general_ocr, but the models for general OCR are not initialized.",
-            )
-            return False
-
         if input_params["use_seal_recognition"] and not self.use_seal_recognition:
             logging.error(
                 "Set use_seal_recognition, but the models for seal recognition are not initialized.",
@@ -897,7 +889,6 @@ class _LayoutParsingPipelineV2(BasePipeline):
         self,
         use_doc_orientation_classify: Union[bool, None],
         use_doc_unwarping: Union[bool, None],
-        use_general_ocr: Union[bool, None],
         use_seal_recognition: Union[bool, None],
         use_table_recognition: Union[bool, None],
         use_formula_recognition: Union[bool, None],
@@ -910,7 +901,6 @@ class _LayoutParsingPipelineV2(BasePipeline):
         Args:
             use_doc_orientation_classify (Union[bool, None]): Enables document orientation classification if True. Defaults to system setting if None.
             use_doc_unwarping (Union[bool, None]): Enables document unwarping if True. Defaults to system setting if None.
-            use_general_ocr (Union[bool, None]): Enables general OCR if True. Defaults to system setting if None.
             use_seal_recognition (Union[bool, None]): Enables seal recognition if True. Defaults to system setting if None.
             use_table_recognition (Union[bool, None]): Enables table recognition if True. Defaults to system setting if None.
             use_formula_recognition (Union[bool, None]): Enables formula recognition if True. Defaults to system setting if None.
@@ -927,9 +917,6 @@ class _LayoutParsingPipelineV2(BasePipeline):
             else:
                 use_doc_preprocessor = False
 
-        if use_general_ocr is None:
-            use_general_ocr = self.use_general_ocr
-
         if use_seal_recognition is None:
             use_seal_recognition = self.use_seal_recognition
 
@@ -947,7 +934,6 @@ class _LayoutParsingPipelineV2(BasePipeline):
 
         return dict(
             use_doc_preprocessor=use_doc_preprocessor,
-            use_general_ocr=use_general_ocr,
             use_seal_recognition=use_seal_recognition,
             use_table_recognition=use_table_recognition,
             use_formula_recognition=use_formula_recognition,
@@ -961,7 +947,6 @@ class _LayoutParsingPipelineV2(BasePipeline):
         use_doc_orientation_classify: Union[bool, None] = None,
         use_doc_unwarping: Union[bool, None] = None,
         use_textline_orientation: Optional[bool] = None,
-        use_general_ocr: Union[bool, None] = None,
         use_seal_recognition: Union[bool, None] = None,
         use_table_recognition: Union[bool, None] = None,
         use_formula_recognition: Union[bool, None] = None,
@@ -986,8 +971,6 @@ class _LayoutParsingPipelineV2(BasePipeline):
         use_table_cells_ocr_results: bool = False,
         use_e2e_wired_table_rec_model: bool = False,
         use_e2e_wireless_table_rec_model: bool = True,
-        max_new_tokens: int = 1024,
-        no_repeat_ngram_size: int = 20,
         **kwargs,
     ) -> LayoutParsingResultV2:
         """
@@ -997,7 +980,6 @@ class _LayoutParsingPipelineV2(BasePipeline):
             use_doc_orientation_classify (Optional[bool]): Whether to use document orientation classification.
             use_doc_unwarping (Optional[bool]): Whether to use document unwarping.
             use_textline_orientation (Optional[bool]): Whether to use textline orientation prediction.
-            use_general_ocr (Optional[bool]): Whether to use general OCR.
             use_seal_recognition (Optional[bool]): Whether to use seal recognition.
             use_table_recognition (Optional[bool]): Whether to use table recognition.
             use_formula_recognition (Optional[bool]): Whether to use formula recognition.
@@ -1025,8 +1007,6 @@ class _LayoutParsingPipelineV2(BasePipeline):
             use_table_cells_ocr_results (bool): whether to use OCR results with cells.
             use_e2e_wired_table_rec_model (bool): Whether to use end-to-end wired table recognition model.
             use_e2e_wireless_table_rec_model (bool): Whether to use end-to-end wireless table recognition model.
-            max_new_tokens (int): argument for chart to table model, default by 1024.
-            no_repeat_ngram_size (int): argument for chart to table model, default by 20.
             **kwargs (Any): Additional settings to extend functionality.
 
         Returns:
@@ -1036,7 +1016,6 @@ class _LayoutParsingPipelineV2(BasePipeline):
         model_settings = self.get_model_settings(
             use_doc_orientation_classify,
             use_doc_unwarping,
-            use_general_ocr,
             use_seal_recognition,
             use_table_recognition,
             use_formula_recognition,
@@ -1113,33 +1092,18 @@ class _LayoutParsingPipelineV2(BasePipeline):
                     x_min, y_min, x_max, y_max = list(map(int, formula_res["dt_polys"]))
                     doc_preprocessor_image[y_min:y_max, x_min:x_max, :] = 255.0
 
-            if (
-                model_settings["use_general_ocr"]
-                or model_settings["use_table_recognition"]
-            ):
-                overall_ocr_results = list(
-                    self.general_ocr_pipeline(
-                        doc_preprocessor_images,
-                        use_textline_orientation=use_textline_orientation,
-                        text_det_limit_side_len=text_det_limit_side_len,
-                        text_det_limit_type=text_det_limit_type,
-                        text_det_thresh=text_det_thresh,
-                        text_det_box_thresh=text_det_box_thresh,
-                        text_det_unclip_ratio=text_det_unclip_ratio,
-                        text_rec_score_thresh=text_rec_score_thresh,
-                    ),
-                )
-            else:
-                overall_ocr_results = [
-                    {
-                        "dt_polys": [],
-                        "rec_texts": [],
-                        "rec_scores": [],
-                        "rec_polys": [],
-                        "rec_boxes": np.array([]),
-                    }
-                    for _ in doc_preprocessor_images
-                ]
+            overall_ocr_results = list(
+                self.general_ocr_pipeline(
+                    doc_preprocessor_images,
+                    use_textline_orientation=use_textline_orientation,
+                    text_det_limit_side_len=text_det_limit_side_len,
+                    text_det_limit_type=text_det_limit_type,
+                    text_det_thresh=text_det_thresh,
+                    text_det_box_thresh=text_det_box_thresh,
+                    text_det_unclip_ratio=text_det_unclip_ratio,
+                    text_rec_score_thresh=text_rec_score_thresh,
+                ),
+            )
 
             for overall_ocr_res in overall_ocr_results:
                 overall_ocr_res["rec_labels"] = ["text"] * len(
@@ -1245,6 +1209,22 @@ class _LayoutParsingPipelineV2(BasePipeline):
             else:
                 seal_res_lists = [[] for _ in doc_preprocessor_images]
 
+            chart_res_list = []
+            if model_settings["use_chart_recognition"]:
+                chart_imgs_list = []
+                for bbox in layout_det_res["boxes"]:
+                    if bbox["label"] == "chart":
+                        x_min, y_min, x_max, y_max = bbox["coordinate"]
+                        chart_img = doc_preprocessor_image[
+                            int(y_min) : int(y_max), int(x_min) : int(x_max), :
+                        ]
+                        chart_imgs_list.append({"image": chart_img})
+
+                for chart_res_batch in self.chart_recognition_model(
+                    input=chart_imgs_list,
+                ):
+                    chart_res_list.append(chart_res_batch["result"])
+
             for (
                 input_path,
                 page_index,

+ 3 - 6
paddlex/inference/pipelines/layout_parsing/result.py

@@ -47,8 +47,7 @@ class LayoutParsingResult(BaseCVResult, HtmlMixin, XlsxMixin):
             res_img_dict.update(**self["doc_preprocessor_res"].img)
         res_img_dict["layout_det_res"] = self["layout_det_res"].img["res"]
 
-        if model_settings["use_general_ocr"] or model_settings["use_table_recognition"]:
-            res_img_dict["overall_ocr_res"] = self["overall_ocr_res"].img["ocr_res_img"]
+        res_img_dict["overall_ocr_res"] = self["overall_ocr_res"].img["ocr_res_img"]
 
         if model_settings["use_table_recognition"] and len(self["table_res_list"]) > 0:
             table_cell_img = Image.fromarray(
@@ -106,8 +105,7 @@ class LayoutParsingResult(BaseCVResult, HtmlMixin, XlsxMixin):
         if self["model_settings"]["use_doc_preprocessor"]:
             data["doc_preprocessor_res"] = self["doc_preprocessor_res"].str["res"]
         data["layout_det_res"] = self["layout_det_res"].str["res"]
-        if model_settings["use_general_ocr"] or model_settings["use_table_recognition"]:
-            data["overall_ocr_res"] = self["overall_ocr_res"].str["res"]
+        data["overall_ocr_res"] = self["overall_ocr_res"].str["res"]
         if model_settings["use_table_recognition"] and len(self["table_res_list"]) > 0:
             data["table_res_list"] = []
             for sno in range(len(self["table_res_list"])):
@@ -149,8 +147,7 @@ class LayoutParsingResult(BaseCVResult, HtmlMixin, XlsxMixin):
         if self["model_settings"]["use_doc_preprocessor"]:
             data["doc_preprocessor_res"] = self["doc_preprocessor_res"].json["res"]
         data["layout_det_res"] = self["layout_det_res"].json["res"]
-        if model_settings["use_general_ocr"] or model_settings["use_table_recognition"]:
-            data["overall_ocr_res"] = self["overall_ocr_res"].json["res"]
+        data["overall_ocr_res"] = self["overall_ocr_res"].json["res"]
         if model_settings["use_table_recognition"] and len(self["table_res_list"]) > 0:
             data["table_res_list"] = []
             for sno in range(len(self["table_res_list"])):

+ 3 - 6
paddlex/inference/pipelines/layout_parsing/result_v2.py

@@ -226,8 +226,7 @@ class LayoutParsingResultV2(BaseCVResult, HtmlMixin, XlsxMixin, MarkdownMixin):
         if model_settings["use_region_detection"]:
             res_img_dict["region_det_res"] = self["region_det_res"].img["res"]
 
-        if model_settings["use_general_ocr"] or model_settings["use_table_recognition"]:
-            res_img_dict["overall_ocr_res"] = self["overall_ocr_res"].img["ocr_res_img"]
+        res_img_dict["overall_ocr_res"] = self["overall_ocr_res"].img["ocr_res_img"]
 
         if model_settings["use_table_recognition"] and len(self["table_res_list"]) > 0:
             table_cell_img = Image.fromarray(
@@ -296,8 +295,7 @@ class LayoutParsingResultV2(BaseCVResult, HtmlMixin, XlsxMixin, MarkdownMixin):
         if self["model_settings"]["use_doc_preprocessor"]:
             data["doc_preprocessor_res"] = self["doc_preprocessor_res"].str["res"]
         data["layout_det_res"] = self["layout_det_res"].str["res"]
-        if model_settings["use_general_ocr"] or model_settings["use_table_recognition"]:
-            data["overall_ocr_res"] = self["overall_ocr_res"].str["res"]
+        data["overall_ocr_res"] = self["overall_ocr_res"].str["res"]
         if model_settings["use_table_recognition"] and len(self["table_res_list"]) > 0:
             data["table_res_list"] = []
             for sno in range(len(self["table_res_list"])):
@@ -348,8 +346,7 @@ class LayoutParsingResultV2(BaseCVResult, HtmlMixin, XlsxMixin, MarkdownMixin):
         if self["model_settings"]["use_doc_preprocessor"]:
             data["doc_preprocessor_res"] = self["doc_preprocessor_res"].json["res"]
         data["layout_det_res"] = self["layout_det_res"].json["res"]
-        if model_settings["use_general_ocr"] or model_settings["use_table_recognition"]:
-            data["overall_ocr_res"] = self["overall_ocr_res"].json["res"]
+        data["overall_ocr_res"] = self["overall_ocr_res"].json["res"]
         if model_settings["use_table_recognition"] and len(self["table_res_list"]) > 0:
             data["table_res_list"] = []
             for sno in range(len(self["table_res_list"])):

+ 0 - 3
paddlex/inference/pipelines/pp_chatocr/pipeline_v3.py

@@ -206,7 +206,6 @@ class PP_ChatOCRv3_Pipeline(PP_ChatOCR_Pipeline):
         input: Union[str, List[str], np.ndarray, List[np.ndarray]],
         use_doc_orientation_classify: Optional[bool] = None,
         use_doc_unwarping: Optional[bool] = None,
-        use_general_ocr: Optional[bool] = None,
         use_seal_recognition: Optional[bool] = None,
         use_table_recognition: Optional[bool] = None,
         layout_threshold: Optional[Union[float, dict]] = None,
@@ -237,7 +236,6 @@ class PP_ChatOCRv3_Pipeline(PP_ChatOCR_Pipeline):
                                                                         numpy array of an image, or list of numpy arrays.
             use_doc_orientation_classify (bool): Flag to use document orientation classification.
             use_doc_unwarping (bool): Flag to use document unwarping.
-            use_general_ocr (bool): Flag to use general OCR.
             use_seal_recognition (bool): Flag to use seal recognition.
             use_table_recognition (bool): Flag to use table recognition.
             layout_threshold (Optional[float]): The threshold value to filter out low-confidence predictions. Default is None.
@@ -280,7 +278,6 @@ class PP_ChatOCRv3_Pipeline(PP_ChatOCR_Pipeline):
             input,
             use_doc_orientation_classify=use_doc_orientation_classify,
             use_doc_unwarping=use_doc_unwarping,
-            use_general_ocr=use_general_ocr,
             use_seal_recognition=use_seal_recognition,
             use_table_recognition=use_table_recognition,
             layout_threshold=layout_threshold,

+ 0 - 3
paddlex/inference/pipelines/pp_chatocr/pipeline_v4.py

@@ -249,7 +249,6 @@ class PP_ChatOCRv4_Pipeline(PP_ChatOCR_Pipeline):
         input: Union[str, List[str], np.ndarray, List[np.ndarray]],
         use_doc_orientation_classify: Optional[bool] = None,
         use_doc_unwarping: Optional[bool] = None,
-        use_general_ocr: Optional[bool] = None,
         use_seal_recognition: Optional[bool] = None,
         use_table_recognition: Optional[bool] = None,
         layout_threshold: Optional[Union[float, dict]] = None,
@@ -280,7 +279,6 @@ class PP_ChatOCRv4_Pipeline(PP_ChatOCR_Pipeline):
                                                                         numpy array of an image, or list of numpy arrays.
             use_doc_orientation_classify (bool): Flag to use document orientation classification.
             use_doc_unwarping (bool): Flag to use document unwarping.
-            use_general_ocr (bool): Flag to use general OCR.
             use_seal_recognition (bool): Flag to use seal recognition.
             use_table_recognition (bool): Flag to use table recognition.
             layout_threshold (Optional[float]): The threshold value to filter out low-confidence predictions. Default is None.
@@ -322,7 +320,6 @@ class PP_ChatOCRv4_Pipeline(PP_ChatOCR_Pipeline):
             input,
             use_doc_orientation_classify=use_doc_orientation_classify,
             use_doc_unwarping=use_doc_unwarping,
-            use_general_ocr=use_general_ocr,
             use_seal_recognition=use_seal_recognition,
             use_table_recognition=use_table_recognition,
             layout_threshold=layout_threshold,