Kaynağa Gözat

make paddle analyze mode adaptation cli input mode to improve analyze speed

赵小蒙 1 yıl önce
ebeveyn
işleme
0606301412

+ 7 - 2
magic_pdf/cli/magicpdf.py

@@ -87,6 +87,11 @@ def _do_parse(pdf_file_name, pdf_bytes, model_list, parse_method, image_writer,
         sys.exit(1)
 
     pipe.pipe_classify()
+
+    '''如果没有传入有效的模型数据,则使用内置paddle解析'''
+    if len(model_list) == 0:
+        pipe.pipe_analyze()
+
     pipe.pipe_parse()
     pdf_info = pipe.pdf_mid_data['pdf_info']
     draw_layout_bbox(pdf_info, pdf_bytes, local_md_dir)
@@ -255,8 +260,8 @@ def pdf_command(pdf, model, method):
             model_path = pdf.replace(".pdf", ".json")
             if not os.path.exists(model_path):
                 logger.warning(f"not found json {model_path} existed, use paddle analyze")
-                # 本地无模型数据则调用内置paddle分析
-                model_json = json_parse.dumps(doc_analyze(pdf_data, ocr=False, show_log=True))
+                # 本地无模型数据则调用内置paddle分析,先传空list,在内部识别到空list再调用paddle
+                model_json = "[]"
             else:
                 model_json = read_fn(model_path).decode("utf-8")
         else:

+ 7 - 0
magic_pdf/pipe/AbsPipe.py

@@ -34,6 +34,13 @@ class AbsPipe(ABC):
         raise NotImplementedError
 
     @abstractmethod
+    def pipe_analyze(self):
+        """
+        有状态的跑模型分析
+        """
+        raise NotImplementedError
+
+    @abstractmethod
     def pipe_parse(self):
         """
         有状态的解析

+ 4 - 1
magic_pdf/pipe/OCRPipe.py

@@ -1,6 +1,6 @@
 from magic_pdf.libs.MakeContentConfig import DropMode
+from magic_pdf.model.doc_analyze_by_pp_structurev2 import doc_analyze
 from magic_pdf.rw.AbsReaderWriter import AbsReaderWriter
-from magic_pdf.libs.json_compressor import JsonCompressor
 from magic_pdf.pipe.AbsPipe import AbsPipe
 from magic_pdf.user_api import parse_ocr_pdf
 
@@ -13,6 +13,9 @@ class OCRPipe(AbsPipe):
     def pipe_classify(self):
         pass
 
+    def pipe_analyze(self):
+        self.model_list = doc_analyze(self.pdf_bytes, ocr=True)
+
     def pipe_parse(self):
         self.pdf_mid_data = parse_ocr_pdf(self.pdf_bytes, self.model_list, self.image_writer, is_debug=self.is_debug)
 

+ 4 - 0
magic_pdf/pipe/TXTPipe.py

@@ -1,4 +1,5 @@
 from magic_pdf.libs.MakeContentConfig import DropMode
+from magic_pdf.model.doc_analyze_by_pp_structurev2 import doc_analyze
 from magic_pdf.rw.AbsReaderWriter import AbsReaderWriter
 from magic_pdf.libs.json_compressor import JsonCompressor
 from magic_pdf.pipe.AbsPipe import AbsPipe
@@ -13,6 +14,9 @@ class TXTPipe(AbsPipe):
     def pipe_classify(self):
         pass
 
+    def pipe_analyze(self):
+        self.model_list = doc_analyze(self.pdf_bytes, ocr=False)
+
     def pipe_parse(self):
         self.pdf_mid_data = parse_txt_pdf(self.pdf_bytes, self.model_list, self.image_writer, is_debug=self.is_debug)
 

+ 12 - 1
magic_pdf/pipe/UNIPipe.py

@@ -3,6 +3,7 @@ import json
 from loguru import logger
 
 from magic_pdf.libs.MakeContentConfig import DropMode
+from magic_pdf.model.doc_analyze_by_pp_structurev2 import doc_analyze
 from magic_pdf.rw.AbsReaderWriter import AbsReaderWriter
 from magic_pdf.rw.DiskReaderWriter import DiskReaderWriter
 from magic_pdf.libs.commons import join_path
@@ -15,14 +16,24 @@ class UNIPipe(AbsPipe):
     def __init__(self, pdf_bytes: bytes, jso_useful_key: dict, image_writer: AbsReaderWriter, is_debug: bool = False):
         self.pdf_type = jso_useful_key["_pdf_type"]
         super().__init__(pdf_bytes, jso_useful_key["model_list"], image_writer, is_debug)
+        if len(self.model_list) == 0:
+            self.input_model_is_empty = True
+        else:
+            self.input_model_is_empty = False
 
     def pipe_classify(self):
         self.pdf_type = AbsPipe.classify(self.pdf_bytes)
 
+    def pipe_analyze(self):
+        if self.pdf_type == self.PIP_TXT:
+            self.model_list = doc_analyze(self.pdf_bytes, ocr=False)
+        elif self.pdf_type == self.PIP_OCR:
+            self.model_list = doc_analyze(self.pdf_bytes, ocr=True)
+
     def pipe_parse(self):
         if self.pdf_type == self.PIP_TXT:
             self.pdf_mid_data = parse_union_pdf(self.pdf_bytes, self.model_list, self.image_writer,
-                                                is_debug=self.is_debug)
+                                                is_debug=self.is_debug, input_model_is_empty=self.input_model_is_empty)
         elif self.pdf_type == self.PIP_OCR:
             self.pdf_mid_data = parse_ocr_pdf(self.pdf_bytes, self.model_list, self.image_writer,
                                               is_debug=self.is_debug)

+ 4 - 0
magic_pdf/user_api.py

@@ -16,6 +16,7 @@ import re
 from loguru import logger
 
 from magic_pdf.libs.version import __version__
+from magic_pdf.model.doc_analyze_by_pp_structurev2 import doc_analyze
 from magic_pdf.rw import AbsReaderWriter
 from magic_pdf.pdf_parse_by_ocr_v2 import parse_pdf_by_ocr
 from magic_pdf.pdf_parse_by_txt_v2 import parse_pdf_by_txt
@@ -65,6 +66,7 @@ def parse_ocr_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: AbsReaderWrit
 
 
 def parse_union_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: AbsReaderWriter, is_debug=False, start_page=0,
+                    input_model_is_empty: bool = False,
                     *args, **kwargs):
     """
     ocr和文本混合的pdf,全部解析出来
@@ -119,6 +121,8 @@ def parse_union_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: AbsReaderWr
         or not_printable_rate > 0.02  # 参考一些正常的pdf,这个值没有超过0.01的,阈值设为0.02
     ):
         logger.warning(f"parse_pdf_by_txt drop or error or garbled_rate too large, switch to parse_pdf_by_ocr")
+        if input_model_is_empty:
+            pdf_models = doc_analyze(pdf_bytes, ocr=True)
         pdf_info_dict = parse_pdf(parse_pdf_by_ocr)
         if pdf_info_dict is None:
             raise Exception("Both parse_pdf_by_txt and parse_pdf_by_ocr failed.")