1 år sedan · 959b8d82d8
--- a/magic_pdf/pdf_parse_by_ocr_v2.py
+++ b/magic_pdf/pdf_parse_by_ocr_v2.py
@@ -1,5 +1,6 @@
 
															 from magic_pdf.pdf_parse_union_core import pdf_parse_union
														
 
															+
														
 
															 def parse_pdf_by_ocr(pdf_bytes,
														
 
															                      model_list,
														
 
															                      imageWriter,
														
--- a/magic_pdf/pdf_parse_by_txt.py
+++ b/magic_pdf/pdf_parse_by_txt.py
@@ -0,0 +1,19 @@
 
															+from magic_pdf.pdf_parse_union_core import pdf_parse_union
														
 
															+
														
 
															+
														
 
															+def parse_pdf_by_txt(
														
 
															+    pdf_bytes,
														
 
															+    model_list,
														
 
															+    imageWriter,
														
 
															+    start_page_id=0,
														
 
															+    end_page_id=None,
														
 
															+    debug_mode=False,
														
 
															+):
														
 
															+    return pdf_parse_union(pdf_bytes,
														
 
															+                           model_list,
														
 
															+                           imageWriter,
														
 
															+                           "txt",
														
 
															+                           start_page_id=start_page_id,
														
 
															+                           end_page_id=end_page_id,
														
 
															+                           debug_mode=debug_mode,
														
 
															+                           )
														
--- a/magic_pdf/pdf_parse_by_txt_v2.py
+++ b/magic_pdf/pdf_parse_by_txt_v2.py
@@ -1,56 +0,0 @@
 
															-from magic_pdf.pdf_parse_union_core import pdf_parse_union
														
 
															-
														
 
															-
														
 
															-def parse_pdf_by_txt(
														
 
															-    pdf_bytes,
														
 
															-    model_list,
														
 
															-    imageWriter,
														
 
															-    start_page_id=0,
														
 
															-    end_page_id=None,
														
 
															-    debug_mode=False,
														
 
															-):
														
 
															-    return pdf_parse_union(pdf_bytes,
														
 
															-                           model_list,
														
 
															-                           imageWriter,
														
 
															-                           "txt",
														
 
															-                           start_page_id=start_page_id,
														
 
															-                           end_page_id=end_page_id,
														
 
															-                           debug_mode=debug_mode,
														
 
															-                           )
														
 
															-
														
 
															-
														
 
															-if __name__ == "__main__":
														
 
															-    pass
														
 
															-    # if 1:
														
 
															-    #     import fitz
														
 
															-    #     import json
														
 
															-    #
														
 
															-    #     with open("/opt/data/pdf/20240418/25536-00.pdf", "rb") as f:
														
 
															-    #         pdf_bytes = f.read()
														
 
															-    #     pdf_docs = fitz.open("pdf", pdf_bytes)
														
 
															-    #
														
 
															-    #     with open("/opt/data/pdf/20240418/25536-00.json") as f:
														
 
															-    #         model_list = json.loads(f.readline())
														
 
															-    #
														
 
															-    #     magic_model = MagicModel(model_list, pdf_docs)
														
 
															-    #     for i in range(7):
														
 
															-    #         print(magic_model.get_imgs(i))
														
 
															-    #
														
 
															-    #     for page_no, page in enumerate(pdf_docs):
														
 
															-    #         inline_equations, interline_equations, interline_equation_blocks = (
														
 
															-    #             magic_model.get_equations(page_no)
														
 
															-    #         )
														
 
															-    #
														
 
															-    #         text_raw_blocks = page.get_text("dict", flags=fitz.TEXTFLAGS_TEXT)["blocks"]
														
 
															-    #         char_level_text_blocks = page.get_text(
														
 
															-    #             "rawdict", flags=fitz.TEXTFLAGS_TEXT
														
 
															-    #         )["blocks"]
														
 
															-    #         text_blocks = combine_chars_to_pymudict(
														
 
															-    #             text_raw_blocks, char_level_text_blocks
														
 
															-    #         )
														
 
															-    #         text_blocks = replace_equations_in_textblock(
														
 
															-    #             text_blocks, inline_equations, interline_equations
														
 
															-    #         )
														
 
															-    #         text_blocks = remove_citation_marker(text_blocks)
														
 
															-    #
														
 
															-    #         text_blocks = remove_chars_in_text_blocks(text_blocks)
														
--- a/magic_pdf/user_api.py
+++ b/magic_pdf/user_api.py
@@ -18,8 +18,8 @@ from loguru import logger
 
															 from magic_pdf.libs.version import __version__
														
 
															 from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
														
 
															 from magic_pdf.rw import AbsReaderWriter
														
 
															-from magic_pdf.pdf_parse_by_ocr_v2 import parse_pdf_by_ocr
														
 
															-from magic_pdf.pdf_parse_by_txt_v2 import parse_pdf_by_txt
														
 
															+from magic_pdf.pdf_parse_by_ocr import parse_pdf_by_ocr
														
 
															+from magic_pdf.pdf_parse_by_txt import parse_pdf_by_txt
														
 
															 PARSE_TYPE_TXT = "txt"
														
 
															 PARSE_TYPE_OCR = "ocr"
														
@@ -86,45 +86,7 @@ def parse_union_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: AbsReaderWr
 
															             return None
														
 
															     pdf_info_dict = parse_pdf(parse_pdf_by_txt)
														
 
															-    # text_all = ""
														
 
															-    # for page_dict in pdf_info_dict['pdf_info']:
														
 
															-    #     for para_block in page_dict['para_blocks']:
														
 
															-    #         if para_block['type'] in ['title', 'text']:
														
 
															-    #             for line in para_block['lines']:
														
 
															-    #                 for span in line['spans']:
														
 
															-    #                     text_all += span['content']
														
 
															-
														
 
															-    # def calculate_not_common_character_rate(text):
														
 
															-    #     garbage_regex = re.compile(r'[^\u4e00-\u9fa5\u0030-\u0039\u0041-\u005a\u0061-\u007a\u3000-\u303f\uff00-\uffef]')
														
 
															-    #     # 计算乱码字符的数量
														
 
															-    #     garbage_count = len(garbage_regex.findall(text))
														
 
															-    #     total = len(text)
														
 
															-    #     if total == 0:
														
 
															-    #         return 0  # 避免除以零的错误
														
 
															-    #     return garbage_count / total
														
 
															-    #
														
 
															-    # def calculate_not_printable_rate(text):
														
 
															-    #     printable_text = ""
														
 
															-    #     for c in text:
														
 
															-    #         if c.isprintable():
														
 
															-    #             printable_text += c
														
 
															-    #     printable_total = len(printable_text)
														
 
															-    #     total = len(text)
														
 
															-    #     if total == 0:
														
 
															-    #         return 0  # 避免除以零的错误
														
 
															-    #     return (total - printable_total) / total
														
 
															-    #
														
 
															-    # not_common_character_rate = calculate_not_common_character_rate(text_all)
														
 
															-    # not_printable_rate = calculate_not_printable_rate(text_all)
														
 
															-    # pdf_info_dict["_not_common_character_rate"] = not_common_character_rate
														
 
															-    # pdf_info_dict["_not_printable_rate"] = not_printable_rate
														
 
															-    # logger.info(f"not_common_character_rate: {not_common_character_rate}, not_printable_rate: {not_printable_rate}")
														
 
															-    '''新逻辑使用pdfminer识别乱码pdf,准确率高且不会误伤,已在解析流程之前进行处理'''
														
 
															-    # not_common_character_rate对小语种可能会有误伤，not_printable_rate对小语种较为友好
														
 
															-    if (pdf_info_dict is None
														
 
															-            or pdf_info_dict.get("_need_drop", False)
														
 
															-            # or not_printable_rate > 0.02  # 参考一些正常的pdf，这个值没有超过0.01的，阈值设为0.02
														
 
															-    ):
														
 
															+    if pdf_info_dict is None or pdf_info_dict.get("_need_drop", False):
														
 
															         logger.warning(f"parse_pdf_by_txt drop or error, switch to parse_pdf_by_ocr")
														
 
															         if input_model_is_empty:
														
 
															             pdf_models = doc_analyze(pdf_bytes, ocr=True)