1 year ago · 959b8d82d8
--- a/magic_pdf/pdf_parse_by_ocr_v2.py
+++ b/magic_pdf/pdf_parse_by_ocr_v2.py
@@ -1,5 +1,6 @@
 
				 from magic_pdf.pdf_parse_union_core import pdf_parse_union
			
 
				 
			
 
				+
			
 
				 def parse_pdf_by_ocr(pdf_bytes,
			
 
				                      model_list,
			
 
				                      imageWriter,
			
--- a/magic_pdf/pdf_parse_by_txt.py
+++ b/magic_pdf/pdf_parse_by_txt.py
@@ -0,0 +1,19 @@
 
				+from magic_pdf.pdf_parse_union_core import pdf_parse_union
			
 
				+
			
 
				+
			
 
				+def parse_pdf_by_txt(
			
 
				+    pdf_bytes,
			
 
				+    model_list,
			
 
				+    imageWriter,
			
 
				+    start_page_id=0,
			
 
				+    end_page_id=None,
			
 
				+    debug_mode=False,
			
 
				+):
			
 
				+    return pdf_parse_union(pdf_bytes,
			
 
				+                           model_list,
			
 
				+                           imageWriter,
			
 
				+                           "txt",
			
 
				+                           start_page_id=start_page_id,
			
 
				+                           end_page_id=end_page_id,
			
 
				+                           debug_mode=debug_mode,
			
 
				+                           )
			
--- a/magic_pdf/pdf_parse_by_txt_v2.py
+++ b/magic_pdf/pdf_parse_by_txt_v2.py
@@ -1,56 +0,0 @@
 
				-from magic_pdf.pdf_parse_union_core import pdf_parse_union
			
 
				-
			
 
				-
			
 
				-def parse_pdf_by_txt(
			
 
				-    pdf_bytes,
			
 
				-    model_list,
			
 
				-    imageWriter,
			
 
				-    start_page_id=0,
			
 
				-    end_page_id=None,
			
 
				-    debug_mode=False,
			
 
				-):
			
 
				-    return pdf_parse_union(pdf_bytes,
			
 
				-                           model_list,
			
 
				-                           imageWriter,
			
 
				-                           "txt",
			
 
				-                           start_page_id=start_page_id,
			
 
				-                           end_page_id=end_page_id,
			
 
				-                           debug_mode=debug_mode,
			
 
				-                           )
			
 
				-
			
 
				-
			
 
				-if __name__ == "__main__":
			
 
				-    pass
			
 
				-    # if 1:
			
 
				-    #     import fitz
			
 
				-    #     import json
			
 
				-    #
			
 
				-    #     with open("/opt/data/pdf/20240418/25536-00.pdf", "rb") as f:
			
 
				-    #         pdf_bytes = f.read()
			
 
				-    #     pdf_docs = fitz.open("pdf", pdf_bytes)
			
 
				-    #
			
 
				-    #     with open("/opt/data/pdf/20240418/25536-00.json") as f:
			
 
				-    #         model_list = json.loads(f.readline())
			
 
				-    #
			
 
				-    #     magic_model = MagicModel(model_list, pdf_docs)
			
 
				-    #     for i in range(7):
			
 
				-    #         print(magic_model.get_imgs(i))
			
 
				-    #
			
 
				-    #     for page_no, page in enumerate(pdf_docs):
			
 
				-    #         inline_equations, interline_equations, interline_equation_blocks = (
			
 
				-    #             magic_model.get_equations(page_no)
			
 
				-    #         )
			
 
				-    #
			
 
				-    #         text_raw_blocks = page.get_text("dict", flags=fitz.TEXTFLAGS_TEXT)["blocks"]
			
 
				-    #         char_level_text_blocks = page.get_text(
			
 
				-    #             "rawdict", flags=fitz.TEXTFLAGS_TEXT
			
 
				-    #         )["blocks"]
			
 
				-    #         text_blocks = combine_chars_to_pymudict(
			
 
				-    #             text_raw_blocks, char_level_text_blocks
			
 
				-    #         )
			
 
				-    #         text_blocks = replace_equations_in_textblock(
			
 
				-    #             text_blocks, inline_equations, interline_equations
			
 
				-    #         )
			
 
				-    #         text_blocks = remove_citation_marker(text_blocks)
			
 
				-    #
			
 
				-    #         text_blocks = remove_chars_in_text_blocks(text_blocks)
			
--- a/magic_pdf/user_api.py
+++ b/magic_pdf/user_api.py
@@ -18,8 +18,8 @@ from loguru import logger
 
				 from magic_pdf.libs.version import __version__
			
 
				 from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
			
 
				 from magic_pdf.rw import AbsReaderWriter
			
 
				-from magic_pdf.pdf_parse_by_ocr_v2 import parse_pdf_by_ocr
			
 
				-from magic_pdf.pdf_parse_by_txt_v2 import parse_pdf_by_txt
			
 
				+from magic_pdf.pdf_parse_by_ocr import parse_pdf_by_ocr
			
 
				+from magic_pdf.pdf_parse_by_txt import parse_pdf_by_txt
			
 
				 
			
 
				 PARSE_TYPE_TXT = "txt"
			
 
				 PARSE_TYPE_OCR = "ocr"
			
@@ -86,45 +86,7 @@ def parse_union_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: AbsReaderWr
 
				             return None
			
 
				 
			
 
				     pdf_info_dict = parse_pdf(parse_pdf_by_txt)
			
 
				-    # text_all = ""
			
 
				-    # for page_dict in pdf_info_dict['pdf_info']:
			
 
				-    #     for para_block in page_dict['para_blocks']:
			
 
				-    #         if para_block['type'] in ['title', 'text']:
			
 
				-    #             for line in para_block['lines']:
			
 
				-    #                 for span in line['spans']:
			
 
				-    #                     text_all += span['content']
			
 
				-
			
 
				-    # def calculate_not_common_character_rate(text):
			
 
				-    #     garbage_regex = re.compile(r'[^\u4e00-\u9fa5\u0030-\u0039\u0041-\u005a\u0061-\u007a\u3000-\u303f\uff00-\uffef]')
			
 
				-    #     # 计算乱码字符的数量
			
 
				-    #     garbage_count = len(garbage_regex.findall(text))
			
 
				-    #     total = len(text)
			
 
				-    #     if total == 0:
			
 
				-    #         return 0  # 避免除以零的错误
			
 
				-    #     return garbage_count / total
			
 
				-    #
			
 
				-    # def calculate_not_printable_rate(text):
			
 
				-    #     printable_text = ""
			
 
				-    #     for c in text:
			
 
				-    #         if c.isprintable():
			
 
				-    #             printable_text += c
			
 
				-    #     printable_total = len(printable_text)
			
 
				-    #     total = len(text)
			
 
				-    #     if total == 0:
			
 
				-    #         return 0  # 避免除以零的错误
			
 
				-    #     return (total - printable_total) / total
			
 
				-    #
			
 
				-    # not_common_character_rate = calculate_not_common_character_rate(text_all)
			
 
				-    # not_printable_rate = calculate_not_printable_rate(text_all)
			
 
				-    # pdf_info_dict["_not_common_character_rate"] = not_common_character_rate
			
 
				-    # pdf_info_dict["_not_printable_rate"] = not_printable_rate
			
 
				-    # logger.info(f"not_common_character_rate: {not_common_character_rate}, not_printable_rate: {not_printable_rate}")
			
 
				-    '''新逻辑使用pdfminer识别乱码pdf,准确率高且不会误伤,已在解析流程之前进行处理'''
			
 
				-    # not_common_character_rate对小语种可能会有误伤，not_printable_rate对小语种较为友好
			
 
				-    if (pdf_info_dict is None
			
 
				-            or pdf_info_dict.get("_need_drop", False)
			
 
				-            # or not_printable_rate > 0.02  # 参考一些正常的pdf，这个值没有超过0.01的，阈值设为0.02
			
 
				-    ):
			
 
				+    if pdf_info_dict is None or pdf_info_dict.get("_need_drop", False):
			
 
				         logger.warning(f"parse_pdf_by_txt drop or error, switch to parse_pdf_by_ocr")
			
 
				         if input_model_is_empty:
			
 
				             pdf_models = doc_analyze(pdf_bytes, ocr=True)