Browse Source

renamed pipeline file name

赵小蒙 1 year ago
parent
commit
959b8d82d8

+ 1 - 0
magic_pdf/pdf_parse_by_ocr_v2.py → magic_pdf/pdf_parse_by_ocr.py

@@ -1,5 +1,6 @@
 from magic_pdf.pdf_parse_union_core import pdf_parse_union
 
+
 def parse_pdf_by_ocr(pdf_bytes,
                      model_list,
                      imageWriter,

+ 19 - 0
magic_pdf/pdf_parse_by_txt.py

@@ -0,0 +1,19 @@
+from magic_pdf.pdf_parse_union_core import pdf_parse_union
+
+
+def parse_pdf_by_txt(
+    pdf_bytes,
+    model_list,
+    imageWriter,
+    start_page_id=0,
+    end_page_id=None,
+    debug_mode=False,
+):
+    return pdf_parse_union(pdf_bytes,
+                           model_list,
+                           imageWriter,
+                           "txt",
+                           start_page_id=start_page_id,
+                           end_page_id=end_page_id,
+                           debug_mode=debug_mode,
+                           )

+ 0 - 56
magic_pdf/pdf_parse_by_txt_v2.py

@@ -1,56 +0,0 @@
-from magic_pdf.pdf_parse_union_core import pdf_parse_union
-
-
-def parse_pdf_by_txt(
-    pdf_bytes,
-    model_list,
-    imageWriter,
-    start_page_id=0,
-    end_page_id=None,
-    debug_mode=False,
-):
-    return pdf_parse_union(pdf_bytes,
-                           model_list,
-                           imageWriter,
-                           "txt",
-                           start_page_id=start_page_id,
-                           end_page_id=end_page_id,
-                           debug_mode=debug_mode,
-                           )
-
-
-if __name__ == "__main__":
-    pass
-    # if 1:
-    #     import fitz
-    #     import json
-    #
-    #     with open("/opt/data/pdf/20240418/25536-00.pdf", "rb") as f:
-    #         pdf_bytes = f.read()
-    #     pdf_docs = fitz.open("pdf", pdf_bytes)
-    #
-    #     with open("/opt/data/pdf/20240418/25536-00.json") as f:
-    #         model_list = json.loads(f.readline())
-    #
-    #     magic_model = MagicModel(model_list, pdf_docs)
-    #     for i in range(7):
-    #         print(magic_model.get_imgs(i))
-    #
-    #     for page_no, page in enumerate(pdf_docs):
-    #         inline_equations, interline_equations, interline_equation_blocks = (
-    #             magic_model.get_equations(page_no)
-    #         )
-    #
-    #         text_raw_blocks = page.get_text("dict", flags=fitz.TEXTFLAGS_TEXT)["blocks"]
-    #         char_level_text_blocks = page.get_text(
-    #             "rawdict", flags=fitz.TEXTFLAGS_TEXT
-    #         )["blocks"]
-    #         text_blocks = combine_chars_to_pymudict(
-    #             text_raw_blocks, char_level_text_blocks
-    #         )
-    #         text_blocks = replace_equations_in_textblock(
-    #             text_blocks, inline_equations, interline_equations
-    #         )
-    #         text_blocks = remove_citation_marker(text_blocks)
-    #
-    #         text_blocks = remove_chars_in_text_blocks(text_blocks)

+ 3 - 41
magic_pdf/user_api.py

@@ -18,8 +18,8 @@ from loguru import logger
 from magic_pdf.libs.version import __version__
 from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
 from magic_pdf.rw import AbsReaderWriter
-from magic_pdf.pdf_parse_by_ocr_v2 import parse_pdf_by_ocr
-from magic_pdf.pdf_parse_by_txt_v2 import parse_pdf_by_txt
+from magic_pdf.pdf_parse_by_ocr import parse_pdf_by_ocr
+from magic_pdf.pdf_parse_by_txt import parse_pdf_by_txt
 
 PARSE_TYPE_TXT = "txt"
 PARSE_TYPE_OCR = "ocr"
@@ -86,45 +86,7 @@ def parse_union_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: AbsReaderWr
             return None
 
     pdf_info_dict = parse_pdf(parse_pdf_by_txt)
-    # text_all = ""
-    # for page_dict in pdf_info_dict['pdf_info']:
-    #     for para_block in page_dict['para_blocks']:
-    #         if para_block['type'] in ['title', 'text']:
-    #             for line in para_block['lines']:
-    #                 for span in line['spans']:
-    #                     text_all += span['content']
-
-    # def calculate_not_common_character_rate(text):
-    #     garbage_regex = re.compile(r'[^\u4e00-\u9fa5\u0030-\u0039\u0041-\u005a\u0061-\u007a\u3000-\u303f\uff00-\uffef]')
-    #     # 计算乱码字符的数量
-    #     garbage_count = len(garbage_regex.findall(text))
-    #     total = len(text)
-    #     if total == 0:
-    #         return 0  # 避免除以零的错误
-    #     return garbage_count / total
-    #
-    # def calculate_not_printable_rate(text):
-    #     printable_text = ""
-    #     for c in text:
-    #         if c.isprintable():
-    #             printable_text += c
-    #     printable_total = len(printable_text)
-    #     total = len(text)
-    #     if total == 0:
-    #         return 0  # 避免除以零的错误
-    #     return (total - printable_total) / total
-    #
-    # not_common_character_rate = calculate_not_common_character_rate(text_all)
-    # not_printable_rate = calculate_not_printable_rate(text_all)
-    # pdf_info_dict["_not_common_character_rate"] = not_common_character_rate
-    # pdf_info_dict["_not_printable_rate"] = not_printable_rate
-    # logger.info(f"not_common_character_rate: {not_common_character_rate}, not_printable_rate: {not_printable_rate}")
-    '''新逻辑使用pdfminer识别乱码pdf,准确率高且不会误伤,已在解析流程之前进行处理'''
-    # not_common_character_rate对小语种可能会有误伤,not_printable_rate对小语种较为友好
-    if (pdf_info_dict is None
-            or pdf_info_dict.get("_need_drop", False)
-            # or not_printable_rate > 0.02  # 参考一些正常的pdf,这个值没有超过0.01的,阈值设为0.02
-    ):
+    if pdf_info_dict is None or pdf_info_dict.get("_need_drop", False):
         logger.warning(f"parse_pdf_by_txt drop or error, switch to parse_pdf_by_ocr")
         if input_model_is_empty:
             pdf_models = doc_analyze(pdf_bytes, ocr=True)