Browse Source

ocr_pdf_intermediate_dict_to_markdown_with_para输出nlp格式的markdown

赵小蒙 1 năm trước cách đây
mục cha
commit
07e4f115e6
1 tập tin đã thay đổi với 3 bổ sung2 xóa
  1. 3 2
      magic_pdf/pipeline.py

+ 3 - 2
magic_pdf/pipeline.py

@@ -7,7 +7,7 @@ from magic_pdf.dict2md.ocr_mkcontent import (
     ocr_mk_nlp_markdown,
     ocr_mk_mm_markdown,
     ocr_mk_mm_standard_format,
-    ocr_mk_mm_markdown_with_para, ocr_mk_mm_markdown_with_para_and_pagination,
+    ocr_mk_mm_markdown_with_para, ocr_mk_mm_markdown_with_para_and_pagination, ocr_mk_nlp_markdown_with_para,
 )
 from magic_pdf.libs.commons import (
     read_file,
@@ -510,7 +510,8 @@ def ocr_pdf_intermediate_dict_to_markdown_with_para(jso: dict, debug_mode=False)
         pdf_intermediate_dict = jso["pdf_intermediate_dict"]
         # 将 pdf_intermediate_dict 解压
         pdf_intermediate_dict = JsonCompressor.decompress_json(pdf_intermediate_dict)
-        markdown_content = ocr_mk_mm_markdown_with_para(pdf_intermediate_dict)
+        # markdown_content = ocr_mk_mm_markdown_with_para(pdf_intermediate_dict)
+        markdown_content = ocr_mk_nlp_markdown_with_para(pdf_intermediate_dict)
         jso["content"] = markdown_content
         logger.info(
             f"book_name is:{get_data_source(jso)}/{jso['file_id']},markdown content length is {len(markdown_content)}",