Browse Source

pdf_parse_by_model.py ---> pdf_parse_by_txt.py

赵小蒙 1 năm trước cách đây
mục cha
commit
f65be6e094
3 tập tin đã thay đổi với 5 bổ sung5 xóa
  1. 2 2
      demo/pdf2md.py
  2. 1 1
      magic_pdf/pdf_parse_by_txt.py
  3. 2 2
      magic_pdf/pipeline.py

+ 2 - 2
demo/pdf2md.py

@@ -8,7 +8,7 @@ from loguru import logger
 
 from magic_pdf.libs.commons import join_path, read_file
 from magic_pdf.dict2md.mkcontent import mk_mm_markdown, mk_universal_format
-from magic_pdf.pipeline import parse_pdf_by_model
+from magic_pdf.pdf_parse_by_txt import parse_pdf_by_txt
 
 
 
@@ -25,7 +25,7 @@ def main(s3_pdf_path: str, s3_pdf_profile: str, pdf_model_path: str, pdf_model_p
     pdf_bytes = read_file(s3_pdf_path, s3_pdf_profile)
 
     try:
-        paras_dict = parse_pdf_by_model(
+        paras_dict = parse_pdf_by_txt(
             pdf_bytes, pdf_model_path, save_path, book_name, pdf_model_profile, start_page_num, debug_mode=debug_mode
         )
         parent_dir = os.path.dirname(text_content_save_path)

+ 1 - 1
magic_pdf/pdf_parse_by_model.py → magic_pdf/pdf_parse_by_txt.py

@@ -70,7 +70,7 @@ paraMergeException_msg = ParaMergeException().message
 
 
 
-def parse_pdf_by_model(
+def parse_pdf_by_txt(
     pdf_bytes,
     pdf_model_output,
     save_path,

+ 2 - 2
magic_pdf/pipeline.py

@@ -13,7 +13,7 @@ from magic_pdf.libs.commons import (
 from magic_pdf.libs.drop_reason import DropReason
 from magic_pdf.libs.json_compressor import JsonCompressor
 from magic_pdf.dict2md.mkcontent import mk_universal_format
-from magic_pdf.pdf_parse_by_model import parse_pdf_by_model
+from magic_pdf.pdf_parse_by_txt import parse_pdf_by_txt
 from magic_pdf.filter.pdf_classify_by_type import classify
 from magic_pdf.filter.pdf_meta_scan import pdf_meta_scan
 from loguru import logger
@@ -310,7 +310,7 @@ def parse_pdf(jso: dict, start_page_id=0, debug_mode=False) -> dict:
                 f"book_name is:{book_name},start_time is:{formatted_time(start_time)}",
                 file=sys.stderr,
             )
-            pdf_info_dict = parse_pdf_by_model(
+            pdf_info_dict = parse_pdf_by_txt(
                 pdf_bytes,
                 model_output_json_list,
                 save_path,