Browse Source

ocr_demo main函数精简

赵小蒙 1 year ago
parent
commit
2277e31ff4
1 changed files with 15 additions and 13 deletions
  1. 15 13
      demo/ocr_demo.py

+ 15 - 13
demo/ocr_demo.py

@@ -6,7 +6,13 @@ from pathlib import Path
 
 from app.common.s3 import get_s3_config
 from demo.demo_test import get_json_from_local_or_s3
-from magic_pdf.dict2md.ocr_mkcontent import ocr_mk_mm_markdown_with_para, ocr_mk_nlp_markdown, ocr_mk_mm_markdown, ocr_mk_mm_standard_format
+from magic_pdf.dict2md.ocr_mkcontent import (
+    ocr_mk_mm_markdown_with_para,
+    ocr_mk_nlp_markdown,
+    ocr_mk_mm_markdown,
+    ocr_mk_mm_standard_format,
+    ocr_mk_mm_markdown_with_para_and_pagination
+)
 from magic_pdf.libs.commons import join_path
 from magic_pdf.pdf_parse_by_ocr import parse_pdf_by_ocr
 
@@ -47,7 +53,7 @@ def ocr_online_parse(book_name, start_page_id=0, debug_mode=True):
         # logger.info(json_object)
         s3_pdf_path = json_object["file_location"]
         s3_config = get_s3_config(s3_pdf_path)
-        ocr_pdf_model_info = json_object["doc_layout_result"]
+        ocr_pdf_model_info = json_object.get("doc_layout_result")
         ocr_parse_core(book_name, s3_pdf_path, ocr_pdf_model_info, s3_config=s3_config)
     except Exception as e:
         logger.exception(e)
@@ -72,6 +78,7 @@ def ocr_parse_core(book_name, ocr_pdf_path, ocr_pdf_model_info, start_page_id=0,
 
     # markdown_content = mk_nlp_markdown(pdf_info_dict)
     markdown_content = ocr_mk_mm_markdown_with_para(pdf_info_dict)
+    # markdown_pagination = ocr_mk_mm_markdown_with_para_and_pagination(pdf_info_dict)
 
     with open(text_content_save_path, "w", encoding="utf-8") as f:
         f.write(markdown_content)
@@ -83,14 +90,9 @@ def ocr_parse_core(book_name, ocr_pdf_path, ocr_pdf_model_info, start_page_id=0,
 
 
 if __name__ == '__main__':
-    #ocr_pdf_path = r"D:\project\20231108code-clean\ocr\new\双栏\s0043-1354(02)00581-x.pdf"
-    #ocr_json_file_path = r"D:\project\20231108code-clean\ocr\new\双栏\s0043-1354(02)00581-x.json"
-    # ocr_pdf_path = r"D:\project\20231108code-clean\ocr\new\双栏\j.1540-627x.2006.00176.x.pdf"
-    # ocr_json_file_path = r"D:\project\20231108code-clean\ocr\new\双栏\j.1540-627x.2006.00176.x.json"
-    
-    ocr_pdf_path = r"/home/cxu/workspace/Magic-PDF/ocr_demo/j.1540-627x.2006.00176.x.pdf"
-    ocr_json_file_path = r"/home/cxu/workspace/Magic-PDF/ocr_demo/j.1540-627x.2006.00176.x.json"
-    # ocr_pdf_path = r"/home/cxu/workspace/Magic-PDF/ocr_demo/ocr_1.pdf"
-    # ocr_json_file_path = r"/home/cxu/workspace/Magic-PDF/ocr_demo/ocr_1.json"
-    ocr_local_parse(ocr_pdf_path, ocr_json_file_path)
-    #ocr_online_parse(book_name="美国加州中学教材/edu_00000060")
+    # pdf_path = r"/home/cxu/workspace/Magic-PDF/ocr_demo/j.1540-627x.2006.00176.x.pdf"
+    # json_file_path = r"/home/cxu/workspace/Magic-PDF/ocr_demo/j.1540-627x.2006.00176.x.json"
+    # ocr_local_parse(pdf_path, json_file_path)
+    # book_name = "数学新星网/edu_00001236"
+    # ocr_online_parse(book_name)
+    pass