Procházet zdrojové kódy

为ocr模式的demo增加online模式,pipeline进行微调适配online模式

赵小蒙 před 1 rokem
rodič
revize
ce96c3f67c
4 změnil soubory, kde provedl 59 přidání a 52 odebrání
  1. 1 1
      demo/demo_test.py
  2. 48 39
      demo/ocr_demo.py
  3. 8 10
      magic_pdf/libs/draw_bbox.py
  4. 2 2
      magic_pdf/pdf_parse_by_ocr.py

+ 1 - 1
demo/demo_test.py

@@ -34,7 +34,7 @@ def get_json_from_local_or_s3(book_name=None):
         s3_config = get_s3_config(json_path)
         file_content = read_file(json_path, s3_config)
         json_str = file_content.decode("utf-8")
-        logger.info(json_str)
+        # logger.info(json_str)
         json_object = json.loads(json_str)
     return json_object
 

+ 48 - 39
demo/ocr_demo.py

@@ -4,6 +4,7 @@ import os
 from loguru import logger
 from pathlib import Path
 
+from app.common.s3 import get_s3_config
 from demo.demo_test import get_json_from_local_or_s3
 from magic_pdf.dict2md.ocr_mkcontent import ocr_mk_mm_markdown_with_para, ocr_mk_nlp_markdown, ocr_mk_mm_markdown, ocr_mk_mm_standard_format
 from magic_pdf.libs.commons import join_path
@@ -35,50 +36,58 @@ def ocr_local_parse(ocr_pdf_path, ocr_json_file_path):
         ocr_pdf_model_info = read_json_file(ocr_json_file_path)
         pth = Path(ocr_json_file_path)
         book_name = pth.name
-        save_tmp_path = os.path.join(os.path.dirname(__file__), "../..", "tmp", "unittest")
-        save_path = join_path(save_tmp_path, "md")
-        save_path_with_bookname = os.path.join(save_path, book_name)
-        text_content_save_path = f"{save_path_with_bookname}/book.md"
-        pdf_info_dict = parse_pdf_by_ocr(
-            ocr_pdf_path,
-            None,
-            ocr_pdf_model_info,
-            save_path,
-            book_name,
-            debug_mode=True)
-
-        parent_dir = os.path.dirname(text_content_save_path)
-        if not os.path.exists(parent_dir):
-            os.makedirs(parent_dir)
-
-        # markdown_content = mk_nlp_markdown(pdf_info_dict)
-        markdown_content = ocr_mk_mm_markdown_with_para(pdf_info_dict)
-
-        with open(text_content_save_path, "w", encoding="utf-8") as f:
-            f.write(markdown_content)
-
-        standard_format = ocr_mk_mm_standard_format(pdf_info_dict)
-        standard_format_save_path = f"{save_path_with_bookname}/standard_format.txt"
-        with open(standard_format_save_path, "w", encoding="utf-8") as f:
-            f.write(str(standard_format))
-
-        # logger.info(markdown_content)
-        # save_markdown(markdown_text, ocr_json_file_path)
+        ocr_parse_core(book_name, ocr_pdf_path, ocr_pdf_model_info)
     except Exception as e:
         logger.exception(e)
 
 
 def ocr_online_parse(book_name, start_page_id=0, debug_mode=True):
-    json_object = get_json_from_local_or_s3(book_name)
-    logger.info(json_object)
+    try:
+        json_object = get_json_from_local_or_s3(book_name)
+        # logger.info(json_object)
+        s3_pdf_path = json_object["file_location"]
+        s3_config = get_s3_config(s3_pdf_path)
+        ocr_pdf_model_info = json_object["doc_layout_result"]
+        ocr_parse_core(book_name, s3_pdf_path, ocr_pdf_model_info, s3_config=s3_config)
+    except Exception as e:
+        logger.exception(e)
+
+
+def ocr_parse_core(book_name, ocr_pdf_path, ocr_pdf_model_info, start_page_id=0, s3_config=None):
+    save_tmp_path = os.path.join(os.path.dirname(__file__), "../..", "tmp", "unittest")
+    save_path = join_path(save_tmp_path, "md")
+    save_path_with_bookname = os.path.join(save_path, book_name)
+    text_content_save_path = f"{save_path_with_bookname}/book.md"
+    pdf_info_dict = parse_pdf_by_ocr(
+        ocr_pdf_path,
+        s3_config,
+        ocr_pdf_model_info,
+        save_path,
+        book_name,
+        debug_mode=True)
+
+    parent_dir = os.path.dirname(text_content_save_path)
+    if not os.path.exists(parent_dir):
+        os.makedirs(parent_dir)
+
+    # markdown_content = mk_nlp_markdown(pdf_info_dict)
+    markdown_content = ocr_mk_mm_markdown_with_para(pdf_info_dict)
+
+    with open(text_content_save_path, "w", encoding="utf-8") as f:
+        f.write(markdown_content)
+
+    standard_format = ocr_mk_mm_standard_format(pdf_info_dict)
+    standard_format_save_path = f"{save_path_with_bookname}/standard_format.txt"
+    with open(standard_format_save_path, "w", encoding="utf-8") as f:
+        f.write(str(standard_format))
+
 
 if __name__ == '__main__':
-    #ocr_pdf_path = r"D:\project\20231108code-clean\ocr\new\双栏\s0043-1354(02)00581-x.pdf"
-    #ocr_json_file_path = r"D:\project\20231108code-clean\ocr\new\双栏\s0043-1354(02)00581-x.json"
-    # ocr_pdf_path = r"D:\project\20231108code-clean\ocr\new\双栏\j.1540-627x.2006.00176.x.pdf"
-    # ocr_json_file_path = r"D:\project\20231108code-clean\ocr\new\双栏\j.1540-627x.2006.00176.x.json"
-    ocr_pdf_path = r"/home/cxu/workspace/Magic-PDF/ocr_demo/ocr_1.pdf"
-    ocr_json_file_path = r"/home/cxu/workspace/Magic-PDF/ocr_demo/ocr_1.json"
+    # pdf_path = r"D:\project\20231108code-clean\ocr\new\双栏\s0043-1354(02)00581-x.pdf"
+    # json_file_path = r"D:\project\20231108code-clean\ocr\new\双栏\s0043-1354(02)00581-x.json"
+    # pdf_path = r"D:\project\20231108code-clean\ocr\new\双栏\j.1540-627x.2006.00176.x.pdf"
+    # json_file_path = r"D:\project\20231108code-clean\ocr\new\双栏\j.1540-627x.2006.00176.x.json"
+    # pdf_path = r"/home/cxu/workspace/Magic-PDF/ocr_demo/ocr_1.pdf"
+    # json_file_path = r"/home/cxu/workspace/Magic-PDF/ocr_demo/ocr_1.json"
+    # ocr_local_parse(pdf_path, json_file_path)
     ocr_online_parse(book_name="数学新星网/edu_00001236")
-    ocr_local_parse(ocr_pdf_path, ocr_json_file_path)
-    pass

+ 8 - 10
magic_pdf/libs/draw_bbox.py

@@ -27,7 +27,7 @@ def draw_bbox_with_number(i, bbox_list, page, rgb_config):
         page.insert_text((x0, y0), str(j + 1), fontsize=10, color=new_rgb)  # Insert the index at the top left corner of the rectangle
 
 
-def draw_layout_bbox(pdf_info_dict, input_path, out_path):
+def draw_layout_bbox(pdf_info_dict, pdf_bytes, out_path):
     layout_bbox_list = []
     dropped_bbox_list = []
     for page in pdf_info_dict.values():
@@ -40,15 +40,14 @@ def draw_layout_bbox(pdf_info_dict, input_path, out_path):
             for dropped_bbox in dropped_bboxes:
                 page_dropped_list.append(dropped_bbox)
         dropped_bbox_list.append(page_dropped_list)
-
-    doc = fitz.open(input_path)
-    for i, page in enumerate(doc):
+    pdf_docs = fitz.open("pdf", pdf_bytes)
+    for i, page in enumerate(pdf_docs):
         draw_bbox_with_number(i, layout_bbox_list, page, [255, 0, 0])
         draw_bbox_without_number(i, dropped_bbox_list, page, [0, 255, 0])
     # Save the PDF
-    doc.save(f"{out_path}/layout.pdf")
+    pdf_docs.save(f"{out_path}/layout.pdf")
 
-def draw_text_bbox(pdf_info_dict, input_path, out_path):
+def draw_text_bbox(pdf_info_dict, pdf_bytes, out_path):
     text_list = []
     inline_equation_list = []
     interline_equation_list = []
@@ -68,13 +67,12 @@ def draw_text_bbox(pdf_info_dict, input_path, out_path):
         text_list.append(page_text_list)
         inline_equation_list.append(page_inline_equation_list)
         interline_equation_list.append(page_interline_equation_list)
-
-    doc = fitz.open(input_path)
-    for i, page in enumerate(doc):
+    pdf_docs = fitz.open("pdf", pdf_bytes)
+    for i, page in enumerate(pdf_docs):
         # 获取当前页面的数据
         draw_bbox_without_number(i, text_list, page, [255, 0, 0])
         draw_bbox_without_number(i, inline_equation_list, page, [0, 255, 0])
         draw_bbox_without_number(i, interline_equation_list, page, [0, 0, 255])
 
     # Save the PDF
-    doc.save(f"{out_path}/text.pdf")
+    pdf_docs.save(f"{out_path}/text.pdf")

+ 2 - 2
magic_pdf/pdf_parse_by_ocr.py

@@ -282,7 +282,7 @@ def parse_pdf_by_ocr(
             json.dump(pdf_info_dict, f, ensure_ascii=False, indent=4)
 
         # drow_bbox
-        draw_layout_bbox(pdf_info_dict, pdf_path, md_bookname_save_path)
-        draw_text_bbox(pdf_info_dict, pdf_path, md_bookname_save_path)
+        draw_layout_bbox(pdf_info_dict, pdf_bytes, md_bookname_save_path)
+        draw_text_bbox(pdf_info_dict, pdf_bytes, md_bookname_save_path)
 
     return pdf_info_dict