Przeglądaj źródła

debug时自动绘制layout区域和text区域

赵小蒙 1 rok temu
rodzic
commit
f31117deba
3 zmienionych plików z 83 dodań i 33 usunięć
  1. 62 17
      demo/draw_bbox.py
  2. 12 5
      demo/ocr_demo.py
  3. 9 11
      magic_pdf/pdf_parse_by_ocr.py

+ 62 - 17
demo/draw_bbox.py

@@ -1,21 +1,66 @@
 from magic_pdf.libs.commons import fitz  # PyMuPDF
 
-# PDF文件路径
-pdf_path = r"D:\project\20231108code-clean\magic_pdf\tmp\unittest\download-pdfs\ocr_1.json.pdf"
-
-doc = fitz.open(pdf_path)  # Open the PDF
-# 你的数据
-data = [[(294.7569528415961, 776.8430953398889, 300.8827085852479, 786.922616502779), (460.1523579201934, 776.8430953398889, 509.51874244256345, 787.2825994014537)], [(294.03627569528413, 779.7229585292861, 301.24304715840384, 788.3625480974777), (85.76058041112454, 781.882855921334, 156.74727932285367, 789.8024796921762)], [(293.6759371221282, 779.7229585292861, 301.60338573155985, 788.7225309961523), (459.43168077388145, 779.7229585292861, 508.7980652962515, 789.8024796921762)], [(295.8379685610641, 780.0829414279607, 301.24304715840384, 788.0025651988029), (85.76058041112454, 781.5228730226593, 156.74727932285367, 790.1624625908509)], [(294.03627569528413, 779.7229585292861, 301.60338573155985, 789.0825138948269), (459.79201934703747, 779.7229585292861, 508.4377267230955, 789.4424967935015)], [(86.4812575574365, 781.882855921334, 156.0266021765417, 789.8024796921762)], [(294.39661426844015, 779.7229585292861, 301.24304715840384, 788.3625480974777), (459.43168077388145, 779.7229585292861, 508.7980652962515, 789.4424967935015)], [(294.03627569528413, 779.7229585292861, 301.24304715840384, 788.3625480974777), (85.76058041112454, 781.5228730226593, 156.74727932285367, 789.8024796921762)], [(294.39661426844015, 779.7229585292861, 300.8827085852479, 788.3625480974777)]]
-
-# 对每个页面进行处理
-for i, page in enumerate(doc):
-    # 获取当前页面的数据
-    page_data = data[i]
-    for img in page_data:
-        # x0, y0, x1, y1, _ = img
-        x0, y0, x1, y1 = img
+def draw_bbox(i, bbox_list, page, rgb_config):
+    new_rgb = []
+    for item in rgb_config:
+        item = float(item) / 255
+        new_rgb.append(item)
+    page_data = bbox_list[i]
+    for bbox in page_data:
+        x0, y0, x1, y1 = bbox
         rect_coords = fitz.Rect(x0, y0, x1, y1)  # Define the rectangle
-        page.draw_rect(rect_coords, color=(1, 0, 0), fill=None, width=1.5, overlay=True)  # Draw the rectangle
+        page.draw_rect(rect_coords, color=new_rgb, fill=None, width=0.5, overlay=True)  # Draw the rectangle
 
-# Save the PDF
-doc.save(r"D:\project\20231108code-clean\magic_pdf\tmp\unittest\download-pdfs\ocr_1.json_new.pdf")
+
+def draw_layout_bbox(pdf_info_dict, input_path, out_path):
+    layout_bbox_list = []
+    for page in pdf_info_dict.values():
+        page_list = []
+        for layout in page['layout_bboxes']:
+            page_list.append(layout['layout_bbox'])
+        layout_bbox_list.append(page_list)
+
+    doc = fitz.open(input_path)
+    for i, page in enumerate(doc):
+        # 获取当前页面的数据
+        page_data = layout_bbox_list[i]
+        for j, bbox in enumerate(page_data):
+            x0, y0, x1, y1 = bbox
+            rect_coords = fitz.Rect(x0, y0, x1, y1)  # Define the rectangle
+            page.draw_rect(rect_coords, color=(1, 0, 0), fill=None, width=0.5, overlay=True)  # Draw the rectangle
+            page.insert_text((x0, y0), str(j + 1), fontsize=10, color=(1, 0, 0))  # Insert the index at the top left corner of the rectangle
+    # Save the PDF
+    doc.save(f"{out_path}/layout.pdf")
+
+def draw_text_bbox(pdf_info_dict, input_path, out_path):
+    text_list = []
+    inline_equation_list = []
+    displayed_equation_list = []
+    for page in pdf_info_dict.values():
+        page_text_list = []
+        page_inline_equation_list = []
+        page_displayed_equation_list = []
+        for block in page['preproc_blocks']:
+            for line in block['lines']:
+                for span in line['spans']:
+                    if span['type'] == 'text':
+                        page_text_list.append(span['bbox'])
+                    elif span['type'] == 'inline_equation':
+                        page_inline_equation_list.append(span['bbox'])
+                    elif span['type'] == 'displayed_equation':
+                        page_displayed_equation_list.append(span['bbox'])
+        text_list.append(page_text_list)
+        inline_equation_list.append(page_inline_equation_list)
+        displayed_equation_list.append(page_displayed_equation_list)
+
+    doc = fitz.open(input_path)
+    for i, page in enumerate(doc):
+        # 获取当前页面的数据
+        draw_bbox(i, text_list, page, [255, 0, 0])
+
+        draw_bbox(i, inline_equation_list, page, [0, 255, 0])
+
+        draw_bbox(i, displayed_equation_list, page, [0, 0, 255])
+
+    # Save the PDF
+    doc.save(f"{out_path}/text.pdf")

+ 12 - 5
demo/ocr_demo.py

@@ -4,7 +4,7 @@ import os
 from loguru import logger
 from pathlib import Path
 
-from magic_pdf.dict2md.ocr_mkcontent import mk_nlp_markdown
+from magic_pdf.dict2md.ocr_mkcontent import mk_nlp_markdown, mk_mm_markdown
 from magic_pdf.libs.commons import join_path
 from magic_pdf.pdf_parse_by_ocr import parse_pdf_by_ocr
 
@@ -30,15 +30,20 @@ def read_json_file(file_path):
 
 
 if __name__ == '__main__':
-    ocr_pdf_path = r"D:\project\20231108code-clean\ocr\new\demo_4\ocr_demo\ocr_0_org.pdf"
-    ocr_json_file_path = r"D:\project\20231108code-clean\ocr\new\demo_4\ocr_demo\ocr_0.json"
+    # ocr_pdf_path = r"D:\project\20231108code-clean\ocr\new\双栏\s0043-1354(02)00581-x.pdf"
+    # ocr_json_file_path = r"D:\project\20231108code-clean\ocr\new\双栏\s0043-1354(02)00581-x.json"
+    # ocr_pdf_path = r"D:\project\20231108code-clean\ocr\new\双栏\j.1540-627x.2006.00176.x.pdf"
+    # ocr_json_file_path = r"D:\project\20231108code-clean\ocr\new\双栏\j.1540-627x.2006.00176.x.json"
+    ocr_pdf_path = r"D:\project\20231108code-clean\ocr\new\demo_4\ocr_demo\ocr_1_org.pdf"
+    ocr_json_file_path = r"D:\project\20231108code-clean\ocr\new\demo_4\ocr_demo\ocr_1.json"
     try:
         ocr_pdf_model_info = read_json_file(ocr_json_file_path)
         pth = Path(ocr_json_file_path)
         book_name = pth.name
         save_tmp_path = os.path.join(os.path.dirname(__file__), "../..", "tmp", "unittest")
         save_path = join_path(save_tmp_path, "md")
-        text_content_save_path = f"{save_path}/{book_name}/book.md"
+        save_path_with_bookname = os.path.join(save_path, book_name)
+        text_content_save_path = f"{save_path_with_bookname}/book.md"
         pdf_info_dict = parse_pdf_by_ocr(
             ocr_pdf_path,
             None,
@@ -46,11 +51,13 @@ if __name__ == '__main__':
             save_path,
             book_name,
             debug_mode=True)
+
         parent_dir = os.path.dirname(text_content_save_path)
         if not os.path.exists(parent_dir):
             os.makedirs(parent_dir)
 
-        markdown_content = mk_nlp_markdown(pdf_info_dict)
+        # markdown_content = mk_nlp_markdown(pdf_info_dict)
+        markdown_content = mk_mm_markdown(pdf_info_dict)
 
         with open(text_content_save_path, "w", encoding="utf-8") as f:
             f.write(markdown_content)

+ 9 - 11
magic_pdf/pdf_parse_by_ocr.py

@@ -4,6 +4,7 @@ import time
 
 from loguru import logger
 
+from demo.draw_bbox import draw_layout_bbox, draw_text_bbox
 from magic_pdf.libs.commons import read_file, join_path, fitz, get_img_s3_client, get_delta_time, get_docx_model_output
 from magic_pdf.libs.coordinate_transform import get_scale_ratio
 from magic_pdf.libs.safe_filename import sanitize_filename
@@ -182,17 +183,14 @@ def parse_pdf_by_ocr(
         page_info = construct_page_component(page_id, blocks, layout_bboxes)
         pdf_info_dict[f"page_{page_id}"] = page_info
 
-        # 在测试时,保存调试信息
-        if debug_mode:
-            params_file_save_path = join_path(save_tmp_path, "md", book_name, "preproc_out.json")
-            page_draw_rect_save_path = join_path(save_tmp_path, "md", book_name, "layout.pdf")
-
-            with open(params_file_save_path, "w", encoding="utf-8") as f:
-                json.dump(pdf_info_dict, f, ensure_ascii=False, indent=4)
-            # 先检测本地 page_draw_rect_save_path 是否存在,如果存在则删除
-            if os.path.exists(page_draw_rect_save_path):
-                os.remove(page_draw_rect_save_path)
-            # 绘制bbox和layout到pdf
+    # 在测试时,保存调试信息
+    if debug_mode:
+        params_file_save_path = join_path(save_tmp_path, "md", book_name, "preproc_out.json")
+        with open(params_file_save_path, "w", encoding="utf-8") as f:
+            json.dump(pdf_info_dict, f, ensure_ascii=False, indent=4)
+        # drow_bbox
+        draw_layout_bbox(pdf_info_dict, pdf_path, md_bookname_save_path)
+        draw_text_bbox(pdf_info_dict, pdf_path, md_bookname_save_path)
 
 
     return pdf_info_dict