Browse Source

Merge branch 'master' into dev-in-line-bbox

# Conflicts:
#	demo/draw_bbox.py
liukaiwen 1 year ago
parent
commit
1b4af400cc

+ 64 - 57
demo/draw_bbox.py

@@ -1,59 +1,66 @@
-from pathlib import Path
-
-from magic_pdf.libs.commons import fitz, join_path  # PyMuPDF
-from magic_pdf.pdf_parse_by_ocr import parse_pdf_by_ocr
-import json
-import os
-
-
-
-
-def read_json_file(file_path):
-    with open(file_path, 'r') as f:
-        data = json.load(f)
-    return data
-
-
-# PDF文件路径
-pdf_path = "D:\\projects\\Magic-PDF\\ocr_demo\\ocr_2_org.pdf"
-
-doc = fitz.open(pdf_path)  # Open the PDF
-# 你的数据
-data = [[[-2, 0, 603, 80, 24]], [[-3, 0, 602, 80, 24]]]
-ocr_json_file_path = r"D:\projects\Magic-PDF\ocr_demo\ocr_2.json"
-ocr_pdf_info = read_json_file(ocr_json_file_path)
-
-pth = Path(ocr_json_file_path)
-book_name = pth.name
-save_tmp_path = os.path.join(os.path.dirname(__file__), "../..", "tmp", "unittest")
-save_path = join_path(save_tmp_path, "md")
-
-pdf_info_dict = parse_pdf_by_ocr(
-            pdf_path,
-            None,
-            ocr_pdf_info,
-            save_path,
-            book_name,
-            debug_mode=True)
-data_list = []
-for page in pdf_info_dict.values():
-    page_list = []
-    blocks = page.get("preproc_blocks")
-    for block in blocks:
-        lines = block.get("lines")
-        for line in lines:
-            spans = line.get("spans")
-            for span in spans:
-                page_list.append(span["bbox"])
-    data_list.append(page_list)
-# 对每个页面进行处理
-for i, page in enumerate(doc):
-    # 获取当前页面的数据
-    page_data = data_list[i]
-    for img in page_data:
-        x0, y0, x1, y1 = img
+from magic_pdf.libs.commons import fitz  # PyMuPDF
+
+def draw_bbox(i, bbox_list, page, rgb_config):
+    new_rgb = []
+    for item in rgb_config:
+        item = float(item) / 255
+        new_rgb.append(item)
+    page_data = bbox_list[i]
+    for bbox in page_data:
+        x0, y0, x1, y1 = bbox
         rect_coords = fitz.Rect(x0, y0, x1, y1)  # Define the rectangle
-        page.draw_rect(rect_coords, color=(1, 0, 0), fill=None, width=1.5, overlay=True)  # Draw the rectangle
+        page.draw_rect(rect_coords, color=new_rgb, fill=None, width=0.5, overlay=True)  # Draw the rectangle
+
+
+def draw_layout_bbox(pdf_info_dict, input_path, out_path):
+    layout_bbox_list = []
+    for page in pdf_info_dict.values():
+        page_list = []
+        for layout in page['layout_bboxes']:
+            page_list.append(layout['layout_bbox'])
+        layout_bbox_list.append(page_list)
+
+    doc = fitz.open(input_path)
+    for i, page in enumerate(doc):
+        # 获取当前页面的数据
+        page_data = layout_bbox_list[i]
+        for j, bbox in enumerate(page_data):
+            x0, y0, x1, y1 = bbox
+            rect_coords = fitz.Rect(x0, y0, x1, y1)  # Define the rectangle
+            page.draw_rect(rect_coords, color=(1, 0, 0), fill=None, width=0.5, overlay=True)  # Draw the rectangle
+            page.insert_text((x0, y0), str(j + 1), fontsize=10, color=(1, 0, 0))  # Insert the index at the top left corner of the rectangle
+    # Save the PDF
+    doc.save(f"{out_path}/layout.pdf")
+
+def draw_text_bbox(pdf_info_dict, input_path, out_path):
+    text_list = []
+    inline_equation_list = []
+    displayed_equation_list = []
+    for page in pdf_info_dict.values():
+        page_text_list = []
+        page_inline_equation_list = []
+        page_displayed_equation_list = []
+        for block in page['preproc_blocks']:
+            for line in block['lines']:
+                for span in line['spans']:
+                    if span['type'] == 'text':
+                        page_text_list.append(span['bbox'])
+                    elif span['type'] == 'inline_equation':
+                        page_inline_equation_list.append(span['bbox'])
+                    elif span['type'] == 'displayed_equation':
+                        page_displayed_equation_list.append(span['bbox'])
+        text_list.append(page_text_list)
+        inline_equation_list.append(page_inline_equation_list)
+        displayed_equation_list.append(page_displayed_equation_list)
+
+    doc = fitz.open(input_path)
+    for i, page in enumerate(doc):
+        # 获取当前页面的数据
+        draw_bbox(i, text_list, page, [255, 0, 0])
+
+        draw_bbox(i, inline_equation_list, page, [0, 255, 0])
+
+        draw_bbox(i, displayed_equation_list, page, [0, 0, 255])
 
-# Save the PDF
-doc.save("D:\\projects\\Magic-PDF\\ocr_demo\\ocr_2_new1.pdf")
+    # Save the PDF
+    doc.save(f"{out_path}/text.pdf")

+ 12 - 5
demo/ocr_demo.py

@@ -4,7 +4,7 @@ import os
 from loguru import logger
 from pathlib import Path
 
-from magic_pdf.dict2md.ocr_mkcontent import mk_nlp_markdown
+from magic_pdf.dict2md.ocr_mkcontent import mk_nlp_markdown, mk_mm_markdown
 from magic_pdf.libs.commons import join_path
 from magic_pdf.pdf_parse_by_ocr import parse_pdf_by_ocr
 
@@ -30,15 +30,20 @@ def read_json_file(file_path):
 
 
 if __name__ == '__main__':
-    ocr_pdf_path = r"D:\project\20231108code-clean\ocr\new\demo_4\ocr_demo\ocr_0_org.pdf"
-    ocr_json_file_path = r"D:\project\20231108code-clean\ocr\new\demo_4\ocr_demo\ocr_0.json"
+    # ocr_pdf_path = r"D:\project\20231108code-clean\ocr\new\双栏\s0043-1354(02)00581-x.pdf"
+    # ocr_json_file_path = r"D:\project\20231108code-clean\ocr\new\双栏\s0043-1354(02)00581-x.json"
+    # ocr_pdf_path = r"D:\project\20231108code-clean\ocr\new\双栏\j.1540-627x.2006.00176.x.pdf"
+    # ocr_json_file_path = r"D:\project\20231108code-clean\ocr\new\双栏\j.1540-627x.2006.00176.x.json"
+    ocr_pdf_path = r"D:\project\20231108code-clean\ocr\new\demo_4\ocr_demo\ocr_1_org.pdf"
+    ocr_json_file_path = r"D:\project\20231108code-clean\ocr\new\demo_4\ocr_demo\ocr_1.json"
     try:
         ocr_pdf_model_info = read_json_file(ocr_json_file_path)
         pth = Path(ocr_json_file_path)
         book_name = pth.name
         save_tmp_path = os.path.join(os.path.dirname(__file__), "../..", "tmp", "unittest")
         save_path = join_path(save_tmp_path, "md")
-        text_content_save_path = f"{save_path}/{book_name}/book.md"
+        save_path_with_bookname = os.path.join(save_path, book_name)
+        text_content_save_path = f"{save_path_with_bookname}/book.md"
         pdf_info_dict = parse_pdf_by_ocr(
             ocr_pdf_path,
             None,
@@ -46,11 +51,13 @@ if __name__ == '__main__':
             save_path,
             book_name,
             debug_mode=True)
+
         parent_dir = os.path.dirname(text_content_save_path)
         if not os.path.exists(parent_dir):
             os.makedirs(parent_dir)
 
-        markdown_content = mk_nlp_markdown(pdf_info_dict)
+        # markdown_content = mk_nlp_markdown(pdf_info_dict)
+        markdown_content = mk_mm_markdown(pdf_info_dict)
 
         with open(text_content_save_path, "w", encoding="utf-8") as f:
             f.write(markdown_content)

+ 28 - 0
magic_pdf/dict2md/ocr_mkcontent.py

@@ -21,3 +21,31 @@ def mk_nlp_markdown(pdf_info_dict: dict):
                 # 在行末添加两个空格以强制换行
                 markdown.append(line_text.strip() + '  ')
     return '\n'.join(markdown)
+
+def mk_mm_markdown(pdf_info_dict: dict):
+
+    markdown = []
+
+    for _, page_info in pdf_info_dict.items():
+        blocks = page_info.get("preproc_blocks")
+        if not blocks:
+            continue
+        for block in blocks:
+            for line in block['lines']:
+                line_text = ''
+                for span in line['spans']:
+                    if not span.get('content'):
+                        if not span.get('image_path'):
+                            continue
+                        else:
+                            content = f"![]({span['image_path']})"
+                    else:
+                        content = span['content'].replace('$', '\$')  # 转义$
+                        if span['type'] == 'inline_equation':
+                            content = f"${content}$"
+                        elif span['type'] == 'displayed_equation':
+                            content = f"$$\n{content}\n$$"
+                    line_text += content + ' '
+                # 在行末添加两个空格以强制换行
+                markdown.append(line_text.strip() + '  ')
+    return '\n'.join(markdown)

+ 9 - 11
magic_pdf/pdf_parse_by_ocr.py

@@ -4,6 +4,7 @@ import time
 
 from loguru import logger
 
+from demo.draw_bbox import draw_layout_bbox, draw_text_bbox
 from magic_pdf.libs.commons import read_file, join_path, fitz, get_img_s3_client, get_delta_time, get_docx_model_output
 from magic_pdf.libs.coordinate_transform import get_scale_ratio
 from magic_pdf.libs.safe_filename import sanitize_filename
@@ -185,17 +186,14 @@ def parse_pdf_by_ocr(
         page_info = construct_page_component(page_id, blocks, layout_bboxes)
         pdf_info_dict[f"page_{page_id}"] = page_info
 
-        # 在测试时,保存调试信息
-        if debug_mode:
-            params_file_save_path = join_path(save_tmp_path, "md", book_name, "preproc_out.json")
-            page_draw_rect_save_path = join_path(save_tmp_path, "md", book_name, "layout.pdf")
-
-            with open(params_file_save_path, "w", encoding="utf-8") as f:
-                json.dump(pdf_info_dict, f, ensure_ascii=False, indent=4)
-            # 先检测本地 page_draw_rect_save_path 是否存在,如果存在则删除
-            if os.path.exists(page_draw_rect_save_path):
-                os.remove(page_draw_rect_save_path)
-            # 绘制bbox和layout到pdf
+    # 在测试时,保存调试信息
+    if debug_mode:
+        params_file_save_path = join_path(save_tmp_path, "md", book_name, "preproc_out.json")
+        with open(params_file_save_path, "w", encoding="utf-8") as f:
+            json.dump(pdf_info_dict, f, ensure_ascii=False, indent=4)
+        # drow_bbox
+        draw_layout_bbox(pdf_info_dict, pdf_path, md_bookname_save_path)
+        draw_text_bbox(pdf_info_dict, pdf_path, md_bookname_save_path)
 
 
     return pdf_info_dict

+ 2 - 2
magic_pdf/pre_proc/ocr_cut_image.py

@@ -12,8 +12,8 @@ def cut_image_and_table(spans, page, page_id, book_name, save_path):
     for span in spans:
         span_type = span['type']
         if span_type == 'image':
-            span['image_path'] = cut_image(span['bbox'], page_id, page, img_save_path('image'))
+            span['image_path'] = cut_image(span['bbox'], page_id, page, img_save_path('images'))
         elif span_type == 'table':
-            span['image_path'] = cut_image(span['bbox'], page_id, page, img_save_path('table'))
+            span['image_path'] = cut_image(span['bbox'], page_id, page, img_save_path('tables'))
 
     return spans

+ 34 - 29
magic_pdf/pre_proc/ocr_detect_layout.py

@@ -1,5 +1,6 @@
 import fitz
 
+from magic_pdf.layout.layout_sort import get_bboxes_layout
 from magic_pdf.libs.boxbase import _is_part_overlap, _is_in
 from magic_pdf.libs.coordinate_transform import get_scale_ratio
 
@@ -26,23 +27,16 @@ def get_area(bbox):
     return (bbox[2] - bbox[0]) * (bbox[3] - bbox[1])
 
 
-def adjust_layouts(layout_bboxes):
+def adjust_layouts(layout_bboxes, page_boundry, page_id):
     # 遍历所有布局框
     for i in range(len(layout_bboxes)):
         # 遍历当前布局框之后的布局框
         for j in range(i + 1, len(layout_bboxes)):
             # 判断两个布局框是否重叠
-            if _is_part_overlap(layout_bboxes[i]["layout_bbox"], layout_bboxes[j]["layout_bbox"]):
+            if _is_part_overlap(layout_bboxes[i], layout_bboxes[j]):
                 # 计算每个布局框的中心点坐标和面积
-                center_i = get_center_point(layout_bboxes[i]["layout_bbox"])
-                area_i = get_area(layout_bboxes[i]["layout_bbox"])
-
-                center_j = get_center_point(layout_bboxes[j]["layout_bbox"])
-                area_j = get_area(layout_bboxes[j]["layout_bbox"])
-
-                # 计算横向和纵向的距离差
-                dx = abs(center_i[0] - center_j[0])
-                dy = abs(center_i[1] - center_j[1])
+                area_i = get_area(layout_bboxes[i])
+                area_j = get_area(layout_bboxes[j])
 
                 # 较大布局框和较小布局框的赋值
                 if area_i > area_j:
@@ -50,19 +44,29 @@ def adjust_layouts(layout_bboxes):
                 else:
                     larger_layout, smaller_layout = layout_bboxes[j], layout_bboxes[i]
 
+                center_large = get_center_point(larger_layout)
+                center_small = get_center_point(smaller_layout)
+                # 计算横向和纵向的距离差
+                distance_x = center_large[0] - center_small[0]
+                distance_y = center_large[1] - center_small[1]
+
                 # 根据距离差判断重叠方向并修正边界
-                if dx > dy:  # 左右重叠
-                    if larger_layout["layout_bbox"][0] < smaller_layout["layout_bbox"][2]:
-                        larger_layout["layout_bbox"][0] = smaller_layout["layout_bbox"][2]
-                    else:
-                        larger_layout["layout_bbox"][2] = smaller_layout["layout_bbox"][0]
+                if abs(distance_x) > abs(distance_y):  # 左右重叠
+                    if distance_x > 0 and larger_layout[0] < smaller_layout[2]:
+                        larger_layout[0] = smaller_layout[2]+1
+                    if distance_x < 0 and larger_layout[2] > smaller_layout[0]:
+                        larger_layout[2] = smaller_layout[0]-1
                 else:  # 上下重叠
-                    if larger_layout["layout_bbox"][1] < smaller_layout["layout_bbox"][3]:
-                        larger_layout["layout_bbox"][1] = smaller_layout["layout_bbox"][3]
-                    else:
-                        larger_layout["layout_bbox"][3] = smaller_layout["layout_bbox"][1]
-    # todo 排序调整布局边界框列表
+                    if distance_y > 0 and larger_layout[1] < smaller_layout[3]:
+                        larger_layout[1] = smaller_layout[3]+1
+                    if distance_y < 0 and larger_layout[3] > smaller_layout[1]:
+                        larger_layout[3] = smaller_layout[1]-1
+    # 排序调整布局边界框列表
+    new_bboxes = []
+    for layout_bbox in layout_bboxes:
+        new_bboxes.append([layout_bbox[0], layout_bbox[1], layout_bbox[2], layout_bbox[3], None, None, None, None, None, None, None, None,None])
 
+    layout_bboxes, layout_tree = get_bboxes_layout(new_bboxes, page_boundry, page_id)
 
     # 返回排序调整后的布局边界框列表
     return layout_bboxes
@@ -79,6 +83,7 @@ def layout_detect(layout_info, page: fitz.Page, ocr_page_info):
         list: 经过排序调整后的所有子布局边界框信息的列表,每个边界框信息为字典类型,包含'layout_bbox'字段,表示边界框的坐标信息。
 
     """
+    page_id = ocr_page_info['page_info']['page_no']-1
     horizontal_scale_ratio, vertical_scale_ratio = get_scale_ratio(ocr_page_info, page)
     # 初始化布局边界框列表
     layout_bboxes = []
@@ -88,12 +93,9 @@ def layout_detect(layout_info, page: fitz.Page, ocr_page_info):
         x0, y0, _, _, x1, y1, _, _ = sub_layout['poly']
         bbox = [int(x0 / horizontal_scale_ratio), int(y0 / vertical_scale_ratio),
                 int(x1 / horizontal_scale_ratio), int(y1 / vertical_scale_ratio)]
-        # 创建子布局的边界框字典
-        layout_bbox = {
-            "layout_bbox": bbox,
-        }
+
         # 将子布局的边界框添加到列表中
-        layout_bboxes.append(layout_bbox)
+        layout_bboxes.append(bbox)
 
     # 初始化新的布局边界框列表
     new_layout_bboxes = []
@@ -102,14 +104,14 @@ def layout_detect(layout_info, page: fitz.Page, ocr_page_info):
         # 初始化标记变量,用于判断当前边界框是否需要保留
         keep = True
         # 获取当前边界框的坐标信息
-        box_i = layout_bboxes[i]["layout_bbox"]
+        box_i = layout_bboxes[i]
 
         # 遍历其他边界框
         for j in range(len(layout_bboxes)):
             # 排除当前边界框自身
             if i != j:
                 # 获取其他边界框的坐标信息
-                box_j = layout_bboxes[j]["layout_bbox"]
+                box_j = layout_bboxes[j]
                 # 检测box_i是否被box_j包含
                 if _is_in(box_i, box_j):
                     # 如果当前边界框被其他边界框包含,则标记为不需要保留
@@ -122,7 +124,10 @@ def layout_detect(layout_info, page: fitz.Page, ocr_page_info):
             new_layout_bboxes.append(layout_bboxes[i])
 
     # 对新的布局边界框列表进行排序调整
-    layout_bboxes = adjust_layouts(new_layout_bboxes)
+    page_width = page.rect.width
+    page_height = page.rect.height
+    page_boundry = [0, 0, page_width, page_height]
+    layout_bboxes = adjust_layouts(new_layout_bboxes, page_boundry, page_id)
 
     # 返回排序调整后的布局边界框列表
     return layout_bboxes