Sfoglia il codice sorgente

Merge branch 'master' into dev-in-line-bbox

# Conflicts:
#	demo/draw_bbox.py
#	demo/ocr_demo.py
#	magic_pdf/pdf_parse_by_ocr.py
#	magic_pdf/pre_proc/ocr_dict_merge.py
liukaiwen 1 anno fa
parent
commit
da5091430b

+ 31 - 7
demo/ocr_demo.py

@@ -2,8 +2,10 @@ import json
 import os
 
 from loguru import logger
+from pathlib import Path
 
 from magic_pdf.dict2md.ocr_mkcontent import mk_nlp_markdown
+from magic_pdf.libs.commons import join_path
 from magic_pdf.pdf_parse_by_ocr import parse_pdf_by_ocr
 
 
@@ -28,10 +30,32 @@ def read_json_file(file_path):
 
 
 if __name__ == '__main__':
-    ocr_json_file_path = r"D:\projects\Magic-PDF\ocr_demo\ocr_0.json"
-    ocr_pdf_info = read_json_file(ocr_json_file_path)
-    pdf_info_dict = parse_pdf_by_ocr(ocr_pdf_info)
-    markdown_text = mk_nlp_markdown(pdf_info_dict)
-    logger.info(markdown_text)
-    save_markdown(markdown_text, ocr_json_file_path)
-
+    ocr_pdf_path = r"D:\project\20231108code-clean\ocr\new\demo_4\ocr_demo\ocr_0_org.pdf"
+    ocr_json_file_path = r"D:\project\20231108code-clean\ocr\new\demo_4\ocr_demo\ocr_0.json"
+    try:
+        ocr_pdf_model_info = read_json_file(ocr_json_file_path)
+        pth = Path(ocr_json_file_path)
+        book_name = pth.name
+        save_tmp_path = os.path.join(os.path.dirname(__file__), "../..", "tmp", "unittest")
+        save_path = join_path(save_tmp_path, "md")
+        text_content_save_path = f"{save_path}/{book_name}/book.md"
+        pdf_info_dict = parse_pdf_by_ocr(
+            ocr_pdf_path,
+            None,
+            ocr_pdf_model_info,
+            save_path,
+            book_name,
+            debug_mode=True)
+        parent_dir = os.path.dirname(text_content_save_path)
+        if not os.path.exists(parent_dir):
+            os.makedirs(parent_dir)
+
+        markdown_content = mk_nlp_markdown(pdf_info_dict)
+
+        with open(text_content_save_path, "w", encoding="utf-8") as f:
+            f.write(markdown_content)
+
+        # logger.info(markdown_content)
+        # save_markdown(markdown_text, ocr_json_file_path)
+    except Exception as e:
+        logger.exception(e)

+ 21 - 0
magic_pdf/libs/boxbase.py

@@ -177,6 +177,27 @@ def calculate_overlap_area_2_minbox_area_ratio(bbox1, bbox2):
     else:
         return intersection_area / min_box_area
 
+def calculate_overlap_area_in_bbox1_area_ratio(bbox1, bbox2):
+    """
+    计算box1和box2的重叠面积占bbox1的比例
+    """
+    # Determine the coordinates of the intersection rectangle
+    x_left = max(bbox1[0], bbox2[0])
+    y_top = max(bbox1[1], bbox2[1])
+    x_right = min(bbox1[2], bbox2[2])
+    y_bottom = min(bbox1[3], bbox2[3])
+
+    if x_right < x_left or y_bottom < y_top:
+        return 0.0
+
+    # The area of overlap area
+    intersection_area = (x_right - x_left) * (y_bottom - y_top)
+    bbox1_area = (bbox1[2]-bbox1[0])*(bbox1[3]-bbox1[1])
+    if bbox1_area == 0:
+        return 0
+    else:
+        return intersection_area / bbox1_area
+
 
 def get_minbox_if_overlap_by_ratio(bbox1, bbox2, ratio):
     """

+ 29 - 0
magic_pdf/libs/commons.py

@@ -1,4 +1,5 @@
 import datetime
+import json
 import os, re, configparser
 import time
 
@@ -115,6 +116,34 @@ def read_file(pdf_path: str, s3_profile):
         with open(pdf_path, "rb") as f:
             return f.read()
 
+
+def get_docx_model_output(pdf_model_output, pdf_model_s3_profile, page_id):
+    if isinstance(pdf_model_output, str):
+        model_output_json_path = join_path(pdf_model_output, f"page_{page_id + 1}.json")  # 模型输出的页面编号从1开始的
+        if os.path.exists(model_output_json_path):
+            json_from_docx = read_file(model_output_json_path, pdf_model_s3_profile)
+            model_output_json = json.loads(json_from_docx)
+        else:
+            try:
+                model_output_json_path = join_path(pdf_model_output, "model.json")
+                with open(model_output_json_path, "r", encoding="utf-8") as f:
+                    model_output_json = json.load(f)
+                    model_output_json = model_output_json["doc_layout_result"][page_id]
+            except:
+                s3_model_output_json_path = join_path(pdf_model_output, f"page_{page_id + 1}.json")
+                s3_model_output_json_path = join_path(pdf_model_output, f"{page_id}.json")
+                #s3_model_output_json_path = join_path(pdf_model_output, f"page_{page_id }.json")
+                # logger.warning(f"model_output_json_path: {model_output_json_path} not found. try to load from s3: {s3_model_output_json_path}")
+
+                s = read_file(s3_model_output_json_path, pdf_model_s3_profile)
+                return json.loads(s)
+
+    elif isinstance(pdf_model_output, list):
+        model_output_json = pdf_model_output[page_id]
+
+    return model_output_json
+
+
 def list_dir(dir_path:str, s3_profile:str):
     """
     列出dir_path下的所有文件

+ 9 - 0
magic_pdf/libs/coordinate_transform.py

@@ -0,0 +1,9 @@
+def get_scale_ratio(ocr_page_info, page):
+    pix = page.get_pixmap(dpi=72)
+    pymu_width = int(pix.w)
+    pymu_height = int(pix.h)
+    width_from_json = ocr_page_info['page_info']['width']
+    height_from_json = ocr_page_info['page_info']['height']
+    horizontal_scale_ratio = width_from_json / pymu_width
+    vertical_scale_ratio = height_from_json / pymu_height
+    return horizontal_scale_ratio, vertical_scale_ratio

+ 1 - 26
magic_pdf/pdf_parse_by_model.py

@@ -2,7 +2,7 @@ import time
 
 # from anyio import Path
 
-from magic_pdf.libs.commons import fitz, get_delta_time, get_img_s3_client
+from magic_pdf.libs.commons import fitz, get_delta_time, get_img_s3_client, get_docx_model_output
 import json
 import os
 import math
@@ -68,31 +68,6 @@ paraSplitException_msg = ParaSplitException().message
 paraMergeException_msg = ParaMergeException().message
 
 
-def get_docx_model_output(pdf_model_output, pdf_model_s3_profile, page_id):
-    if isinstance(pdf_model_output, str):
-        model_output_json_path = join_path(pdf_model_output, f"page_{page_id + 1}.json")  # 模型输出的页面编号从1开始的
-        if os.path.exists(model_output_json_path):
-            json_from_docx = read_file(model_output_json_path, pdf_model_s3_profile)
-            model_output_json = json.loads(json_from_docx)
-        else:
-            try:
-                model_output_json_path = join_path(pdf_model_output, "model.json")
-                with open(model_output_json_path, "r", encoding="utf-8") as f:
-                    model_output_json = json.load(f)
-                    model_output_json = model_output_json["doc_layout_result"][page_id]
-            except:
-                s3_model_output_json_path = join_path(pdf_model_output, f"page_{page_id + 1}.json")
-                s3_model_output_json_path = join_path(pdf_model_output, f"{page_id}.json")
-                #s3_model_output_json_path = join_path(pdf_model_output, f"page_{page_id }.json")
-                # logger.warning(f"model_output_json_path: {model_output_json_path} not found. try to load from s3: {s3_model_output_json_path}")
-
-                s = read_file(s3_model_output_json_path, pdf_model_s3_profile)
-                return json.loads(s)
-
-    elif isinstance(pdf_model_output, list):
-        model_output_json = pdf_model_output[page_id]
-
-    return model_output_json
 
 
 def parse_pdf_by_model(

+ 123 - 14
magic_pdf/pdf_parse_by_ocr.py

@@ -1,34 +1,114 @@
+import json
+import os
+import time
+
 from loguru import logger
 
 from magic_pdf.libs.ocr_dict_merge import merge_spans_to_line, remove_overlaps_min_spans, modify_y_axis
+from magic_pdf.libs.commons import read_file, join_path, fitz, get_img_s3_client, get_delta_time, get_docx_model_output
+from magic_pdf.libs.coordinate_transform import get_scale_ratio
+from magic_pdf.libs.safe_filename import sanitize_filename
+from magic_pdf.pre_proc.detect_footer_by_model import parse_footers
+from magic_pdf.pre_proc.detect_footnote import parse_footnotes_by_model
+from magic_pdf.pre_proc.detect_header import parse_headers
+from magic_pdf.pre_proc.detect_page_number import parse_pageNos
+from magic_pdf.pre_proc.ocr_cut_image import cut_image_and_table
+from magic_pdf.pre_proc.ocr_detect_layout import layout_detect
+from magic_pdf.pre_proc.ocr_dict_merge import remove_overlaps_min_spans, merge_spans_to_line_by_layout
+from magic_pdf.pre_proc.ocr_remove_spans import remove_spans_by_bboxes
 
 
-def construct_page_component(page_id, blocks):
+def construct_page_component(page_id, blocks, layout_bboxes):
     return_dict = {
         'preproc_blocks': blocks,
         'page_idx': page_id,
+        'layout_bboxes': layout_bboxes,
     }
     return return_dict
 
 
 def parse_pdf_by_ocr(
-    ocr_pdf_info,
-    start_page_id=0,
-    end_page_id=None,
+        pdf_path,
+        s3_pdf_profile,
+        pdf_model_output,
+        save_path,
+        book_name,
+        pdf_model_profile=None,
+        image_s3_config=None,
+        start_page_id=0,
+        end_page_id=None,
+        debug_mode=False,
 ):
+    pdf_bytes = read_file(pdf_path, s3_pdf_profile)
+    save_tmp_path = os.path.join(os.path.dirname(__file__), "../..", "tmp", "unittest")
+    book_name = sanitize_filename(book_name)
+    md_bookname_save_path = ""
+    if debug_mode:
+        save_path = join_path(save_tmp_path, "md")
+        pdf_local_path = join_path(save_tmp_path, "download-pdfs", book_name)
+
+        if not os.path.exists(os.path.dirname(pdf_local_path)):
+            # 如果目录不存在,创建它
+            os.makedirs(os.path.dirname(pdf_local_path))
+
+        md_bookname_save_path = join_path(save_tmp_path, "md", book_name)
+        if not os.path.exists(md_bookname_save_path):
+            # 如果目录不存在,创建它
+            os.makedirs(md_bookname_save_path)
+
+        with open(pdf_local_path + ".pdf", "wb") as pdf_file:
+            pdf_file.write(pdf_bytes)
 
+    pdf_docs = fitz.open("pdf", pdf_bytes)
+    # 初始化空的pdf_info_dict
     pdf_info_dict = {}
-    end_page_id = end_page_id if end_page_id else len(ocr_pdf_info) - 1
+    img_s3_client = get_img_s3_client(save_path, image_s3_config)
+
+    start_time = time.time()
+
+    remove_bboxes = []
+
+    end_page_id = end_page_id if end_page_id else len(pdf_docs) - 1
     for page_id in range(start_page_id, end_page_id + 1):
-        ocr_page_info = ocr_pdf_info[page_id]
+
+        # 获取当前页的page对象
+        page = pdf_docs[page_id]
+
+        if debug_mode:
+            time_now = time.time()
+            logger.info(f"page_id: {page_id}, last_page_cost_time: {get_delta_time(start_time)}")
+            start_time = time_now
+
+        # 获取当前页的模型数据
+        ocr_page_info = get_docx_model_output(pdf_model_output, pdf_model_profile, page_id)
+
+        """从json中获取每页的页码、页眉、页脚的bbox"""
+        page_no_bboxes = parse_pageNos(page_id, page, ocr_page_info)
+        header_bboxes = parse_headers(page_id, page, ocr_page_info)
+        footer_bboxes = parse_footers(page_id, page, ocr_page_info)
+        footnote_bboxes = parse_footnotes_by_model(page_id, page, ocr_page_info, md_bookname_save_path,
+                                                   debug_mode=debug_mode)
+
+        # 构建需要remove的bbox列表
+        need_remove_spans_bboxes = []
+        need_remove_spans_bboxes.extend(page_no_bboxes)
+        need_remove_spans_bboxes.extend(header_bboxes)
+        need_remove_spans_bboxes.extend(footer_bboxes)
+        need_remove_spans_bboxes.extend(footnote_bboxes)
+
         layout_dets = ocr_page_info['layout_dets']
         spans = []
+
+        # 计算模型坐标和pymu坐标的缩放比例
+        horizontal_scale_ratio, vertical_scale_ratio = get_scale_ratio(ocr_page_info, page)
+
         for layout_det in layout_dets:
             category_id = layout_det['category_id']
             allow_category_id_list = [1, 7, 13, 14, 15]
             if category_id in allow_category_id_list:
                 x0, y0, _, _, x1, y1, _, _ = layout_det['poly']
-                bbox = [int(x0), int(y0), int(x1), int(y1)]
+                bbox = [int(x0 / horizontal_scale_ratio), int(y0 / vertical_scale_ratio),
+                        int(x1 / horizontal_scale_ratio), int(y1 / vertical_scale_ratio)]
                 '''要删除的'''
                 #  3: 'header',      # 页眉
                 #  4: 'page number', # 页码
@@ -48,8 +128,10 @@ def parse_pdf_by_ocr(
                 }
                 if category_id == 1:
                     span['type'] = 'image'
+
                 elif category_id == 7:
                     span['type'] = 'table'
+
                 elif category_id == 13:
                     span['content'] = layout_det['latex']
                     span['type'] = 'inline_equation'
@@ -68,13 +150,28 @@ def parse_pdf_by_ocr(
         spans = remove_overlaps_min_spans(spans)
 
         # 对tpye=["displayed_equation", "image", "table"]进行额外处理,如果左边有字的话,将该span的bbox中y0调整低于文字的y0
-        #spans = modify_y_axis(spans)
+        # spans = modify_y_axis(spans)
+
+        # 删除remove_span_block_bboxes中的bbox
+        spans = remove_spans_by_bboxes(spans, need_remove_spans_bboxes)
+
+        # 对image和table截图
+        spans = cut_image_and_table(spans, page, page_id, book_name, save_path)
+
+
+        # 行内公式调整, 高度调整至与同行文字高度一致(优先左侧, 其次右侧)
 
-        # 将spans合并成line(从上到下,从左到右)
-        lines = merge_spans_to_line(spans)
-        # logger.info(lines)
+        # 模型识别错误的行间公式, type类型转换成行内公式
 
-        # 从ocr_page_info中获取layout信息
+        # bbox去除粘连
+
+        # 对tpye=["displayed_equation", "image", "table"]进行额外处理,如果左边有字的话,将该span的bbox中y0调整至不高于文字的y0
+
+        # 从ocr_page_info中解析layout信息(按自然阅读方向排序,并修复重叠和交错的bad case)
+        layout_bboxes = layout_detect(ocr_page_info['subfield_dets'], page, ocr_page_info)
+
+        # 将spans合并成line(在layout内,从上到下,从左到右)
+        lines = merge_spans_to_line_by_layout(spans, layout_bboxes)
 
 
         # 目前不做block拼接,先做个结构,每个block中只有一个line,block的bbox就是line的bbox
@@ -86,8 +183,20 @@ def parse_pdf_by_ocr(
             })
 
         # 构造pdf_info_dict
-        page_info = construct_page_component(page_id, blocks)
+        page_info = construct_page_component(page_id, blocks, layout_bboxes)
         pdf_info_dict[f"page_{page_id}"] = page_info
 
-    return pdf_info_dict
+        # 在测试时,保存调试信息
+        if debug_mode:
+            params_file_save_path = join_path(save_tmp_path, "md", book_name, "preproc_out.json")
+            page_draw_rect_save_path = join_path(save_tmp_path, "md", book_name, "layout.pdf")
 
+            with open(params_file_save_path, "w", encoding="utf-8") as f:
+                json.dump(pdf_info_dict, f, ensure_ascii=False, indent=4)
+            # 先检测本地 page_draw_rect_save_path 是否存在,如果存在则删除
+            if os.path.exists(page_draw_rect_save_path):
+                os.remove(page_draw_rect_save_path)
+            # 绘制bbox和layout到pdf
+
+
+    return pdf_info_dict

+ 6 - 16
magic_pdf/pre_proc/detect_footer_by_model.py

@@ -1,4 +1,5 @@
 from magic_pdf.libs.commons import fitz             # pyMuPDF库
+from magic_pdf.libs.coordinate_transform import get_scale_ratio
 
 
 def parse_footers(page_ID: int, page: fitz.Page, json_from_DocXchain_obj: dict):
@@ -8,23 +9,12 @@ def parse_footers(page_ID: int, page: fitz.Page, json_from_DocXchain_obj: dict):
     :param res_dir_path: str类型,是每一个pdf文档,在当前.py文件的目录下生成一个与pdf文档同名的文件夹,res_dir_path就是文件夹的dir
     :param json_from_DocXchain_obj: dict类型,把pdf文档送入DocXChain模型中后,提取bbox,结果保存到pdf文档同名文件夹下的 page_ID.json文件中了。json_from_DocXchain_obj就是打开后的dict
     """
-    DPI = 72  # use this resolution
-    pix = page.get_pixmap(dpi=DPI)
-    pageL = 0
-    pageR = int(pix.w)
-    pageU = 0
-    pageD = int(pix.h)
-    
 
     #--------- 通过json_from_DocXchain来获取 footer ---------#
     footer_bbox_from_DocXChain = []
 
-    
     xf_json = json_from_DocXchain_obj
-    width_from_json = xf_json['page_info']['width']
-    height_from_json = xf_json['page_info']['height']
-    LR_scaleRatio = width_from_json / (pageR - pageL)
-    UD_scaleRatio = height_from_json / (pageD - pageU)
+    horizontal_scale_ratio, vertical_scale_ratio = get_scale_ratio(xf_json, page)
 
     # {0: 'title',  # 标题
     # 1: 'figure', # 图片
@@ -42,10 +32,10 @@ def parse_footers(page_ID: int, page: fitz.Page, json_from_DocXchain_obj: dict):
     #  13: 'embedding',     # 嵌入公式
     #  14: 'isolated'}      # 单行公式
     for xf in xf_json['layout_dets']:
-        L = xf['poly'][0] / LR_scaleRatio
-        U = xf['poly'][1] / UD_scaleRatio
-        R = xf['poly'][2] / LR_scaleRatio
-        D = xf['poly'][5] / UD_scaleRatio
+        L = xf['poly'][0] / horizontal_scale_ratio
+        U = xf['poly'][1] / vertical_scale_ratio
+        R = xf['poly'][2] / horizontal_scale_ratio
+        D = xf['poly'][5] / vertical_scale_ratio
         # L += pageL          # 有的页面,artBox偏移了。不在(0,0)
         # R += pageL
         # U += pageU

+ 6 - 15
magic_pdf/pre_proc/detect_footnote.py

@@ -1,5 +1,6 @@
 from collections import Counter
 from magic_pdf.libs.commons import fitz             # pyMuPDF库
+from magic_pdf.libs.coordinate_transform import get_scale_ratio
 
 
 def parse_footnotes_by_model(page_ID: int, page: fitz.Page, json_from_DocXchain_obj: dict, md_bookname_save_path, debug_mode=False):
@@ -9,22 +10,12 @@ def parse_footnotes_by_model(page_ID: int, page: fitz.Page, json_from_DocXchain_
     :param res_dir_path: str类型,是每一个pdf文档,在当前.py文件的目录下生成一个与pdf文档同名的文件夹,res_dir_path就是文件夹的dir
     :param json_from_DocXchain_obj: dict类型,把pdf文档送入DocXChain模型中后,提取bbox,结果保存到pdf文档同名文件夹下的 page_ID.json文件中了。json_from_DocXchain_obj就是打开后的dict
     """
-    DPI = 72  # use this resolution
-    pix = page.get_pixmap(dpi=DPI)
-    pageL = 0
-    pageR = int(pix.w)
-    pageU = 0
-    pageD = int(pix.h)
-    
 
     #--------- 通过json_from_DocXchain来获取 footnote ---------#
     footnote_bbox_from_DocXChain = []
 
     xf_json = json_from_DocXchain_obj
-    width_from_json = xf_json['page_info']['width']
-    height_from_json = xf_json['page_info']['height']
-    LR_scaleRatio = width_from_json / (pageR - pageL)
-    UD_scaleRatio = height_from_json / (pageD - pageU)
+    horizontal_scale_ratio, vertical_scale_ratio = get_scale_ratio(xf_json, page)
 
     # {0: 'title',  # 标题
     # 1: 'figure', # 图片
@@ -42,10 +33,10 @@ def parse_footnotes_by_model(page_ID: int, page: fitz.Page, json_from_DocXchain_
     #  13: 'embedding',     # 嵌入公式
     #  14: 'isolated'}      # 单行公式
     for xf in xf_json['layout_dets']:
-        L = xf['poly'][0] / LR_scaleRatio
-        U = xf['poly'][1] / UD_scaleRatio
-        R = xf['poly'][2] / LR_scaleRatio
-        D = xf['poly'][5] / UD_scaleRatio
+        L = xf['poly'][0] / horizontal_scale_ratio
+        U = xf['poly'][1] / vertical_scale_ratio
+        R = xf['poly'][2] / horizontal_scale_ratio
+        D = xf['poly'][5] / vertical_scale_ratio
         # L += pageL          # 有的页面,artBox偏移了。不在(0,0)
         # R += pageL
         # U += pageU

+ 6 - 15
magic_pdf/pre_proc/detect_header.py

@@ -1,4 +1,5 @@
 from magic_pdf.libs.commons import fitz             # pyMuPDF库
+from magic_pdf.libs.coordinate_transform import get_scale_ratio
 
 
 def parse_headers(page_ID: int, page: fitz.Page, json_from_DocXchain_obj: dict):
@@ -8,22 +9,12 @@ def parse_headers(page_ID: int, page: fitz.Page, json_from_DocXchain_obj: dict):
     :param res_dir_path: str类型,是每一个pdf文档,在当前.py文件的目录下生成一个与pdf文档同名的文件夹,res_dir_path就是文件夹的dir
     :param json_from_DocXchain_obj: dict类型,把pdf文档送入DocXChain模型中后,提取bbox,结果保存到pdf文档同名文件夹下的 page_ID.json文件中了。json_from_DocXchain_obj就是打开后的dict
     """
-    DPI = 72  # use this resolution
-    pix = page.get_pixmap(dpi=DPI)
-    pageL = 0
-    pageR = int(pix.w)
-    pageU = 0
-    pageD = int(pix.h)
-    
 
     #--------- 通过json_from_DocXchain来获取 header ---------#
     header_bbox_from_DocXChain = []
 
     xf_json = json_from_DocXchain_obj
-    width_from_json = xf_json['page_info']['width']
-    height_from_json = xf_json['page_info']['height']
-    LR_scaleRatio = width_from_json / (pageR - pageL)
-    UD_scaleRatio = height_from_json / (pageD - pageU)
+    horizontal_scale_ratio, vertical_scale_ratio = get_scale_ratio(xf_json, page)
 
     # {0: 'title',  # 标题
     # 1: 'figure', # 图片
@@ -41,10 +32,10 @@ def parse_headers(page_ID: int, page: fitz.Page, json_from_DocXchain_obj: dict):
     #  13: 'embedding',     # 嵌入公式
     #  14: 'isolated'}      # 单行公式
     for xf in xf_json['layout_dets']:
-        L = xf['poly'][0] / LR_scaleRatio
-        U = xf['poly'][1] / UD_scaleRatio
-        R = xf['poly'][2] / LR_scaleRatio
-        D = xf['poly'][5] / UD_scaleRatio
+        L = xf['poly'][0] / horizontal_scale_ratio
+        U = xf['poly'][1] / vertical_scale_ratio
+        R = xf['poly'][2] / horizontal_scale_ratio
+        D = xf['poly'][5] / vertical_scale_ratio
         # L += pageL          # 有的页面,artBox偏移了。不在(0,0)
         # R += pageL
         # U += pageU

+ 6 - 15
magic_pdf/pre_proc/detect_page_number.py

@@ -1,4 +1,5 @@
 from magic_pdf.libs.commons import fitz             # pyMuPDF库
+from magic_pdf.libs.coordinate_transform import get_scale_ratio
 
 
 def parse_pageNos(page_ID: int, page: fitz.Page, json_from_DocXchain_obj: dict):
@@ -8,22 +9,12 @@ def parse_pageNos(page_ID: int, page: fitz.Page, json_from_DocXchain_obj: dict):
     :param res_dir_path: str类型,是每一个pdf文档,在当前.py文件的目录下生成一个与pdf文档同名的文件夹,res_dir_path就是文件夹的dir
     :param json_from_DocXchain_obj: dict类型,把pdf文档送入DocXChain模型中后,提取bbox,结果保存到pdf文档同名文件夹下的 page_ID.json文件中了。json_from_DocXchain_obj就是打开后的dict
     """
-    DPI = 72  # use this resolution
-    pix = page.get_pixmap(dpi=DPI)
-    pageL = 0
-    pageR = int(pix.w)
-    pageU = 0
-    pageD = int(pix.h)
-    
 
     #--------- 通过json_from_DocXchain来获取 pageNo ---------#
     pageNo_bbox_from_DocXChain = []
 
     xf_json = json_from_DocXchain_obj
-    width_from_json = xf_json['page_info']['width']
-    height_from_json = xf_json['page_info']['height']
-    LR_scaleRatio = width_from_json / (pageR - pageL)
-    UD_scaleRatio = height_from_json / (pageD - pageU)
+    horizontal_scale_ratio, vertical_scale_ratio = get_scale_ratio(xf_json, page)
 
     # {0: 'title',  # 标题
     # 1: 'figure', # 图片
@@ -41,10 +32,10 @@ def parse_pageNos(page_ID: int, page: fitz.Page, json_from_DocXchain_obj: dict):
     #  13: 'embedding',     # 嵌入公式
     #  14: 'isolated'}      # 单行公式
     for xf in xf_json['layout_dets']:
-        L = xf['poly'][0] / LR_scaleRatio
-        U = xf['poly'][1] / UD_scaleRatio
-        R = xf['poly'][2] / LR_scaleRatio
-        D = xf['poly'][5] / UD_scaleRatio
+        L = xf['poly'][0] / horizontal_scale_ratio
+        U = xf['poly'][1] / vertical_scale_ratio
+        R = xf['poly'][2] / horizontal_scale_ratio
+        D = xf['poly'][5] / vertical_scale_ratio
         # L += pageL          # 有的页面,artBox偏移了。不在(0,0)
         # R += pageL
         # U += pageU

+ 19 - 0
magic_pdf/pre_proc/ocr_cut_image.py

@@ -0,0 +1,19 @@
+from magic_pdf.libs.commons import join_path
+from magic_pdf.libs.pdf_image_tools import cut_image
+
+
+def cut_image_and_table(spans, page, page_id, book_name, save_path):
+    def s3_return_path(type):
+        return join_path(book_name, type)
+
+    def img_save_path(type):
+        return join_path(save_path, s3_return_path(type))
+
+    for span in spans:
+        span_type = span['type']
+        if span_type == 'image':
+            span['image_path'] = cut_image(span['bbox'], page_id, page, img_save_path('image'))
+        elif span_type == 'table':
+            span['image_path'] = cut_image(span['bbox'], page_id, page, img_save_path('table'))
+
+    return spans

+ 128 - 0
magic_pdf/pre_proc/ocr_detect_layout.py

@@ -0,0 +1,128 @@
+import fitz
+
+from magic_pdf.libs.boxbase import _is_part_overlap, _is_in
+from magic_pdf.libs.coordinate_transform import get_scale_ratio
+
+
+def get_center_point(bbox):
+    """
+    根据边界框坐标信息,计算出该边界框的中心点坐标。
+    Args:
+        bbox (list): 边界框坐标信息,包含四个元素,分别为左上角x坐标、左上角y坐标、右下角x坐标、右下角y坐标。
+    Returns:
+        list: 中心点坐标信息,包含两个元素,分别为x坐标和y坐标。
+    """
+    return [(bbox[0] + bbox[2]) / 2, (bbox[1] + bbox[3]) / 2]
+
+
+def get_area(bbox):
+    """
+    根据边界框坐标信息,计算出该边界框的面积。
+    Args:
+        bbox (list): 边界框坐标信息,包含四个元素,分别为左上角x坐标、左上角y坐标、右下角x坐标、右下角y坐标。
+    Returns:
+        float: 该边界框的面积。
+    """
+    return (bbox[2] - bbox[0]) * (bbox[3] - bbox[1])
+
+
+def adjust_layouts(layout_bboxes):
+    # 遍历所有布局框
+    for i in range(len(layout_bboxes)):
+        # 遍历当前布局框之后的布局框
+        for j in range(i + 1, len(layout_bboxes)):
+            # 判断两个布局框是否重叠
+            if _is_part_overlap(layout_bboxes[i]["layout_bbox"], layout_bboxes[j]["layout_bbox"]):
+                # 计算每个布局框的中心点坐标和面积
+                center_i = get_center_point(layout_bboxes[i]["layout_bbox"])
+                area_i = get_area(layout_bboxes[i]["layout_bbox"])
+
+                center_j = get_center_point(layout_bboxes[j]["layout_bbox"])
+                area_j = get_area(layout_bboxes[j]["layout_bbox"])
+
+                # 计算横向和纵向的距离差
+                dx = abs(center_i[0] - center_j[0])
+                dy = abs(center_i[1] - center_j[1])
+
+                # 较大布局框和较小布局框的赋值
+                if area_i > area_j:
+                    larger_layout, smaller_layout = layout_bboxes[i], layout_bboxes[j]
+                else:
+                    larger_layout, smaller_layout = layout_bboxes[j], layout_bboxes[i]
+
+                # 根据距离差判断重叠方向并修正边界
+                if dx > dy:  # 左右重叠
+                    if larger_layout["layout_bbox"][0] < smaller_layout["layout_bbox"][2]:
+                        larger_layout["layout_bbox"][0] = smaller_layout["layout_bbox"][2]
+                    else:
+                        larger_layout["layout_bbox"][2] = smaller_layout["layout_bbox"][0]
+                else:  # 上下重叠
+                    if larger_layout["layout_bbox"][1] < smaller_layout["layout_bbox"][3]:
+                        larger_layout["layout_bbox"][1] = smaller_layout["layout_bbox"][3]
+                    else:
+                        larger_layout["layout_bbox"][3] = smaller_layout["layout_bbox"][1]
+    # todo 排序调整布局边界框列表
+
+
+    # 返回排序调整后的布局边界框列表
+    return layout_bboxes
+
+
+def layout_detect(layout_info, page: fitz.Page, ocr_page_info):
+    """
+    对输入的布局信息进行解析,提取出每个子布局的边界框,并对所有子布局进行排序调整。
+
+    Args:
+        layout_info (list): 包含子布局信息的列表,每个子布局信息为字典类型,包含'poly'字段,表示子布局的边界框坐标信息。
+
+    Returns:
+        list: 经过排序调整后的所有子布局边界框信息的列表,每个边界框信息为字典类型,包含'layout_bbox'字段,表示边界框的坐标信息。
+
+    """
+    horizontal_scale_ratio, vertical_scale_ratio = get_scale_ratio(ocr_page_info, page)
+    # 初始化布局边界框列表
+    layout_bboxes = []
+    # 遍历每个子布局
+    for sub_layout in layout_info:
+        # 提取子布局的边界框坐标信息
+        x0, y0, _, _, x1, y1, _, _ = sub_layout['poly']
+        bbox = [int(x0 / horizontal_scale_ratio), int(y0 / vertical_scale_ratio),
+                int(x1 / horizontal_scale_ratio), int(y1 / vertical_scale_ratio)]
+        # 创建子布局的边界框字典
+        layout_bbox = {
+            "layout_bbox": bbox,
+        }
+        # 将子布局的边界框添加到列表中
+        layout_bboxes.append(layout_bbox)
+
+    # 初始化新的布局边界框列表
+    new_layout_bboxes = []
+    # 遍历每个布局边界框
+    for i in range(len(layout_bboxes)):
+        # 初始化标记变量,用于判断当前边界框是否需要保留
+        keep = True
+        # 获取当前边界框的坐标信息
+        box_i = layout_bboxes[i]["layout_bbox"]
+
+        # 遍历其他边界框
+        for j in range(len(layout_bboxes)):
+            # 排除当前边界框自身
+            if i != j:
+                # 获取其他边界框的坐标信息
+                box_j = layout_bboxes[j]["layout_bbox"]
+                # 检测box_i是否被box_j包含
+                if _is_in(box_i, box_j):
+                    # 如果当前边界框被其他边界框包含,则标记为不需要保留
+                    keep = False
+                    # 跳出内层循环
+                    break
+
+        # 如果当前边界框需要保留,则添加到新的布局边界框列表中
+        if keep:
+            new_layout_bboxes.append(layout_bboxes[i])
+
+    # 对新的布局边界框列表进行排序调整
+    layout_bboxes = adjust_layouts(new_layout_bboxes)
+
+    # 返回排序调整后的布局边界框列表
+    return layout_bboxes

+ 51 - 18
magic_pdf/libs/ocr_dict_merge.py → magic_pdf/pre_proc/ocr_dict_merge.py

@@ -1,4 +1,7 @@
-from magic_pdf.libs.boxbase import __is_overlaps_y_exceeds_threshold, get_minbox_if_overlap_by_ratio
+from loguru import logger
+
+from magic_pdf.libs.boxbase import __is_overlaps_y_exceeds_threshold, get_minbox_if_overlap_by_ratio, \
+    calculate_overlap_area_in_bbox1_area_ratio
 
 
 # 删除重叠spans中较小的那些
@@ -14,6 +17,24 @@ def remove_overlaps_min_spans(spans):
     return spans
 
 
+# 将每一个line中的span从左到右排序
+def line_sort_spans_by_left_to_right(lines):
+    line_objects = []
+    for line in lines:
+        # 按照x0坐标排序
+        line.sort(key=lambda span: span['bbox'][0])
+        line_bbox = [
+            min(span['bbox'][0] for span in line),  # x0
+            min(span['bbox'][1] for span in line),  # y0
+            max(span['bbox'][2] for span in line),  # x1
+            max(span['bbox'][3] for span in line),  # y1
+        ]
+        line_objects.append({
+            "bbox": line_bbox,
+            "spans": line,
+        })
+    return line_objects
+
 def merge_spans_to_line(spans):
     # 按照y0坐标排序
     spans.sort(key=lambda span: span['bbox'][1])
@@ -23,7 +44,8 @@ def merge_spans_to_line(spans):
     for span in spans[1:]:
         # 如果当前的span类型为"displayed_equation" 或者 当前行中已经有"displayed_equation"
         # image和table类型,同上
-        if span['type'] in ["displayed_equation", "image", "table"] or any(s['type'] in ["displayed_equation", "image", "table"] for s in current_line):
+        if span['type'] in ["displayed_equation", "image", "table"] or any(
+                s['type'] in ["displayed_equation", "image", "table"] for s in current_line):
             # 则开始新行
             lines.append(current_line)
             current_line = [span]
@@ -41,23 +63,34 @@ def merge_spans_to_line(spans):
     if current_line:
         lines.append(current_line)
 
-    # 计算每行的边界框,并对每行中的span按照x0进行排序
-    line_objects = []
-    for line in lines:
-        # 按照x0坐标排序
-        line.sort(key=lambda span: span['bbox'][0])
-        line_bbox = [
-            min(span['bbox'][0] for span in line),  # x0
-            min(span['bbox'][1] for span in line),  # y0
-            max(span['bbox'][2] for span in line),  # x1
-            max(span['bbox'][3] for span in line),  # y1
-        ]
-        line_objects.append({
-            "bbox": line_bbox,
-            "spans": line,
-        })
+    return lines
 
-    return line_objects
+def merge_spans_to_line_by_layout(spans, layout_bboxes):
+    lines = []
+    new_spans = []
+    for item in layout_bboxes:
+        layout_bbox = item['layout_bbox']
+        # 遍历spans,将每个span放入对应的layout中
+        layout_sapns = []
+        for span in spans:
+            if calculate_overlap_area_in_bbox1_area_ratio(span['bbox'], layout_bbox) > 0.8:
+                layout_sapns.append(span)
+        # 如果layout_sapns不为空,则放入new_spans中
+        if len(layout_sapns) > 0:
+            new_spans.append(layout_sapns)
+            # 从spans删除已经放入layout_sapns中的span
+            for layout_sapn in layout_sapns:
+                spans.remove(layout_sapn)
+
+    if len(new_spans) > 0:
+        for layout_sapns in new_spans:
+            layout_lines = merge_spans_to_line(layout_sapns)
+            lines.extend(layout_lines)
+
+    #对line中的span进行排序
+    lines = line_sort_spans_by_left_to_right(lines)
+
+    return lines
 
 
 

+ 17 - 0
magic_pdf/pre_proc/ocr_remove_spans.py

@@ -0,0 +1,17 @@
+from magic_pdf.libs.boxbase import calculate_overlap_area_in_bbox1_area_ratio
+
+
+def remove_spans_by_bboxes(spans, need_remove_spans_bboxes):
+    # 遍历spans, 判断是否在removed_span_block_bboxes中
+    # 如果是, 则删除该span 否则, 保留该span
+    need_remove_spans = []
+    for span in spans:
+        for removed_bbox in need_remove_spans_bboxes:
+            if calculate_overlap_area_in_bbox1_area_ratio(span['bbox'], removed_bbox) > 0.5:
+                need_remove_spans.append(span)
+                break
+
+    for span in need_remove_spans:
+        spans.remove(span)
+
+    return spans