Jelajahi Sumber

span->line现基于模型的layout进行拼接

赵小蒙 1 tahun lalu
induk
melakukan
864e95355f

+ 1 - 1
demo/ocr_demo.py

@@ -57,4 +57,4 @@ if __name__ == '__main__':
         # logger.info(markdown_content)
         # save_markdown(markdown_text, ocr_json_file_path)
     except Exception as e:
-        logger.error(e)
+        logger.exception(e)

+ 3 - 3
magic_pdf/pdf_parse_by_ocr.py

@@ -11,7 +11,7 @@ from magic_pdf.pre_proc.detect_footnote import parse_footnotes_by_model
 from magic_pdf.pre_proc.detect_header import parse_headers
 from magic_pdf.pre_proc.detect_page_number import parse_pageNos
 from magic_pdf.pre_proc.ocr_detect_layout import layout_detect
-from magic_pdf.pre_proc.ocr_dict_merge import merge_spans_to_line, remove_overlaps_min_spans
+from magic_pdf.pre_proc.ocr_dict_merge import remove_overlaps_min_spans, merge_spans_to_line_by_layout
 from magic_pdf.pre_proc.ocr_remove_spans import remove_spans_by_bboxes
 
 
@@ -151,10 +151,10 @@ def parse_pdf_by_ocr(
         # 对tpye=["displayed_equation", "image", "table"]进行额外处理,如果左边有字的话,将该span的bbox中y0调整低于文字的y0
 
         # 从ocr_page_info中解析layout信息(按自然阅读方向排序,并修复重叠和交错的bad case)
-        layout_bboxes = layout_detect(ocr_page_info['subfield_dets'], page)
+        layout_bboxes = layout_detect(ocr_page_info['subfield_dets'], page, ocr_page_info)
 
         # 将spans合并成line(在layout内,从上到下,从左到右)
-        lines = merge_spans_to_line(spans, layout_bboxes)
+        lines = merge_spans_to_line_by_layout(spans, layout_bboxes)
         # logger.info(lines)
 
         # 目前不做block拼接,先做个结构,每个block中只有一个line,block的bbox就是line的bbox

+ 2 - 2
magic_pdf/pre_proc/ocr_detect_layout.py

@@ -66,7 +66,7 @@ def adjust_layouts(layout_bboxes):
     return layout_bboxes
 
 
-def layout_detect(layout_info, page: fitz.Page):
+def layout_detect(layout_info, page: fitz.Page, ocr_page_info):
     """
     对输入的布局信息进行解析,提取出每个子布局的边界框,并对所有子布局进行排序调整。
 
@@ -77,7 +77,7 @@ def layout_detect(layout_info, page: fitz.Page):
         list: 经过排序调整后的所有子布局边界框信息的列表,每个边界框信息为字典类型,包含'layout_bbox'字段,表示边界框的坐标信息。
 
     """
-    horizontal_scale_ratio, vertical_scale_ratio = get_scale_ratio(layout_info, page)
+    horizontal_scale_ratio, vertical_scale_ratio = get_scale_ratio(ocr_page_info, page)
     # 初始化布局边界框列表
     layout_bboxes = []
     # 遍历每个子布局

+ 45 - 18
magic_pdf/pre_proc/ocr_dict_merge.py

@@ -1,4 +1,7 @@
-from magic_pdf.libs.boxbase import __is_overlaps_y_exceeds_threshold, get_minbox_if_overlap_by_ratio
+from loguru import logger
+
+from magic_pdf.libs.boxbase import __is_overlaps_y_exceeds_threshold, get_minbox_if_overlap_by_ratio, \
+    calculate_overlap_area_in_bbox1_area_ratio
 
 
 # 删除重叠spans中较小的那些
@@ -14,6 +17,24 @@ def remove_overlaps_min_spans(spans):
     return spans
 
 
+# 将每一个line中的span从左到右排序
+def line_sort_spans_by_left_to_right(lines):
+    line_objects = []
+    for line in lines:
+        # 按照x0坐标排序
+        line.sort(key=lambda span: span['bbox'][0])
+        line_bbox = [
+            min(span['bbox'][0] for span in line),  # x0
+            min(span['bbox'][1] for span in line),  # y0
+            max(span['bbox'][2] for span in line),  # x1
+            max(span['bbox'][3] for span in line),  # y1
+        ]
+        line_objects.append({
+            "bbox": line_bbox,
+            "spans": line,
+        })
+    return line_objects
+
 def merge_spans_to_line(spans):
     # 按照y0坐标排序
     spans.sort(key=lambda span: span['bbox'][1])
@@ -23,7 +44,8 @@ def merge_spans_to_line(spans):
     for span in spans[1:]:
         # 如果当前的span类型为"displayed_equation" 或者 当前行中已经有"displayed_equation"
         # image和table类型,同上
-        if span['type'] in ["displayed_equation", "image", "table"] or any(s['type'] in ["displayed_equation", "image", "table"] for s in current_line):
+        if span['type'] in ["displayed_equation", "image", "table"] or any(
+                s['type'] in ["displayed_equation", "image", "table"] for s in current_line):
             # 则开始新行
             lines.append(current_line)
             current_line = [span]
@@ -41,20 +63,25 @@ def merge_spans_to_line(spans):
     if current_line:
         lines.append(current_line)
 
-    # 计算每行的边界框,并对每行中的span按照x0进行排序
-    line_objects = []
-    for line in lines:
-        # 按照x0坐标排序
-        line.sort(key=lambda span: span['bbox'][0])
-        line_bbox = [
-            min(span['bbox'][0] for span in line),  # x0
-            min(span['bbox'][1] for span in line),  # y0
-            max(span['bbox'][2] for span in line),  # x1
-            max(span['bbox'][3] for span in line),  # y1
-        ]
-        line_objects.append({
-            "bbox": line_bbox,
-            "spans": line,
-        })
+    return lines
 
-    return line_objects
+def merge_spans_to_line_by_layout(spans, layout_bboxes):
+    lines = []
+    new_spans = []
+    for item in layout_bboxes:
+        layout_bbox = item['layout_bbox']
+        # 遍历spans,将每个span放入对应的layout中
+        layout_sapns = []
+        for span in spans:
+            if calculate_overlap_area_in_bbox1_area_ratio(span['bbox'], layout_bbox) > 0.8:
+                layout_sapns.append(span)
+        new_spans.append(layout_sapns)
+
+    for layout_sapns in new_spans:
+        layout_lines = merge_spans_to_line(layout_sapns)
+        lines.extend(layout_lines)
+
+    #对line中的span进行排序
+    lines = line_sort_spans_by_left_to_right(lines)
+
+    return lines