Jelajahi Sumber

Merge pull request #3 from myhloli/dev-in-line-bbox

in line equation bbox modification
myhloli 1 tahun lalu
induk
melakukan
61405b8af8
2 mengubah file dengan 105 tambahan dan 0 penghapusan
  1. 4 0
      magic_pdf/pdf_parse_by_ocr.py
  2. 101 0
      magic_pdf/pre_proc/ocr_dict_merge.py

+ 4 - 0
magic_pdf/pdf_parse_by_ocr.py

@@ -24,6 +24,7 @@ from magic_pdf.pre_proc.ocr_detect_layout import layout_detect
 from magic_pdf.pre_proc.ocr_dict_merge import (
     remove_overlaps_min_spans,
     merge_spans_to_line_by_layout,
+    modify_y_axis
 )
 from magic_pdf.pre_proc.ocr_remove_spans import remove_spans_by_bboxes
 from magic_pdf.pre_proc.remove_bbox_overlap import remove_overlap_between_bbox
@@ -176,6 +177,9 @@ def parse_pdf_by_ocr(
         # 删除重叠spans中较小的那些
         spans = remove_overlaps_min_spans(spans)
 
+        # 对tpye=["displayed_equation", "image", "table"]进行额外处理,如果左边有字的话,将该span的bbox中y0调整低于文字的y0
+        spans = modify_y_axis(spans)
+
         # 删除remove_span_block_bboxes中的bbox
         spans = remove_spans_by_bboxes(spans, need_remove_spans_bboxes)
 

+ 101 - 0
magic_pdf/pre_proc/ocr_dict_merge.py

@@ -91,3 +91,104 @@ def merge_spans_to_line_by_layout(spans, layout_bboxes):
     lines = line_sort_spans_by_left_to_right(lines)
 
     return lines
+
+
+
+def modify_y_axis(spans: list):
+    inline_list = []
+    displayed_list = []
+    text_list = []
+    image_list = []
+    table_list = []
+
+    spans.sort(key=lambda span: span['bbox'][1])
+
+    lines = []
+    current_line = [spans[0]]
+    if spans[0]["type"] in ["displayed_equation", "image", "table"]:
+        displayed_list.append(spans[0])
+
+    line_first_y0 = spans[0]["bbox"][1]
+    line_first_y = spans[0]["bbox"][3]
+    #用于给行间公式搜索
+    text_inline_lines = []
+    for span in spans[1:]:
+        if span.get("content","") == "78.":
+            print("debug")
+        # 如果当前的span类型为"displayed_equation" 或者 当前行中已经有"displayed_equation"
+        # image和table类型,同上
+        if span['type'] in ["displayed_equation", "image", "table"] or any(
+                s['type'] in ["displayed_equation", "image", "table"] for s in current_line):
+            #传入
+            if span["type"] in ["displayed_equation", "image", "table"]:
+                displayed_list.append(span)
+            # 则开始新行
+            lines.append(current_line)
+            if len(current_line) > 1 or current_line[0]["type"] in ["text", "inline_equation"]:
+                text_inline_lines.append((current_line, (line_first_y0, line_first_y)))
+            current_line = [span]
+            line_first_y0 = span["bbox"][1]
+            line_first_y = span["bbox"][3]
+            continue
+
+        # 如果当前的span与当前行的最后一个span在y轴上重叠,则添加到当前行
+        if __is_overlaps_y_exceeds_threshold(span['bbox'], current_line[-1]['bbox']):
+            if span["bbox"][1] < line_first_y0:
+                line_first_y0 = span["bbox"][1]
+            if span["bbox"][3] > line_first_y:
+                line_first_y = span["bbox"][3]
+            current_line.append(span)
+
+        else:
+            # 否则,开始新行
+            lines.append(current_line)
+            text_inline_lines.append((current_line, (line_first_y0, line_first_y)))
+            current_line = [span]
+            line_first_y0 = span["bbox"][1]
+            line_first_y = span["bbox"][3]
+
+        # 添加最后一行
+    if current_line:
+        lines.append(current_line)
+        if len(current_line) > 1 or current_line[0]["type"] in ["text", "inline_equation"]:
+            text_inline_lines.append((current_line, (line_first_y0, line_first_y)))
+    for line in text_inline_lines:
+        # 按照x0坐标排序
+        current_line = line[0]
+        current_line.sort(key=lambda span: span['bbox'][0])
+
+
+    #调整每一个文字行内bbox统一
+    for line in text_inline_lines:
+        current_line, (line_first_y0, line_first_y) = line
+        for span in current_line:
+            span["bbox"][1] = line_first_y0
+            span["bbox"][3] = line_first_y
+    #错误行间公式转行内公式
+    j = 0
+    for i in range(len(displayed_list)):
+        if i == 8:
+            print("debug")
+        span = displayed_list[i]
+        span_y0, span_y = span["bbox"][1], span["bbox"][3]
+
+        while j < len(text_inline_lines):
+            text_line = text_inline_lines[j]
+            y0, y1 = text_line[1]
+            if (span_y0 < y0 and span_y > y0 or span_y0 < y1 and span_y > y1 or span_y0 < y0 and span_y > y1) and __is_overlaps_y_exceeds_threshold(span['bbox'], (0, y0, 0, y1)):
+                span["bbox"][1] = y0
+                # span["bbox"][3] = y1
+                #调整公式类型
+                if span["type"] == "displayed_equation":
+                    span["type"] = "inline_equation"
+                break
+            elif span_y < y0 or span_y0 < y0 and span_y > y0 and not __is_overlaps_y_exceeds_threshold(span['bbox'], (0, y0, 0, y1)):
+                break
+            else:
+                j += 1
+
+    return spans
+
+
+
+