Browse Source

Merge pull request #4 from myhloli/dev-in-line-bbox

Dev in line bbox
myhloli 1 year ago
parent
commit
f9310954bd

+ 5 - 3
magic_pdf/pdf_parse_by_ocr.py

@@ -23,10 +23,9 @@ from magic_pdf.pre_proc.ocr_cut_image import cut_image_and_table
 from magic_pdf.pre_proc.ocr_detect_layout import layout_detect
 from magic_pdf.pre_proc.ocr_dict_merge import (
     merge_spans_to_line_by_layout,
-    modify_y_axis
 )
 from magic_pdf.pre_proc.ocr_span_list_modify import remove_spans_by_bboxes, remove_overlaps_min_spans, \
-    adjust_bbox_for_standalone_block
+    adjust_bbox_for_standalone_block,modify_y_axis,modify_inline_equation
 from magic_pdf.pre_proc.remove_bbox_overlap import remove_overlap_between_bbox
 
 
@@ -184,8 +183,11 @@ def parse_pdf_by_ocr(
         spans = cut_image_and_table(spans, page, page_id, book_name, save_path)
 
         # 行内公式调整, 高度调整至与同行文字高度一致(优先左侧, 其次右侧)
+        displayed_list = []
+        text_inline_lines = []
+        modify_y_axis(spans, displayed_list, text_inline_lines)
         # 模型识别错误的行间公式, type类型转换成行内公式
-        spans = modify_y_axis(spans)
+        spans = modify_inline_equation(spans, displayed_list, text_inline_lines)
 
         # bbox去除粘连
         spans = remove_overlap_between_bbox(spans)

+ 0 - 95
magic_pdf/pre_proc/ocr_dict_merge.py

@@ -81,101 +81,6 @@ def merge_spans_to_line_by_layout(spans, layout_bboxes):
 
 
 
-def modify_y_axis(spans: list):
-    inline_list = []
-    displayed_list = []
-    text_list = []
-    image_list = []
-    table_list = []
-
-    spans.sort(key=lambda span: span['bbox'][1])
-
-    lines = []
-    current_line = [spans[0]]
-    if spans[0]["type"] in ["displayed_equation", "image", "table"]:
-        displayed_list.append(spans[0])
-
-    line_first_y0 = spans[0]["bbox"][1]
-    line_first_y = spans[0]["bbox"][3]
-    #用于给行间公式搜索
-    text_inline_lines = []
-    for span in spans[1:]:
-        # if span.get("content","") == "78.":
-        #     print("debug")
-        # 如果当前的span类型为"displayed_equation" 或者 当前行中已经有"displayed_equation"
-        # image和table类型,同上
-        if span['type'] in ["displayed_equation", "image", "table"] or any(
-                s['type'] in ["displayed_equation", "image", "table"] for s in current_line):
-            #传入
-            if span["type"] in ["displayed_equation", "image", "table"]:
-                displayed_list.append(span)
-            # 则开始新行
-            lines.append(current_line)
-            if len(current_line) > 1 or current_line[0]["type"] in ["text", "inline_equation"]:
-                text_inline_lines.append((current_line, (line_first_y0, line_first_y)))
-            current_line = [span]
-            line_first_y0 = span["bbox"][1]
-            line_first_y = span["bbox"][3]
-            continue
-
-        # 如果当前的span与当前行的最后一个span在y轴上重叠,则添加到当前行
-        if __is_overlaps_y_exceeds_threshold(span['bbox'], current_line[-1]['bbox']):
-            if span["bbox"][1] < line_first_y0:
-                line_first_y0 = span["bbox"][1]
-            if span["bbox"][3] > line_first_y:
-                line_first_y = span["bbox"][3]
-            current_line.append(span)
-
-        else:
-            # 否则,开始新行
-            lines.append(current_line)
-            text_inline_lines.append((current_line, (line_first_y0, line_first_y)))
-            current_line = [span]
-            line_first_y0 = span["bbox"][1]
-            line_first_y = span["bbox"][3]
-
-        # 添加最后一行
-    if current_line:
-        lines.append(current_line)
-        if len(current_line) > 1 or current_line[0]["type"] in ["text", "inline_equation"]:
-            text_inline_lines.append((current_line, (line_first_y0, line_first_y)))
-    for line in text_inline_lines:
-        # 按照x0坐标排序
-        current_line = line[0]
-        current_line.sort(key=lambda span: span['bbox'][0])
-
-
-    #调整每一个文字行内bbox统一
-    for line in text_inline_lines:
-        current_line, (line_first_y0, line_first_y) = line
-        for span in current_line:
-            span["bbox"][1] = line_first_y0
-            span["bbox"][3] = line_first_y
-    #错误行间公式转行内公式
-    j = 0
-    for i in range(len(displayed_list)):
-        # if i == 8:
-        #     print("debug")
-        span = displayed_list[i]
-        span_y0, span_y = span["bbox"][1], span["bbox"][3]
-
-        while j < len(text_inline_lines):
-            text_line = text_inline_lines[j]
-            y0, y1 = text_line[1]
-            if (span_y0 < y0 and span_y > y0 or span_y0 < y1 and span_y > y1 or span_y0 < y0 and span_y > y1) and __is_overlaps_y_exceeds_threshold(span['bbox'], (0, y0, 0, y1)):
-                span["bbox"][1] = y0
-                # span["bbox"][3] = y1
-                #调整公式类型
-                if span["type"] == "displayed_equation":
-                    span["type"] = "inline_equation"
-                break
-            elif span_y < y0 or span_y0 < y0 and span_y > y0 and not __is_overlaps_y_exceeds_threshold(span['bbox'], (0, y0, 0, y1)):
-                break
-            else:
-                j += 1
-
-    return spans
-
 
 
 

+ 101 - 0
magic_pdf/pre_proc/ocr_span_list_modify.py

@@ -43,3 +43,104 @@ def adjust_bbox_for_standalone_block(spans):
                             # 调整span的y0和span2的y0一致
                             sb_span['bbox'][1] = text_span['bbox'][1]
     return spans
+
+
+
+def modify_y_axis(spans: list, displayed_list: list, text_inline_lines: list):
+    # displayed_list = []
+
+    spans.sort(key=lambda span: span['bbox'][1])
+
+    lines = []
+    current_line = [spans[0]]
+    if spans[0]["type"] in ["displayed_equation", "image", "table"]:
+        displayed_list.append(spans[0])
+
+    line_first_y0 = spans[0]["bbox"][1]
+    line_first_y = spans[0]["bbox"][3]
+    #用于给行间公式搜索
+    # text_inline_lines = []
+    for span in spans[1:]:
+        # if span.get("content","") == "78.":
+        #     print("debug")
+        # 如果当前的span类型为"displayed_equation" 或者 当前行中已经有"displayed_equation"
+        # image和table类型,同上
+        if span['type'] in ["displayed_equation", "image", "table"] or any(
+                s['type'] in ["displayed_equation", "image", "table"] for s in current_line):
+            #传入
+            if span["type"] in ["displayed_equation", "image", "table"]:
+                displayed_list.append(span)
+            # 则开始新行
+            lines.append(current_line)
+            if len(current_line) > 1 or current_line[0]["type"] in ["text", "inline_equation"]:
+                text_inline_lines.append((current_line, (line_first_y0, line_first_y)))
+            current_line = [span]
+            line_first_y0 = span["bbox"][1]
+            line_first_y = span["bbox"][3]
+            continue
+
+        # 如果当前的span与当前行的最后一个span在y轴上重叠,则添加到当前行
+        if __is_overlaps_y_exceeds_threshold(span['bbox'], current_line[-1]['bbox']):
+            if span["type"] == "text":
+                line_first_y0 = span["bbox"][1]
+                line_first_y = span["bbox"][3]
+            current_line.append(span)
+
+        else:
+            # 否则,开始新行
+            lines.append(current_line)
+            text_inline_lines.append((current_line, (line_first_y0, line_first_y)))
+            current_line = [span]
+            line_first_y0 = span["bbox"][1]
+            line_first_y = span["bbox"][3]
+
+        # 添加最后一行
+    if current_line:
+        lines.append(current_line)
+        if len(current_line) > 1 or current_line[0]["type"] in ["text", "inline_equation"]:
+            text_inline_lines.append((current_line, (line_first_y0, line_first_y)))
+    for line in text_inline_lines:
+        # 按照x0坐标排序
+        current_line = line[0]
+        current_line.sort(key=lambda span: span['bbox'][0])
+
+
+    #调整每一个文字行内bbox统一
+    for line in text_inline_lines:
+        current_line, (line_first_y0, line_first_y) = line
+        for span in current_line:
+            span["bbox"][1] = line_first_y0
+            span["bbox"][3] = line_first_y
+
+    # return spans, displayed_list, text_inline_lines
+
+def modify_inline_equation(spans: list, displayed_list: list, text_inline_lines: list):
+    #错误行间公式转行内公式
+    j = 0
+    for i in range(len(displayed_list)):
+        # if i == 8:
+        #     print("debug")
+        span = displayed_list[i]
+        span_y0, span_y = span["bbox"][1], span["bbox"][3]
+
+        while j < len(text_inline_lines):
+            text_line = text_inline_lines[j]
+            y0, y1 = text_line[1]
+            if (span_y0 < y0 and span_y > y0 or span_y0 < y1 and span_y > y1 or span_y0 < y0 and span_y > y1) and __is_overlaps_y_exceeds_threshold(span['bbox'], (0, y0, 0, y1)):
+                span["bbox"][1] = y0
+                # span["bbox"][3] = y1
+                #调整公式类型
+                if span["type"] == "displayed_equation":
+                    if j+1 >= len(text_inline_lines):
+                        span["type"] = "inline_equation"
+                    else:
+                        y0_next, y1_next = text_inline_lines[j + 1][1]
+                        if not __is_overlaps_y_exceeds_threshold(span['bbox'], (0, y0_next, 0, y1_next)):
+                            span["type"] = "inline_equation"
+                break
+            elif span_y < y0 or span_y0 < y0 and span_y > y0 and not __is_overlaps_y_exceeds_threshold(span['bbox'], (0, y0, 0, y1)):
+                break
+            else:
+                j += 1
+
+    return spans