Explorar el Código

add modify inline equation y axis
add false displayed equation to inline equation

liukaiwen hace 1 año
padre
commit
1f468bed0a
Se han modificado 2 ficheros con 20 adiciones y 12 borrados
  1. 6 2
      magic_pdf/pdf_parse_by_ocr.py
  2. 14 10
      magic_pdf/pre_proc/ocr_dict_merge.py

+ 6 - 2
magic_pdf/pdf_parse_by_ocr.py

@@ -24,7 +24,8 @@ from magic_pdf.pre_proc.ocr_detect_layout import layout_detect
 from magic_pdf.pre_proc.ocr_dict_merge import (
     remove_overlaps_min_spans,
     merge_spans_to_line_by_layout,
-    modify_y_axis
+    modify_y_axis,
+    modify_inline_equation
 )
 from magic_pdf.pre_proc.ocr_remove_spans import remove_spans_by_bboxes
 from magic_pdf.pre_proc.remove_bbox_overlap import remove_overlap_between_bbox
@@ -184,8 +185,11 @@ def parse_pdf_by_ocr(
         spans = cut_image_and_table(spans, page, page_id, book_name, save_path)
 
         # 行内公式调整, 高度调整至与同行文字高度一致(优先左侧, 其次右侧)
+        displayed_list = []
+        text_inline_lines = []
+        modify_y_axis(spans, displayed_list, text_inline_lines)
         # 模型识别错误的行间公式, type类型转换成行内公式
-        spans = modify_y_axis(spans)
+        spans = modify_inline_equation(spans, displayed_list, text_inline_lines)
 
         # bbox去除粘连
         spans = remove_overlap_between_bbox(spans)

+ 14 - 10
magic_pdf/pre_proc/ocr_dict_merge.py

@@ -94,12 +94,8 @@ def merge_spans_to_line_by_layout(spans, layout_bboxes):
 
 
 
-def modify_y_axis(spans: list):
-    inline_list = []
-    displayed_list = []
-    text_list = []
-    image_list = []
-    table_list = []
+def modify_y_axis(spans: list, displayed_list: list, text_inline_lines: list):
+    # displayed_list = []
 
     spans.sort(key=lambda span: span['bbox'][1])
 
@@ -111,7 +107,7 @@ def modify_y_axis(spans: list):
     line_first_y0 = spans[0]["bbox"][1]
     line_first_y = spans[0]["bbox"][3]
     #用于给行间公式搜索
-    text_inline_lines = []
+    # text_inline_lines = []
     for span in spans[1:]:
         # if span.get("content","") == "78.":
         #     print("debug")
@@ -133,9 +129,8 @@ def modify_y_axis(spans: list):
 
         # 如果当前的span与当前行的最后一个span在y轴上重叠,则添加到当前行
         if __is_overlaps_y_exceeds_threshold(span['bbox'], current_line[-1]['bbox']):
-            if span["bbox"][1] < line_first_y0:
+            if span["type"] == "text":
                 line_first_y0 = span["bbox"][1]
-            if span["bbox"][3] > line_first_y:
                 line_first_y = span["bbox"][3]
             current_line.append(span)
 
@@ -164,6 +159,10 @@ def modify_y_axis(spans: list):
         for span in current_line:
             span["bbox"][1] = line_first_y0
             span["bbox"][3] = line_first_y
+
+    # return spans, displayed_list, text_inline_lines
+
+def modify_inline_equation(spans: list, displayed_list: list, text_inline_lines: list):
     #错误行间公式转行内公式
     j = 0
     for i in range(len(displayed_list)):
@@ -180,7 +179,12 @@ def modify_y_axis(spans: list):
                 # span["bbox"][3] = y1
                 #调整公式类型
                 if span["type"] == "displayed_equation":
-                    span["type"] = "inline_equation"
+                    if j+1 >= len(text_inline_lines):
+                        span["type"] = "inline_equation"
+                    else:
+                        y0_next, y1_next = text_inline_lines[j + 1][1]
+                        if not __is_overlaps_y_exceeds_threshold(span['bbox'], (0, y0_next, 0, y1_next)):
+                            span["type"] = "inline_equation"
                 break
             elif span_y < y0 or span_y0 < y0 and span_y > y0 and not __is_overlaps_y_exceeds_threshold(span['bbox'], (0, y0, 0, y1)):
                 break