Browse Source

移动modify_y_axis在pipeline中的位置

赵小蒙 1 year ago
parent
commit
6396910992

+ 1 - 4
magic_pdf/pdf_parse_by_ocr.py

@@ -177,9 +177,6 @@ def parse_pdf_by_ocr(
         # 删除重叠spans中较小的那些
         spans = remove_overlaps_min_spans(spans)
 
-        # 对tpye=["displayed_equation", "image", "table"]进行额外处理,如果左边有字的话,将该span的bbox中y0调整低于文字的y0
-        spans = modify_y_axis(spans)
-
         # 删除remove_span_block_bboxes中的bbox
         spans = remove_spans_by_bboxes(spans, need_remove_spans_bboxes)
 
@@ -187,8 +184,8 @@ def parse_pdf_by_ocr(
         spans = cut_image_and_table(spans, page, page_id, book_name, save_path)
 
         # 行内公式调整, 高度调整至与同行文字高度一致(优先左侧, 其次右侧)
-
         # 模型识别错误的行间公式, type类型转换成行内公式
+        spans = modify_y_axis(spans)
 
         # bbox去除粘连
         spans = remove_overlap_between_bbox(spans)

+ 1 - 1
magic_pdf/pre_proc/ocr_detect_layout.py

@@ -64,7 +64,7 @@ def adjust_layouts(layout_bboxes, page_boundry, page_id):
     # 排序调整布局边界框列表
     new_bboxes = []
     for layout_bbox in layout_bboxes:
-        new_bboxes.append([layout_bbox[0], layout_bbox[1], layout_bbox[2], layout_bbox[3], None, None, None, None, None, None, None, None,None])
+        new_bboxes.append([layout_bbox[0], layout_bbox[1], layout_bbox[2], layout_bbox[3], None, None, None, None, None, None, None, None, None])
 
     layout_bboxes, layout_tree = get_bboxes_layout(new_bboxes, page_boundry, page_id)
 

+ 5 - 5
magic_pdf/pre_proc/ocr_dict_merge.py

@@ -9,7 +9,7 @@ def remove_overlaps_min_spans(spans):
     for span1 in spans.copy():
         for span2 in spans.copy():
             if span1 != span2:
-                overlap_box = get_minbox_if_overlap_by_ratio(span1['bbox'], span2['bbox'], 0.5)
+                overlap_box = get_minbox_if_overlap_by_ratio(span1['bbox'], span2['bbox'], 0.8)
                 if overlap_box is not None:
                     bbox_to_remove = next((span for span in spans if span['bbox'] == overlap_box), None)
                     if bbox_to_remove is not None:
@@ -113,8 +113,8 @@ def modify_y_axis(spans: list):
     #用于给行间公式搜索
     text_inline_lines = []
     for span in spans[1:]:
-        if span.get("content","") == "78.":
-            print("debug")
+        # if span.get("content","") == "78.":
+        #     print("debug")
         # 如果当前的span类型为"displayed_equation" 或者 当前行中已经有"displayed_equation"
         # image和table类型,同上
         if span['type'] in ["displayed_equation", "image", "table"] or any(
@@ -167,8 +167,8 @@ def modify_y_axis(spans: list):
     #错误行间公式转行内公式
     j = 0
     for i in range(len(displayed_list)):
-        if i == 8:
-            print("debug")
+        # if i == 8:
+        #     print("debug")
         span = displayed_list[i]
         span_y0, span_y = span["bbox"][1], span["bbox"][3]