瀏覽代碼

fix text_block overlap bug

赵小蒙 1 年之前
父節點
當前提交
ffdfe37b00
共有 1 個文件被更改,包括 2 次插入1 次删除
  1. 2 1
      magic_pdf/pdf_parse_union_core.py

+ 2 - 1
magic_pdf/pdf_parse_union_core.py

@@ -20,6 +20,7 @@ from magic_pdf.pre_proc.ocr_detect_all_bboxes import ocr_prepare_bboxes_for_layo
 from magic_pdf.pre_proc.ocr_dict_merge import sort_blocks_by_layout, fill_spans_in_blocks, fix_block_spans, \
     fix_discarded_block
 from magic_pdf.pre_proc.ocr_span_list_modify import remove_overlaps_min_spans, get_qa_need_list_v2
+from magic_pdf.pre_proc.remove_bbox_overlap import remove_overlap_between_bbox
 from magic_pdf.pre_proc.resolve_bbox_conflict import check_useful_block_horizontal_overlap
 
 
@@ -97,7 +98,7 @@ def parse_page_core(pdf_docs, magic_model, page_id, pdf_bytes_md5, imageWriter,
     img_blocks = magic_model.get_imgs(page_id)
     table_blocks = magic_model.get_tables(page_id)
     discarded_blocks = magic_model.get_discarded(page_id)
-    text_blocks = magic_model.get_text_blocks(page_id)
+    text_blocks = remove_overlap_between_bbox(magic_model.get_text_blocks(page_id))
     title_blocks = magic_model.get_title_blocks(page_id)
     inline_equations, interline_equations, interline_equation_blocks = magic_model.get_equations(page_id)