Sfoglia il codice sorgente

add todo about interline_equation

赵小蒙 1 anno fa
parent
commit
e92de75844

+ 1 - 0
magic_pdf/pdf_parse_union_core.py

@@ -111,6 +111,7 @@ def parse_page_core(pdf_docs, magic_model, page_id, pdf_bytes_md5, imageWriter,
     spans = ocr_cut_image_and_table(spans, pdf_docs[page_id], page_id, pdf_bytes_md5, imageWriter)
 
     '''将所有区块的bbox整理到一起'''
+    # @todo interline_equation_blocks参数不够准,后面切换到interline_equations上
     if len(interline_equation_blocks) > 0:
         all_bboxes, all_discarded_blocks, drop_reasons = ocr_prepare_bboxes_for_layout_split(
             img_blocks, table_blocks, discarded_blocks, text_blocks, title_blocks,

+ 3 - 0
magic_pdf/pre_proc/ocr_detect_all_bboxes.py

@@ -36,6 +36,9 @@ def ocr_prepare_bboxes_for_layout_split(img_blocks, table_blocks, discarded_bloc
     all_bboxes = fix_text_overlap_title_blocks(all_bboxes)
     '''任何框体与舍弃框重叠,优先信任舍弃框'''
     all_bboxes = remove_need_drop_blocks(all_bboxes, discarded_blocks)
+    # @todo interline_equation 与title或text框冲突的情况,分两种情况处理
+    '''interline_equation框与文本类型框iou比较接近1的时候,信任行间公式框'''
+    '''interline_equation框被包含在文本类型框内,且interline_equation比文本区块小很多时信任文本框,这时需要舍弃公式框'''
 
     '''discarded_blocks中只保留宽度超过1/3页面宽度的,高度超过10的,处于页面下半50%区域的(限定footnote)'''
     for discarded in discarded_blocks: