Преглед на файлове

基于block内对spans做一些fix

赵小蒙 преди 1 година
родител
ревизия
54f165aac6
променени са 1 файла, в които са добавени 14 реда и са изтрити 0 реда
  1. 14 0
      magic_pdf/pre_proc/ocr_dict_merge.py

+ 14 - 0
magic_pdf/pre_proc/ocr_dict_merge.py

@@ -5,6 +5,8 @@ from magic_pdf.libs.boxbase import __is_overlaps_y_exceeds_threshold, get_minbox
 from magic_pdf.libs.drop_tag import DropTag
 from magic_pdf.libs.drop_tag import DropTag
 from magic_pdf.libs.ocr_content_type import ContentType, BlockType
 from magic_pdf.libs.ocr_content_type import ContentType, BlockType
 from magic_pdf.pre_proc.ocr_fix_block_logic import fix_image_block, fix_table_block, fix_text_block
 from magic_pdf.pre_proc.ocr_fix_block_logic import fix_image_block, fix_table_block, fix_text_block
+from magic_pdf.pre_proc.ocr_span_list_modify import modify_y_axis, modify_inline_equation
+from magic_pdf.pre_proc.remove_bbox_overlap import remove_overlap_between_bbox
 
 
 
 
 # 将每一个line中的span从左到右排序
 # 将每一个line中的span从左到右排序
@@ -157,6 +159,18 @@ def fill_spans_in_blocks(blocks, spans):
             span_bbox = span['bbox']
             span_bbox = span['bbox']
             if calculate_overlap_area_in_bbox1_area_ratio(span_bbox, block_bbox) > 0.8:
             if calculate_overlap_area_in_bbox1_area_ratio(span_bbox, block_bbox) > 0.8:
                 block_spans.append(span)
                 block_spans.append(span)
+
+        '''行内公式调整, 高度调整至与同行文字高度一致(优先左侧, 其次右侧)'''
+        displayed_list = []
+        text_inline_lines = []
+        modify_y_axis(block_spans, displayed_list, text_inline_lines)
+
+        '''模型识别错误的行间公式, type类型转换成行内公式'''
+        block_spans = modify_inline_equation(block_spans, displayed_list, text_inline_lines)
+
+        '''bbox去除粘连'''
+        block_spans = remove_overlap_between_bbox(block_spans)
+
         block_dict['spans'] = block_spans
         block_dict['spans'] = block_spans
         block_with_spans.append(block_dict)
         block_with_spans.append(block_dict)