|
|
@@ -4,7 +4,8 @@ from magic_pdf.libs.boxbase import __is_overlaps_y_exceeds_threshold, get_minbox
|
|
|
calculate_overlap_area_in_bbox1_area_ratio
|
|
|
from magic_pdf.libs.drop_tag import DropTag
|
|
|
from magic_pdf.libs.ocr_content_type import ContentType, BlockType
|
|
|
-from magic_pdf.pre_proc.ocr_fix_block_logic import fix_image_block, fix_table_block, fix_text_block
|
|
|
+from magic_pdf.pre_proc.ocr_span_list_modify import modify_y_axis, modify_inline_equation
|
|
|
+from magic_pdf.pre_proc.remove_bbox_overlap import remove_overlap_between_bbox
|
|
|
|
|
|
|
|
|
# 将每一个line中的span从左到右排序
|
|
|
@@ -157,6 +158,18 @@ def fill_spans_in_blocks(blocks, spans):
|
|
|
span_bbox = span['bbox']
|
|
|
if calculate_overlap_area_in_bbox1_area_ratio(span_bbox, block_bbox) > 0.8:
|
|
|
block_spans.append(span)
|
|
|
+
|
|
|
+ '''行内公式调整, 高度调整至与同行文字高度一致(优先左侧, 其次右侧)'''
|
|
|
+ displayed_list = []
|
|
|
+ text_inline_lines = []
|
|
|
+ modify_y_axis(block_spans, displayed_list, text_inline_lines)
|
|
|
+
|
|
|
+ '''模型识别错误的行间公式, type类型转换成行内公式'''
|
|
|
+ block_spans = modify_inline_equation(block_spans, displayed_list, text_inline_lines)
|
|
|
+
|
|
|
+ '''bbox去除粘连'''
|
|
|
+ block_spans = remove_overlap_between_bbox(block_spans)
|
|
|
+
|
|
|
block_dict['spans'] = block_spans
|
|
|
block_with_spans.append(block_dict)
|
|
|
|
|
|
@@ -189,3 +202,111 @@ def fix_block_spans(block_with_spans, img_blocks, table_blocks):
|
|
|
continue
|
|
|
fix_blocks.append(block)
|
|
|
return fix_blocks
|
|
|
+
|
|
|
+
|
|
|
+def merge_spans_to_block(spans: list, block_bbox: list, block_type: str):
|
|
|
+ block_spans = []
|
|
|
+ # 如果有img_caption,则将img_block中的text_spans放入img_caption_block中
|
|
|
+ for span in spans:
|
|
|
+ if calculate_overlap_area_in_bbox1_area_ratio(span['bbox'], block_bbox) > 0.8:
|
|
|
+ block_spans.append(span)
|
|
|
+ block_lines = merge_spans_to_line(block_spans)
|
|
|
+ # 对line中的span进行排序
|
|
|
+ sort_block_lines = line_sort_spans_by_left_to_right(block_lines)
|
|
|
+ block = {
|
|
|
+ 'bbox': block_bbox,
|
|
|
+ 'block_type': block_type,
|
|
|
+ 'lines': sort_block_lines
|
|
|
+ }
|
|
|
+ return block, block_spans
|
|
|
+
|
|
|
+
|
|
|
+def make_body_block(span: dict, block_bbox: list, block_type: str):
|
|
|
+ # 创建body_block
|
|
|
+ body_line = {
|
|
|
+ 'bbox': block_bbox,
|
|
|
+ 'spans': [span],
|
|
|
+ }
|
|
|
+ body_block = {
|
|
|
+ 'bbox': block_bbox,
|
|
|
+ 'block_type': block_type,
|
|
|
+ 'lines': [body_line]
|
|
|
+ }
|
|
|
+ return body_block
|
|
|
+
|
|
|
+
|
|
|
+def fix_image_block(block, img_blocks):
|
|
|
+ block['blocks'] = []
|
|
|
+ # 遍历img_blocks,找到与当前block匹配的img_block
|
|
|
+ for img_block in img_blocks:
|
|
|
+ if img_block['bbox'] == block['bbox']:
|
|
|
+ # 创建img_body_block
|
|
|
+ for span in block['spans']:
|
|
|
+ if span['type'] == ContentType.Image and span['bbox'] == img_block['img_body_bbox']:
|
|
|
+ # 创建img_body_block
|
|
|
+ img_body_block = make_body_block(span, img_block['img_body_bbox'], BlockType.ImageBody)
|
|
|
+ block['blocks'].append(img_body_block)
|
|
|
+
|
|
|
+ # 从spans中移除img_body_block中已经放入的span
|
|
|
+ block['spans'].remove(span)
|
|
|
+ break
|
|
|
+
|
|
|
+ # 根据list长度,判断img_block中是否有img_caption
|
|
|
+ if img_block['img_caption_bbox'] is not None:
|
|
|
+ img_caption_block, img_caption_spans = merge_spans_to_block(
|
|
|
+ block['spans'], img_block['img_caption_bbox'], BlockType.ImageCaption
|
|
|
+ )
|
|
|
+ block['blocks'].append(img_caption_block)
|
|
|
+
|
|
|
+ break
|
|
|
+ del block['spans']
|
|
|
+ return block
|
|
|
+
|
|
|
+
|
|
|
+def fix_table_block(block, table_blocks):
|
|
|
+ block['blocks'] = []
|
|
|
+ # 遍历table_blocks,找到与当前block匹配的table_block
|
|
|
+ for table_block in table_blocks:
|
|
|
+ if table_block['bbox'] == block['bbox']:
|
|
|
+ # 创建table_body_block
|
|
|
+ for span in block['spans']:
|
|
|
+ if span['type'] == ContentType.Table and span['bbox'] == table_block['table_body_bbox']:
|
|
|
+ # 创建table_body_block
|
|
|
+ table_body_block = make_body_block(span, table_block['table_body_bbox'], BlockType.TableBody)
|
|
|
+ block['blocks'].append(table_body_block)
|
|
|
+
|
|
|
+ # 从spans中移除img_body_block中已经放入的span
|
|
|
+ block['spans'].remove(span)
|
|
|
+ break
|
|
|
+
|
|
|
+ # 根据list长度,判断table_block中是否有caption
|
|
|
+ if table_block['table_caption_bbox'] is not None:
|
|
|
+ table_caption_block, table_caption_spans = merge_spans_to_block(
|
|
|
+ block['spans'], table_block['table_caption_bbox'], BlockType.TableCaption
|
|
|
+ )
|
|
|
+ block['blocks'].append(table_caption_block)
|
|
|
+
|
|
|
+ # 如果table_caption_block_spans不为空
|
|
|
+ if len(table_caption_spans) > 0:
|
|
|
+ # 一些span已经放入了caption_block中,需要从block['spans']中删除
|
|
|
+ for span in table_caption_spans:
|
|
|
+ block['spans'].remove(span)
|
|
|
+
|
|
|
+ # 根据list长度,判断table_block中是否有table_note
|
|
|
+ if table_block['table_footnote_bbox'] is not None:
|
|
|
+ table_footnote_block, table_footnote_spans = merge_spans_to_block(
|
|
|
+ block['spans'], table_block['table_footnote_bbox'], BlockType.TableFootnote
|
|
|
+ )
|
|
|
+ block['blocks'].append(table_footnote_block)
|
|
|
+
|
|
|
+ break
|
|
|
+ del block['spans']
|
|
|
+ return block
|
|
|
+
|
|
|
+
|
|
|
+def fix_text_block(block):
|
|
|
+ block_lines = merge_spans_to_line(block['spans'])
|
|
|
+ sort_block_lines = line_sort_spans_by_left_to_right(block_lines)
|
|
|
+ block['lines'] = sort_block_lines
|
|
|
+ del block['spans']
|
|
|
+ return block
|