| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159 |
- from magic_pdf.config.ocr_content_type import BlockType, ContentType
- from magic_pdf.libs.boxbase import __is_overlaps_y_exceeds_threshold, calculate_overlap_area_in_bbox1_area_ratio
- # 将每一个line中的span从左到右排序
- def line_sort_spans_by_left_to_right(lines):
- line_objects = []
- for line in lines:
- # 按照x0坐标排序
- line.sort(key=lambda span: span['bbox'][0])
- line_bbox = [
- min(span['bbox'][0] for span in line), # x0
- min(span['bbox'][1] for span in line), # y0
- max(span['bbox'][2] for span in line), # x1
- max(span['bbox'][3] for span in line), # y1
- ]
- line_objects.append({
- 'bbox': line_bbox,
- 'spans': line,
- })
- return line_objects
- def merge_spans_to_line(spans, threshold=0.6):
- if len(spans) == 0:
- return []
- else:
- # 按照y0坐标排序
- spans.sort(key=lambda span: span['bbox'][1])
- lines = []
- current_line = [spans[0]]
- for span in spans[1:]:
- # 如果当前的span类型为"interline_equation" 或者 当前行中已经有"interline_equation"
- # image和table类型,同上
- if span['type'] in [
- ContentType.InterlineEquation, ContentType.Image,
- ContentType.Table
- ] or any(s['type'] in [
- ContentType.InterlineEquation, ContentType.Image,
- ContentType.Table
- ] for s in current_line):
- # 则开始新行
- lines.append(current_line)
- current_line = [span]
- continue
- # 如果当前的span与当前行的最后一个span在y轴上重叠,则添加到当前行
- if __is_overlaps_y_exceeds_threshold(span['bbox'], current_line[-1]['bbox'], threshold):
- current_line.append(span)
- else:
- # 否则,开始新行
- lines.append(current_line)
- current_line = [span]
- # 添加最后一行
- if current_line:
- lines.append(current_line)
- return lines
- def span_block_type_compatible(span_type, block_type):
- if span_type in [ContentType.Text, ContentType.InlineEquation]:
- return block_type in [
- BlockType.Text,
- BlockType.Title,
- BlockType.ImageCaption,
- BlockType.ImageFootnote,
- BlockType.TableCaption,
- BlockType.TableFootnote,
- BlockType.Discarded
- ]
- elif span_type == ContentType.InterlineEquation:
- return block_type in [BlockType.InterlineEquation, BlockType.Text]
- elif span_type == ContentType.Image:
- return block_type in [BlockType.ImageBody]
- elif span_type == ContentType.Table:
- return block_type in [BlockType.TableBody]
- else:
- return False
- def fill_spans_in_blocks(blocks, spans, radio):
- """将allspans中的span按位置关系,放入blocks中."""
- block_with_spans = []
- for block in blocks:
- block_type = block[7]
- block_bbox = block[0:4]
- block_dict = {
- 'type': block_type,
- 'bbox': block_bbox,
- }
- if block_type in [
- BlockType.ImageBody, BlockType.ImageCaption, BlockType.ImageFootnote,
- BlockType.TableBody, BlockType.TableCaption, BlockType.TableFootnote
- ]:
- block_dict['group_id'] = block[-1]
- block_spans = []
- for span in spans:
- span_bbox = span['bbox']
- if calculate_overlap_area_in_bbox1_area_ratio(span_bbox, block_bbox) > radio and span_block_type_compatible(span['type'], block_type):
- block_spans.append(span)
- block_dict['spans'] = block_spans
- block_with_spans.append(block_dict)
- # 从spans删除已经放入block_spans中的span
- if len(block_spans) > 0:
- for span in block_spans:
- spans.remove(span)
- return block_with_spans, spans
- def fix_block_spans_v2(block_with_spans):
- fix_blocks = []
- for block in block_with_spans:
- block_type = block['type']
- if block_type in [BlockType.Text, BlockType.Title,
- BlockType.ImageCaption, BlockType.ImageFootnote,
- BlockType.TableCaption, BlockType.TableFootnote
- ]:
- block = fix_text_block(block)
- elif block_type in [BlockType.InterlineEquation, BlockType.ImageBody, BlockType.TableBody]:
- block = fix_interline_block(block)
- else:
- continue
- fix_blocks.append(block)
- return fix_blocks
- def fix_discarded_block(discarded_block_with_spans):
- fix_discarded_blocks = []
- for block in discarded_block_with_spans:
- block = fix_text_block(block)
- fix_discarded_blocks.append(block)
- return fix_discarded_blocks
- def fix_text_block(block):
- # 文本block中的公式span都应该转换成行内type
- for span in block['spans']:
- if span['type'] == ContentType.InterlineEquation:
- span['type'] = ContentType.InlineEquation
- block_lines = merge_spans_to_line(block['spans'])
- sort_block_lines = line_sort_spans_by_left_to_right(block_lines)
- block['lines'] = sort_block_lines
- del block['spans']
- return block
- def fix_interline_block(block):
- block_lines = merge_spans_to_line(block['spans'])
- sort_block_lines = line_sort_spans_by_left_to_right(block_lines)
- block['lines'] = sort_block_lines
- del block['spans']
- return block
|