|
|
@@ -382,39 +382,44 @@ def revert_group_blocks(blocks):
|
|
|
return new_blocks
|
|
|
|
|
|
|
|
|
-def remove_outside_spans(spans, all_bboxes):
|
|
|
- image_bboxes = []
|
|
|
- table_bboxes = []
|
|
|
- other_block_bboxes = []
|
|
|
- for block in all_bboxes:
|
|
|
- block_type = block[7]
|
|
|
- block_bbox = block[0:4]
|
|
|
-
|
|
|
- if block_type == BlockType.ImageBody:
|
|
|
- image_bboxes.append(block_bbox)
|
|
|
- elif block_type == BlockType.TableBody:
|
|
|
- table_bboxes.append(block_bbox)
|
|
|
- else:
|
|
|
- other_block_bboxes.append(block_bbox)
|
|
|
+def remove_outside_spans(spans, all_bboxes, all_discarded_blocks):
|
|
|
+ def get_block_bboxes(blocks, block_type_list):
|
|
|
+ return [block[0:4] for block in blocks if block[7] in block_type_list]
|
|
|
+
|
|
|
+ image_bboxes = get_block_bboxes(all_bboxes, [BlockType.ImageBody])
|
|
|
+ table_bboxes = get_block_bboxes(all_bboxes, [BlockType.TableBody])
|
|
|
+ other_block_type = []
|
|
|
+ for block_type in BlockType.__dict__.values():
|
|
|
+ if not isinstance(block_type, str):
|
|
|
+ continue
|
|
|
+ if block_type not in [BlockType.ImageBody, BlockType.TableBody]:
|
|
|
+ other_block_type.append(block_type)
|
|
|
+ other_block_bboxes = get_block_bboxes(all_bboxes, other_block_type)
|
|
|
+ discarded_block_bboxes = get_block_bboxes(all_discarded_blocks, [BlockType.Discarded])
|
|
|
|
|
|
new_spans = []
|
|
|
|
|
|
for span in spans:
|
|
|
- if span['type'] == ContentType.Image:
|
|
|
- for block_bbox in image_bboxes:
|
|
|
- if calculate_overlap_area_in_bbox1_area_ratio(span['bbox'], block_bbox) > 0.5:
|
|
|
- new_spans.append(span)
|
|
|
- break
|
|
|
- elif span['type'] == ContentType.Table:
|
|
|
- for block_bbox in table_bboxes:
|
|
|
- if calculate_overlap_area_in_bbox1_area_ratio(span['bbox'], block_bbox) > 0.5:
|
|
|
- new_spans.append(span)
|
|
|
- break
|
|
|
+ span_bbox = span['bbox']
|
|
|
+ span_type = span['type']
|
|
|
+
|
|
|
+ if any(calculate_overlap_area_in_bbox1_area_ratio(span_bbox, block_bbox) > 0.4 for block_bbox in
|
|
|
+ discarded_block_bboxes):
|
|
|
+ new_spans.append(span)
|
|
|
+ continue
|
|
|
+
|
|
|
+ if span_type == ContentType.Image:
|
|
|
+ if any(calculate_overlap_area_in_bbox1_area_ratio(span_bbox, block_bbox) > 0.5 for block_bbox in
|
|
|
+ image_bboxes):
|
|
|
+ new_spans.append(span)
|
|
|
+ elif span_type == ContentType.Table:
|
|
|
+ if any(calculate_overlap_area_in_bbox1_area_ratio(span_bbox, block_bbox) > 0.5 for block_bbox in
|
|
|
+ table_bboxes):
|
|
|
+ new_spans.append(span)
|
|
|
else:
|
|
|
- for block_bbox in other_block_bboxes:
|
|
|
- if calculate_overlap_area_in_bbox1_area_ratio(span['bbox'], block_bbox) > 0.5:
|
|
|
- new_spans.append(span)
|
|
|
- break
|
|
|
+ if any(calculate_overlap_area_in_bbox1_area_ratio(span_bbox, block_bbox) > 0.5 for block_bbox in
|
|
|
+ other_block_bboxes):
|
|
|
+ new_spans.append(span)
|
|
|
|
|
|
return new_spans
|
|
|
|
|
|
@@ -488,7 +493,8 @@ def parse_page_core(
|
|
|
raise Exception('parse_mode must be txt or ocr')
|
|
|
|
|
|
"""在删除重复span之前,应该通过image_body和table_body的block过滤一下image和table的span"""
|
|
|
- spans = remove_outside_spans(spans, all_bboxes)
|
|
|
+ """顺便删除大水印并保留abandon的span"""
|
|
|
+ spans = remove_outside_spans(spans, all_bboxes, all_discarded_blocks)
|
|
|
|
|
|
"""删除重叠spans中置信度较低的那些"""
|
|
|
spans, dropped_spans_by_confidence = remove_overlaps_low_confidence_spans(spans)
|