| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181 |
- # Copyright (c) Opendatalab. All rights reserved.
- import re
- import cv2
- import numpy as np
- from mineru.utils.boxbase import calculate_overlap_area_in_bbox1_area_ratio, calculate_iou, \
- get_minbox_if_overlap_by_ratio
- from mineru.utils.enum_class import BlockType, ContentType
- from mineru.utils.pdf_image_tools import get_crop_img
- def remove_outside_spans(spans, all_bboxes, all_discarded_blocks):
- def get_block_bboxes(blocks, block_type_list):
- return [block[0:4] for block in blocks if block[7] in block_type_list]
- image_bboxes = get_block_bboxes(all_bboxes, [BlockType.IMAGE_BODY])
- table_bboxes = get_block_bboxes(all_bboxes, [BlockType.TABLE_BODY])
- other_block_type = []
- for block_type in BlockType.__dict__.values():
- if not isinstance(block_type, str):
- continue
- if block_type not in [BlockType.IMAGE_BODY, BlockType.TABLE_BODY]:
- other_block_type.append(block_type)
- other_block_bboxes = get_block_bboxes(all_bboxes, other_block_type)
- discarded_block_bboxes = get_block_bboxes(all_discarded_blocks, [BlockType.DISCARDED])
- new_spans = []
- for span in spans:
- span_bbox = span['bbox']
- span_type = span['type']
- if any(calculate_overlap_area_in_bbox1_area_ratio(span_bbox, block_bbox) > 0.4 for block_bbox in
- discarded_block_bboxes):
- new_spans.append(span)
- continue
- if span_type == ContentType.IMAGE:
- if any(calculate_overlap_area_in_bbox1_area_ratio(span_bbox, block_bbox) > 0.5 for block_bbox in
- image_bboxes):
- new_spans.append(span)
- elif span_type == ContentType.TABLE:
- if any(calculate_overlap_area_in_bbox1_area_ratio(span_bbox, block_bbox) > 0.5 for block_bbox in
- table_bboxes):
- new_spans.append(span)
- else:
- if any(calculate_overlap_area_in_bbox1_area_ratio(span_bbox, block_bbox) > 0.5 for block_bbox in
- other_block_bboxes):
- new_spans.append(span)
- return new_spans
- def remove_overlaps_low_confidence_spans(spans):
- dropped_spans = []
- # 删除重叠spans中置信度低的的那些
- for span1 in spans:
- for span2 in spans:
- if span1 != span2:
- # span1 或 span2 任何一个都不应该在 dropped_spans 中
- if span1 in dropped_spans or span2 in dropped_spans:
- continue
- else:
- if calculate_iou(span1['bbox'], span2['bbox']) > 0.9:
- if span1['score'] < span2['score']:
- span_need_remove = span1
- else:
- span_need_remove = span2
- if (
- span_need_remove is not None
- and span_need_remove not in dropped_spans
- ):
- dropped_spans.append(span_need_remove)
- if len(dropped_spans) > 0:
- for span_need_remove in dropped_spans:
- spans.remove(span_need_remove)
- return spans, dropped_spans
- def remove_overlaps_min_spans(spans):
- dropped_spans = []
- # 删除重叠spans中较小的那些
- for span1 in spans:
- for span2 in spans:
- if span1 != span2:
- # span1 或 span2 任何一个都不应该在 dropped_spans 中
- if span1 in dropped_spans or span2 in dropped_spans:
- continue
- else:
- overlap_box = get_minbox_if_overlap_by_ratio(span1['bbox'], span2['bbox'], 0.65)
- if overlap_box is not None:
- span_need_remove = next((span for span in spans if span['bbox'] == overlap_box), None)
- if span_need_remove is not None and span_need_remove not in dropped_spans:
- dropped_spans.append(span_need_remove)
- if len(dropped_spans) > 0:
- for span_need_remove in dropped_spans:
- spans.remove(span_need_remove)
- return spans, dropped_spans
- def __replace_ligatures(text: str):
- ligatures = {
- 'fi': 'fi', 'fl': 'fl', 'ff': 'ff', 'ffi': 'ffi', 'ffl': 'ffl', 'ſt': 'ft', 'st': 'st'
- }
- return re.sub('|'.join(map(re.escape, ligatures.keys())), lambda m: ligatures[m.group()], text)
- def __replace_unicode(text: str):
- ligatures = {
- '\r\n': '', '\u0002': '-',
- }
- return re.sub('|'.join(map(re.escape, ligatures.keys())), lambda m: ligatures[m.group()], text)
- def txt_spans_extract(pdf_page, spans, pil_img, scale):
- textpage = pdf_page.get_textpage()
- width, height = pdf_page.get_size()
- cropbox = pdf_page.get_cropbox()
- need_ocr_spans = []
- for span in spans:
- if span['type'] in [ContentType.INTERLINE_EQUATION, ContentType.IMAGE, ContentType.TABLE]:
- continue
- span_bbox = span['bbox']
- rect_box = [span_bbox[0] + cropbox[0],
- height - span_bbox[3] + cropbox[1],
- span_bbox[2] + cropbox[0],
- height - span_bbox[1] + cropbox[1]]
- text = textpage.get_text_bounded(left=rect_box[0], top=rect_box[1],
- right=rect_box[2], bottom=rect_box[3])
- if text and len(text) > 0:
- text = __replace_unicode(text)
- text = __replace_ligatures(text)
- span['content'] = text.strip()
- span['score'] = 1.0
- else:
- need_ocr_spans.append(span)
- if len(need_ocr_spans) > 0:
- for span in need_ocr_spans:
- # 对span的bbox截图再ocr
- span_pil_img = get_crop_img(span['bbox'], pil_img, scale)
- span_img = cv2.cvtColor(np.array(span_pil_img), cv2.COLOR_RGB2BGR)
- # 计算span的对比度,低于0.20的span不进行ocr
- if calculate_contrast(span_img, img_mode='bgr') <= 0.17:
- spans.remove(span)
- continue
- span['content'] = ''
- span['score'] = 1.0
- span['np_img'] = span_img
- return spans
- def calculate_contrast(img, img_mode) -> float:
- """
- 计算给定图像的对比度。
- :param img: 图像,类型为numpy.ndarray
- :Param img_mode = 图像的色彩通道,'rgb' 或 'bgr'
- :return: 图像的对比度值
- """
- if img_mode == 'rgb':
- # 将RGB图像转换为灰度图
- gray_img = cv2.cvtColor(img, cv2.COLOR_RGB2GRAY)
- elif img_mode == 'bgr':
- # 将BGR图像转换为灰度图
- gray_img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
- else:
- raise ValueError("Invalid image mode. Please provide 'rgb' or 'bgr'.")
- # 计算均值和标准差
- mean_value = np.mean(gray_img)
- std_dev = np.std(gray_img)
- # 对比度定义为标准差除以平均值(加上小常数避免除零错误)
- contrast = std_dev / (mean_value + 1e-6)
- # logger.debug(f"contrast: {contrast}")
- return round(contrast, 2)
|