span_pre_proc.py 6.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163
  1. # Copyright (c) Opendatalab. All rights reserved.
  2. import cv2
  3. import numpy as np
  4. from mineru.utils.boxbase import calculate_overlap_area_in_bbox1_area_ratio, calculate_iou, \
  5. get_minbox_if_overlap_by_ratio
  6. from mineru.utils.enum_class import BlockType, ContentType
  7. from mineru.utils.pdf_image_tools import get_crop_img
  8. def remove_outside_spans(spans, all_bboxes, all_discarded_blocks):
  9. def get_block_bboxes(blocks, block_type_list):
  10. return [block[0:4] for block in blocks if block[7] in block_type_list]
  11. image_bboxes = get_block_bboxes(all_bboxes, [BlockType.IMAGE_BODY])
  12. table_bboxes = get_block_bboxes(all_bboxes, [BlockType.TABLE_BODY])
  13. other_block_type = []
  14. for block_type in BlockType.__dict__.values():
  15. if not isinstance(block_type, str):
  16. continue
  17. if block_type not in [BlockType.IMAGE_BODY, BlockType.TABLE_BODY]:
  18. other_block_type.append(block_type)
  19. other_block_bboxes = get_block_bboxes(all_bboxes, other_block_type)
  20. discarded_block_bboxes = get_block_bboxes(all_discarded_blocks, [BlockType.DISCARDED])
  21. new_spans = []
  22. for span in spans:
  23. span_bbox = span['bbox']
  24. span_type = span['type']
  25. if any(calculate_overlap_area_in_bbox1_area_ratio(span_bbox, block_bbox) > 0.4 for block_bbox in
  26. discarded_block_bboxes):
  27. new_spans.append(span)
  28. continue
  29. if span_type == ContentType.IMAGE:
  30. if any(calculate_overlap_area_in_bbox1_area_ratio(span_bbox, block_bbox) > 0.5 for block_bbox in
  31. image_bboxes):
  32. new_spans.append(span)
  33. elif span_type == ContentType.TABLE:
  34. if any(calculate_overlap_area_in_bbox1_area_ratio(span_bbox, block_bbox) > 0.5 for block_bbox in
  35. table_bboxes):
  36. new_spans.append(span)
  37. else:
  38. if any(calculate_overlap_area_in_bbox1_area_ratio(span_bbox, block_bbox) > 0.5 for block_bbox in
  39. other_block_bboxes):
  40. new_spans.append(span)
  41. return new_spans
  42. def remove_overlaps_low_confidence_spans(spans):
  43. dropped_spans = []
  44. # 删除重叠spans中置信度低的的那些
  45. for span1 in spans:
  46. for span2 in spans:
  47. if span1 != span2:
  48. # span1 或 span2 任何一个都不应该在 dropped_spans 中
  49. if span1 in dropped_spans or span2 in dropped_spans:
  50. continue
  51. else:
  52. if calculate_iou(span1['bbox'], span2['bbox']) > 0.9:
  53. if span1['score'] < span2['score']:
  54. span_need_remove = span1
  55. else:
  56. span_need_remove = span2
  57. if (
  58. span_need_remove is not None
  59. and span_need_remove not in dropped_spans
  60. ):
  61. dropped_spans.append(span_need_remove)
  62. if len(dropped_spans) > 0:
  63. for span_need_remove in dropped_spans:
  64. spans.remove(span_need_remove)
  65. return spans, dropped_spans
  66. def remove_overlaps_min_spans(spans):
  67. dropped_spans = []
  68. # 删除重叠spans中较小的那些
  69. for span1 in spans:
  70. for span2 in spans:
  71. if span1 != span2:
  72. # span1 或 span2 任何一个都不应该在 dropped_spans 中
  73. if span1 in dropped_spans or span2 in dropped_spans:
  74. continue
  75. else:
  76. overlap_box = get_minbox_if_overlap_by_ratio(span1['bbox'], span2['bbox'], 0.65)
  77. if overlap_box is not None:
  78. span_need_remove = next((span for span in spans if span['bbox'] == overlap_box), None)
  79. if span_need_remove is not None and span_need_remove not in dropped_spans:
  80. dropped_spans.append(span_need_remove)
  81. if len(dropped_spans) > 0:
  82. for span_need_remove in dropped_spans:
  83. spans.remove(span_need_remove)
  84. return spans, dropped_spans
  85. def txt_spans_extract(pdf_page, spans, pil_img, scale):
  86. textpage = pdf_page.get_textpage()
  87. width, height = pdf_page.get_size()
  88. cropbox = pdf_page.get_cropbox()
  89. need_ocr_spans = []
  90. for span in spans:
  91. span_bbox = span['bbox']
  92. rect_box = [span_bbox[0] + cropbox[0],
  93. height - span_bbox[3] + cropbox[1],
  94. span_bbox[2] + cropbox[0],
  95. height - span_bbox[1] + cropbox[1]]
  96. text = textpage.get_text_bounded(left=rect_box[0], top=rect_box[1],
  97. right=rect_box[2], bottom=rect_box[3])
  98. if text and len(text) > 0:
  99. span['content'] = text.strip()
  100. span['score'] = 1.0
  101. else:
  102. need_ocr_spans.append(span)
  103. if len(need_ocr_spans) > 0:
  104. for span in need_ocr_spans:
  105. # 对span的bbox截图再ocr
  106. span_pil_img = get_crop_img(span['bbox'], pil_img, scale)
  107. span_img = cv2.cvtColor(np.array(span_pil_img), cv2.COLOR_RGB2BGR)
  108. # 计算span的对比度,低于0.20的span不进行ocr
  109. if calculate_contrast(span_img, img_mode='bgr') <= 0.17:
  110. spans.remove(span)
  111. continue
  112. span['content'] = ''
  113. span['score'] = 1.0
  114. span['np_img'] = span_img
  115. return spans
  116. def calculate_contrast(img, img_mode) -> float:
  117. """
  118. 计算给定图像的对比度。
  119. :param img: 图像,类型为numpy.ndarray
  120. :Param img_mode = 图像的色彩通道,'rgb' 或 'bgr'
  121. :return: 图像的对比度值
  122. """
  123. if img_mode == 'rgb':
  124. # 将RGB图像转换为灰度图
  125. gray_img = cv2.cvtColor(img, cv2.COLOR_RGB2GRAY)
  126. elif img_mode == 'bgr':
  127. # 将BGR图像转换为灰度图
  128. gray_img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
  129. else:
  130. raise ValueError("Invalid image mode. Please provide 'rgb' or 'bgr'.")
  131. # 计算均值和标准差
  132. mean_value = np.mean(gray_img)
  133. std_dev = np.std(gray_img)
  134. # 对比度定义为标准差除以平均值(加上小常数避免除零错误)
  135. contrast = std_dev / (mean_value + 1e-6)
  136. # logger.debug(f"contrast: {contrast}")
  137. return round(contrast, 2)