ocr_span_list_modify.py 4.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131
  1. from magic_pdf.config.drop_tag import DropTag
  2. from magic_pdf.config.ocr_content_type import BlockType
  3. from magic_pdf.libs.boxbase import calculate_iou, get_minbox_if_overlap_by_ratio
  4. def remove_overlaps_low_confidence_spans(spans):
  5. dropped_spans = []
  6. # 删除重叠spans中置信度低的的那些
  7. for span1 in spans:
  8. for span2 in spans:
  9. if span1 != span2:
  10. # span1 或 span2 任何一个都不应该在 dropped_spans 中
  11. if span1 in dropped_spans or span2 in dropped_spans:
  12. continue
  13. else:
  14. if calculate_iou(span1['bbox'], span2['bbox']) > 0.9:
  15. if span1['score'] < span2['score']:
  16. span_need_remove = span1
  17. else:
  18. span_need_remove = span2
  19. if (
  20. span_need_remove is not None
  21. and span_need_remove not in dropped_spans
  22. ):
  23. dropped_spans.append(span_need_remove)
  24. if len(dropped_spans) > 0:
  25. for span_need_remove in dropped_spans:
  26. spans.remove(span_need_remove)
  27. span_need_remove['tag'] = DropTag.SPAN_OVERLAP
  28. return spans, dropped_spans
  29. def check_chars_is_overlap_in_span(chars):
  30. for i in range(len(chars)):
  31. for j in range(i + 1, len(chars)):
  32. if calculate_iou(chars[i]['bbox'], chars[j]['bbox']) > 0.35:
  33. return True
  34. return False
  35. def remove_x_overlapping_chars(span, median_width):
  36. """
  37. Remove characters from a span that overlap significantly on the x-axis.
  38. Args:
  39. median_width:
  40. span (dict): A span containing a list of chars, each with bbox coordinates
  41. in the format [x0, y0, x1, y1]
  42. Returns:
  43. dict: The span with overlapping characters removed
  44. """
  45. if 'chars' not in span or len(span['chars']) < 2:
  46. return span
  47. overlap_threshold = median_width * 0.3
  48. i = 0
  49. while i < len(span['chars']) - 1:
  50. char1 = span['chars'][i]
  51. char2 = span['chars'][i + 1]
  52. # Calculate overlap width
  53. x_left = max(char1['bbox'][0], char2['bbox'][0])
  54. x_right = min(char1['bbox'][2], char2['bbox'][2])
  55. if x_right > x_left: # There is overlap
  56. overlap_width = x_right - x_left
  57. if overlap_width > overlap_threshold:
  58. if char1['c'] == char2['c'] or char1['c'] == ' ' or char2['c'] == ' ':
  59. # Determine which character to remove
  60. width1 = char1['bbox'][2] - char1['bbox'][0]
  61. width2 = char2['bbox'][2] - char2['bbox'][0]
  62. if width1 < width2:
  63. # Remove the narrower character
  64. span['chars'].pop(i)
  65. else:
  66. span['chars'].pop(i + 1)
  67. else:
  68. i += 1
  69. # Don't increment i since we need to check the new pair
  70. else:
  71. i += 1
  72. else:
  73. i += 1
  74. return span
  75. def remove_overlaps_min_spans(spans):
  76. dropped_spans = []
  77. # 删除重叠spans中较小的那些
  78. for span1 in spans:
  79. for span2 in spans:
  80. if span1 != span2:
  81. # span1 或 span2 任何一个都不应该在 dropped_spans 中
  82. if span1 in dropped_spans or span2 in dropped_spans:
  83. continue
  84. else:
  85. overlap_box = get_minbox_if_overlap_by_ratio(span1['bbox'], span2['bbox'], 0.65)
  86. if overlap_box is not None:
  87. span_need_remove = next((span for span in spans if span['bbox'] == overlap_box), None)
  88. if span_need_remove is not None and span_need_remove not in dropped_spans:
  89. dropped_spans.append(span_need_remove)
  90. if len(dropped_spans) > 0:
  91. for span_need_remove in dropped_spans:
  92. spans.remove(span_need_remove)
  93. span_need_remove['tag'] = DropTag.SPAN_OVERLAP
  94. return spans, dropped_spans
  95. def get_qa_need_list_v2(blocks):
  96. # 创建 images, tables, interline_equations, inline_equations 的副本
  97. images = []
  98. tables = []
  99. interline_equations = []
  100. for block in blocks:
  101. if block['type'] == BlockType.Image:
  102. images.append(block)
  103. elif block['type'] == BlockType.Table:
  104. tables.append(block)
  105. elif block['type'] == BlockType.InterlineEquation:
  106. interline_equations.append(block)
  107. return images, tables, interline_equations