ocr_span_list_modify.py 3.0 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980
  1. from magic_pdf.config.drop_tag import DropTag
  2. from magic_pdf.config.ocr_content_type import BlockType
  3. from magic_pdf.libs.boxbase import calculate_iou, get_minbox_if_overlap_by_ratio
  4. def remove_overlaps_low_confidence_spans(spans):
  5. dropped_spans = []
  6. # 删除重叠spans中置信度低的的那些
  7. for span1 in spans:
  8. for span2 in spans:
  9. if span1 != span2:
  10. # span1 或 span2 任何一个都不应该在 dropped_spans 中
  11. if span1 in dropped_spans or span2 in dropped_spans:
  12. continue
  13. else:
  14. if calculate_iou(span1['bbox'], span2['bbox']) > 0.9:
  15. if span1['score'] < span2['score']:
  16. span_need_remove = span1
  17. else:
  18. span_need_remove = span2
  19. if (
  20. span_need_remove is not None
  21. and span_need_remove not in dropped_spans
  22. ):
  23. dropped_spans.append(span_need_remove)
  24. if len(dropped_spans) > 0:
  25. for span_need_remove in dropped_spans:
  26. spans.remove(span_need_remove)
  27. span_need_remove['tag'] = DropTag.SPAN_OVERLAP
  28. return spans, dropped_spans
  29. def check_chars_is_overlap_in_span(chars):
  30. for i in range(len(chars)):
  31. for j in range(i + 1, len(chars)):
  32. if calculate_iou(chars[i]['bbox'], chars[j]['bbox']) > 0.9:
  33. return True
  34. return False
  35. def remove_overlaps_min_spans(spans):
  36. dropped_spans = []
  37. # 删除重叠spans中较小的那些
  38. for span1 in spans:
  39. for span2 in spans:
  40. if span1 != span2:
  41. # span1 或 span2 任何一个都不应该在 dropped_spans 中
  42. if span1 in dropped_spans or span2 in dropped_spans:
  43. continue
  44. else:
  45. overlap_box = get_minbox_if_overlap_by_ratio(span1['bbox'], span2['bbox'], 0.65)
  46. if overlap_box is not None:
  47. span_need_remove = next((span for span in spans if span['bbox'] == overlap_box), None)
  48. if span_need_remove is not None and span_need_remove not in dropped_spans:
  49. dropped_spans.append(span_need_remove)
  50. if len(dropped_spans) > 0:
  51. for span_need_remove in dropped_spans:
  52. spans.remove(span_need_remove)
  53. span_need_remove['tag'] = DropTag.SPAN_OVERLAP
  54. return spans, dropped_spans
  55. def get_qa_need_list_v2(blocks):
  56. # 创建 images, tables, interline_equations, inline_equations 的副本
  57. images = []
  58. tables = []
  59. interline_equations = []
  60. for block in blocks:
  61. if block['type'] == BlockType.Image:
  62. images.append(block)
  63. elif block['type'] == BlockType.Table:
  64. tables.append(block)
  65. elif block['type'] == BlockType.InterlineEquation:
  66. interline_equations.append(block)
  67. return images, tables, interline_equations