ocr_span_list_modify.py 2.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172
  1. from magic_pdf.config.drop_tag import DropTag
  2. from magic_pdf.config.ocr_content_type import BlockType
  3. from magic_pdf.libs.boxbase import calculate_iou, get_minbox_if_overlap_by_ratio
  4. def remove_overlaps_low_confidence_spans(spans):
  5. dropped_spans = []
  6. # 删除重叠spans中置信度低的的那些
  7. for span1 in spans:
  8. for span2 in spans:
  9. if span1 != span2:
  10. # span1 或 span2 任何一个都不应该在 dropped_spans 中
  11. if span1 in dropped_spans or span2 in dropped_spans:
  12. continue
  13. else:
  14. if calculate_iou(span1['bbox'], span2['bbox']) > 0.9:
  15. if span1['score'] < span2['score']:
  16. span_need_remove = span1
  17. else:
  18. span_need_remove = span2
  19. if (
  20. span_need_remove is not None
  21. and span_need_remove not in dropped_spans
  22. ):
  23. dropped_spans.append(span_need_remove)
  24. if len(dropped_spans) > 0:
  25. for span_need_remove in dropped_spans:
  26. spans.remove(span_need_remove)
  27. span_need_remove['tag'] = DropTag.SPAN_OVERLAP
  28. return spans, dropped_spans
  29. def remove_overlaps_min_spans(spans):
  30. dropped_spans = []
  31. # 删除重叠spans中较小的那些
  32. for span1 in spans:
  33. for span2 in spans:
  34. if span1 != span2:
  35. # span1 或 span2 任何一个都不应该在 dropped_spans 中
  36. if span1 in dropped_spans or span2 in dropped_spans:
  37. continue
  38. else:
  39. overlap_box = get_minbox_if_overlap_by_ratio(span1['bbox'], span2['bbox'], 0.65)
  40. if overlap_box is not None:
  41. span_need_remove = next((span for span in spans if span['bbox'] == overlap_box), None)
  42. if span_need_remove is not None and span_need_remove not in dropped_spans:
  43. dropped_spans.append(span_need_remove)
  44. if len(dropped_spans) > 0:
  45. for span_need_remove in dropped_spans:
  46. spans.remove(span_need_remove)
  47. span_need_remove['tag'] = DropTag.SPAN_OVERLAP
  48. return spans, dropped_spans
  49. def get_qa_need_list_v2(blocks):
  50. # 创建 images, tables, interline_equations, inline_equations 的副本
  51. images = []
  52. tables = []
  53. interline_equations = []
  54. for block in blocks:
  55. if block['type'] == BlockType.Image:
  56. images.append(block)
  57. elif block['type'] == BlockType.Table:
  58. tables.append(block)
  59. elif block['type'] == BlockType.InterlineEquation:
  60. interline_equations.append(block)
  61. return images, tables, interline_equations