layout_spiler_recog.py 3.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101
  1. """
  2. 找到能分割布局的水平的横线、色块
  3. """
  4. import os
  5. from magic_pdf.libs.commons import fitz
  6. from magic_pdf.libs.boxbase import _is_in_or_part_overlap
  7. def __rect_filter_by_width(rect, page_w, page_h):
  8. mid_x = page_w/2
  9. if rect[0]< mid_x < rect[2]:
  10. return True
  11. return False
  12. def __rect_filter_by_pos(rect, image_bboxes, table_bboxes):
  13. """
  14. 不能出现在table和image的位置
  15. """
  16. for box in image_bboxes:
  17. if _is_in_or_part_overlap(rect, box):
  18. return False
  19. for box in table_bboxes:
  20. if _is_in_or_part_overlap(rect, box):
  21. return False
  22. return True
  23. def __debug_show_page(page, bboxes1: list,bboxes2: list,bboxes3: list,):
  24. save_path = "./tmp/debug.pdf"
  25. if os.path.exists(save_path):
  26. # 删除已经存在的文件
  27. os.remove(save_path)
  28. # 创建一个新的空白 PDF 文件
  29. doc = fitz.open('')
  30. width = page.rect.width
  31. height = page.rect.height
  32. new_page = doc.new_page(width=width, height=height)
  33. shape = new_page.new_shape()
  34. for bbox in bboxes1:
  35. # 原始box画上去
  36. rect = fitz.Rect(*bbox[0:4])
  37. shape = new_page.new_shape()
  38. shape.draw_rect(rect)
  39. shape.finish(color=fitz.pdfcolor['red'], fill=fitz.pdfcolor['blue'], fill_opacity=0.2)
  40. shape.finish()
  41. shape.commit()
  42. for bbox in bboxes2:
  43. # 原始box画上去
  44. rect = fitz.Rect(*bbox[0:4])
  45. shape = new_page.new_shape()
  46. shape.draw_rect(rect)
  47. shape.finish(color=None, fill=fitz.pdfcolor['yellow'], fill_opacity=0.2)
  48. shape.finish()
  49. shape.commit()
  50. for bbox in bboxes3:
  51. # 原始box画上去
  52. rect = fitz.Rect(*bbox[0:4])
  53. shape = new_page.new_shape()
  54. shape.draw_rect(rect)
  55. shape.finish(color=fitz.pdfcolor['red'], fill=None)
  56. shape.finish()
  57. shape.commit()
  58. parent_dir = os.path.dirname(save_path)
  59. if not os.path.exists(parent_dir):
  60. os.makedirs(parent_dir)
  61. doc.save(save_path)
  62. doc.close()
  63. def get_spilter_of_page(page, image_bboxes, table_bboxes):
  64. """
  65. 获取到色块和横线
  66. """
  67. cdrawings = page.get_cdrawings()
  68. spilter_bbox = []
  69. for block in cdrawings:
  70. if 'fill' in block:
  71. fill = block['fill']
  72. if 'fill' in block and block['fill'] and block['fill']!=(1.0,1.0,1.0):
  73. rect = block['rect']
  74. if __rect_filter_by_width(rect, page.rect.width, page.rect.height) and __rect_filter_by_pos(rect, image_bboxes, table_bboxes):
  75. spilter_bbox.append(list(rect))
  76. """过滤、修正一下这些box。因为有时候会有一些矩形,高度为0或者为负数,造成layout计算无限循环。如果是负高度或者0高度,统一修正为高度为1"""
  77. for box in spilter_bbox:
  78. if box[3]-box[1] <= 0:
  79. box[3] = box[1] + 1
  80. #__debug_show_page(page, spilter_bbox, [], [])
  81. return spilter_bbox