layout_match_processor.py 1.5 KB

12345678910111213141516171819202122232425262728293031323334353637383940
  1. import math
  2. from magic_pdf.para.commons import *
  3. if sys.version_info[0] >= 3:
  4. sys.stdout.reconfigure(encoding="utf-8") # type: ignore
  5. class LayoutFilterProcessor:
  6. def __init__(self) -> None:
  7. pass
  8. def batch_process_blocks(self, pdf_dict):
  9. for page_id, blocks in pdf_dict.items():
  10. if page_id.startswith("page_"):
  11. if "layout_bboxes" in blocks.keys() and "para_blocks" in blocks.keys():
  12. layout_bbox_objs = blocks["layout_bboxes"]
  13. if layout_bbox_objs is None:
  14. continue
  15. layout_bboxes = [bbox_obj["layout_bbox"] for bbox_obj in layout_bbox_objs]
  16. # Use math.ceil function to enlarge each value of x0, y0, x1, y1 of each layout_bbox
  17. layout_bboxes = [
  18. [math.ceil(x0), math.ceil(y0), math.ceil(x1), math.ceil(y1)] for x0, y0, x1, y1 in layout_bboxes
  19. ]
  20. para_blocks = blocks["para_blocks"]
  21. if para_blocks is None:
  22. continue
  23. for lb_bbox in layout_bboxes:
  24. for i, para_block in enumerate(para_blocks):
  25. para_bbox = para_block["bbox"]
  26. para_blocks[i]["in_layout"] = 0
  27. if is_in_bbox(para_bbox, lb_bbox):
  28. para_blocks[i]["in_layout"] = 1
  29. blocks["para_blocks"] = para_blocks
  30. return pdf_dict