| 12345678910111213141516171819202122232425262728293031323334353637383940 |
- import math
- from pdf_tools.para.commons import *
- if sys.version_info[0] >= 3:
- sys.stdout.reconfigure(encoding="utf-8") # type: ignore
- class LayoutFilterProcessor:
- def __init__(self) -> None:
- pass
- def batch_process_blocks(self, pdf_dict):
- for page_id, blocks in pdf_dict.items():
- if page_id.startswith("page_"):
- if "layout_bboxes" in blocks.keys() and "para_blocks" in blocks.keys():
- layout_bbox_objs = blocks["layout_bboxes"]
- if layout_bbox_objs is None:
- continue
- layout_bboxes = [bbox_obj["layout_bbox"] for bbox_obj in layout_bbox_objs]
- # Use math.ceil function to enlarge each value of x0, y0, x1, y1 of each layout_bbox
- layout_bboxes = [
- [math.ceil(x0), math.ceil(y0), math.ceil(x1), math.ceil(y1)] for x0, y0, x1, y1 in layout_bboxes
- ]
- para_blocks = blocks["para_blocks"]
- if para_blocks is None:
- continue
- for lb_bbox in layout_bboxes:
- for i, para_block in enumerate(para_blocks):
- para_bbox = para_block["bbox"]
- para_blocks[i]["in_layout"] = 0
- if is_in_bbox(para_bbox, lb_bbox):
- para_blocks[i]["in_layout"] = 1
- blocks["para_blocks"] = para_blocks
- return pdf_dict
|