| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207 |
- class RawBlockProcessor:
- def __init__(self) -> None:
- self.y_tolerance = 2
- self.pdf_dic = {}
- def __span_flags_decomposer(self, span_flags):
- """
- Make font flags human readable.
- Parameters
- ----------
- self : object
- The instance of the class.
- span_flags : int
- span flags
- Returns
- -------
- l : dict
- decomposed flags
- """
- l = {
- "is_superscript": False,
- "is_italic": False,
- "is_serifed": False,
- "is_sans_serifed": False,
- "is_monospaced": False,
- "is_proportional": False,
- "is_bold": False,
- }
- if span_flags & 2**0:
- l["is_superscript"] = True # 表示上标
- if span_flags & 2**1:
- l["is_italic"] = True # 表示斜体
- if span_flags & 2**2:
- l["is_serifed"] = True # 表示衬线字体
- else:
- l["is_sans_serifed"] = True # 表示非衬线字体
- if span_flags & 2**3:
- l["is_monospaced"] = True # 表示等宽字体
- else:
- l["is_proportional"] = True # 表示比例字体
- if span_flags & 2**4:
- l["is_bold"] = True # 表示粗体
- return l
- def __make_new_lines(self, raw_lines):
- """
- This function makes new lines.
- Parameters
- ----------
- self : object
- The instance of the class.
- raw_lines : list
- raw lines
- Returns
- -------
- new_lines : list
- new lines
- """
- new_lines = []
- new_line = None
- for raw_line in raw_lines:
- raw_line_bbox = raw_line["bbox"]
- raw_line_spans = raw_line["spans"]
- raw_line_text = "".join([span["text"] for span in raw_line_spans])
- raw_line_dir = raw_line.get("dir", None)
- decomposed_line_spans = []
- for span in raw_line_spans:
- raw_flags = span["flags"]
- decomposed_flags = self.__span_flags_decomposer(raw_flags)
- span["decomposed_flags"] = decomposed_flags
- decomposed_line_spans.append(span)
- if new_line is None:
- new_line = {
- "bbox": raw_line_bbox,
- "text": raw_line_text,
- "dir": raw_line_dir if raw_line_dir else (0, 0),
- "spans": decomposed_line_spans,
- }
- else:
- if (
- abs(raw_line_bbox[1] - new_line["bbox"][1]) <= self.y_tolerance
- and abs(raw_line_bbox[3] - new_line["bbox"][3]) <= self.y_tolerance
- ):
- new_line["bbox"] = (
- min(new_line["bbox"][0], raw_line_bbox[0]), # left
- new_line["bbox"][1], # top
- max(new_line["bbox"][2], raw_line_bbox[2]), # right
- raw_line_bbox[3], # bottom
- )
- new_line["text"] += " " + raw_line_text
- new_line["spans"].extend(raw_line_spans)
- new_line["dir"] = (
- new_line["dir"][0] + raw_line_dir[0],
- new_line["dir"][1] + raw_line_dir[1],
- )
- else:
- new_lines.append(new_line)
- new_line = {
- "bbox": raw_line_bbox,
- "text": raw_line_text,
- "dir": raw_line_dir if raw_line_dir else (0, 0),
- "spans": raw_line_spans,
- }
- if new_line:
- new_lines.append(new_line)
- return new_lines
- def __make_new_block(self, raw_block):
- """
- This function makes a new block.
- Parameters
- ----------
- self : object
- The instance of the class.
- ----------
- raw_block : dict
- a raw block
- Returns
- -------
- new_block : dict
- Schema of new_block:
- {
- "block_id": "block_1",
- "bbox": [0, 0, 100, 100],
- "text": "This is a block.",
- "lines": [
- {
- "bbox": [0, 0, 100, 100],
- "text": "This is a line.",
- "spans": [
- {
- "text": "This is a span.",
- "font": "Times New Roman",
- "size": 12,
- "color": "#000000",
- }
- ],
- }
- ],
- }
- """
- new_block = {}
- block_id = raw_block["number"]
- block_bbox = raw_block["bbox"]
- block_text = " ".join(span["text"] for line in raw_block["lines"] for span in line["spans"])
- raw_lines = raw_block["lines"]
- block_lines = self.__make_new_lines(raw_lines)
- new_block["block_id"] = block_id
- new_block["bbox"] = block_bbox
- new_block["text"] = block_text
- new_block["lines"] = block_lines
- return new_block
- def batch_process_blocks(self, pdf_dic):
- """
- This function processes the blocks in batch.
- Parameters
- ----------
- self : object
- The instance of the class.
- ----------
- blocks : list
- Input block is a list of raw blocks. Schema can refer to the value of key ""preproc_blocks", demo file is app/pdf_toolbox/tests/preproc_2_parasplit_example.json.
- Returns
- -------
- result_dict : dict
- result dictionary
- """
- for page_id, blocks in pdf_dic.items():
- if page_id.startswith("page_"):
- para_blocks = []
- if "preproc_blocks" in blocks.keys():
- input_blocks = blocks["preproc_blocks"]
- for raw_block in input_blocks:
- new_block = self.__make_new_block(raw_block)
- para_blocks.append(new_block)
- blocks["para_blocks"] = para_blocks
- return pdf_dic
|