raw_processor.py 6.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207
  1. class RawBlockProcessor:
  2. def __init__(self) -> None:
  3. self.y_tolerance = 2
  4. self.pdf_dic = {}
  5. def __span_flags_decomposer(self, span_flags):
  6. """
  7. Make font flags human readable.
  8. Parameters
  9. ----------
  10. self : object
  11. The instance of the class.
  12. span_flags : int
  13. span flags
  14. Returns
  15. -------
  16. l : dict
  17. decomposed flags
  18. """
  19. l = {
  20. "is_superscript": False,
  21. "is_italic": False,
  22. "is_serifed": False,
  23. "is_sans_serifed": False,
  24. "is_monospaced": False,
  25. "is_proportional": False,
  26. "is_bold": False,
  27. }
  28. if span_flags & 2**0:
  29. l["is_superscript"] = True # 表示上标
  30. if span_flags & 2**1:
  31. l["is_italic"] = True # 表示斜体
  32. if span_flags & 2**2:
  33. l["is_serifed"] = True # 表示衬线字体
  34. else:
  35. l["is_sans_serifed"] = True # 表示非衬线字体
  36. if span_flags & 2**3:
  37. l["is_monospaced"] = True # 表示等宽字体
  38. else:
  39. l["is_proportional"] = True # 表示比例字体
  40. if span_flags & 2**4:
  41. l["is_bold"] = True # 表示粗体
  42. return l
  43. def __make_new_lines(self, raw_lines):
  44. """
  45. This function makes new lines.
  46. Parameters
  47. ----------
  48. self : object
  49. The instance of the class.
  50. raw_lines : list
  51. raw lines
  52. Returns
  53. -------
  54. new_lines : list
  55. new lines
  56. """
  57. new_lines = []
  58. new_line = None
  59. for raw_line in raw_lines:
  60. raw_line_bbox = raw_line["bbox"]
  61. raw_line_spans = raw_line["spans"]
  62. raw_line_text = "".join([span["text"] for span in raw_line_spans])
  63. raw_line_dir = raw_line.get("dir", None)
  64. decomposed_line_spans = []
  65. for span in raw_line_spans:
  66. raw_flags = span["flags"]
  67. decomposed_flags = self.__span_flags_decomposer(raw_flags)
  68. span["decomposed_flags"] = decomposed_flags
  69. decomposed_line_spans.append(span)
  70. if new_line is None:
  71. new_line = {
  72. "bbox": raw_line_bbox,
  73. "text": raw_line_text,
  74. "dir": raw_line_dir if raw_line_dir else (0, 0),
  75. "spans": decomposed_line_spans,
  76. }
  77. else:
  78. if (
  79. abs(raw_line_bbox[1] - new_line["bbox"][1]) <= self.y_tolerance
  80. and abs(raw_line_bbox[3] - new_line["bbox"][3]) <= self.y_tolerance
  81. ):
  82. new_line["bbox"] = (
  83. min(new_line["bbox"][0], raw_line_bbox[0]), # left
  84. new_line["bbox"][1], # top
  85. max(new_line["bbox"][2], raw_line_bbox[2]), # right
  86. raw_line_bbox[3], # bottom
  87. )
  88. new_line["text"] += " " + raw_line_text
  89. new_line["spans"].extend(raw_line_spans)
  90. new_line["dir"] = (
  91. new_line["dir"][0] + raw_line_dir[0],
  92. new_line["dir"][1] + raw_line_dir[1],
  93. )
  94. else:
  95. new_lines.append(new_line)
  96. new_line = {
  97. "bbox": raw_line_bbox,
  98. "text": raw_line_text,
  99. "dir": raw_line_dir if raw_line_dir else (0, 0),
  100. "spans": raw_line_spans,
  101. }
  102. if new_line:
  103. new_lines.append(new_line)
  104. return new_lines
  105. def __make_new_block(self, raw_block):
  106. """
  107. This function makes a new block.
  108. Parameters
  109. ----------
  110. self : object
  111. The instance of the class.
  112. ----------
  113. raw_block : dict
  114. a raw block
  115. Returns
  116. -------
  117. new_block : dict
  118. Schema of new_block:
  119. {
  120. "block_id": "block_1",
  121. "bbox": [0, 0, 100, 100],
  122. "text": "This is a block.",
  123. "lines": [
  124. {
  125. "bbox": [0, 0, 100, 100],
  126. "text": "This is a line.",
  127. "spans": [
  128. {
  129. "text": "This is a span.",
  130. "font": "Times New Roman",
  131. "size": 12,
  132. "color": "#000000",
  133. }
  134. ],
  135. }
  136. ],
  137. }
  138. """
  139. new_block = {}
  140. block_id = raw_block["number"]
  141. block_bbox = raw_block["bbox"]
  142. block_text = " ".join(span["text"] for line in raw_block["lines"] for span in line["spans"])
  143. raw_lines = raw_block["lines"]
  144. block_lines = self.__make_new_lines(raw_lines)
  145. new_block["block_id"] = block_id
  146. new_block["bbox"] = block_bbox
  147. new_block["text"] = block_text
  148. new_block["lines"] = block_lines
  149. return new_block
  150. def batch_process_blocks(self, pdf_dic):
  151. """
  152. This function processes the blocks in batch.
  153. Parameters
  154. ----------
  155. self : object
  156. The instance of the class.
  157. ----------
  158. blocks : list
  159. Input block is a list of raw blocks. Schema can refer to the value of key ""preproc_blocks", demo file is app/pdf_toolbox/tests/preproc_2_parasplit_example.json.
  160. Returns
  161. -------
  162. result_dict : dict
  163. result dictionary
  164. """
  165. for page_id, blocks in pdf_dic.items():
  166. if page_id.startswith("page_"):
  167. para_blocks = []
  168. if "preproc_blocks" in blocks.keys():
  169. input_blocks = blocks["preproc_blocks"]
  170. for raw_block in input_blocks:
  171. new_block = self.__make_new_block(raw_block)
  172. para_blocks.append(new_block)
  173. blocks["para_blocks"] = para_blocks
  174. return pdf_dic