draw_bbox.py 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314
  1. import json
  2. from io import BytesIO
  3. from PyPDF2 import PdfReader, PdfWriter
  4. from reportlab.pdfgen import canvas
  5. from .enum_class import BlockType, ContentType
  6. def draw_bbox_without_number(i, bbox_list, page, c, rgb_config, fill_config):
  7. new_rgb = [float(color) / 255 for color in rgb_config]
  8. page_data = bbox_list[i]
  9. page_width, page_height = page.cropbox[2], page.cropbox[3]
  10. for bbox in page_data:
  11. width = bbox[2] - bbox[0]
  12. height = bbox[3] - bbox[1]
  13. rect = [bbox[0], page_height - bbox[3], width, height] # Define the rectangle
  14. if fill_config: # filled rectangle
  15. c.setFillColorRGB(new_rgb[0], new_rgb[1], new_rgb[2], 0.3)
  16. c.rect(rect[0], rect[1], rect[2], rect[3], stroke=0, fill=1)
  17. else: # bounding box
  18. c.setStrokeColorRGB(new_rgb[0], new_rgb[1], new_rgb[2])
  19. c.rect(rect[0], rect[1], rect[2], rect[3], stroke=1, fill=0)
  20. return c
  21. def draw_bbox_with_number(i, bbox_list, page, c, rgb_config, fill_config, draw_bbox=True):
  22. new_rgb = [float(color) / 255 for color in rgb_config]
  23. page_data = bbox_list[i]
  24. # 强制转换为 float
  25. page_width, page_height = float(page.cropbox[2]), float(page.cropbox[3])
  26. for j, bbox in enumerate(page_data):
  27. # 确保bbox的每个元素都是float
  28. x0, y0, x1, y1 = map(float, bbox)
  29. width = x1 - x0
  30. height = y1 - y0
  31. rect = [x0, page_height - y1, width, height]
  32. if draw_bbox:
  33. if fill_config:
  34. c.setFillColorRGB(*new_rgb, 0.3)
  35. c.rect(rect[0], rect[1], rect[2], rect[3], stroke=0, fill=1)
  36. else:
  37. c.setStrokeColorRGB(*new_rgb)
  38. c.rect(rect[0], rect[1], rect[2], rect[3], stroke=1, fill=0)
  39. c.setFillColorRGB(*new_rgb, 1.0)
  40. c.setFontSize(size=10)
  41. # 这里也要用float
  42. c.drawString(x1 + 2, page_height - y0 - 10, str(j + 1))
  43. return c
  44. def draw_layout_bbox(pdf_info, pdf_bytes, out_path, filename):
  45. dropped_bbox_list = []
  46. tables_list, tables_body_list = [], []
  47. tables_caption_list, tables_footnote_list = [], []
  48. imgs_list, imgs_body_list, imgs_caption_list = [], [], []
  49. imgs_footnote_list = []
  50. titles_list = []
  51. texts_list = []
  52. interequations_list = []
  53. lists_list = []
  54. indexs_list = []
  55. for page in pdf_info:
  56. page_dropped_list = []
  57. tables, tables_body, tables_caption, tables_footnote = [], [], [], []
  58. imgs, imgs_body, imgs_caption, imgs_footnote = [], [], [], []
  59. titles = []
  60. texts = []
  61. interequations = []
  62. lists = []
  63. indices = []
  64. for dropped_bbox in page['discarded_blocks']:
  65. page_dropped_list.append(dropped_bbox['bbox'])
  66. dropped_bbox_list.append(page_dropped_list)
  67. for block in page["para_blocks"]:
  68. bbox = block["bbox"]
  69. if block["type"] == BlockType.TABLE:
  70. tables.append(bbox)
  71. for nested_block in block["blocks"]:
  72. bbox = nested_block["bbox"]
  73. if nested_block["type"] == BlockType.TABLE_BODY:
  74. tables_body.append(bbox)
  75. elif nested_block["type"] == BlockType.TABLE_CAPTION:
  76. tables_caption.append(bbox)
  77. elif nested_block["type"] == BlockType.TABLE_FOOTNOTE:
  78. tables_footnote.append(bbox)
  79. elif block["type"] == BlockType.IMAGE:
  80. imgs.append(bbox)
  81. for nested_block in block["blocks"]:
  82. bbox = nested_block["bbox"]
  83. if nested_block["type"] == BlockType.IMAGE_BODY:
  84. imgs_body.append(bbox)
  85. elif nested_block["type"] == BlockType.IMAGE_CAPTION:
  86. imgs_caption.append(bbox)
  87. elif nested_block["type"] == BlockType.IMAGE_FOOTNOTE:
  88. imgs_footnote.append(bbox)
  89. elif block["type"] == BlockType.TITLE:
  90. titles.append(bbox)
  91. elif block["type"] == BlockType.TEXT:
  92. texts.append(bbox)
  93. elif block["type"] == BlockType.INTERLINE_EQUATION:
  94. interequations.append(bbox)
  95. elif block["type"] == BlockType.LIST:
  96. lists.append(bbox)
  97. elif block["type"] == BlockType.INDEX:
  98. indices.append(bbox)
  99. tables_list.append(tables)
  100. tables_body_list.append(tables_body)
  101. tables_caption_list.append(tables_caption)
  102. tables_footnote_list.append(tables_footnote)
  103. imgs_list.append(imgs)
  104. imgs_body_list.append(imgs_body)
  105. imgs_caption_list.append(imgs_caption)
  106. imgs_footnote_list.append(imgs_footnote)
  107. titles_list.append(titles)
  108. texts_list.append(texts)
  109. interequations_list.append(interequations)
  110. lists_list.append(lists)
  111. indexs_list.append(indices)
  112. layout_bbox_list = []
  113. table_type_order = {"table_caption": 1, "table_body": 2, "table_footnote": 3}
  114. for page in pdf_info:
  115. page_block_list = []
  116. for block in page["para_blocks"]:
  117. if block["type"] in [
  118. BlockType.TEXT,
  119. BlockType.TITLE,
  120. BlockType.INTERLINE_EQUATION,
  121. BlockType.LIST,
  122. BlockType.INDEX,
  123. ]:
  124. bbox = block["bbox"]
  125. page_block_list.append(bbox)
  126. elif block["type"] in [BlockType.IMAGE]:
  127. for sub_block in block["blocks"]:
  128. bbox = sub_block["bbox"]
  129. page_block_list.append(bbox)
  130. elif block["type"] in [BlockType.TABLE]:
  131. sorted_blocks = sorted(block["blocks"], key=lambda x: table_type_order[x["type"]])
  132. for sub_block in sorted_blocks:
  133. bbox = sub_block["bbox"]
  134. page_block_list.append(bbox)
  135. layout_bbox_list.append(page_block_list)
  136. pdf_bytes_io = BytesIO(pdf_bytes)
  137. pdf_docs = PdfReader(pdf_bytes_io)
  138. output_pdf = PdfWriter()
  139. for i, page in enumerate(pdf_docs.pages):
  140. # 获取原始页面尺寸
  141. page_width, page_height = float(page.cropbox[2]), float(page.cropbox[3])
  142. custom_page_size = (page_width, page_height)
  143. packet = BytesIO()
  144. # 使用原始PDF的尺寸创建canvas
  145. c = canvas.Canvas(packet, pagesize=custom_page_size)
  146. c = draw_bbox_without_number(i, dropped_bbox_list, page, c, [158, 158, 158], True)
  147. c = draw_bbox_without_number(i, tables_body_list, page, c, [204, 204, 0], True)
  148. c = draw_bbox_without_number(i, tables_caption_list, page, c, [255, 255, 102], True)
  149. c = draw_bbox_without_number(i, tables_footnote_list, page, c, [229, 255, 204], True)
  150. c = draw_bbox_without_number(i, imgs_body_list, page, c, [153, 255, 51], True)
  151. c = draw_bbox_without_number(i, imgs_caption_list, page, c, [102, 178, 255], True)
  152. c = draw_bbox_without_number(i, imgs_footnote_list, page, c, [255, 178, 102], True)
  153. c = draw_bbox_without_number(i, titles_list, page, c, [102, 102, 255], True)
  154. c = draw_bbox_without_number(i, texts_list, page, c, [153, 0, 76], True)
  155. c = draw_bbox_without_number(i, interequations_list, page, c, [0, 255, 0], True)
  156. c = draw_bbox_without_number(i, lists_list, page, c, [40, 169, 92], True)
  157. c = draw_bbox_without_number(i, indexs_list, page, c, [40, 169, 92], True)
  158. c = draw_bbox_with_number(i, layout_bbox_list, page, c, [255, 0, 0], False, draw_bbox=False)
  159. c.save()
  160. packet.seek(0)
  161. overlay_pdf = PdfReader(packet)
  162. page.merge_page(overlay_pdf.pages[0])
  163. output_pdf.add_page(page)
  164. # 保存结果
  165. with open(f"{out_path}/{filename}", "wb") as f:
  166. output_pdf.write(f)
  167. def draw_span_bbox(pdf_info, pdf_bytes, out_path, filename):
  168. text_list = []
  169. inline_equation_list = []
  170. interline_equation_list = []
  171. image_list = []
  172. table_list = []
  173. dropped_list = []
  174. next_page_text_list = []
  175. next_page_inline_equation_list = []
  176. def get_span_info(span):
  177. if span['type'] == ContentType.TEXT:
  178. if span.get('cross_page', False):
  179. next_page_text_list.append(span['bbox'])
  180. else:
  181. page_text_list.append(span['bbox'])
  182. elif span['type'] == ContentType.INLINE_EQUATION:
  183. if span.get('cross_page', False):
  184. next_page_inline_equation_list.append(span['bbox'])
  185. else:
  186. page_inline_equation_list.append(span['bbox'])
  187. elif span['type'] == ContentType.INTERLINE_EQUATION:
  188. page_interline_equation_list.append(span['bbox'])
  189. elif span['type'] == ContentType.IMAGE:
  190. page_image_list.append(span['bbox'])
  191. elif span['type'] == ContentType.TABLE:
  192. page_table_list.append(span['bbox'])
  193. for page in pdf_info:
  194. page_text_list = []
  195. page_inline_equation_list = []
  196. page_interline_equation_list = []
  197. page_image_list = []
  198. page_table_list = []
  199. page_dropped_list = []
  200. # 将跨页的span放到移动到下一页的列表中
  201. if len(next_page_text_list) > 0:
  202. page_text_list.extend(next_page_text_list)
  203. next_page_text_list.clear()
  204. if len(next_page_inline_equation_list) > 0:
  205. page_inline_equation_list.extend(next_page_inline_equation_list)
  206. next_page_inline_equation_list.clear()
  207. # 构造dropped_list
  208. for block in page['discarded_blocks']:
  209. if block['type'] == BlockType.DISCARDED:
  210. for line in block['lines']:
  211. for span in line['spans']:
  212. page_dropped_list.append(span['bbox'])
  213. dropped_list.append(page_dropped_list)
  214. # 构造其余useful_list
  215. # for block in page['para_blocks']: # span直接用分段合并前的结果就可以
  216. for block in page['preproc_blocks']:
  217. if block['type'] in [
  218. BlockType.TEXT,
  219. BlockType.TITLE,
  220. BlockType.INTERLINE_EQUATION,
  221. BlockType.LIST,
  222. BlockType.INDEX,
  223. ]:
  224. for line in block['lines']:
  225. for span in line['spans']:
  226. get_span_info(span)
  227. elif block['type'] in [BlockType.IMAGE, BlockType.TABLE]:
  228. for sub_block in block['blocks']:
  229. for line in sub_block['lines']:
  230. for span in line['spans']:
  231. get_span_info(span)
  232. text_list.append(page_text_list)
  233. inline_equation_list.append(page_inline_equation_list)
  234. interline_equation_list.append(page_interline_equation_list)
  235. image_list.append(page_image_list)
  236. table_list.append(page_table_list)
  237. pdf_bytes_io = BytesIO(pdf_bytes)
  238. pdf_docs = PdfReader(pdf_bytes_io)
  239. output_pdf = PdfWriter()
  240. for i, page in enumerate(pdf_docs.pages):
  241. # 获取原始页面尺寸
  242. page_width, page_height = float(page.cropbox[2]), float(page.cropbox[3])
  243. custom_page_size = (page_width, page_height)
  244. packet = BytesIO()
  245. # 使用原始PDF的尺寸创建canvas
  246. c = canvas.Canvas(packet, pagesize=custom_page_size)
  247. # 获取当前页面的数据
  248. draw_bbox_without_number(i, text_list, page, c,[255, 0, 0], False)
  249. draw_bbox_without_number(i, inline_equation_list, page, c, [0, 255, 0], False)
  250. draw_bbox_without_number(i, interline_equation_list, page, c, [0, 0, 255], False)
  251. draw_bbox_without_number(i, image_list, page, c, [255, 204, 0], False)
  252. draw_bbox_without_number(i, table_list, page, c, [204, 0, 255], False)
  253. draw_bbox_without_number(i, dropped_list, page, c, [158, 158, 158], False)
  254. c.save()
  255. packet.seek(0)
  256. overlay_pdf = PdfReader(packet)
  257. page.merge_page(overlay_pdf.pages[0])
  258. output_pdf.add_page(page)
  259. # Save the PDF
  260. with open(f"{out_path}/{filename}", "wb") as f:
  261. output_pdf.write(f)
  262. if __name__ == "__main__":
  263. # 读取PDF文件
  264. pdf_path = "examples/demo1.pdf"
  265. with open(pdf_path, "rb") as f:
  266. pdf_bytes = f.read()
  267. # 从json文件读取pdf_info
  268. json_path = "examples/demo1_1746005777.0863056_middle.json"
  269. with open(json_path, "r", encoding="utf-8") as f:
  270. pdf_ann = json.load(f)
  271. pdf_info = pdf_ann["pdf_info"]
  272. # 调用可视化函数,输出到examples目录
  273. draw_layout_bbox(pdf_info, pdf_bytes, "examples", "output_with_layout.pdf")