draw_bbox.py 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385
  1. import json
  2. from io import BytesIO
  3. from loguru import logger
  4. from pypdf import PdfReader, PdfWriter
  5. from reportlab.pdfgen import canvas
  6. from .enum_class import BlockType, ContentType
  7. def cal_canvas_rect(page, bbox):
  8. """
  9. Calculate the rectangle coordinates on the canvas based on the original PDF page and bounding box.
  10. Args:
  11. page: A PyPDF2 Page object representing a single page in the PDF.
  12. bbox: [x0, y0, x1, y1] representing the bounding box coordinates.
  13. Returns:
  14. rect: [x0, y0, width, height] representing the rectangle coordinates on the canvas.
  15. """
  16. page_width, page_height = float(page.cropbox[2]), float(page.cropbox[3])
  17. actual_width = page_width # The width of the final PDF display
  18. actual_height = page_height # The height of the final PDF display
  19. rotation = page.get("/Rotate", 0)
  20. rotation = rotation % 360
  21. if rotation in [90, 270]:
  22. # PDF is rotated 90 degrees or 270 degrees, and the width and height need to be swapped
  23. actual_width, actual_height = actual_height, actual_width
  24. x0, y0, x1, y1 = bbox
  25. rect_w = abs(x1 - x0)
  26. rect_h = abs(y1 - y0)
  27. if 270 == rotation:
  28. rect_w, rect_h = rect_h, rect_w
  29. x0 = actual_height - y1
  30. y0 = actual_width - x1
  31. elif 180 == rotation:
  32. x0 = page_width - x1
  33. y0 = y0
  34. elif 90 == rotation:
  35. rect_w, rect_h = rect_h, rect_w
  36. x0, y0 = y0, x0
  37. else:
  38. # 0 == rotation:
  39. x0 = x0
  40. y0 = page_height - y1
  41. rect = [x0, y0, rect_w, rect_h]
  42. return rect
  43. def draw_bbox_without_number(i, bbox_list, page, c, rgb_config, fill_config):
  44. new_rgb = [float(color) / 255 for color in rgb_config]
  45. page_data = bbox_list[i]
  46. for bbox in page_data:
  47. rect = cal_canvas_rect(page, bbox) # Define the rectangle
  48. if fill_config: # filled rectangle
  49. c.setFillColorRGB(new_rgb[0], new_rgb[1], new_rgb[2], 0.3)
  50. c.rect(rect[0], rect[1], rect[2], rect[3], stroke=0, fill=1)
  51. else: # bounding box
  52. c.setStrokeColorRGB(new_rgb[0], new_rgb[1], new_rgb[2])
  53. c.rect(rect[0], rect[1], rect[2], rect[3], stroke=1, fill=0)
  54. return c
  55. def draw_bbox_with_number(i, bbox_list, page, c, rgb_config, fill_config, draw_bbox=True):
  56. new_rgb = [float(color) / 255 for color in rgb_config]
  57. page_data = bbox_list[i]
  58. # 强制转换为 float
  59. page_width, page_height = float(page.cropbox[2]), float(page.cropbox[3])
  60. for j, bbox in enumerate(page_data):
  61. # 确保bbox的每个元素都是float
  62. rect = cal_canvas_rect(page, bbox) # Define the rectangle
  63. if draw_bbox:
  64. if fill_config:
  65. c.setFillColorRGB(*new_rgb, 0.3)
  66. c.rect(rect[0], rect[1], rect[2], rect[3], stroke=0, fill=1)
  67. else:
  68. c.setStrokeColorRGB(*new_rgb)
  69. c.rect(rect[0], rect[1], rect[2], rect[3], stroke=1, fill=0)
  70. c.setFillColorRGB(*new_rgb, 1.0)
  71. c.setFontSize(size=10)
  72. c.saveState()
  73. rotation = page.get("/Rotate", 0)
  74. rotation = rotation % 360
  75. if 0 == rotation:
  76. c.translate(rect[0] + rect[2] + 2, rect[1] + rect[3] - 10)
  77. elif 90 == rotation:
  78. c.translate(rect[0] + 2, rect[1] + rect[3] - 10)
  79. elif 180 == rotation:
  80. c.translate(rect[0] + 2, rect[1] - 10)
  81. elif 270 == rotation:
  82. c.translate(rect[0] + rect[2] + 2, rect[1] - 10)
  83. c.rotate(rotation)
  84. c.drawString(0, 0, str(j + 1))
  85. c.restoreState()
  86. return c
  87. def draw_layout_bbox(pdf_info, pdf_bytes, out_path, filename):
  88. dropped_bbox_list = []
  89. tables_list, tables_body_list = [], []
  90. tables_caption_list, tables_footnote_list = [], []
  91. imgs_list, imgs_body_list, imgs_caption_list = [], [], []
  92. imgs_footnote_list = []
  93. titles_list = []
  94. texts_list = []
  95. interequations_list = []
  96. lists_list = []
  97. indexs_list = []
  98. for page in pdf_info:
  99. page_dropped_list = []
  100. tables, tables_body, tables_caption, tables_footnote = [], [], [], []
  101. imgs, imgs_body, imgs_caption, imgs_footnote = [], [], [], []
  102. titles = []
  103. texts = []
  104. interequations = []
  105. lists = []
  106. indices = []
  107. for dropped_bbox in page['discarded_blocks']:
  108. page_dropped_list.append(dropped_bbox['bbox'])
  109. dropped_bbox_list.append(page_dropped_list)
  110. for block in page["para_blocks"]:
  111. bbox = block["bbox"]
  112. if block["type"] == BlockType.TABLE:
  113. tables.append(bbox)
  114. for nested_block in block["blocks"]:
  115. bbox = nested_block["bbox"]
  116. if nested_block["type"] == BlockType.TABLE_BODY:
  117. tables_body.append(bbox)
  118. elif nested_block["type"] == BlockType.TABLE_CAPTION:
  119. tables_caption.append(bbox)
  120. elif nested_block["type"] == BlockType.TABLE_FOOTNOTE:
  121. tables_footnote.append(bbox)
  122. elif block["type"] == BlockType.IMAGE:
  123. imgs.append(bbox)
  124. for nested_block in block["blocks"]:
  125. bbox = nested_block["bbox"]
  126. if nested_block["type"] == BlockType.IMAGE_BODY:
  127. imgs_body.append(bbox)
  128. elif nested_block["type"] == BlockType.IMAGE_CAPTION:
  129. imgs_caption.append(bbox)
  130. elif nested_block["type"] == BlockType.IMAGE_FOOTNOTE:
  131. imgs_footnote.append(bbox)
  132. elif block["type"] == BlockType.TITLE:
  133. titles.append(bbox)
  134. elif block["type"] == BlockType.TEXT:
  135. texts.append(bbox)
  136. elif block["type"] == BlockType.INTERLINE_EQUATION:
  137. interequations.append(bbox)
  138. elif block["type"] == BlockType.LIST:
  139. lists.append(bbox)
  140. elif block["type"] == BlockType.INDEX:
  141. indices.append(bbox)
  142. tables_list.append(tables)
  143. tables_body_list.append(tables_body)
  144. tables_caption_list.append(tables_caption)
  145. tables_footnote_list.append(tables_footnote)
  146. imgs_list.append(imgs)
  147. imgs_body_list.append(imgs_body)
  148. imgs_caption_list.append(imgs_caption)
  149. imgs_footnote_list.append(imgs_footnote)
  150. titles_list.append(titles)
  151. texts_list.append(texts)
  152. interequations_list.append(interequations)
  153. lists_list.append(lists)
  154. indexs_list.append(indices)
  155. layout_bbox_list = []
  156. table_type_order = {"table_caption": 1, "table_body": 2, "table_footnote": 3}
  157. for page in pdf_info:
  158. page_block_list = []
  159. for block in page["para_blocks"]:
  160. if block["type"] in [
  161. BlockType.TEXT,
  162. BlockType.TITLE,
  163. BlockType.INTERLINE_EQUATION,
  164. BlockType.LIST,
  165. BlockType.INDEX,
  166. ]:
  167. bbox = block["bbox"]
  168. page_block_list.append(bbox)
  169. elif block["type"] in [BlockType.IMAGE]:
  170. for sub_block in block["blocks"]:
  171. bbox = sub_block["bbox"]
  172. page_block_list.append(bbox)
  173. elif block["type"] in [BlockType.TABLE]:
  174. sorted_blocks = sorted(block["blocks"], key=lambda x: table_type_order[x["type"]])
  175. for sub_block in sorted_blocks:
  176. bbox = sub_block["bbox"]
  177. page_block_list.append(bbox)
  178. layout_bbox_list.append(page_block_list)
  179. pdf_bytes_io = BytesIO(pdf_bytes)
  180. pdf_docs = PdfReader(pdf_bytes_io)
  181. output_pdf = PdfWriter()
  182. for i, page in enumerate(pdf_docs.pages):
  183. # 获取原始页面尺寸
  184. page_width, page_height = float(page.cropbox[2]), float(page.cropbox[3])
  185. custom_page_size = (page_width, page_height)
  186. packet = BytesIO()
  187. # 使用原始PDF的尺寸创建canvas
  188. c = canvas.Canvas(packet, pagesize=custom_page_size)
  189. c = draw_bbox_without_number(i, dropped_bbox_list, page, c, [158, 158, 158], True)
  190. c = draw_bbox_without_number(i, tables_body_list, page, c, [204, 204, 0], True)
  191. c = draw_bbox_without_number(i, tables_caption_list, page, c, [255, 255, 102], True)
  192. c = draw_bbox_without_number(i, tables_footnote_list, page, c, [229, 255, 204], True)
  193. c = draw_bbox_without_number(i, imgs_body_list, page, c, [153, 255, 51], True)
  194. c = draw_bbox_without_number(i, imgs_caption_list, page, c, [102, 178, 255], True)
  195. c = draw_bbox_without_number(i, imgs_footnote_list, page, c, [255, 178, 102], True)
  196. c = draw_bbox_without_number(i, titles_list, page, c, [102, 102, 255], True)
  197. c = draw_bbox_without_number(i, texts_list, page, c, [153, 0, 76], True)
  198. c = draw_bbox_without_number(i, interequations_list, page, c, [0, 255, 0], True)
  199. c = draw_bbox_without_number(i, lists_list, page, c, [40, 169, 92], True)
  200. c = draw_bbox_without_number(i, indexs_list, page, c, [40, 169, 92], True)
  201. c = draw_bbox_with_number(i, layout_bbox_list, page, c, [255, 0, 0], False, draw_bbox=False)
  202. c.save()
  203. packet.seek(0)
  204. overlay_pdf = PdfReader(packet)
  205. # 添加检查确保overlay_pdf.pages不为空
  206. if len(overlay_pdf.pages) > 0:
  207. page.merge_page(overlay_pdf.pages[0])
  208. else:
  209. # 记录日志并继续处理下一个页面
  210. # logger.warning(f"layout.pdf: 第{i + 1}页未能生成有效的overlay PDF")
  211. pass
  212. output_pdf.add_page(page)
  213. # 保存结果
  214. with open(f"{out_path}/{filename}", "wb") as f:
  215. output_pdf.write(f)
  216. def draw_span_bbox(pdf_info, pdf_bytes, out_path, filename):
  217. text_list = []
  218. inline_equation_list = []
  219. interline_equation_list = []
  220. image_list = []
  221. table_list = []
  222. dropped_list = []
  223. next_page_text_list = []
  224. next_page_inline_equation_list = []
  225. def get_span_info(span):
  226. if span['type'] == ContentType.TEXT:
  227. if span.get('cross_page', False):
  228. next_page_text_list.append(span['bbox'])
  229. else:
  230. page_text_list.append(span['bbox'])
  231. elif span['type'] == ContentType.INLINE_EQUATION:
  232. if span.get('cross_page', False):
  233. next_page_inline_equation_list.append(span['bbox'])
  234. else:
  235. page_inline_equation_list.append(span['bbox'])
  236. elif span['type'] == ContentType.INTERLINE_EQUATION:
  237. page_interline_equation_list.append(span['bbox'])
  238. elif span['type'] == ContentType.IMAGE:
  239. page_image_list.append(span['bbox'])
  240. elif span['type'] == ContentType.TABLE:
  241. page_table_list.append(span['bbox'])
  242. for page in pdf_info:
  243. page_text_list = []
  244. page_inline_equation_list = []
  245. page_interline_equation_list = []
  246. page_image_list = []
  247. page_table_list = []
  248. page_dropped_list = []
  249. # 将跨页的span放到移动到下一页的列表中
  250. if len(next_page_text_list) > 0:
  251. page_text_list.extend(next_page_text_list)
  252. next_page_text_list.clear()
  253. if len(next_page_inline_equation_list) > 0:
  254. page_inline_equation_list.extend(next_page_inline_equation_list)
  255. next_page_inline_equation_list.clear()
  256. # 构造dropped_list
  257. for block in page['discarded_blocks']:
  258. if block['type'] == BlockType.DISCARDED:
  259. for line in block['lines']:
  260. for span in line['spans']:
  261. page_dropped_list.append(span['bbox'])
  262. dropped_list.append(page_dropped_list)
  263. # 构造其余useful_list
  264. # for block in page['para_blocks']: # span直接用分段合并前的结果就可以
  265. for block in page['preproc_blocks']:
  266. if block['type'] in [
  267. BlockType.TEXT,
  268. BlockType.TITLE,
  269. BlockType.INTERLINE_EQUATION,
  270. BlockType.LIST,
  271. BlockType.INDEX,
  272. ]:
  273. for line in block['lines']:
  274. for span in line['spans']:
  275. get_span_info(span)
  276. elif block['type'] in [BlockType.IMAGE, BlockType.TABLE]:
  277. for sub_block in block['blocks']:
  278. for line in sub_block['lines']:
  279. for span in line['spans']:
  280. get_span_info(span)
  281. text_list.append(page_text_list)
  282. inline_equation_list.append(page_inline_equation_list)
  283. interline_equation_list.append(page_interline_equation_list)
  284. image_list.append(page_image_list)
  285. table_list.append(page_table_list)
  286. pdf_bytes_io = BytesIO(pdf_bytes)
  287. pdf_docs = PdfReader(pdf_bytes_io)
  288. output_pdf = PdfWriter()
  289. for i, page in enumerate(pdf_docs.pages):
  290. # 获取原始页面尺寸
  291. page_width, page_height = float(page.cropbox[2]), float(page.cropbox[3])
  292. custom_page_size = (page_width, page_height)
  293. packet = BytesIO()
  294. # 使用原始PDF的尺寸创建canvas
  295. c = canvas.Canvas(packet, pagesize=custom_page_size)
  296. # 获取当前页面的数据
  297. draw_bbox_without_number(i, text_list, page, c,[255, 0, 0], False)
  298. draw_bbox_without_number(i, inline_equation_list, page, c, [0, 255, 0], False)
  299. draw_bbox_without_number(i, interline_equation_list, page, c, [0, 0, 255], False)
  300. draw_bbox_without_number(i, image_list, page, c, [255, 204, 0], False)
  301. draw_bbox_without_number(i, table_list, page, c, [204, 0, 255], False)
  302. draw_bbox_without_number(i, dropped_list, page, c, [158, 158, 158], False)
  303. c.save()
  304. packet.seek(0)
  305. overlay_pdf = PdfReader(packet)
  306. # 添加检查确保overlay_pdf.pages不为空
  307. if len(overlay_pdf.pages) > 0:
  308. page.merge_page(overlay_pdf.pages[0])
  309. else:
  310. # 记录日志并继续处理下一个页面
  311. # logger.warning(f"span.pdf: 第{i + 1}页未能生成有效的overlay PDF")
  312. pass
  313. output_pdf.add_page(page)
  314. # Save the PDF
  315. with open(f"{out_path}/{filename}", "wb") as f:
  316. output_pdf.write(f)
  317. if __name__ == "__main__":
  318. # 读取PDF文件
  319. pdf_path = "examples/demo1.pdf"
  320. with open(pdf_path, "rb") as f:
  321. pdf_bytes = f.read()
  322. # 从json文件读取pdf_info
  323. json_path = "examples/demo1_1746005777.0863056_middle.json"
  324. with open(json_path, "r", encoding="utf-8") as f:
  325. pdf_ann = json.load(f)
  326. pdf_info = pdf_ann["pdf_info"]
  327. # 调用可视化函数,输出到examples目录
  328. draw_layout_bbox(pdf_info, pdf_bytes, "examples", "output_with_layout.pdf")