draw_bbox.py 20 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489
  1. import json
  2. from io import BytesIO
  3. from loguru import logger
  4. from pypdf import PdfReader, PdfWriter, PageObject
  5. from reportlab.pdfgen import canvas
  6. from .enum_class import BlockType, ContentType, SplitFlag
  7. def cal_canvas_rect(page, bbox):
  8. """
  9. Calculate the rectangle coordinates on the canvas based on the original PDF page and bounding box.
  10. Args:
  11. page: A PyPDF2 Page object representing a single page in the PDF.
  12. bbox: [x0, y0, x1, y1] representing the bounding box coordinates.
  13. Returns:
  14. rect: [x0, y0, width, height] representing the rectangle coordinates on the canvas.
  15. """
  16. page_width, page_height = float(page.cropbox[2]), float(page.cropbox[3])
  17. actual_width = page_width # The width of the final PDF display
  18. actual_height = page_height # The height of the final PDF display
  19. rotation_obj = page.get("/Rotate", 0)
  20. try:
  21. rotation = int(rotation_obj) % 360 # cast rotation to int to handle IndirectObject
  22. except (ValueError, TypeError) as e:
  23. logger.warning(f"Invalid /Rotate value {rotation_obj!r} on page; defaulting to 0. Error: {e}")
  24. rotation = 0
  25. if rotation in [90, 270]:
  26. # PDF is rotated 90 degrees or 270 degrees, and the width and height need to be swapped
  27. actual_width, actual_height = actual_height, actual_width
  28. x0, y0, x1, y1 = bbox
  29. rect_w = abs(x1 - x0)
  30. rect_h = abs(y1 - y0)
  31. if rotation == 270:
  32. rect_w, rect_h = rect_h, rect_w
  33. x0 = actual_height - y1
  34. y0 = actual_width - x1
  35. elif rotation == 180:
  36. x0 = page_width - x1
  37. # y0 stays the same
  38. elif rotation == 90:
  39. rect_w, rect_h = rect_h, rect_w
  40. x0, y0 = y0, x0
  41. else:
  42. # rotation == 0
  43. y0 = page_height - y1
  44. rect = [x0, y0, rect_w, rect_h]
  45. return rect
  46. def draw_bbox_without_number(i, bbox_list, page, c, rgb_config, fill_config):
  47. new_rgb = [float(color) / 255 for color in rgb_config]
  48. page_data = bbox_list[i]
  49. for bbox in page_data:
  50. rect = cal_canvas_rect(page, bbox) # Define the rectangle
  51. if fill_config: # filled rectangle
  52. c.setFillColorRGB(new_rgb[0], new_rgb[1], new_rgb[2], 0.3)
  53. c.rect(rect[0], rect[1], rect[2], rect[3], stroke=0, fill=1)
  54. else: # bounding box
  55. c.setStrokeColorRGB(new_rgb[0], new_rgb[1], new_rgb[2])
  56. c.rect(rect[0], rect[1], rect[2], rect[3], stroke=1, fill=0)
  57. return c
  58. def draw_bbox_with_number(i, bbox_list, page, c, rgb_config, fill_config, draw_bbox=True):
  59. new_rgb = [float(color) / 255 for color in rgb_config]
  60. page_data = bbox_list[i]
  61. # 强制转换为 float
  62. page_width, page_height = float(page.cropbox[2]), float(page.cropbox[3])
  63. for j, bbox in enumerate(page_data):
  64. # 确保bbox的每个元素都是float
  65. rect = cal_canvas_rect(page, bbox) # Define the rectangle
  66. if draw_bbox:
  67. if fill_config:
  68. c.setFillColorRGB(*new_rgb, 0.3)
  69. c.rect(rect[0], rect[1], rect[2], rect[3], stroke=0, fill=1)
  70. else:
  71. c.setStrokeColorRGB(*new_rgb)
  72. c.rect(rect[0], rect[1], rect[2], rect[3], stroke=1, fill=0)
  73. c.setFillColorRGB(*new_rgb, 1.0)
  74. c.setFontSize(size=10)
  75. c.saveState()
  76. rotation_obj = page.get("/Rotate", 0)
  77. try:
  78. rotation = int(rotation_obj) % 360 # cast rotation to int to handle IndirectObject
  79. except (ValueError, TypeError):
  80. logger.warning(f"Invalid /Rotate value: {rotation_obj!r}, defaulting to 0")
  81. rotation = 0
  82. if rotation == 0:
  83. c.translate(rect[0] + rect[2] + 2, rect[1] + rect[3] - 10)
  84. elif rotation == 90:
  85. c.translate(rect[0] + 10, rect[1] + rect[3] + 2)
  86. elif rotation == 180:
  87. c.translate(rect[0] - 2, rect[1] + 10)
  88. elif rotation == 270:
  89. c.translate(rect[0] + rect[2] - 10, rect[1] - 2)
  90. c.rotate(rotation)
  91. c.drawString(0, 0, str(j + 1))
  92. c.restoreState()
  93. return c
  94. def draw_layout_bbox(pdf_info, pdf_bytes, out_path, filename):
  95. dropped_bbox_list = []
  96. tables_body_list, tables_caption_list, tables_footnote_list = [], [], []
  97. imgs_body_list, imgs_caption_list, imgs_footnote_list = [], [], []
  98. codes_body_list, codes_caption_list = [], []
  99. titles_list = []
  100. texts_list = []
  101. interequations_list = []
  102. lists_list = []
  103. list_items_list = []
  104. indexs_list = []
  105. for page in pdf_info:
  106. page_dropped_list = []
  107. tables_body, tables_caption, tables_footnote = [], [], []
  108. imgs_body, imgs_caption, imgs_footnote = [], [], []
  109. codes_body, codes_caption = [], []
  110. titles = []
  111. texts = []
  112. interequations = []
  113. lists = []
  114. list_items = []
  115. indices = []
  116. for dropped_bbox in page['discarded_blocks']:
  117. page_dropped_list.append(dropped_bbox['bbox'])
  118. dropped_bbox_list.append(page_dropped_list)
  119. for block in page["para_blocks"]:
  120. bbox = block["bbox"]
  121. if block["type"] == BlockType.TABLE:
  122. for nested_block in block["blocks"]:
  123. bbox = nested_block["bbox"]
  124. if nested_block["type"] == BlockType.TABLE_BODY:
  125. tables_body.append(bbox)
  126. elif nested_block["type"] == BlockType.TABLE_CAPTION:
  127. tables_caption.append(bbox)
  128. elif nested_block["type"] == BlockType.TABLE_FOOTNOTE:
  129. if nested_block.get(SplitFlag.CROSS_PAGE, False):
  130. continue
  131. tables_footnote.append(bbox)
  132. elif block["type"] == BlockType.IMAGE:
  133. for nested_block in block["blocks"]:
  134. bbox = nested_block["bbox"]
  135. if nested_block["type"] == BlockType.IMAGE_BODY:
  136. imgs_body.append(bbox)
  137. elif nested_block["type"] == BlockType.IMAGE_CAPTION:
  138. imgs_caption.append(bbox)
  139. elif nested_block["type"] == BlockType.IMAGE_FOOTNOTE:
  140. imgs_footnote.append(bbox)
  141. elif block["type"] == BlockType.CODE:
  142. for nested_block in block["blocks"]:
  143. if nested_block["type"] == BlockType.CODE_BODY:
  144. bbox = nested_block["bbox"]
  145. codes_body.append(bbox)
  146. elif nested_block["type"] == BlockType.CODE_CAPTION:
  147. bbox = nested_block["bbox"]
  148. codes_caption.append(bbox)
  149. elif block["type"] == BlockType.TITLE:
  150. titles.append(bbox)
  151. elif block["type"] in [BlockType.TEXT, BlockType.REF_TEXT]:
  152. texts.append(bbox)
  153. elif block["type"] == BlockType.INTERLINE_EQUATION:
  154. interequations.append(bbox)
  155. elif block["type"] == BlockType.LIST:
  156. lists.append(bbox)
  157. if "blocks" in block:
  158. for sub_block in block["blocks"]:
  159. list_items.append(sub_block["bbox"])
  160. elif block["type"] == BlockType.INDEX:
  161. indices.append(bbox)
  162. tables_body_list.append(tables_body)
  163. tables_caption_list.append(tables_caption)
  164. tables_footnote_list.append(tables_footnote)
  165. imgs_body_list.append(imgs_body)
  166. imgs_caption_list.append(imgs_caption)
  167. imgs_footnote_list.append(imgs_footnote)
  168. titles_list.append(titles)
  169. texts_list.append(texts)
  170. interequations_list.append(interequations)
  171. lists_list.append(lists)
  172. list_items_list.append(list_items)
  173. indexs_list.append(indices)
  174. codes_body_list.append(codes_body)
  175. codes_caption_list.append(codes_caption)
  176. layout_bbox_list = []
  177. table_type_order = {"table_caption": 1, "table_body": 2, "table_footnote": 3}
  178. for page in pdf_info:
  179. page_block_list = []
  180. for block in page["para_blocks"]:
  181. if block["type"] in [
  182. BlockType.TEXT,
  183. BlockType.REF_TEXT,
  184. BlockType.TITLE,
  185. BlockType.INTERLINE_EQUATION,
  186. BlockType.LIST,
  187. BlockType.INDEX,
  188. ]:
  189. bbox = block["bbox"]
  190. page_block_list.append(bbox)
  191. elif block["type"] in [BlockType.IMAGE]:
  192. for sub_block in block["blocks"]:
  193. bbox = sub_block["bbox"]
  194. page_block_list.append(bbox)
  195. elif block["type"] in [BlockType.TABLE]:
  196. sorted_blocks = sorted(block["blocks"], key=lambda x: table_type_order[x["type"]])
  197. for sub_block in sorted_blocks:
  198. if sub_block.get(SplitFlag.CROSS_PAGE, False):
  199. continue
  200. bbox = sub_block["bbox"]
  201. page_block_list.append(bbox)
  202. elif block["type"] in [BlockType.CODE]:
  203. for sub_block in block["blocks"]:
  204. bbox = sub_block["bbox"]
  205. page_block_list.append(bbox)
  206. layout_bbox_list.append(page_block_list)
  207. pdf_bytes_io = BytesIO(pdf_bytes)
  208. pdf_docs = PdfReader(pdf_bytes_io)
  209. output_pdf = PdfWriter()
  210. for i, page in enumerate(pdf_docs.pages):
  211. # 获取原始页面尺寸
  212. page_width, page_height = float(page.cropbox[2]), float(page.cropbox[3])
  213. custom_page_size = (page_width, page_height)
  214. packet = BytesIO()
  215. # 使用原始PDF的尺寸创建canvas
  216. c = canvas.Canvas(packet, pagesize=custom_page_size)
  217. c = draw_bbox_without_number(i, codes_body_list, page, c, [102, 0, 204], True)
  218. c = draw_bbox_without_number(i, codes_caption_list, page, c, [204, 153, 255], True)
  219. c = draw_bbox_without_number(i, dropped_bbox_list, page, c, [158, 158, 158], True)
  220. c = draw_bbox_without_number(i, tables_body_list, page, c, [204, 204, 0], True)
  221. c = draw_bbox_without_number(i, tables_caption_list, page, c, [255, 255, 102], True)
  222. c = draw_bbox_without_number(i, tables_footnote_list, page, c, [229, 255, 204], True)
  223. c = draw_bbox_without_number(i, imgs_body_list, page, c, [153, 255, 51], True)
  224. c = draw_bbox_without_number(i, imgs_caption_list, page, c, [102, 178, 255], True)
  225. c = draw_bbox_without_number(i, imgs_footnote_list, page, c, [255, 178, 102], True)
  226. c = draw_bbox_without_number(i, titles_list, page, c, [102, 102, 255], True)
  227. c = draw_bbox_without_number(i, texts_list, page, c, [153, 0, 76], True)
  228. c = draw_bbox_without_number(i, interequations_list, page, c, [0, 255, 0], True)
  229. c = draw_bbox_without_number(i, lists_list, page, c, [40, 169, 92], True)
  230. c = draw_bbox_without_number(i, list_items_list, page, c, [40, 169, 92], False)
  231. c = draw_bbox_without_number(i, indexs_list, page, c, [40, 169, 92], True)
  232. c = draw_bbox_with_number(i, layout_bbox_list, page, c, [255, 0, 0], False, draw_bbox=False)
  233. c.save()
  234. packet.seek(0)
  235. overlay_pdf = PdfReader(packet)
  236. # 添加检查确保overlay_pdf.pages不为空
  237. if len(overlay_pdf.pages) > 0:
  238. new_page = PageObject(pdf=None)
  239. new_page.update(page)
  240. page = new_page
  241. page.merge_page(overlay_pdf.pages[0])
  242. else:
  243. # 记录日志并继续处理下一个页面
  244. # logger.warning(f"layout.pdf: 第{i + 1}页未能生成有效的overlay PDF")
  245. pass
  246. output_pdf.add_page(page)
  247. # 保存结果
  248. with open(f"{out_path}/{filename}", "wb") as f:
  249. output_pdf.write(f)
  250. def draw_span_bbox(pdf_info, pdf_bytes, out_path, filename):
  251. text_list = []
  252. inline_equation_list = []
  253. interline_equation_list = []
  254. image_list = []
  255. table_list = []
  256. dropped_list = []
  257. def get_span_info(span):
  258. if span['type'] == ContentType.TEXT:
  259. page_text_list.append(span['bbox'])
  260. elif span['type'] == ContentType.INLINE_EQUATION:
  261. page_inline_equation_list.append(span['bbox'])
  262. elif span['type'] == ContentType.INTERLINE_EQUATION:
  263. page_interline_equation_list.append(span['bbox'])
  264. elif span['type'] == ContentType.IMAGE:
  265. page_image_list.append(span['bbox'])
  266. elif span['type'] == ContentType.TABLE:
  267. page_table_list.append(span['bbox'])
  268. for page in pdf_info:
  269. page_text_list = []
  270. page_inline_equation_list = []
  271. page_interline_equation_list = []
  272. page_image_list = []
  273. page_table_list = []
  274. page_dropped_list = []
  275. # 构造dropped_list
  276. for block in page['discarded_blocks']:
  277. if block['type'] == BlockType.DISCARDED:
  278. for line in block['lines']:
  279. for span in line['spans']:
  280. page_dropped_list.append(span['bbox'])
  281. dropped_list.append(page_dropped_list)
  282. # 构造其余useful_list
  283. # for block in page['para_blocks']: # span直接用分段合并前的结果就可以
  284. for block in page['preproc_blocks']:
  285. if block['type'] in [
  286. BlockType.TEXT,
  287. BlockType.TITLE,
  288. BlockType.INTERLINE_EQUATION,
  289. BlockType.LIST,
  290. BlockType.INDEX,
  291. ]:
  292. for line in block['lines']:
  293. for span in line['spans']:
  294. get_span_info(span)
  295. elif block['type'] in [BlockType.IMAGE, BlockType.TABLE]:
  296. for sub_block in block['blocks']:
  297. for line in sub_block['lines']:
  298. for span in line['spans']:
  299. get_span_info(span)
  300. text_list.append(page_text_list)
  301. inline_equation_list.append(page_inline_equation_list)
  302. interline_equation_list.append(page_interline_equation_list)
  303. image_list.append(page_image_list)
  304. table_list.append(page_table_list)
  305. pdf_bytes_io = BytesIO(pdf_bytes)
  306. pdf_docs = PdfReader(pdf_bytes_io)
  307. output_pdf = PdfWriter()
  308. for i, page in enumerate(pdf_docs.pages):
  309. # 获取原始页面尺寸
  310. page_width, page_height = float(page.cropbox[2]), float(page.cropbox[3])
  311. custom_page_size = (page_width, page_height)
  312. packet = BytesIO()
  313. # 使用原始PDF的尺寸创建canvas
  314. c = canvas.Canvas(packet, pagesize=custom_page_size)
  315. # 获取当前页面的数据
  316. draw_bbox_without_number(i, text_list, page, c,[255, 0, 0], False)
  317. draw_bbox_without_number(i, inline_equation_list, page, c, [0, 255, 0], False)
  318. draw_bbox_without_number(i, interline_equation_list, page, c, [0, 0, 255], False)
  319. draw_bbox_without_number(i, image_list, page, c, [255, 204, 0], False)
  320. draw_bbox_without_number(i, table_list, page, c, [204, 0, 255], False)
  321. draw_bbox_without_number(i, dropped_list, page, c, [158, 158, 158], False)
  322. c.save()
  323. packet.seek(0)
  324. overlay_pdf = PdfReader(packet)
  325. # 添加检查确保overlay_pdf.pages不为空
  326. if len(overlay_pdf.pages) > 0:
  327. new_page = PageObject(pdf=None)
  328. new_page.update(page)
  329. page = new_page
  330. page.merge_page(overlay_pdf.pages[0])
  331. else:
  332. # 记录日志并继续处理下一个页面
  333. # logger.warning(f"span.pdf: 第{i + 1}页未能生成有效的overlay PDF")
  334. pass
  335. output_pdf.add_page(page)
  336. # Save the PDF
  337. with open(f"{out_path}/{filename}", "wb") as f:
  338. output_pdf.write(f)
  339. def draw_line_sort_bbox(pdf_info, pdf_bytes, out_path, filename):
  340. layout_bbox_list = []
  341. for page in pdf_info:
  342. page_line_list = []
  343. for block in page['preproc_blocks']:
  344. if block['type'] in [BlockType.TEXT]:
  345. for line in block['lines']:
  346. bbox = line['bbox']
  347. index = line['index']
  348. page_line_list.append({'index': index, 'bbox': bbox})
  349. elif block['type'] in [BlockType.TITLE, BlockType.INTERLINE_EQUATION]:
  350. if 'virtual_lines' in block:
  351. if len(block['virtual_lines']) > 0 and block['virtual_lines'][0].get('index', None) is not None:
  352. for line in block['virtual_lines']:
  353. bbox = line['bbox']
  354. index = line['index']
  355. page_line_list.append({'index': index, 'bbox': bbox})
  356. else:
  357. for line in block['lines']:
  358. bbox = line['bbox']
  359. index = line['index']
  360. page_line_list.append({'index': index, 'bbox': bbox})
  361. elif block['type'] in [BlockType.IMAGE, BlockType.TABLE]:
  362. for sub_block in block['blocks']:
  363. if sub_block['type'] in [BlockType.IMAGE_BODY, BlockType.TABLE_BODY]:
  364. if len(sub_block['virtual_lines']) > 0 and sub_block['virtual_lines'][0].get('index', None) is not None:
  365. for line in sub_block['virtual_lines']:
  366. bbox = line['bbox']
  367. index = line['index']
  368. page_line_list.append({'index': index, 'bbox': bbox})
  369. else:
  370. for line in sub_block['lines']:
  371. bbox = line['bbox']
  372. index = line['index']
  373. page_line_list.append({'index': index, 'bbox': bbox})
  374. elif sub_block['type'] in [BlockType.IMAGE_CAPTION, BlockType.TABLE_CAPTION, BlockType.IMAGE_FOOTNOTE, BlockType.TABLE_FOOTNOTE]:
  375. for line in sub_block['lines']:
  376. bbox = line['bbox']
  377. index = line['index']
  378. page_line_list.append({'index': index, 'bbox': bbox})
  379. sorted_bboxes = sorted(page_line_list, key=lambda x: x['index'])
  380. layout_bbox_list.append(sorted_bbox['bbox'] for sorted_bbox in sorted_bboxes)
  381. pdf_bytes_io = BytesIO(pdf_bytes)
  382. pdf_docs = PdfReader(pdf_bytes_io)
  383. output_pdf = PdfWriter()
  384. for i, page in enumerate(pdf_docs.pages):
  385. # 获取原始页面尺寸
  386. page_width, page_height = float(page.cropbox[2]), float(page.cropbox[3])
  387. custom_page_size = (page_width, page_height)
  388. packet = BytesIO()
  389. # 使用原始PDF的尺寸创建canvas
  390. c = canvas.Canvas(packet, pagesize=custom_page_size)
  391. # 获取当前页面的数据
  392. draw_bbox_with_number(i, layout_bbox_list, page, c, [255, 0, 0], False)
  393. c.save()
  394. packet.seek(0)
  395. overlay_pdf = PdfReader(packet)
  396. # 添加检查确保overlay_pdf.pages不为空
  397. if len(overlay_pdf.pages) > 0:
  398. new_page = PageObject(pdf=None)
  399. new_page.update(page)
  400. page = new_page
  401. page.merge_page(overlay_pdf.pages[0])
  402. else:
  403. # 记录日志并继续处理下一个页面
  404. # logger.warning(f"span.pdf: 第{i + 1}页未能生成有效的overlay PDF")
  405. pass
  406. output_pdf.add_page(page)
  407. # Save the PDF
  408. with open(f"{out_path}/{filename}", "wb") as f:
  409. output_pdf.write(f)
  410. if __name__ == "__main__":
  411. # 读取PDF文件
  412. pdf_path = "examples/demo1.pdf"
  413. with open(pdf_path, "rb") as f:
  414. pdf_bytes = f.read()
  415. # 从json文件读取pdf_info
  416. json_path = "examples/demo1_1746005777.0863056_middle.json"
  417. with open(json_path, "r", encoding="utf-8") as f:
  418. pdf_ann = json.load(f)
  419. pdf_info = pdf_ann["pdf_info"]
  420. # 调用可视化函数,输出到examples目录
  421. draw_layout_bbox(pdf_info, pdf_bytes, "examples", "output_with_layout.pdf")