draw_bbox.py 8.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210
  1. from magic_pdf.libs.commons import fitz # PyMuPDF
  2. from magic_pdf.libs.ocr_content_type import ContentType, BlockType
  3. def draw_bbox_without_number(i, bbox_list, page, rgb_config, fill_config):
  4. new_rgb = []
  5. for item in rgb_config:
  6. item = float(item) / 255
  7. new_rgb.append(item)
  8. page_data = bbox_list[i]
  9. for bbox in page_data:
  10. x0, y0, x1, y1 = bbox
  11. rect_coords = fitz.Rect(x0, y0, x1, y1) # Define the rectangle
  12. if fill_config:
  13. page.draw_rect(
  14. rect_coords,
  15. color=None,
  16. fill=new_rgb,
  17. fill_opacity=0.3,
  18. width=0.5,
  19. overlay=True,
  20. ) # Draw the rectangle
  21. else:
  22. page.draw_rect(
  23. rect_coords,
  24. color=new_rgb,
  25. fill=None,
  26. fill_opacity=1,
  27. width=0.5,
  28. overlay=True,
  29. ) # Draw the rectangle
  30. def draw_bbox_with_number(i, bbox_list, page, rgb_config, fill_config):
  31. new_rgb = []
  32. for item in rgb_config:
  33. item = float(item) / 255
  34. new_rgb.append(item)
  35. page_data = bbox_list[i]
  36. for j, bbox in enumerate(page_data):
  37. x0, y0, x1, y1 = bbox
  38. rect_coords = fitz.Rect(x0, y0, x1, y1) # Define the rectangle
  39. if fill_config:
  40. page.draw_rect(
  41. rect_coords,
  42. color=None,
  43. fill=new_rgb,
  44. fill_opacity=0.3,
  45. width=0.5,
  46. overlay=True,
  47. ) # Draw the rectangle
  48. else:
  49. page.draw_rect(
  50. rect_coords,
  51. color=new_rgb,
  52. fill=None,
  53. fill_opacity=1,
  54. width=0.5,
  55. overlay=True,
  56. ) # Draw the rectangle
  57. page.insert_text(
  58. (x0, y0 + 10), str(j + 1), fontsize=10, color=new_rgb
  59. ) # Insert the index at the top left corner of the rectangle
  60. def draw_layout_bbox(pdf_info, pdf_bytes, out_path):
  61. layout_bbox_list = []
  62. blocks_bbox_list = []
  63. dropped_bbox_list = []
  64. tables_list, tables_body_list, tables_caption_list, tables_footnote_list = (
  65. [],
  66. [],
  67. [],
  68. [],
  69. )
  70. imgs_list, imgs_body_list, imgs_caption_list = [], [], []
  71. titles_list = []
  72. texts_list = []
  73. interequations_list = []
  74. for page in pdf_info:
  75. page_layout_list = []
  76. page_dropped_list = []
  77. page_blocks_bbox_list = []
  78. tables, tables_body, tables_caption, tables_footnote = [], [], [], []
  79. imgs, imgs_body, imgs_caption = [], [], []
  80. titles = []
  81. texts = []
  82. interequations = []
  83. for layout in page["layout_bboxes"]:
  84. page_layout_list.append(layout["layout_bbox"])
  85. layout_bbox_list.append(page_layout_list)
  86. for dropped_bbox in page["discarded_blocks"]:
  87. page_dropped_list.append(dropped_bbox["bbox"])
  88. dropped_bbox_list.append(page_dropped_list)
  89. for block in page["para_blocks"]:
  90. bbox = block["bbox"]
  91. if block["type"] == BlockType.Table:
  92. tables.append(bbox)
  93. for nested_block in block["blocks"]:
  94. bbox = nested_block["bbox"]
  95. if nested_block["type"] == BlockType.TableBody:
  96. tables_body.append(bbox)
  97. elif nested_block["type"] == BlockType.TableCaption:
  98. tables_caption.append(bbox)
  99. elif nested_block["type"] == BlockType.TableFootnote:
  100. tables_footnote.append(bbox)
  101. elif block["type"] == BlockType.Image:
  102. imgs.append(bbox)
  103. for nested_block in block["blocks"]:
  104. bbox = nested_block["bbox"]
  105. if nested_block["type"] == BlockType.ImageBody:
  106. imgs_body.append(bbox)
  107. elif nested_block["type"] == BlockType.ImageCaption:
  108. imgs_caption.append(bbox)
  109. elif block["type"] == BlockType.Title:
  110. titles.append(bbox)
  111. elif block["type"] == BlockType.Text:
  112. texts.append(bbox)
  113. elif block["type"] == BlockType.InterlineEquation:
  114. interequations.append(bbox)
  115. tables_list.append(tables)
  116. tables_body_list.append(tables_body)
  117. tables_caption_list.append(tables_caption)
  118. tables_footnote_list.append(tables_footnote)
  119. imgs_list.append(imgs)
  120. imgs_body_list.append(imgs_body)
  121. imgs_caption_list.append(imgs_caption)
  122. titles_list.append(titles)
  123. texts_list.append(texts)
  124. interequations_list.append(interequations)
  125. pdf_docs = fitz.open("pdf", pdf_bytes)
  126. for i, page in enumerate(pdf_docs):
  127. draw_bbox_with_number(i, layout_bbox_list, page, [255, 0, 0], False)
  128. draw_bbox_without_number(i, dropped_bbox_list, page, [158, 158, 158], True)
  129. draw_bbox_without_number(i, tables_list, page, [153, 153, 0], True) # color !
  130. draw_bbox_without_number(i, tables_body_list, page, [204, 204, 0], True)
  131. draw_bbox_without_number(i, tables_caption_list, page, [255, 255, 102], True)
  132. draw_bbox_without_number(i, tables_footnote_list, page, [229, 255, 204], True)
  133. draw_bbox_without_number(i, imgs_list, page, [51, 102, 0], True)
  134. draw_bbox_without_number(i, imgs_body_list, page, [153, 255, 51], True)
  135. draw_bbox_without_number(i, imgs_caption_list, page, [102, 178, 255], True)
  136. draw_bbox_without_number(i, titles_list, page, [102, 102, 255], True)
  137. draw_bbox_without_number(i, texts_list, page, [153, 0, 76], True)
  138. draw_bbox_without_number(i, interequations_list, page, [0, 255, 0], True)
  139. # Save the PDF
  140. pdf_docs.save(f"{out_path}/layout.pdf")
  141. def draw_span_bbox(pdf_info, pdf_bytes, out_path):
  142. text_list = []
  143. inline_equation_list = []
  144. interline_equation_list = []
  145. image_list = []
  146. table_list = []
  147. for page in pdf_info:
  148. page_text_list = []
  149. page_inline_equation_list = []
  150. page_interline_equation_list = []
  151. page_image_list = []
  152. page_table_list = []
  153. for block in page["para_blocks"]:
  154. if block["type"] in [
  155. BlockType.Text,
  156. BlockType.Title,
  157. BlockType.InterlineEquation,
  158. ]:
  159. for line in block["lines"]:
  160. for span in line["spans"]:
  161. if span["type"] == ContentType.Text:
  162. page_text_list.append(span["bbox"])
  163. elif span["type"] == ContentType.InlineEquation:
  164. page_inline_equation_list.append(span["bbox"])
  165. elif span["type"] == ContentType.InterlineEquation:
  166. page_interline_equation_list.append(span["bbox"])
  167. elif span["type"] == ContentType.Image:
  168. page_image_list.append(span["bbox"])
  169. elif span["type"] == ContentType.Table:
  170. page_table_list.append(span["bbox"])
  171. elif block["type"] in [BlockType.Image, BlockType.Table]:
  172. for sub_block in block["blocks"]:
  173. for line in sub_block["lines"]:
  174. for span in line["spans"]:
  175. if span["type"] == ContentType.Text:
  176. page_text_list.append(span["bbox"])
  177. elif span["type"] == ContentType.InlineEquation:
  178. page_inline_equation_list.append(span["bbox"])
  179. elif span["type"] == ContentType.InterlineEquation:
  180. page_interline_equation_list.append(span["bbox"])
  181. elif span["type"] == ContentType.Image:
  182. page_image_list.append(span["bbox"])
  183. elif span["type"] == ContentType.Table:
  184. page_table_list.append(span["bbox"])
  185. text_list.append(page_text_list)
  186. inline_equation_list.append(page_inline_equation_list)
  187. interline_equation_list.append(page_interline_equation_list)
  188. image_list.append(page_image_list)
  189. table_list.append(page_table_list)
  190. pdf_docs = fitz.open("pdf", pdf_bytes)
  191. for i, page in enumerate(pdf_docs):
  192. # 获取当前页面的数据
  193. draw_bbox_without_number(i, text_list, page, [255, 0, 0], False)
  194. draw_bbox_without_number(i, inline_equation_list, page, [0, 255, 0], False)
  195. draw_bbox_without_number(i, interline_equation_list, page, [0, 0, 255], False)
  196. draw_bbox_without_number(i, image_list, page, [255, 204, 0], False)
  197. draw_bbox_without_number(i, table_list, page, [204, 0, 255], False)
  198. # Save the PDF
  199. pdf_docs.save(f"{out_path}/spans.pdf")