draw_bbox.py 10 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232
  1. from magic_pdf.libs.Constants import CROSS_PAGE
  2. from magic_pdf.libs.commons import fitz # PyMuPDF
  3. from magic_pdf.libs.ocr_content_type import ContentType, BlockType
  4. def draw_bbox_without_number(i, bbox_list, page, rgb_config, fill_config):
  5. new_rgb = []
  6. for item in rgb_config:
  7. item = float(item) / 255
  8. new_rgb.append(item)
  9. page_data = bbox_list[i]
  10. for bbox in page_data:
  11. x0, y0, x1, y1 = bbox
  12. rect_coords = fitz.Rect(x0, y0, x1, y1) # Define the rectangle
  13. if fill_config:
  14. page.draw_rect(
  15. rect_coords,
  16. color=None,
  17. fill=new_rgb,
  18. fill_opacity=0.3,
  19. width=0.5,
  20. overlay=True,
  21. ) # Draw the rectangle
  22. else:
  23. page.draw_rect(
  24. rect_coords,
  25. color=new_rgb,
  26. fill=None,
  27. fill_opacity=1,
  28. width=0.5,
  29. overlay=True,
  30. ) # Draw the rectangle
  31. def draw_bbox_with_number(i, bbox_list, page, rgb_config, fill_config):
  32. new_rgb = []
  33. for item in rgb_config:
  34. item = float(item) / 255
  35. new_rgb.append(item)
  36. page_data = bbox_list[i]
  37. for j, bbox in enumerate(page_data):
  38. x0, y0, x1, y1 = bbox
  39. rect_coords = fitz.Rect(x0, y0, x1, y1) # Define the rectangle
  40. if fill_config:
  41. page.draw_rect(
  42. rect_coords,
  43. color=None,
  44. fill=new_rgb,
  45. fill_opacity=0.3,
  46. width=0.5,
  47. overlay=True,
  48. ) # Draw the rectangle
  49. else:
  50. page.draw_rect(
  51. rect_coords,
  52. color=new_rgb,
  53. fill=None,
  54. fill_opacity=1,
  55. width=0.5,
  56. overlay=True,
  57. ) # Draw the rectangle
  58. page.insert_text(
  59. (x0, y0 + 10), str(j + 1), fontsize=10, color=new_rgb
  60. ) # Insert the index at the top left corner of the rectangle
  61. def draw_layout_bbox(pdf_info, pdf_bytes, out_path):
  62. layout_bbox_list = []
  63. dropped_bbox_list = []
  64. tables_list, tables_body_list, tables_caption_list, tables_footnote_list = [], [], [], []
  65. imgs_list, imgs_body_list, imgs_caption_list = [], [], []
  66. titles_list = []
  67. texts_list = []
  68. interequations_list = []
  69. for page in pdf_info:
  70. page_layout_list = []
  71. page_dropped_list = []
  72. tables, tables_body, tables_caption, tables_footnote = [], [], [], []
  73. imgs, imgs_body, imgs_caption = [], [], []
  74. titles = []
  75. texts = []
  76. interequations = []
  77. for layout in page["layout_bboxes"]:
  78. page_layout_list.append(layout["layout_bbox"])
  79. layout_bbox_list.append(page_layout_list)
  80. for dropped_bbox in page["discarded_blocks"]:
  81. page_dropped_list.append(dropped_bbox["bbox"])
  82. dropped_bbox_list.append(page_dropped_list)
  83. for block in page["para_blocks"]:
  84. bbox = block["bbox"]
  85. if block["type"] == BlockType.Table:
  86. tables.append(bbox)
  87. for nested_block in block["blocks"]:
  88. bbox = nested_block["bbox"]
  89. if nested_block["type"] == BlockType.TableBody:
  90. tables_body.append(bbox)
  91. elif nested_block["type"] == BlockType.TableCaption:
  92. tables_caption.append(bbox)
  93. elif nested_block["type"] == BlockType.TableFootnote:
  94. tables_footnote.append(bbox)
  95. elif block["type"] == BlockType.Image:
  96. imgs.append(bbox)
  97. for nested_block in block["blocks"]:
  98. bbox = nested_block["bbox"]
  99. if nested_block["type"] == BlockType.ImageBody:
  100. imgs_body.append(bbox)
  101. elif nested_block["type"] == BlockType.ImageCaption:
  102. imgs_caption.append(bbox)
  103. elif block["type"] == BlockType.Title:
  104. titles.append(bbox)
  105. elif block["type"] == BlockType.Text:
  106. texts.append(bbox)
  107. elif block["type"] == BlockType.InterlineEquation:
  108. interequations.append(bbox)
  109. tables_list.append(tables)
  110. tables_body_list.append(tables_body)
  111. tables_caption_list.append(tables_caption)
  112. tables_footnote_list.append(tables_footnote)
  113. imgs_list.append(imgs)
  114. imgs_body_list.append(imgs_body)
  115. imgs_caption_list.append(imgs_caption)
  116. titles_list.append(titles)
  117. texts_list.append(texts)
  118. interequations_list.append(interequations)
  119. pdf_docs = fitz.open("pdf", pdf_bytes)
  120. for i, page in enumerate(pdf_docs):
  121. draw_bbox_with_number(i, layout_bbox_list, page, [255, 0, 0], False)
  122. draw_bbox_without_number(i, dropped_bbox_list, page, [158, 158, 158], True)
  123. draw_bbox_without_number(i, tables_list, page, [153, 153, 0], True) # color !
  124. draw_bbox_without_number(i, tables_body_list, page, [204, 204, 0], True)
  125. draw_bbox_without_number(i, tables_caption_list, page, [255, 255, 102], True)
  126. draw_bbox_without_number(i, tables_footnote_list, page, [229, 255, 204], True)
  127. draw_bbox_without_number(i, imgs_list, page, [51, 102, 0], True)
  128. draw_bbox_without_number(i, imgs_body_list, page, [153, 255, 51], True)
  129. draw_bbox_without_number(i, imgs_caption_list, page, [102, 178, 255], True)
  130. draw_bbox_without_number(i, titles_list, page, [102, 102, 255], True)
  131. draw_bbox_without_number(i, texts_list, page, [153, 0, 76], True)
  132. draw_bbox_without_number(i, interequations_list, page, [0, 255, 0], True)
  133. # Save the PDF
  134. pdf_docs.save(f"{out_path}/layout.pdf")
  135. def draw_span_bbox(pdf_info, pdf_bytes, out_path):
  136. text_list = []
  137. inline_equation_list = []
  138. interline_equation_list = []
  139. image_list = []
  140. table_list = []
  141. dropped_list = []
  142. next_page_text_list = []
  143. next_page_inline_equation_list = []
  144. for page in pdf_info:
  145. page_text_list = []
  146. page_inline_equation_list = []
  147. page_interline_equation_list = []
  148. page_image_list = []
  149. page_table_list = []
  150. page_dropped_list = []
  151. # 将跨页的span放到移动到下一页的列表中
  152. if len(next_page_text_list) > 0:
  153. page_text_list.extend(next_page_text_list)
  154. next_page_text_list = []
  155. if len(next_page_inline_equation_list) > 0:
  156. page_inline_equation_list.extend(next_page_inline_equation_list)
  157. next_page_inline_equation_list = []
  158. # 构造dropped_list
  159. for block in page["discarded_blocks"]:
  160. if block["type"] == BlockType.Discarded:
  161. for line in block["lines"]:
  162. for span in line["spans"]:
  163. page_dropped_list.append(span["bbox"])
  164. dropped_list.append(page_dropped_list)
  165. # 构造其余useful_list
  166. for block in page["para_blocks"]:
  167. if block["type"] in [
  168. BlockType.Text,
  169. BlockType.Title,
  170. BlockType.InterlineEquation,
  171. ]:
  172. for line in block["lines"]:
  173. for span in line["spans"]:
  174. if span["type"] == ContentType.Text:
  175. if span.get(CROSS_PAGE, False):
  176. next_page_text_list.append(span["bbox"])
  177. else:
  178. page_text_list.append(span["bbox"])
  179. elif span["type"] == ContentType.InlineEquation:
  180. if span.get(CROSS_PAGE, False):
  181. next_page_inline_equation_list.append(span["bbox"])
  182. else:
  183. page_inline_equation_list.append(span["bbox"])
  184. elif span["type"] == ContentType.InterlineEquation:
  185. page_interline_equation_list.append(span["bbox"])
  186. elif span["type"] == ContentType.Image:
  187. page_image_list.append(span["bbox"])
  188. elif span["type"] == ContentType.Table:
  189. page_table_list.append(span["bbox"])
  190. elif block["type"] in [BlockType.Image, BlockType.Table]:
  191. for sub_block in block["blocks"]:
  192. for line in sub_block["lines"]:
  193. for span in line["spans"]:
  194. if span["type"] == ContentType.Text:
  195. page_text_list.append(span["bbox"])
  196. elif span["type"] == ContentType.InlineEquation:
  197. page_inline_equation_list.append(span["bbox"])
  198. elif span["type"] == ContentType.InterlineEquation:
  199. page_interline_equation_list.append(span["bbox"])
  200. elif span["type"] == ContentType.Image:
  201. page_image_list.append(span["bbox"])
  202. elif span["type"] == ContentType.Table:
  203. page_table_list.append(span["bbox"])
  204. text_list.append(page_text_list)
  205. inline_equation_list.append(page_inline_equation_list)
  206. interline_equation_list.append(page_interline_equation_list)
  207. image_list.append(page_image_list)
  208. table_list.append(page_table_list)
  209. pdf_docs = fitz.open("pdf", pdf_bytes)
  210. for i, page in enumerate(pdf_docs):
  211. # 获取当前页面的数据
  212. draw_bbox_without_number(i, text_list, page, [255, 0, 0], False)
  213. draw_bbox_without_number(i, inline_equation_list, page, [0, 255, 0], False)
  214. draw_bbox_without_number(i, interline_equation_list, page, [0, 0, 255], False)
  215. draw_bbox_without_number(i, image_list, page, [255, 204, 0], False)
  216. draw_bbox_without_number(i, table_list, page, [204, 0, 255], False)
  217. draw_bbox_without_number(i, dropped_list, page, [158, 158, 158], False)
  218. # Save the PDF
  219. pdf_docs.save(f"{out_path}/spans.pdf")