draw_bbox.py 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292
  1. from magic_pdf.libs.Constants import CROSS_PAGE
  2. from magic_pdf.libs.commons import fitz # PyMuPDF
  3. from magic_pdf.libs.ocr_content_type import ContentType, BlockType, CategoryId
  4. from magic_pdf.model.magic_model import MagicModel
  5. def draw_bbox_without_number(i, bbox_list, page, rgb_config, fill_config):
  6. new_rgb = []
  7. for item in rgb_config:
  8. item = float(item) / 255
  9. new_rgb.append(item)
  10. page_data = bbox_list[i]
  11. for bbox in page_data:
  12. x0, y0, x1, y1 = bbox
  13. rect_coords = fitz.Rect(x0, y0, x1, y1) # Define the rectangle
  14. if fill_config:
  15. page.draw_rect(
  16. rect_coords,
  17. color=None,
  18. fill=new_rgb,
  19. fill_opacity=0.3,
  20. width=0.5,
  21. overlay=True,
  22. ) # Draw the rectangle
  23. else:
  24. page.draw_rect(
  25. rect_coords,
  26. color=new_rgb,
  27. fill=None,
  28. fill_opacity=1,
  29. width=0.5,
  30. overlay=True,
  31. ) # Draw the rectangle
  32. def draw_bbox_with_number(i, bbox_list, page, rgb_config, fill_config):
  33. new_rgb = []
  34. for item in rgb_config:
  35. item = float(item) / 255
  36. new_rgb.append(item)
  37. page_data = bbox_list[i]
  38. for j, bbox in enumerate(page_data):
  39. x0, y0, x1, y1 = bbox
  40. rect_coords = fitz.Rect(x0, y0, x1, y1) # Define the rectangle
  41. if fill_config:
  42. page.draw_rect(
  43. rect_coords,
  44. color=None,
  45. fill=new_rgb,
  46. fill_opacity=0.3,
  47. width=0.5,
  48. overlay=True,
  49. ) # Draw the rectangle
  50. else:
  51. page.draw_rect(
  52. rect_coords,
  53. color=new_rgb,
  54. fill=None,
  55. fill_opacity=1,
  56. width=0.5,
  57. overlay=True,
  58. ) # Draw the rectangle
  59. page.insert_text(
  60. (x0, y0 + 10), str(j + 1), fontsize=10, color=new_rgb
  61. ) # Insert the index in the top left corner of the rectangle
  62. def draw_layout_bbox(pdf_info, pdf_bytes, out_path):
  63. layout_bbox_list = []
  64. dropped_bbox_list = []
  65. tables_list, tables_body_list, tables_caption_list, tables_footnote_list = [], [], [], []
  66. imgs_list, imgs_body_list, imgs_caption_list = [], [], []
  67. titles_list = []
  68. texts_list = []
  69. interequations_list = []
  70. for page in pdf_info:
  71. page_layout_list = []
  72. page_dropped_list = []
  73. tables, tables_body, tables_caption, tables_footnote = [], [], [], []
  74. imgs, imgs_body, imgs_caption = [], [], []
  75. titles = []
  76. texts = []
  77. interequations = []
  78. for layout in page["layout_bboxes"]:
  79. page_layout_list.append(layout["layout_bbox"])
  80. layout_bbox_list.append(page_layout_list)
  81. for dropped_bbox in page["discarded_blocks"]:
  82. page_dropped_list.append(dropped_bbox["bbox"])
  83. dropped_bbox_list.append(page_dropped_list)
  84. for block in page["para_blocks"]:
  85. bbox = block["bbox"]
  86. if block["type"] == BlockType.Table:
  87. tables.append(bbox)
  88. for nested_block in block["blocks"]:
  89. bbox = nested_block["bbox"]
  90. if nested_block["type"] == BlockType.TableBody:
  91. tables_body.append(bbox)
  92. elif nested_block["type"] == BlockType.TableCaption:
  93. tables_caption.append(bbox)
  94. elif nested_block["type"] == BlockType.TableFootnote:
  95. tables_footnote.append(bbox)
  96. elif block["type"] == BlockType.Image:
  97. imgs.append(bbox)
  98. for nested_block in block["blocks"]:
  99. bbox = nested_block["bbox"]
  100. if nested_block["type"] == BlockType.ImageBody:
  101. imgs_body.append(bbox)
  102. elif nested_block["type"] == BlockType.ImageCaption:
  103. imgs_caption.append(bbox)
  104. elif block["type"] == BlockType.Title:
  105. titles.append(bbox)
  106. elif block["type"] == BlockType.Text:
  107. texts.append(bbox)
  108. elif block["type"] == BlockType.InterlineEquation:
  109. interequations.append(bbox)
  110. tables_list.append(tables)
  111. tables_body_list.append(tables_body)
  112. tables_caption_list.append(tables_caption)
  113. tables_footnote_list.append(tables_footnote)
  114. imgs_list.append(imgs)
  115. imgs_body_list.append(imgs_body)
  116. imgs_caption_list.append(imgs_caption)
  117. titles_list.append(titles)
  118. texts_list.append(texts)
  119. interequations_list.append(interequations)
  120. pdf_docs = fitz.open("pdf", pdf_bytes)
  121. for i, page in enumerate(pdf_docs):
  122. draw_bbox_with_number(i, layout_bbox_list, page, [255, 0, 0], False)
  123. draw_bbox_without_number(i, dropped_bbox_list, page, [158, 158, 158], True)
  124. draw_bbox_without_number(i, tables_list, page, [153, 153, 0], True) # color !
  125. draw_bbox_without_number(i, tables_body_list, page, [204, 204, 0], True)
  126. draw_bbox_without_number(i, tables_caption_list, page, [255, 255, 102], True)
  127. draw_bbox_without_number(i, tables_footnote_list, page, [229, 255, 204], True)
  128. draw_bbox_without_number(i, imgs_list, page, [51, 102, 0], True)
  129. draw_bbox_without_number(i, imgs_body_list, page, [153, 255, 51], True)
  130. draw_bbox_without_number(i, imgs_caption_list, page, [102, 178, 255], True)
  131. draw_bbox_without_number(i, titles_list, page, [102, 102, 255], True)
  132. draw_bbox_without_number(i, texts_list, page, [153, 0, 76], True)
  133. draw_bbox_without_number(i, interequations_list, page, [0, 255, 0], True)
  134. # Save the PDF
  135. pdf_docs.save(f"{out_path}/layout.pdf")
  136. def draw_span_bbox(pdf_info, pdf_bytes, out_path):
  137. text_list = []
  138. inline_equation_list = []
  139. interline_equation_list = []
  140. image_list = []
  141. table_list = []
  142. dropped_list = []
  143. next_page_text_list = []
  144. next_page_inline_equation_list = []
  145. def get_span_info(span):
  146. if span["type"] == ContentType.Text:
  147. if span.get(CROSS_PAGE, False):
  148. next_page_text_list.append(span["bbox"])
  149. else:
  150. page_text_list.append(span["bbox"])
  151. elif span["type"] == ContentType.InlineEquation:
  152. if span.get(CROSS_PAGE, False):
  153. next_page_inline_equation_list.append(span["bbox"])
  154. else:
  155. page_inline_equation_list.append(span["bbox"])
  156. elif span["type"] == ContentType.InterlineEquation:
  157. page_interline_equation_list.append(span["bbox"])
  158. elif span["type"] == ContentType.Image:
  159. page_image_list.append(span["bbox"])
  160. elif span["type"] == ContentType.Table:
  161. page_table_list.append(span["bbox"])
  162. for page in pdf_info:
  163. page_text_list = []
  164. page_inline_equation_list = []
  165. page_interline_equation_list = []
  166. page_image_list = []
  167. page_table_list = []
  168. page_dropped_list = []
  169. # 将跨页的span放到移动到下一页的列表中
  170. if len(next_page_text_list) > 0:
  171. page_text_list.extend(next_page_text_list)
  172. next_page_text_list.clear()
  173. if len(next_page_inline_equation_list) > 0:
  174. page_inline_equation_list.extend(next_page_inline_equation_list)
  175. next_page_inline_equation_list.clear()
  176. # 构造dropped_list
  177. for block in page["discarded_blocks"]:
  178. if block["type"] == BlockType.Discarded:
  179. for line in block["lines"]:
  180. for span in line["spans"]:
  181. page_dropped_list.append(span["bbox"])
  182. dropped_list.append(page_dropped_list)
  183. # 构造其余useful_list
  184. for block in page["para_blocks"]:
  185. if block["type"] in [
  186. BlockType.Text,
  187. BlockType.Title,
  188. BlockType.InterlineEquation,
  189. ]:
  190. for line in block["lines"]:
  191. for span in line["spans"]:
  192. get_span_info(span)
  193. elif block["type"] in [BlockType.Image, BlockType.Table]:
  194. for sub_block in block["blocks"]:
  195. for line in sub_block["lines"]:
  196. for span in line["spans"]:
  197. get_span_info(span)
  198. text_list.append(page_text_list)
  199. inline_equation_list.append(page_inline_equation_list)
  200. interline_equation_list.append(page_interline_equation_list)
  201. image_list.append(page_image_list)
  202. table_list.append(page_table_list)
  203. pdf_docs = fitz.open("pdf", pdf_bytes)
  204. for i, page in enumerate(pdf_docs):
  205. # 获取当前页面的数据
  206. draw_bbox_without_number(i, text_list, page, [255, 0, 0], False)
  207. draw_bbox_without_number(i, inline_equation_list, page, [0, 255, 0], False)
  208. draw_bbox_without_number(i, interline_equation_list, page, [0, 0, 255], False)
  209. draw_bbox_without_number(i, image_list, page, [255, 204, 0], False)
  210. draw_bbox_without_number(i, table_list, page, [204, 0, 255], False)
  211. draw_bbox_without_number(i, dropped_list, page, [158, 158, 158], False)
  212. # Save the PDF
  213. pdf_docs.save(f"{out_path}/spans.pdf")
  214. def drow_model_bbox(model_list: list, pdf_bytes, out_path):
  215. dropped_bbox_list = []
  216. tables_body_list, tables_caption_list, tables_footnote_list = [], [], []
  217. imgs_body_list, imgs_caption_list = [], []
  218. titles_list = []
  219. texts_list = []
  220. interequations_list = []
  221. pdf_docs = fitz.open("pdf", pdf_bytes)
  222. magic_model = MagicModel(model_list, pdf_docs)
  223. for i in range(len(model_list)):
  224. page_dropped_list = []
  225. tables_body, tables_caption, tables_footnote = [], [], []
  226. imgs_body, imgs_caption = [], []
  227. titles = []
  228. texts = []
  229. interequations = []
  230. page_info = magic_model.get_model_list(i)
  231. layout_dets = page_info["layout_dets"]
  232. for layout_det in layout_dets:
  233. bbox = layout_det["bbox"]
  234. if layout_det["category_id"] == CategoryId.Text:
  235. texts.append(bbox)
  236. elif layout_det["category_id"] == CategoryId.Title:
  237. titles.append(bbox)
  238. elif layout_det["category_id"] == CategoryId.TableBody:
  239. tables_body.append(bbox)
  240. elif layout_det["category_id"] == CategoryId.TableCaption:
  241. tables_caption.append(bbox)
  242. elif layout_det["category_id"] == CategoryId.TableFootnote:
  243. tables_footnote.append(bbox)
  244. elif layout_det["category_id"] == CategoryId.ImageBody:
  245. imgs_body.append(bbox)
  246. elif layout_det["category_id"] == CategoryId.ImageCaption:
  247. imgs_caption.append(bbox)
  248. elif layout_det["category_id"] == CategoryId.InterlineEquation_YOLO:
  249. interequations.append(bbox)
  250. elif layout_det["category_id"] == CategoryId.Abandon:
  251. page_dropped_list.append(bbox)
  252. tables_body_list.append(tables_body)
  253. tables_caption_list.append(tables_caption)
  254. tables_footnote_list.append(tables_footnote)
  255. imgs_body_list.append(imgs_body)
  256. imgs_caption_list.append(imgs_caption)
  257. titles_list.append(titles)
  258. texts_list.append(texts)
  259. interequations_list.append(interequations)
  260. dropped_bbox_list.append(page_dropped_list)
  261. for i, page in enumerate(pdf_docs):
  262. draw_bbox_with_number(i, dropped_bbox_list, page, [158, 158, 158], True) # color !
  263. draw_bbox_with_number(i, tables_body_list, page, [204, 204, 0], True)
  264. draw_bbox_with_number(i, tables_caption_list, page, [255, 255, 102], True)
  265. draw_bbox_with_number(i, tables_footnote_list, page, [229, 255, 204], True)
  266. draw_bbox_with_number(i, imgs_body_list, page, [153, 255, 51], True)
  267. draw_bbox_with_number(i, imgs_caption_list, page, [102, 178, 255], True)
  268. draw_bbox_with_number(i, titles_list, page, [102, 102, 255], True)
  269. draw_bbox_with_number(i, texts_list, page, [153, 0, 76], True)
  270. draw_bbox_with_number(i, interequations_list, page, [0, 255, 0], True)
  271. # Save the PDF
  272. pdf_docs.save(f"{out_path}/model.pdf")