draw_bbox.py 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372
  1. import time
  2. import torch
  3. from magic_pdf.libs.commons import fitz # PyMuPDF
  4. from magic_pdf.libs.Constants import CROSS_PAGE
  5. from magic_pdf.libs.ocr_content_type import BlockType, CategoryId, ContentType
  6. from magic_pdf.model.magic_model import MagicModel
  7. def draw_bbox_without_number(i, bbox_list, page, rgb_config, fill_config):
  8. new_rgb = []
  9. for item in rgb_config:
  10. item = float(item) / 255
  11. new_rgb.append(item)
  12. page_data = bbox_list[i]
  13. for bbox in page_data:
  14. x0, y0, x1, y1 = bbox
  15. rect_coords = fitz.Rect(x0, y0, x1, y1) # Define the rectangle
  16. if fill_config:
  17. page.draw_rect(
  18. rect_coords,
  19. color=None,
  20. fill=new_rgb,
  21. fill_opacity=0.3,
  22. width=0.5,
  23. overlay=True,
  24. ) # Draw the rectangle
  25. else:
  26. page.draw_rect(
  27. rect_coords,
  28. color=new_rgb,
  29. fill=None,
  30. fill_opacity=1,
  31. width=0.5,
  32. overlay=True,
  33. ) # Draw the rectangle
  34. def draw_bbox_with_number(i, bbox_list, page, rgb_config, fill_config, width=0.5):
  35. new_rgb = []
  36. for item in rgb_config:
  37. item = float(item) / 255
  38. new_rgb.append(item)
  39. page_data = bbox_list[i]
  40. for j, bbox in enumerate(page_data):
  41. x0, y0, x1, y1 = bbox
  42. rect_coords = fitz.Rect(x0, y0, x1, y1) # Define the rectangle
  43. if fill_config:
  44. page.draw_rect(
  45. rect_coords,
  46. color=None,
  47. fill=new_rgb,
  48. fill_opacity=0.3,
  49. width=width,
  50. overlay=True,
  51. ) # Draw the rectangle
  52. else:
  53. page.draw_rect(
  54. rect_coords,
  55. color=new_rgb,
  56. fill=None,
  57. fill_opacity=1,
  58. width=width,
  59. overlay=True,
  60. ) # Draw the rectangle
  61. page.insert_text(
  62. (x1+2, y0 + 10), str(j + 1), fontsize=10, color=new_rgb
  63. ) # Insert the index in the top left corner of the rectangle
  64. def draw_layout_bbox(pdf_info, pdf_bytes, out_path, filename):
  65. # layout_bbox_list = []
  66. dropped_bbox_list = []
  67. tables_list, tables_body_list = [], []
  68. tables_caption_list, tables_footnote_list = [], []
  69. imgs_list, imgs_body_list, imgs_caption_list = [], [], []
  70. imgs_footnote_list = []
  71. titles_list = []
  72. texts_list = []
  73. interequations_list = []
  74. for page in pdf_info:
  75. # page_layout_list = []
  76. page_dropped_list = []
  77. tables, tables_body, tables_caption, tables_footnote = [], [], [], []
  78. imgs, imgs_body, imgs_caption, imgs_footnote = [], [], [], []
  79. titles = []
  80. texts = []
  81. interequations = []
  82. # for layout in page['layout_bboxes']:
  83. # page_layout_list.append(layout['layout_bbox'])
  84. # layout_bbox_list.append(page_layout_list)
  85. for dropped_bbox in page['discarded_blocks']:
  86. page_dropped_list.append(dropped_bbox['bbox'])
  87. dropped_bbox_list.append(page_dropped_list)
  88. for block in page['para_blocks']:
  89. bbox = block['bbox']
  90. if block['type'] == BlockType.Table:
  91. tables.append(bbox)
  92. for nested_block in block['blocks']:
  93. bbox = nested_block['bbox']
  94. if nested_block['type'] == BlockType.TableBody:
  95. tables_body.append(bbox)
  96. elif nested_block['type'] == BlockType.TableCaption:
  97. tables_caption.append(bbox)
  98. elif nested_block['type'] == BlockType.TableFootnote:
  99. tables_footnote.append(bbox)
  100. elif block['type'] == BlockType.Image:
  101. imgs.append(bbox)
  102. for nested_block in block['blocks']:
  103. bbox = nested_block['bbox']
  104. if nested_block['type'] == BlockType.ImageBody:
  105. imgs_body.append(bbox)
  106. elif nested_block['type'] == BlockType.ImageCaption:
  107. imgs_caption.append(bbox)
  108. elif nested_block['type'] == BlockType.ImageFootnote:
  109. imgs_footnote.append(bbox)
  110. elif block['type'] == BlockType.Title:
  111. titles.append(bbox)
  112. elif block['type'] == BlockType.Text:
  113. texts.append(bbox)
  114. elif block['type'] == BlockType.InterlineEquation:
  115. interequations.append(bbox)
  116. tables_list.append(tables)
  117. tables_body_list.append(tables_body)
  118. tables_caption_list.append(tables_caption)
  119. tables_footnote_list.append(tables_footnote)
  120. imgs_list.append(imgs)
  121. imgs_body_list.append(imgs_body)
  122. imgs_caption_list.append(imgs_caption)
  123. imgs_footnote_list.append(imgs_footnote)
  124. titles_list.append(titles)
  125. texts_list.append(texts)
  126. interequations_list.append(interequations)
  127. layout_bbox_list = []
  128. for page in pdf_info:
  129. page_block_list = []
  130. for block in page['para_blocks']:
  131. bbox = block['bbox']
  132. page_block_list.append(bbox)
  133. layout_bbox_list.append(page_block_list)
  134. pdf_docs = fitz.open('pdf', pdf_bytes)
  135. for i, page in enumerate(pdf_docs):
  136. # draw_bbox_with_number(i, layout_bbox_list, page, [255, 0, 0], False)
  137. draw_bbox_without_number(i, dropped_bbox_list, page, [158, 158, 158],
  138. True)
  139. draw_bbox_without_number(i, tables_list, page, [153, 153, 0],
  140. True) # color !
  141. draw_bbox_without_number(i, tables_body_list, page, [204, 204, 0],
  142. True)
  143. draw_bbox_without_number(i, tables_caption_list, page, [255, 255, 102],
  144. True)
  145. draw_bbox_without_number(i, tables_footnote_list, page,
  146. [229, 255, 204], True)
  147. draw_bbox_without_number(i, imgs_list, page, [51, 102, 0], True)
  148. draw_bbox_without_number(i, imgs_body_list, page, [153, 255, 51], True)
  149. draw_bbox_without_number(i, imgs_caption_list, page, [102, 178, 255],
  150. True)
  151. draw_bbox_without_number(i, imgs_footnote_list, page, [255, 178, 102],
  152. True),
  153. draw_bbox_without_number(i, titles_list, page, [102, 102, 255], True)
  154. draw_bbox_without_number(i, texts_list, page, [153, 0, 76], True)
  155. draw_bbox_without_number(i, interequations_list, page, [0, 255, 0],
  156. True)
  157. for i, page in enumerate(pdf_docs):
  158. draw_bbox_with_number(i, layout_bbox_list, page, [255, 0, 0], False, width=0)
  159. # Save the PDF
  160. pdf_docs.save(f'{out_path}/{filename}_layout.pdf')
  161. def draw_span_bbox(pdf_info, pdf_bytes, out_path, filename):
  162. text_list = []
  163. inline_equation_list = []
  164. interline_equation_list = []
  165. image_list = []
  166. table_list = []
  167. dropped_list = []
  168. next_page_text_list = []
  169. next_page_inline_equation_list = []
  170. def get_span_info(span):
  171. if span['type'] == ContentType.Text:
  172. if span.get(CROSS_PAGE, False):
  173. next_page_text_list.append(span['bbox'])
  174. else:
  175. page_text_list.append(span['bbox'])
  176. elif span['type'] == ContentType.InlineEquation:
  177. if span.get(CROSS_PAGE, False):
  178. next_page_inline_equation_list.append(span['bbox'])
  179. else:
  180. page_inline_equation_list.append(span['bbox'])
  181. elif span['type'] == ContentType.InterlineEquation:
  182. page_interline_equation_list.append(span['bbox'])
  183. elif span['type'] == ContentType.Image:
  184. page_image_list.append(span['bbox'])
  185. elif span['type'] == ContentType.Table:
  186. page_table_list.append(span['bbox'])
  187. for page in pdf_info:
  188. page_text_list = []
  189. page_inline_equation_list = []
  190. page_interline_equation_list = []
  191. page_image_list = []
  192. page_table_list = []
  193. page_dropped_list = []
  194. # 将跨页的span放到移动到下一页的列表中
  195. if len(next_page_text_list) > 0:
  196. page_text_list.extend(next_page_text_list)
  197. next_page_text_list.clear()
  198. if len(next_page_inline_equation_list) > 0:
  199. page_inline_equation_list.extend(next_page_inline_equation_list)
  200. next_page_inline_equation_list.clear()
  201. # 构造dropped_list
  202. for block in page['discarded_blocks']:
  203. if block['type'] == BlockType.Discarded:
  204. for line in block['lines']:
  205. for span in line['spans']:
  206. page_dropped_list.append(span['bbox'])
  207. dropped_list.append(page_dropped_list)
  208. # 构造其余useful_list
  209. for block in page['para_blocks']:
  210. if block['type'] in [
  211. BlockType.Text,
  212. BlockType.Title,
  213. BlockType.InterlineEquation,
  214. ]:
  215. for line in block['lines']:
  216. for span in line['spans']:
  217. get_span_info(span)
  218. elif block['type'] in [BlockType.Image, BlockType.Table]:
  219. for sub_block in block['blocks']:
  220. for line in sub_block['lines']:
  221. for span in line['spans']:
  222. get_span_info(span)
  223. text_list.append(page_text_list)
  224. inline_equation_list.append(page_inline_equation_list)
  225. interline_equation_list.append(page_interline_equation_list)
  226. image_list.append(page_image_list)
  227. table_list.append(page_table_list)
  228. pdf_docs = fitz.open('pdf', pdf_bytes)
  229. for i, page in enumerate(pdf_docs):
  230. # 获取当前页面的数据
  231. draw_bbox_without_number(i, text_list, page, [255, 0, 0], False)
  232. draw_bbox_without_number(i, inline_equation_list, page, [0, 255, 0], False)
  233. draw_bbox_without_number(i, interline_equation_list, page, [0, 0, 255], False)
  234. draw_bbox_without_number(i, image_list, page, [255, 204, 0], False)
  235. draw_bbox_without_number(i, table_list, page, [204, 0, 255], False)
  236. draw_bbox_without_number(i, dropped_list, page, [158, 158, 158], False)
  237. # Save the PDF
  238. pdf_docs.save(f'{out_path}/{filename}_spans.pdf')
  239. def draw_model_bbox(model_list: list, pdf_bytes, out_path, filename):
  240. dropped_bbox_list = []
  241. tables_body_list, tables_caption_list, tables_footnote_list = [], [], []
  242. imgs_body_list, imgs_caption_list, imgs_footnote_list = [], [], []
  243. titles_list = []
  244. texts_list = []
  245. interequations_list = []
  246. pdf_docs = fitz.open('pdf', pdf_bytes)
  247. magic_model = MagicModel(model_list, pdf_docs)
  248. for i in range(len(model_list)):
  249. page_dropped_list = []
  250. tables_body, tables_caption, tables_footnote = [], [], []
  251. imgs_body, imgs_caption, imgs_footnote = [], [], []
  252. titles = []
  253. texts = []
  254. interequations = []
  255. page_info = magic_model.get_model_list(i)
  256. layout_dets = page_info['layout_dets']
  257. for layout_det in layout_dets:
  258. bbox = layout_det['bbox']
  259. if layout_det['category_id'] == CategoryId.Text:
  260. texts.append(bbox)
  261. elif layout_det['category_id'] == CategoryId.Title:
  262. titles.append(bbox)
  263. elif layout_det['category_id'] == CategoryId.TableBody:
  264. tables_body.append(bbox)
  265. elif layout_det['category_id'] == CategoryId.TableCaption:
  266. tables_caption.append(bbox)
  267. elif layout_det['category_id'] == CategoryId.TableFootnote:
  268. tables_footnote.append(bbox)
  269. elif layout_det['category_id'] == CategoryId.ImageBody:
  270. imgs_body.append(bbox)
  271. elif layout_det['category_id'] == CategoryId.ImageCaption:
  272. imgs_caption.append(bbox)
  273. elif layout_det[
  274. 'category_id'] == CategoryId.InterlineEquation_YOLO:
  275. interequations.append(bbox)
  276. elif layout_det['category_id'] == CategoryId.Abandon:
  277. page_dropped_list.append(bbox)
  278. elif layout_det['category_id'] == CategoryId.ImageFootnote:
  279. imgs_footnote.append(bbox)
  280. tables_body_list.append(tables_body)
  281. tables_caption_list.append(tables_caption)
  282. tables_footnote_list.append(tables_footnote)
  283. imgs_body_list.append(imgs_body)
  284. imgs_caption_list.append(imgs_caption)
  285. titles_list.append(titles)
  286. texts_list.append(texts)
  287. interequations_list.append(interequations)
  288. dropped_bbox_list.append(page_dropped_list)
  289. imgs_footnote_list.append(imgs_footnote)
  290. for i, page in enumerate(pdf_docs):
  291. draw_bbox_with_number(i, dropped_bbox_list, page, [158, 158, 158],
  292. True) # color !
  293. draw_bbox_with_number(i, tables_body_list, page, [204, 204, 0], True)
  294. draw_bbox_with_number(i, tables_caption_list, page, [255, 255, 102],
  295. True)
  296. draw_bbox_with_number(i, tables_footnote_list, page, [229, 255, 204],
  297. True)
  298. draw_bbox_with_number(i, imgs_body_list, page, [153, 255, 51], True)
  299. draw_bbox_with_number(i, imgs_caption_list, page, [102, 178, 255],
  300. True)
  301. draw_bbox_with_number(i, imgs_footnote_list, page, [255, 178, 102],
  302. True)
  303. draw_bbox_with_number(i, titles_list, page, [102, 102, 255], True)
  304. draw_bbox_with_number(i, texts_list, page, [153, 0, 76], True)
  305. draw_bbox_with_number(i, interequations_list, page, [0, 255, 0], True)
  306. # Save the PDF
  307. pdf_docs.save(f'{out_path}/{filename}_model.pdf')
  308. def draw_line_sort_bbox(pdf_info, pdf_bytes, out_path, filename):
  309. layout_bbox_list = []
  310. from loguru import logger
  311. for page in pdf_info:
  312. page_line_list = []
  313. for block in page['preproc_blocks']:
  314. if block['type'] == 'text' or block['type'] == 'title' or block['type'] == 'interline_equation':
  315. for line in block['lines']:
  316. bbox = line['bbox']
  317. index = line['index']
  318. page_line_list.append({'index': index, 'bbox': bbox})
  319. if block['type'] == 'table' or block['type'] == 'image':
  320. bbox = block['bbox']
  321. index = block['index']
  322. page_line_list.append({'index': index, 'bbox': bbox})
  323. sorted_bboxes = sorted(page_line_list, key=lambda x: x['index'])
  324. layout_bbox_list.append(sorted_bbox['bbox'] for sorted_bbox in sorted_bboxes)
  325. pdf_docs = fitz.open('pdf', pdf_bytes)
  326. for i, page in enumerate(pdf_docs):
  327. draw_bbox_with_number(i, layout_bbox_list, page, [255, 0, 0], False)
  328. pdf_docs.save(f'{out_path}/{filename}_line_sort.pdf')
  329. def draw_layout_sort_bbox(pdf_info, pdf_bytes, out_path, filename):
  330. layout_bbox_list = []
  331. for page in pdf_info:
  332. page_block_list = []
  333. for block in page['para_blocks']:
  334. bbox = block['bbox']
  335. page_block_list.append(bbox)
  336. layout_bbox_list.append(page_block_list)
  337. pdf_docs = fitz.open('pdf', pdf_bytes)
  338. for i, page in enumerate(pdf_docs):
  339. draw_bbox_with_number(i, layout_bbox_list, page, [255, 0, 0], False)
  340. pdf_docs.save(f'{out_path}/{filename}_layout_sort.pdf')