draw_bbox.py 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385
  1. from magic_pdf.libs.commons import fitz # PyMuPDF
  2. from magic_pdf.libs.Constants import CROSS_PAGE
  3. from magic_pdf.libs.ocr_content_type import BlockType, CategoryId, ContentType
  4. from magic_pdf.model.magic_model import MagicModel
  5. def draw_bbox_without_number(i, bbox_list, page, rgb_config, fill_config):
  6. new_rgb = []
  7. for item in rgb_config:
  8. item = float(item) / 255
  9. new_rgb.append(item)
  10. page_data = bbox_list[i]
  11. for bbox in page_data:
  12. x0, y0, x1, y1 = bbox
  13. rect_coords = fitz.Rect(x0, y0, x1, y1) # Define the rectangle
  14. if fill_config:
  15. page.draw_rect(
  16. rect_coords,
  17. color=None,
  18. fill=new_rgb,
  19. fill_opacity=0.3,
  20. width=0.5,
  21. overlay=True,
  22. ) # Draw the rectangle
  23. else:
  24. page.draw_rect(
  25. rect_coords,
  26. color=new_rgb,
  27. fill=None,
  28. fill_opacity=1,
  29. width=0.5,
  30. overlay=True,
  31. ) # Draw the rectangle
  32. def draw_bbox_with_number(i, bbox_list, page, rgb_config, fill_config, draw_bbox=True):
  33. new_rgb = []
  34. for item in rgb_config:
  35. item = float(item) / 255
  36. new_rgb.append(item)
  37. page_data = bbox_list[i]
  38. for j, bbox in enumerate(page_data):
  39. x0, y0, x1, y1 = bbox
  40. rect_coords = fitz.Rect(x0, y0, x1, y1) # Define the rectangle
  41. if draw_bbox:
  42. if fill_config:
  43. page.draw_rect(
  44. rect_coords,
  45. color=None,
  46. fill=new_rgb,
  47. fill_opacity=0.3,
  48. width=0.5,
  49. overlay=True,
  50. ) # Draw the rectangle
  51. else:
  52. page.draw_rect(
  53. rect_coords,
  54. color=new_rgb,
  55. fill=None,
  56. fill_opacity=1,
  57. width=0.5,
  58. overlay=True,
  59. ) # Draw the rectangle
  60. page.insert_text(
  61. (x1+2, y0 + 10), str(j + 1), fontsize=10, color=new_rgb
  62. ) # Insert the index in the top left corner of the rectangle
  63. def draw_layout_bbox(pdf_info, pdf_bytes, out_path, filename):
  64. dropped_bbox_list = []
  65. tables_list, tables_body_list = [], []
  66. tables_caption_list, tables_footnote_list = [], []
  67. imgs_list, imgs_body_list, imgs_caption_list = [], [], []
  68. imgs_footnote_list = []
  69. titles_list = []
  70. texts_list = []
  71. interequations_list = []
  72. lists_list = []
  73. indexs_list = []
  74. for page in pdf_info:
  75. page_dropped_list = []
  76. tables, tables_body, tables_caption, tables_footnote = [], [], [], []
  77. imgs, imgs_body, imgs_caption, imgs_footnote = [], [], [], []
  78. titles = []
  79. texts = []
  80. interequations = []
  81. lists = []
  82. indexs = []
  83. for dropped_bbox in page['discarded_blocks']:
  84. page_dropped_list.append(dropped_bbox['bbox'])
  85. dropped_bbox_list.append(page_dropped_list)
  86. for block in page['para_blocks']:
  87. bbox = block['bbox']
  88. if block['type'] == BlockType.Table:
  89. tables.append(bbox)
  90. for nested_block in block['blocks']:
  91. bbox = nested_block['bbox']
  92. if nested_block['type'] == BlockType.TableBody:
  93. tables_body.append(bbox)
  94. elif nested_block['type'] == BlockType.TableCaption:
  95. tables_caption.append(bbox)
  96. elif nested_block['type'] == BlockType.TableFootnote:
  97. tables_footnote.append(bbox)
  98. elif block['type'] == BlockType.Image:
  99. imgs.append(bbox)
  100. for nested_block in block['blocks']:
  101. bbox = nested_block['bbox']
  102. if nested_block['type'] == BlockType.ImageBody:
  103. imgs_body.append(bbox)
  104. elif nested_block['type'] == BlockType.ImageCaption:
  105. imgs_caption.append(bbox)
  106. elif nested_block['type'] == BlockType.ImageFootnote:
  107. imgs_footnote.append(bbox)
  108. elif block['type'] == BlockType.Title:
  109. titles.append(bbox)
  110. elif block['type'] == BlockType.Text:
  111. texts.append(bbox)
  112. elif block['type'] == BlockType.InterlineEquation:
  113. interequations.append(bbox)
  114. elif block['type'] == BlockType.List:
  115. lists.append(bbox)
  116. elif block['type'] == BlockType.Index:
  117. indexs.append(bbox)
  118. tables_list.append(tables)
  119. tables_body_list.append(tables_body)
  120. tables_caption_list.append(tables_caption)
  121. tables_footnote_list.append(tables_footnote)
  122. imgs_list.append(imgs)
  123. imgs_body_list.append(imgs_body)
  124. imgs_caption_list.append(imgs_caption)
  125. imgs_footnote_list.append(imgs_footnote)
  126. titles_list.append(titles)
  127. texts_list.append(texts)
  128. interequations_list.append(interequations)
  129. lists_list.append(lists)
  130. indexs_list.append(indexs)
  131. layout_bbox_list = []
  132. for page in pdf_info:
  133. page_block_list = []
  134. for block in page['para_blocks']:
  135. bbox = block['bbox']
  136. page_block_list.append(bbox)
  137. layout_bbox_list.append(page_block_list)
  138. pdf_docs = fitz.open('pdf', pdf_bytes)
  139. for i, page in enumerate(pdf_docs):
  140. draw_bbox_without_number(i, dropped_bbox_list, page, [158, 158, 158],
  141. True)
  142. draw_bbox_without_number(i, tables_list, page, [153, 153, 0],
  143. True) # color !
  144. draw_bbox_without_number(i, tables_body_list, page, [204, 204, 0],
  145. True)
  146. draw_bbox_without_number(i, tables_caption_list, page, [255, 255, 102],
  147. True)
  148. draw_bbox_without_number(i, tables_footnote_list, page,
  149. [229, 255, 204], True)
  150. draw_bbox_without_number(i, imgs_list, page, [51, 102, 0], True)
  151. draw_bbox_without_number(i, imgs_body_list, page, [153, 255, 51], True)
  152. draw_bbox_without_number(i, imgs_caption_list, page, [102, 178, 255],
  153. True)
  154. draw_bbox_without_number(i, imgs_footnote_list, page, [255, 178, 102],
  155. True),
  156. draw_bbox_without_number(i, titles_list, page, [102, 102, 255], True)
  157. draw_bbox_without_number(i, texts_list, page, [153, 0, 76], True)
  158. draw_bbox_without_number(i, interequations_list, page, [0, 255, 0],
  159. True)
  160. draw_bbox_without_number(i, lists_list, page, [40, 169, 92], True)
  161. draw_bbox_without_number(i, indexs_list, page, [40, 169, 92], True)
  162. draw_bbox_with_number(i, layout_bbox_list, page, [255, 0, 0], False, draw_bbox=False)
  163. # Save the PDF
  164. pdf_docs.save(f'{out_path}/{filename}_layout.pdf')
  165. def draw_span_bbox(pdf_info, pdf_bytes, out_path, filename):
  166. text_list = []
  167. inline_equation_list = []
  168. interline_equation_list = []
  169. image_list = []
  170. table_list = []
  171. dropped_list = []
  172. next_page_text_list = []
  173. next_page_inline_equation_list = []
  174. def get_span_info(span):
  175. if span['type'] == ContentType.Text:
  176. if span.get(CROSS_PAGE, False):
  177. next_page_text_list.append(span['bbox'])
  178. else:
  179. page_text_list.append(span['bbox'])
  180. elif span['type'] == ContentType.InlineEquation:
  181. if span.get(CROSS_PAGE, False):
  182. next_page_inline_equation_list.append(span['bbox'])
  183. else:
  184. page_inline_equation_list.append(span['bbox'])
  185. elif span['type'] == ContentType.InterlineEquation:
  186. page_interline_equation_list.append(span['bbox'])
  187. elif span['type'] == ContentType.Image:
  188. page_image_list.append(span['bbox'])
  189. elif span['type'] == ContentType.Table:
  190. page_table_list.append(span['bbox'])
  191. for page in pdf_info:
  192. page_text_list = []
  193. page_inline_equation_list = []
  194. page_interline_equation_list = []
  195. page_image_list = []
  196. page_table_list = []
  197. page_dropped_list = []
  198. # 将跨页的span放到移动到下一页的列表中
  199. if len(next_page_text_list) > 0:
  200. page_text_list.extend(next_page_text_list)
  201. next_page_text_list.clear()
  202. if len(next_page_inline_equation_list) > 0:
  203. page_inline_equation_list.extend(next_page_inline_equation_list)
  204. next_page_inline_equation_list.clear()
  205. # 构造dropped_list
  206. for block in page['discarded_blocks']:
  207. if block['type'] == BlockType.Discarded:
  208. for line in block['lines']:
  209. for span in line['spans']:
  210. page_dropped_list.append(span['bbox'])
  211. dropped_list.append(page_dropped_list)
  212. # 构造其余useful_list
  213. for block in page['para_blocks']:
  214. if block['type'] in [
  215. BlockType.Text,
  216. BlockType.Title,
  217. BlockType.InterlineEquation,
  218. BlockType.List,
  219. BlockType.Index,
  220. ]:
  221. for line in block['lines']:
  222. for span in line['spans']:
  223. get_span_info(span)
  224. elif block['type'] in [BlockType.Image, BlockType.Table]:
  225. for sub_block in block['blocks']:
  226. for line in sub_block['lines']:
  227. for span in line['spans']:
  228. get_span_info(span)
  229. text_list.append(page_text_list)
  230. inline_equation_list.append(page_inline_equation_list)
  231. interline_equation_list.append(page_interline_equation_list)
  232. image_list.append(page_image_list)
  233. table_list.append(page_table_list)
  234. pdf_docs = fitz.open('pdf', pdf_bytes)
  235. for i, page in enumerate(pdf_docs):
  236. # 获取当前页面的数据
  237. draw_bbox_without_number(i, text_list, page, [255, 0, 0], False)
  238. draw_bbox_without_number(i, inline_equation_list, page, [0, 255, 0], False)
  239. draw_bbox_without_number(i, interline_equation_list, page, [0, 0, 255], False)
  240. draw_bbox_without_number(i, image_list, page, [255, 204, 0], False)
  241. draw_bbox_without_number(i, table_list, page, [204, 0, 255], False)
  242. draw_bbox_without_number(i, dropped_list, page, [158, 158, 158], False)
  243. # Save the PDF
  244. pdf_docs.save(f'{out_path}/{filename}_spans.pdf')
  245. def draw_model_bbox(model_list: list, pdf_bytes, out_path, filename):
  246. dropped_bbox_list = []
  247. tables_body_list, tables_caption_list, tables_footnote_list = [], [], []
  248. imgs_body_list, imgs_caption_list, imgs_footnote_list = [], [], []
  249. titles_list = []
  250. texts_list = []
  251. interequations_list = []
  252. pdf_docs = fitz.open('pdf', pdf_bytes)
  253. magic_model = MagicModel(model_list, pdf_docs)
  254. for i in range(len(model_list)):
  255. page_dropped_list = []
  256. tables_body, tables_caption, tables_footnote = [], [], []
  257. imgs_body, imgs_caption, imgs_footnote = [], [], []
  258. titles = []
  259. texts = []
  260. interequations = []
  261. page_info = magic_model.get_model_list(i)
  262. layout_dets = page_info['layout_dets']
  263. for layout_det in layout_dets:
  264. bbox = layout_det['bbox']
  265. if layout_det['category_id'] == CategoryId.Text:
  266. texts.append(bbox)
  267. elif layout_det['category_id'] == CategoryId.Title:
  268. titles.append(bbox)
  269. elif layout_det['category_id'] == CategoryId.TableBody:
  270. tables_body.append(bbox)
  271. elif layout_det['category_id'] == CategoryId.TableCaption:
  272. tables_caption.append(bbox)
  273. elif layout_det['category_id'] == CategoryId.TableFootnote:
  274. tables_footnote.append(bbox)
  275. elif layout_det['category_id'] == CategoryId.ImageBody:
  276. imgs_body.append(bbox)
  277. elif layout_det['category_id'] == CategoryId.ImageCaption:
  278. imgs_caption.append(bbox)
  279. elif layout_det[
  280. 'category_id'] == CategoryId.InterlineEquation_YOLO:
  281. interequations.append(bbox)
  282. elif layout_det['category_id'] == CategoryId.Abandon:
  283. page_dropped_list.append(bbox)
  284. elif layout_det['category_id'] == CategoryId.ImageFootnote:
  285. imgs_footnote.append(bbox)
  286. tables_body_list.append(tables_body)
  287. tables_caption_list.append(tables_caption)
  288. tables_footnote_list.append(tables_footnote)
  289. imgs_body_list.append(imgs_body)
  290. imgs_caption_list.append(imgs_caption)
  291. titles_list.append(titles)
  292. texts_list.append(texts)
  293. interequations_list.append(interequations)
  294. dropped_bbox_list.append(page_dropped_list)
  295. imgs_footnote_list.append(imgs_footnote)
  296. for i, page in enumerate(pdf_docs):
  297. draw_bbox_with_number(i, dropped_bbox_list, page, [158, 158, 158],
  298. True) # color !
  299. draw_bbox_with_number(i, tables_body_list, page, [204, 204, 0], True)
  300. draw_bbox_with_number(i, tables_caption_list, page, [255, 255, 102],
  301. True)
  302. draw_bbox_with_number(i, tables_footnote_list, page, [229, 255, 204],
  303. True)
  304. draw_bbox_with_number(i, imgs_body_list, page, [153, 255, 51], True)
  305. draw_bbox_with_number(i, imgs_caption_list, page, [102, 178, 255],
  306. True)
  307. draw_bbox_with_number(i, imgs_footnote_list, page, [255, 178, 102],
  308. True)
  309. draw_bbox_with_number(i, titles_list, page, [102, 102, 255], True)
  310. draw_bbox_with_number(i, texts_list, page, [153, 0, 76], True)
  311. draw_bbox_with_number(i, interequations_list, page, [0, 255, 0], True)
  312. # Save the PDF
  313. pdf_docs.save(f'{out_path}/{filename}_model.pdf')
  314. def draw_line_sort_bbox(pdf_info, pdf_bytes, out_path, filename):
  315. layout_bbox_list = []
  316. for page in pdf_info:
  317. page_line_list = []
  318. for block in page['preproc_blocks']:
  319. if block['type'] in ['text', 'title', 'interline_equation']:
  320. for line in block['lines']:
  321. bbox = line['bbox']
  322. index = line['index']
  323. page_line_list.append({'index': index, 'bbox': bbox})
  324. if block['type'] in ['table', 'image']:
  325. bbox = block['bbox']
  326. index = block['index']
  327. page_line_list.append({'index': index, 'bbox': bbox})
  328. # for line in block['lines']:
  329. # bbox = line['bbox']
  330. # index = line['index']
  331. # page_line_list.append({'index': index, 'bbox': bbox})
  332. sorted_bboxes = sorted(page_line_list, key=lambda x: x['index'])
  333. layout_bbox_list.append(sorted_bbox['bbox'] for sorted_bbox in sorted_bboxes)
  334. pdf_docs = fitz.open('pdf', pdf_bytes)
  335. for i, page in enumerate(pdf_docs):
  336. draw_bbox_with_number(i, layout_bbox_list, page, [255, 0, 0], False)
  337. pdf_docs.save(f'{out_path}/{filename}_line_sort.pdf')
  338. def draw_layout_sort_bbox(pdf_info, pdf_bytes, out_path, filename):
  339. layout_bbox_list = []
  340. for page in pdf_info:
  341. page_block_list = []
  342. for block in page['para_blocks']:
  343. bbox = block['bbox']
  344. page_block_list.append(bbox)
  345. layout_bbox_list.append(page_block_list)
  346. pdf_docs = fitz.open('pdf', pdf_bytes)
  347. for i, page in enumerate(pdf_docs):
  348. draw_bbox_with_number(i, layout_bbox_list, page, [255, 0, 0], False)
  349. pdf_docs.save(f'{out_path}/{filename}_layout_sort.pdf')