draw_bbox.py 17 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410
  1. from magic_pdf.config.constants import CROSS_PAGE
  2. from magic_pdf.config.ocr_content_type import (BlockType, CategoryId,
  3. ContentType)
  4. from magic_pdf.data.dataset import PymuDocDataset
  5. from magic_pdf.libs.commons import fitz # PyMuPDF
  6. from magic_pdf.model.magic_model import MagicModel
  7. def draw_bbox_without_number(i, bbox_list, page, rgb_config, fill_config):
  8. new_rgb = []
  9. for item in rgb_config:
  10. item = float(item) / 255
  11. new_rgb.append(item)
  12. page_data = bbox_list[i]
  13. for bbox in page_data:
  14. x0, y0, x1, y1 = bbox
  15. rect_coords = fitz.Rect(x0, y0, x1, y1) # Define the rectangle
  16. if fill_config:
  17. page.draw_rect(
  18. rect_coords,
  19. color=None,
  20. fill=new_rgb,
  21. fill_opacity=0.3,
  22. width=0.5,
  23. overlay=True,
  24. ) # Draw the rectangle
  25. else:
  26. page.draw_rect(
  27. rect_coords,
  28. color=new_rgb,
  29. fill=None,
  30. fill_opacity=1,
  31. width=0.5,
  32. overlay=True,
  33. ) # Draw the rectangle
  34. def draw_bbox_with_number(i, bbox_list, page, rgb_config, fill_config, draw_bbox=True):
  35. new_rgb = []
  36. for item in rgb_config:
  37. item = float(item) / 255
  38. new_rgb.append(item)
  39. page_data = bbox_list[i]
  40. for j, bbox in enumerate(page_data):
  41. x0, y0, x1, y1 = bbox
  42. rect_coords = fitz.Rect(x0, y0, x1, y1) # Define the rectangle
  43. if draw_bbox:
  44. if fill_config:
  45. page.draw_rect(
  46. rect_coords,
  47. color=None,
  48. fill=new_rgb,
  49. fill_opacity=0.3,
  50. width=0.5,
  51. overlay=True,
  52. ) # Draw the rectangle
  53. else:
  54. page.draw_rect(
  55. rect_coords,
  56. color=new_rgb,
  57. fill=None,
  58. fill_opacity=1,
  59. width=0.5,
  60. overlay=True,
  61. ) # Draw the rectangle
  62. page.insert_text(
  63. (x1 + 2, y0 + 10), str(j + 1), fontsize=10, color=new_rgb
  64. ) # Insert the index in the top left corner of the rectangle
  65. def draw_layout_bbox(pdf_info, pdf_bytes, out_path, filename):
  66. dropped_bbox_list = []
  67. tables_list, tables_body_list = [], []
  68. tables_caption_list, tables_footnote_list = [], []
  69. imgs_list, imgs_body_list, imgs_caption_list = [], [], []
  70. imgs_footnote_list = []
  71. titles_list = []
  72. texts_list = []
  73. interequations_list = []
  74. lists_list = []
  75. indexs_list = []
  76. for page in pdf_info:
  77. page_dropped_list = []
  78. tables, tables_body, tables_caption, tables_footnote = [], [], [], []
  79. imgs, imgs_body, imgs_caption, imgs_footnote = [], [], [], []
  80. titles = []
  81. texts = []
  82. interequations = []
  83. lists = []
  84. indices = []
  85. for dropped_bbox in page['discarded_blocks']:
  86. page_dropped_list.append(dropped_bbox['bbox'])
  87. dropped_bbox_list.append(page_dropped_list)
  88. for block in page['para_blocks']:
  89. bbox = block['bbox']
  90. if block['type'] == BlockType.Table:
  91. tables.append(bbox)
  92. for nested_block in block['blocks']:
  93. bbox = nested_block['bbox']
  94. if nested_block['type'] == BlockType.TableBody:
  95. tables_body.append(bbox)
  96. elif nested_block['type'] == BlockType.TableCaption:
  97. tables_caption.append(bbox)
  98. elif nested_block['type'] == BlockType.TableFootnote:
  99. tables_footnote.append(bbox)
  100. elif block['type'] == BlockType.Image:
  101. imgs.append(bbox)
  102. for nested_block in block['blocks']:
  103. bbox = nested_block['bbox']
  104. if nested_block['type'] == BlockType.ImageBody:
  105. imgs_body.append(bbox)
  106. elif nested_block['type'] == BlockType.ImageCaption:
  107. imgs_caption.append(bbox)
  108. elif nested_block['type'] == BlockType.ImageFootnote:
  109. imgs_footnote.append(bbox)
  110. elif block['type'] == BlockType.Title:
  111. titles.append(bbox)
  112. elif block['type'] == BlockType.Text:
  113. texts.append(bbox)
  114. elif block['type'] == BlockType.InterlineEquation:
  115. interequations.append(bbox)
  116. elif block['type'] == BlockType.List:
  117. lists.append(bbox)
  118. elif block['type'] == BlockType.Index:
  119. indices.append(bbox)
  120. tables_list.append(tables)
  121. tables_body_list.append(tables_body)
  122. tables_caption_list.append(tables_caption)
  123. tables_footnote_list.append(tables_footnote)
  124. imgs_list.append(imgs)
  125. imgs_body_list.append(imgs_body)
  126. imgs_caption_list.append(imgs_caption)
  127. imgs_footnote_list.append(imgs_footnote)
  128. titles_list.append(titles)
  129. texts_list.append(texts)
  130. interequations_list.append(interequations)
  131. lists_list.append(lists)
  132. indexs_list.append(indices)
  133. layout_bbox_list = []
  134. table_type_order = {
  135. 'table_caption': 1,
  136. 'table_body': 2,
  137. 'table_footnote': 3
  138. }
  139. for page in pdf_info:
  140. page_block_list = []
  141. for block in page['para_blocks']:
  142. if block['type'] in [
  143. BlockType.Text,
  144. BlockType.Title,
  145. BlockType.InterlineEquation,
  146. BlockType.List,
  147. BlockType.Index,
  148. ]:
  149. bbox = block['bbox']
  150. page_block_list.append(bbox)
  151. elif block['type'] in [BlockType.Image]:
  152. for sub_block in block['blocks']:
  153. bbox = sub_block['bbox']
  154. page_block_list.append(bbox)
  155. elif block['type'] in [BlockType.Table]:
  156. sorted_blocks = sorted(block['blocks'], key=lambda x: table_type_order[x['type']])
  157. for sub_block in sorted_blocks:
  158. bbox = sub_block['bbox']
  159. page_block_list.append(bbox)
  160. layout_bbox_list.append(page_block_list)
  161. pdf_docs = fitz.open('pdf', pdf_bytes)
  162. for i, page in enumerate(pdf_docs):
  163. draw_bbox_without_number(i, dropped_bbox_list, page, [158, 158, 158], True)
  164. # draw_bbox_without_number(i, tables_list, page, [153, 153, 0], True) # color !
  165. draw_bbox_without_number(i, tables_body_list, page, [204, 204, 0], True)
  166. draw_bbox_without_number(i, tables_caption_list, page, [255, 255, 102], True)
  167. draw_bbox_without_number(i, tables_footnote_list, page, [229, 255, 204], True)
  168. # draw_bbox_without_number(i, imgs_list, page, [51, 102, 0], True)
  169. draw_bbox_without_number(i, imgs_body_list, page, [153, 255, 51], True)
  170. draw_bbox_without_number(i, imgs_caption_list, page, [102, 178, 255], True)
  171. draw_bbox_without_number(i, imgs_footnote_list, page, [255, 178, 102], True),
  172. draw_bbox_without_number(i, titles_list, page, [102, 102, 255], True)
  173. draw_bbox_without_number(i, texts_list, page, [153, 0, 76], True)
  174. draw_bbox_without_number(i, interequations_list, page, [0, 255, 0], True)
  175. draw_bbox_without_number(i, lists_list, page, [40, 169, 92], True)
  176. draw_bbox_without_number(i, indexs_list, page, [40, 169, 92], True)
  177. draw_bbox_with_number(
  178. i, layout_bbox_list, page, [255, 0, 0], False, draw_bbox=False
  179. )
  180. # Save the PDF
  181. pdf_docs.save(f'{out_path}/{filename}_layout.pdf')
  182. def draw_span_bbox(pdf_info, pdf_bytes, out_path, filename):
  183. text_list = []
  184. inline_equation_list = []
  185. interline_equation_list = []
  186. image_list = []
  187. table_list = []
  188. dropped_list = []
  189. next_page_text_list = []
  190. next_page_inline_equation_list = []
  191. def get_span_info(span):
  192. if span['type'] == ContentType.Text:
  193. if span.get(CROSS_PAGE, False):
  194. next_page_text_list.append(span['bbox'])
  195. else:
  196. page_text_list.append(span['bbox'])
  197. elif span['type'] == ContentType.InlineEquation:
  198. if span.get(CROSS_PAGE, False):
  199. next_page_inline_equation_list.append(span['bbox'])
  200. else:
  201. page_inline_equation_list.append(span['bbox'])
  202. elif span['type'] == ContentType.InterlineEquation:
  203. page_interline_equation_list.append(span['bbox'])
  204. elif span['type'] == ContentType.Image:
  205. page_image_list.append(span['bbox'])
  206. elif span['type'] == ContentType.Table:
  207. page_table_list.append(span['bbox'])
  208. for page in pdf_info:
  209. page_text_list = []
  210. page_inline_equation_list = []
  211. page_interline_equation_list = []
  212. page_image_list = []
  213. page_table_list = []
  214. page_dropped_list = []
  215. # 将跨页的span放到移动到下一页的列表中
  216. if len(next_page_text_list) > 0:
  217. page_text_list.extend(next_page_text_list)
  218. next_page_text_list.clear()
  219. if len(next_page_inline_equation_list) > 0:
  220. page_inline_equation_list.extend(next_page_inline_equation_list)
  221. next_page_inline_equation_list.clear()
  222. # 构造dropped_list
  223. for block in page['discarded_blocks']:
  224. if block['type'] == BlockType.Discarded:
  225. for line in block['lines']:
  226. for span in line['spans']:
  227. page_dropped_list.append(span['bbox'])
  228. dropped_list.append(page_dropped_list)
  229. # 构造其余useful_list
  230. # for block in page['para_blocks']: # span直接用分段合并前的结果就可以
  231. for block in page['preproc_blocks']:
  232. if block['type'] in [
  233. BlockType.Text,
  234. BlockType.Title,
  235. BlockType.InterlineEquation,
  236. BlockType.List,
  237. BlockType.Index,
  238. ]:
  239. for line in block['lines']:
  240. for span in line['spans']:
  241. get_span_info(span)
  242. elif block['type'] in [BlockType.Image, BlockType.Table]:
  243. for sub_block in block['blocks']:
  244. for line in sub_block['lines']:
  245. for span in line['spans']:
  246. get_span_info(span)
  247. text_list.append(page_text_list)
  248. inline_equation_list.append(page_inline_equation_list)
  249. interline_equation_list.append(page_interline_equation_list)
  250. image_list.append(page_image_list)
  251. table_list.append(page_table_list)
  252. pdf_docs = fitz.open('pdf', pdf_bytes)
  253. for i, page in enumerate(pdf_docs):
  254. # 获取当前页面的数据
  255. draw_bbox_without_number(i, text_list, page, [255, 0, 0], False)
  256. draw_bbox_without_number(i, inline_equation_list, page, [0, 255, 0], False)
  257. draw_bbox_without_number(i, interline_equation_list, page, [0, 0, 255], False)
  258. draw_bbox_without_number(i, image_list, page, [255, 204, 0], False)
  259. draw_bbox_without_number(i, table_list, page, [204, 0, 255], False)
  260. draw_bbox_without_number(i, dropped_list, page, [158, 158, 158], False)
  261. # Save the PDF
  262. pdf_docs.save(f'{out_path}/{filename}_spans.pdf')
  263. def draw_model_bbox(model_list: list, pdf_bytes, out_path, filename):
  264. dropped_bbox_list = []
  265. tables_body_list, tables_caption_list, tables_footnote_list = [], [], []
  266. imgs_body_list, imgs_caption_list, imgs_footnote_list = [], [], []
  267. titles_list = []
  268. texts_list = []
  269. interequations_list = []
  270. pdf_docs = fitz.open('pdf', pdf_bytes)
  271. magic_model = MagicModel(model_list, PymuDocDataset(pdf_bytes))
  272. for i in range(len(model_list)):
  273. page_dropped_list = []
  274. tables_body, tables_caption, tables_footnote = [], [], []
  275. imgs_body, imgs_caption, imgs_footnote = [], [], []
  276. titles = []
  277. texts = []
  278. interequations = []
  279. page_info = magic_model.get_model_list(i)
  280. layout_dets = page_info['layout_dets']
  281. for layout_det in layout_dets:
  282. bbox = layout_det['bbox']
  283. if layout_det['category_id'] == CategoryId.Text:
  284. texts.append(bbox)
  285. elif layout_det['category_id'] == CategoryId.Title:
  286. titles.append(bbox)
  287. elif layout_det['category_id'] == CategoryId.TableBody:
  288. tables_body.append(bbox)
  289. elif layout_det['category_id'] == CategoryId.TableCaption:
  290. tables_caption.append(bbox)
  291. elif layout_det['category_id'] == CategoryId.TableFootnote:
  292. tables_footnote.append(bbox)
  293. elif layout_det['category_id'] == CategoryId.ImageBody:
  294. imgs_body.append(bbox)
  295. elif layout_det['category_id'] == CategoryId.ImageCaption:
  296. imgs_caption.append(bbox)
  297. elif layout_det['category_id'] == CategoryId.InterlineEquation_YOLO:
  298. interequations.append(bbox)
  299. elif layout_det['category_id'] == CategoryId.Abandon:
  300. page_dropped_list.append(bbox)
  301. elif layout_det['category_id'] == CategoryId.ImageFootnote:
  302. imgs_footnote.append(bbox)
  303. tables_body_list.append(tables_body)
  304. tables_caption_list.append(tables_caption)
  305. tables_footnote_list.append(tables_footnote)
  306. imgs_body_list.append(imgs_body)
  307. imgs_caption_list.append(imgs_caption)
  308. titles_list.append(titles)
  309. texts_list.append(texts)
  310. interequations_list.append(interequations)
  311. dropped_bbox_list.append(page_dropped_list)
  312. imgs_footnote_list.append(imgs_footnote)
  313. for i, page in enumerate(pdf_docs):
  314. draw_bbox_with_number(
  315. i, dropped_bbox_list, page, [158, 158, 158], True
  316. ) # color !
  317. draw_bbox_with_number(i, tables_body_list, page, [204, 204, 0], True)
  318. draw_bbox_with_number(i, tables_caption_list, page, [255, 255, 102], True)
  319. draw_bbox_with_number(i, tables_footnote_list, page, [229, 255, 204], True)
  320. draw_bbox_with_number(i, imgs_body_list, page, [153, 255, 51], True)
  321. draw_bbox_with_number(i, imgs_caption_list, page, [102, 178, 255], True)
  322. draw_bbox_with_number(i, imgs_footnote_list, page, [255, 178, 102], True)
  323. draw_bbox_with_number(i, titles_list, page, [102, 102, 255], True)
  324. draw_bbox_with_number(i, texts_list, page, [153, 0, 76], True)
  325. draw_bbox_with_number(i, interequations_list, page, [0, 255, 0], True)
  326. # Save the PDF
  327. pdf_docs.save(f'{out_path}/{filename}_model.pdf')
  328. def draw_line_sort_bbox(pdf_info, pdf_bytes, out_path, filename):
  329. layout_bbox_list = []
  330. for page in pdf_info:
  331. page_line_list = []
  332. for block in page['preproc_blocks']:
  333. if block['type'] in [BlockType.Text, BlockType.Title, BlockType.InterlineEquation]:
  334. for line in block['lines']:
  335. bbox = line['bbox']
  336. index = line['index']
  337. page_line_list.append({'index': index, 'bbox': bbox})
  338. if block['type'] in [BlockType.Image, BlockType.Table]:
  339. for sub_block in block['blocks']:
  340. if sub_block['type'] in [BlockType.ImageBody, BlockType.TableBody]:
  341. if len(sub_block['virtual_lines']) > 0 and sub_block['virtual_lines'][0].get('index', None) is not None:
  342. for line in sub_block['virtual_lines']:
  343. bbox = line['bbox']
  344. index = line['index']
  345. page_line_list.append({'index': index, 'bbox': bbox})
  346. else:
  347. for line in sub_block['lines']:
  348. bbox = line['bbox']
  349. index = line['index']
  350. page_line_list.append({'index': index, 'bbox': bbox})
  351. elif sub_block['type'] in [BlockType.ImageCaption, BlockType.TableCaption, BlockType.ImageFootnote, BlockType.TableFootnote]:
  352. for line in sub_block['lines']:
  353. bbox = line['bbox']
  354. index = line['index']
  355. page_line_list.append({'index': index, 'bbox': bbox})
  356. sorted_bboxes = sorted(page_line_list, key=lambda x: x['index'])
  357. layout_bbox_list.append(sorted_bbox['bbox'] for sorted_bbox in sorted_bboxes)
  358. pdf_docs = fitz.open('pdf', pdf_bytes)
  359. for i, page in enumerate(pdf_docs):
  360. draw_bbox_with_number(i, layout_bbox_list, page, [255, 0, 0], False)
  361. pdf_docs.save(f'{out_path}/{filename}_line_sort.pdf')
  362. def draw_layout_sort_bbox(pdf_info, pdf_bytes, out_path, filename):
  363. layout_bbox_list = []
  364. for page in pdf_info:
  365. page_block_list = []
  366. for block in page['para_blocks']:
  367. bbox = block['bbox']
  368. page_block_list.append(bbox)
  369. layout_bbox_list.append(page_block_list)
  370. pdf_docs = fitz.open('pdf', pdf_bytes)
  371. for i, page in enumerate(pdf_docs):
  372. draw_bbox_with_number(i, layout_bbox_list, page, [255, 0, 0], False)
  373. pdf_docs.save(f'{out_path}/{filename}_layout_sort.pdf')