draw_bbox.py 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372
  1. import time
  2. from magic_pdf.libs.commons import fitz # PyMuPDF
  3. from magic_pdf.libs.Constants import CROSS_PAGE
  4. from magic_pdf.libs.ocr_content_type import BlockType, CategoryId, ContentType
  5. from magic_pdf.model.magic_model import MagicModel
  6. def draw_bbox_without_number(i, bbox_list, page, rgb_config, fill_config):
  7. new_rgb = []
  8. for item in rgb_config:
  9. item = float(item) / 255
  10. new_rgb.append(item)
  11. page_data = bbox_list[i]
  12. for bbox in page_data:
  13. x0, y0, x1, y1 = bbox
  14. rect_coords = fitz.Rect(x0, y0, x1, y1) # Define the rectangle
  15. if fill_config:
  16. page.draw_rect(
  17. rect_coords,
  18. color=None,
  19. fill=new_rgb,
  20. fill_opacity=0.3,
  21. width=0.5,
  22. overlay=True,
  23. ) # Draw the rectangle
  24. else:
  25. page.draw_rect(
  26. rect_coords,
  27. color=new_rgb,
  28. fill=None,
  29. fill_opacity=1,
  30. width=0.5,
  31. overlay=True,
  32. ) # Draw the rectangle
  33. def draw_bbox_with_number(i, bbox_list, page, rgb_config, fill_config):
  34. new_rgb = []
  35. for item in rgb_config:
  36. item = float(item) / 255
  37. new_rgb.append(item)
  38. page_data = bbox_list[i]
  39. for j, bbox in enumerate(page_data):
  40. x0, y0, x1, y1 = bbox
  41. rect_coords = fitz.Rect(x0, y0, x1, y1) # Define the rectangle
  42. if fill_config:
  43. page.draw_rect(
  44. rect_coords,
  45. color=None,
  46. fill=new_rgb,
  47. fill_opacity=0.3,
  48. width=0.5,
  49. overlay=True,
  50. ) # Draw the rectangle
  51. else:
  52. page.draw_rect(
  53. rect_coords,
  54. color=new_rgb,
  55. fill=None,
  56. fill_opacity=1,
  57. width=0.5,
  58. overlay=True,
  59. ) # Draw the rectangle
  60. page.insert_text(
  61. (x0, y0 + 10), str(j + 1), fontsize=10, color=new_rgb
  62. ) # Insert the index in the top left corner of the rectangle
  63. def draw_layout_bbox(pdf_info, pdf_bytes, out_path, filename):
  64. layout_bbox_list = []
  65. dropped_bbox_list = []
  66. tables_list, tables_body_list = [], []
  67. tables_caption_list, tables_footnote_list = [], []
  68. imgs_list, imgs_body_list, imgs_caption_list = [], [], []
  69. imgs_footnote_list = []
  70. titles_list = []
  71. texts_list = []
  72. interequations_list = []
  73. for page in pdf_info:
  74. page_layout_list = []
  75. page_dropped_list = []
  76. tables, tables_body, tables_caption, tables_footnote = [], [], [], []
  77. imgs, imgs_body, imgs_caption, imgs_footnote = [], [], [], []
  78. titles = []
  79. texts = []
  80. interequations = []
  81. for layout in page['layout_bboxes']:
  82. page_layout_list.append(layout['layout_bbox'])
  83. layout_bbox_list.append(page_layout_list)
  84. for dropped_bbox in page['discarded_blocks']:
  85. page_dropped_list.append(dropped_bbox['bbox'])
  86. dropped_bbox_list.append(page_dropped_list)
  87. for block in page['para_blocks']:
  88. bbox = block['bbox']
  89. if block['type'] == BlockType.Table:
  90. tables.append(bbox)
  91. for nested_block in block['blocks']:
  92. bbox = nested_block['bbox']
  93. if nested_block['type'] == BlockType.TableBody:
  94. tables_body.append(bbox)
  95. elif nested_block['type'] == BlockType.TableCaption:
  96. tables_caption.append(bbox)
  97. elif nested_block['type'] == BlockType.TableFootnote:
  98. tables_footnote.append(bbox)
  99. elif block['type'] == BlockType.Image:
  100. imgs.append(bbox)
  101. for nested_block in block['blocks']:
  102. bbox = nested_block['bbox']
  103. if nested_block['type'] == BlockType.ImageBody:
  104. imgs_body.append(bbox)
  105. elif nested_block['type'] == BlockType.ImageCaption:
  106. imgs_caption.append(bbox)
  107. elif nested_block['type'] == BlockType.ImageFootnote:
  108. imgs_footnote.append(bbox)
  109. elif block['type'] == BlockType.Title:
  110. titles.append(bbox)
  111. elif block['type'] == BlockType.Text:
  112. texts.append(bbox)
  113. elif block['type'] == BlockType.InterlineEquation:
  114. interequations.append(bbox)
  115. tables_list.append(tables)
  116. tables_body_list.append(tables_body)
  117. tables_caption_list.append(tables_caption)
  118. tables_footnote_list.append(tables_footnote)
  119. imgs_list.append(imgs)
  120. imgs_body_list.append(imgs_body)
  121. imgs_caption_list.append(imgs_caption)
  122. imgs_footnote_list.append(imgs_footnote)
  123. titles_list.append(titles)
  124. texts_list.append(texts)
  125. interequations_list.append(interequations)
  126. pdf_docs = fitz.open('pdf', pdf_bytes)
  127. for i, page in enumerate(pdf_docs):
  128. draw_bbox_with_number(i, layout_bbox_list, page, [255, 0, 0], False)
  129. draw_bbox_without_number(i, dropped_bbox_list, page, [158, 158, 158],
  130. True)
  131. draw_bbox_without_number(i, tables_list, page, [153, 153, 0],
  132. True) # color !
  133. draw_bbox_without_number(i, tables_body_list, page, [204, 204, 0],
  134. True)
  135. draw_bbox_without_number(i, tables_caption_list, page, [255, 255, 102],
  136. True)
  137. draw_bbox_without_number(i, tables_footnote_list, page,
  138. [229, 255, 204], True)
  139. draw_bbox_without_number(i, imgs_list, page, [51, 102, 0], True)
  140. draw_bbox_without_number(i, imgs_body_list, page, [153, 255, 51], True)
  141. draw_bbox_without_number(i, imgs_caption_list, page, [102, 178, 255],
  142. True)
  143. draw_bbox_with_number(i, imgs_footnote_list, page, [255, 178, 102],
  144. True),
  145. draw_bbox_without_number(i, titles_list, page, [102, 102, 255], True)
  146. draw_bbox_without_number(i, texts_list, page, [153, 0, 76], True)
  147. draw_bbox_without_number(i, interequations_list, page, [0, 255, 0],
  148. True)
  149. # Save the PDF
  150. pdf_docs.save(f'{out_path}/{filename}_layout.pdf')
  151. def draw_span_bbox(pdf_info, pdf_bytes, out_path, filename):
  152. text_list = []
  153. inline_equation_list = []
  154. interline_equation_list = []
  155. image_list = []
  156. table_list = []
  157. dropped_list = []
  158. next_page_text_list = []
  159. next_page_inline_equation_list = []
  160. def get_span_info(span):
  161. if span['type'] == ContentType.Text:
  162. if span.get(CROSS_PAGE, False):
  163. next_page_text_list.append(span['bbox'])
  164. else:
  165. page_text_list.append(span['bbox'])
  166. elif span['type'] == ContentType.InlineEquation:
  167. if span.get(CROSS_PAGE, False):
  168. next_page_inline_equation_list.append(span['bbox'])
  169. else:
  170. page_inline_equation_list.append(span['bbox'])
  171. elif span['type'] == ContentType.InterlineEquation:
  172. page_interline_equation_list.append(span['bbox'])
  173. elif span['type'] == ContentType.Image:
  174. page_image_list.append(span['bbox'])
  175. elif span['type'] == ContentType.Table:
  176. page_table_list.append(span['bbox'])
  177. for page in pdf_info:
  178. page_text_list = []
  179. page_inline_equation_list = []
  180. page_interline_equation_list = []
  181. page_image_list = []
  182. page_table_list = []
  183. page_dropped_list = []
  184. # 将跨页的span放到移动到下一页的列表中
  185. if len(next_page_text_list) > 0:
  186. page_text_list.extend(next_page_text_list)
  187. next_page_text_list.clear()
  188. if len(next_page_inline_equation_list) > 0:
  189. page_inline_equation_list.extend(next_page_inline_equation_list)
  190. next_page_inline_equation_list.clear()
  191. # 构造dropped_list
  192. for block in page['discarded_blocks']:
  193. if block['type'] == BlockType.Discarded:
  194. for line in block['lines']:
  195. for span in line['spans']:
  196. page_dropped_list.append(span['bbox'])
  197. dropped_list.append(page_dropped_list)
  198. # 构造其余useful_list
  199. for block in page['para_blocks']:
  200. if block['type'] in [
  201. BlockType.Text,
  202. BlockType.Title,
  203. BlockType.InterlineEquation,
  204. ]:
  205. for line in block['lines']:
  206. for span in line['spans']:
  207. get_span_info(span)
  208. elif block['type'] in [BlockType.Image, BlockType.Table]:
  209. for sub_block in block['blocks']:
  210. for line in sub_block['lines']:
  211. for span in line['spans']:
  212. get_span_info(span)
  213. text_list.append(page_text_list)
  214. inline_equation_list.append(page_inline_equation_list)
  215. interline_equation_list.append(page_interline_equation_list)
  216. image_list.append(page_image_list)
  217. table_list.append(page_table_list)
  218. pdf_docs = fitz.open('pdf', pdf_bytes)
  219. for i, page in enumerate(pdf_docs):
  220. # 获取当前页面的数据
  221. draw_bbox_without_number(i, text_list, page, [255, 0, 0], False)
  222. draw_bbox_without_number(i, inline_equation_list, page, [0, 255, 0],
  223. False)
  224. draw_bbox_without_number(i, interline_equation_list, page, [0, 0, 255],
  225. False)
  226. draw_bbox_without_number(i, image_list, page, [255, 204, 0], False)
  227. draw_bbox_without_number(i, table_list, page, [204, 0, 255], False)
  228. draw_bbox_without_number(i, dropped_list, page, [158, 158, 158], False)
  229. # Save the PDF
  230. pdf_docs.save(f'{out_path}/{filename}_spans.pdf')
  231. def draw_model_bbox(model_list: list, pdf_bytes, out_path, filename):
  232. dropped_bbox_list = []
  233. tables_body_list, tables_caption_list, tables_footnote_list = [], [], []
  234. imgs_body_list, imgs_caption_list, imgs_footnote_list = [], [], []
  235. titles_list = []
  236. texts_list = []
  237. interequations_list = []
  238. pdf_docs = fitz.open('pdf', pdf_bytes)
  239. magic_model = MagicModel(model_list, pdf_docs)
  240. for i in range(len(model_list)):
  241. page_dropped_list = []
  242. tables_body, tables_caption, tables_footnote = [], [], []
  243. imgs_body, imgs_caption, imgs_footnote = [], [], []
  244. titles = []
  245. texts = []
  246. interequations = []
  247. page_info = magic_model.get_model_list(i)
  248. layout_dets = page_info['layout_dets']
  249. for layout_det in layout_dets:
  250. bbox = layout_det['bbox']
  251. if layout_det['category_id'] == CategoryId.Text:
  252. texts.append(bbox)
  253. elif layout_det['category_id'] == CategoryId.Title:
  254. titles.append(bbox)
  255. elif layout_det['category_id'] == CategoryId.TableBody:
  256. tables_body.append(bbox)
  257. elif layout_det['category_id'] == CategoryId.TableCaption:
  258. tables_caption.append(bbox)
  259. elif layout_det['category_id'] == CategoryId.TableFootnote:
  260. tables_footnote.append(bbox)
  261. elif layout_det['category_id'] == CategoryId.ImageBody:
  262. imgs_body.append(bbox)
  263. elif layout_det['category_id'] == CategoryId.ImageCaption:
  264. imgs_caption.append(bbox)
  265. elif layout_det[
  266. 'category_id'] == CategoryId.InterlineEquation_YOLO:
  267. interequations.append(bbox)
  268. elif layout_det['category_id'] == CategoryId.Abandon:
  269. page_dropped_list.append(bbox)
  270. elif layout_det['category_id'] == CategoryId.ImageFootnote:
  271. imgs_footnote.append(bbox)
  272. tables_body_list.append(tables_body)
  273. tables_caption_list.append(tables_caption)
  274. tables_footnote_list.append(tables_footnote)
  275. imgs_body_list.append(imgs_body)
  276. imgs_caption_list.append(imgs_caption)
  277. titles_list.append(titles)
  278. texts_list.append(texts)
  279. interequations_list.append(interequations)
  280. dropped_bbox_list.append(page_dropped_list)
  281. imgs_footnote_list.append(imgs_footnote)
  282. for i, page in enumerate(pdf_docs):
  283. draw_bbox_with_number(i, dropped_bbox_list, page, [158, 158, 158],
  284. True) # color !
  285. draw_bbox_with_number(i, tables_body_list, page, [204, 204, 0], True)
  286. draw_bbox_with_number(i, tables_caption_list, page, [255, 255, 102],
  287. True)
  288. draw_bbox_with_number(i, tables_footnote_list, page, [229, 255, 204],
  289. True)
  290. draw_bbox_with_number(i, imgs_body_list, page, [153, 255, 51], True)
  291. draw_bbox_with_number(i, imgs_caption_list, page, [102, 178, 255],
  292. True)
  293. draw_bbox_with_number(i, imgs_footnote_list, page, [255, 178, 102],
  294. True)
  295. draw_bbox_with_number(i, titles_list, page, [102, 102, 255], True)
  296. draw_bbox_with_number(i, texts_list, page, [153, 0, 76], True)
  297. draw_bbox_with_number(i, interequations_list, page, [0, 255, 0], True)
  298. # Save the PDF
  299. pdf_docs.save(f'{out_path}/{filename}_model.pdf')
  300. from typing import List
  301. def do_predict(boxes: List[List[int]]) -> List[int]:
  302. from transformers import LayoutLMv3ForTokenClassification
  303. from magic_pdf.v3.helpers import prepare_inputs, boxes2inputs, parse_logits
  304. model = LayoutLMv3ForTokenClassification.from_pretrained("hantian/layoutreader")
  305. inputs = boxes2inputs(boxes)
  306. inputs = prepare_inputs(inputs, model)
  307. logits = model(**inputs).logits.cpu().squeeze(0)
  308. return parse_logits(logits, len(boxes))
  309. def draw_layout_sort_bbox(pdf_info, pdf_bytes, out_path, filename):
  310. layout_bbox_list = []
  311. from loguru import logger
  312. for page in pdf_info:
  313. page_layout_list = []
  314. for block in page['para_blocks']:
  315. bbox = block['bbox']
  316. page_layout_list.append(bbox)
  317. # 使用layoutreader排序
  318. page_size = page['page_size']
  319. x_scale = 1000.0 / page_size[0]
  320. y_scale = 1000.0 / page_size[1]
  321. boxes = []
  322. logger.info(f"Scale: {x_scale}, {y_scale}, Boxes len: {len(page_layout_list)}")
  323. for left, top, right, bottom in page_layout_list:
  324. left = round(left * x_scale)
  325. top = round(top * y_scale)
  326. right = round(right * x_scale)
  327. bottom = round(bottom * y_scale)
  328. assert (
  329. 1000 >= right >= left >= 0 and 1000 >= bottom >= top >= 0
  330. ), f"Invalid box. right: {right}, left: {left}, bottom: {bottom}, top: {top}"
  331. boxes.append([left, top, right, bottom])
  332. logger.info("layoutreader start")
  333. start = time.time()
  334. orders = do_predict(boxes)
  335. print(orders)
  336. logger.info(f"layoutreader end, cos time{time.time() - start}")
  337. sorted_bboxes = [page_layout_list[i] for i in orders]
  338. layout_bbox_list.append(sorted_bboxes)
  339. pdf_docs = fitz.open('pdf', pdf_bytes)
  340. for i, page in enumerate(pdf_docs):
  341. draw_bbox_with_number(i, layout_bbox_list, page, [102, 102, 255], False)
  342. pdf_docs.save(f'{out_path}/{filename}_layout_sort.pdf')