utils.py 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285
  1. import json
  2. import os
  3. from pathlib import Path
  4. from loguru import logger
  5. import magic_pdf.model as model_config
  6. from magic_pdf.dict2md.ocr_mkcontent import merge_para_with_text
  7. from magic_pdf.integrations.rag.type import (CategoryType, ContentObject,
  8. ElementRelation, ElementRelType,
  9. LayoutElements,
  10. LayoutElementsExtra, PageInfo)
  11. from magic_pdf.libs.ocr_content_type import BlockType, ContentType
  12. from magic_pdf.rw.AbsReaderWriter import AbsReaderWriter
  13. from magic_pdf.rw.DiskReaderWriter import DiskReaderWriter
  14. from magic_pdf.tools.common import do_parse, prepare_env
  15. def convert_middle_json_to_layout_elements(
  16. json_data: dict,
  17. output_dir: str,
  18. ) -> list[LayoutElements]:
  19. uniq_anno_id = 0
  20. res: list[LayoutElements] = []
  21. for page_no, page_data in enumerate(json_data['pdf_info']):
  22. order_id = 0
  23. page_info = PageInfo(
  24. height=int(page_data['page_size'][1]),
  25. width=int(page_data['page_size'][0]),
  26. page_no=page_no,
  27. )
  28. layout_dets: list[ContentObject] = []
  29. extra_element_relation: list[ElementRelation] = []
  30. for para_block in page_data['para_blocks']:
  31. para_text = ''
  32. para_type = para_block['type']
  33. if para_type == BlockType.Text:
  34. para_text = merge_para_with_text(para_block)
  35. x0, y0, x1, y1 = para_block['bbox']
  36. content = ContentObject(
  37. anno_id=uniq_anno_id,
  38. category_type=CategoryType.text,
  39. text=para_text,
  40. order=order_id,
  41. poly=[x0, y0, x1, y0, x1, y1, x0, y1],
  42. )
  43. uniq_anno_id += 1
  44. order_id += 1
  45. layout_dets.append(content)
  46. elif para_type == BlockType.Title:
  47. para_text = merge_para_with_text(para_block)
  48. x0, y0, x1, y1 = para_block['bbox']
  49. content = ContentObject(
  50. anno_id=uniq_anno_id,
  51. category_type=CategoryType.title,
  52. text=para_text,
  53. order=order_id,
  54. poly=[x0, y0, x1, y0, x1, y1, x0, y1],
  55. )
  56. uniq_anno_id += 1
  57. order_id += 1
  58. layout_dets.append(content)
  59. elif para_type == BlockType.InterlineEquation:
  60. para_text = merge_para_with_text(para_block)
  61. x0, y0, x1, y1 = para_block['bbox']
  62. content = ContentObject(
  63. anno_id=uniq_anno_id,
  64. category_type=CategoryType.interline_equation,
  65. text=para_text,
  66. order=order_id,
  67. poly=[x0, y0, x1, y0, x1, y1, x0, y1],
  68. )
  69. uniq_anno_id += 1
  70. order_id += 1
  71. layout_dets.append(content)
  72. elif para_type == BlockType.Image:
  73. body_anno_id = -1
  74. caption_anno_id = -1
  75. for block in para_block['blocks']:
  76. if block['type'] == BlockType.ImageBody:
  77. for line in block['lines']:
  78. for span in line['spans']:
  79. if span['type'] == ContentType.Image:
  80. x0, y0, x1, y1 = block['bbox']
  81. content = ContentObject(
  82. anno_id=uniq_anno_id,
  83. category_type=CategoryType.image_body,
  84. image_path=os.path.join(
  85. output_dir, span['image_path']),
  86. order=order_id,
  87. poly=[x0, y0, x1, y0, x1, y1, x0, y1],
  88. )
  89. body_anno_id = uniq_anno_id
  90. uniq_anno_id += 1
  91. order_id += 1
  92. layout_dets.append(content)
  93. for block in para_block['blocks']:
  94. if block['type'] == BlockType.ImageCaption:
  95. para_text += merge_para_with_text(block)
  96. x0, y0, x1, y1 = block['bbox']
  97. content = ContentObject(
  98. anno_id=uniq_anno_id,
  99. category_type=CategoryType.image_caption,
  100. text=para_text,
  101. order=order_id,
  102. poly=[x0, y0, x1, y0, x1, y1, x0, y1],
  103. )
  104. caption_anno_id = uniq_anno_id
  105. uniq_anno_id += 1
  106. order_id += 1
  107. layout_dets.append(content)
  108. if body_anno_id > 0 and caption_anno_id > 0:
  109. element_relation = ElementRelation(
  110. relation=ElementRelType.sibling,
  111. source_anno_id=body_anno_id,
  112. target_anno_id=caption_anno_id,
  113. )
  114. extra_element_relation.append(element_relation)
  115. elif para_type == BlockType.Table:
  116. body_anno_id, caption_anno_id, footnote_anno_id = -1, -1, -1
  117. for block in para_block['blocks']:
  118. if block['type'] == BlockType.TableCaption:
  119. para_text += merge_para_with_text(block)
  120. x0, y0, x1, y1 = block['bbox']
  121. content = ContentObject(
  122. anno_id=uniq_anno_id,
  123. category_type=CategoryType.table_caption,
  124. text=para_text,
  125. order=order_id,
  126. poly=[x0, y0, x1, y0, x1, y1, x0, y1],
  127. )
  128. caption_anno_id = uniq_anno_id
  129. uniq_anno_id += 1
  130. order_id += 1
  131. layout_dets.append(content)
  132. for block in para_block['blocks']:
  133. if block['type'] == BlockType.TableBody:
  134. for line in block['lines']:
  135. for span in line['spans']:
  136. if span['type'] == ContentType.Table:
  137. x0, y0, x1, y1 = para_block['bbox']
  138. content = ContentObject(
  139. anno_id=uniq_anno_id,
  140. category_type=CategoryType.table_body,
  141. order=order_id,
  142. poly=[x0, y0, x1, y0, x1, y1, x0, y1],
  143. )
  144. body_anno_id = uniq_anno_id
  145. uniq_anno_id += 1
  146. order_id += 1
  147. # if processed by table model
  148. if span.get('latex', ''):
  149. content.latex = span['latex']
  150. else:
  151. content.image_path = os.path.join(
  152. output_dir, span['image_path'])
  153. layout_dets.append(content)
  154. for block in para_block['blocks']:
  155. if block['type'] == BlockType.TableFootnote:
  156. para_text += merge_para_with_text(block)
  157. x0, y0, x1, y1 = block['bbox']
  158. content = ContentObject(
  159. anno_id=uniq_anno_id,
  160. category_type=CategoryType.table_footnote,
  161. text=para_text,
  162. order=order_id,
  163. poly=[x0, y0, x1, y0, x1, y1, x0, y1],
  164. )
  165. footnote_anno_id = uniq_anno_id
  166. uniq_anno_id += 1
  167. order_id += 1
  168. layout_dets.append(content)
  169. if caption_anno_id != -1 and body_anno_id != -1:
  170. element_relation = ElementRelation(
  171. relation=ElementRelType.sibling,
  172. source_anno_id=body_anno_id,
  173. target_anno_id=caption_anno_id,
  174. )
  175. extra_element_relation.append(element_relation)
  176. if footnote_anno_id != -1 and body_anno_id != -1:
  177. element_relation = ElementRelation(
  178. relation=ElementRelType.sibling,
  179. source_anno_id=body_anno_id,
  180. target_anno_id=footnote_anno_id,
  181. )
  182. extra_element_relation.append(element_relation)
  183. res.append(
  184. LayoutElements(
  185. page_info=page_info,
  186. layout_dets=layout_dets,
  187. extra=LayoutElementsExtra(
  188. element_relation=extra_element_relation),
  189. ))
  190. return res
  191. def inference(path, output_dir, method):
  192. model_config.__use_inside_model__ = True
  193. model_config.__model_mode__ = 'full'
  194. if output_dir == '':
  195. if os.path.isdir(path):
  196. output_dir = os.path.join(path, 'output')
  197. else:
  198. output_dir = os.path.join(os.path.dirname(path), 'output')
  199. local_image_dir, local_md_dir = prepare_env(output_dir,
  200. str(Path(path).stem), method)
  201. def read_fn(path):
  202. disk_rw = DiskReaderWriter(os.path.dirname(path))
  203. return disk_rw.read(os.path.basename(path), AbsReaderWriter.MODE_BIN)
  204. def parse_doc(doc_path: str):
  205. try:
  206. file_name = str(Path(doc_path).stem)
  207. pdf_data = read_fn(doc_path)
  208. do_parse(
  209. output_dir,
  210. file_name,
  211. pdf_data,
  212. [],
  213. method,
  214. False,
  215. f_draw_span_bbox=False,
  216. f_draw_layout_bbox=False,
  217. f_dump_md=False,
  218. f_dump_middle_json=True,
  219. f_dump_model_json=False,
  220. f_dump_orig_pdf=False,
  221. f_dump_content_list=False,
  222. f_draw_model_bbox=False,
  223. )
  224. middle_json_fn = os.path.join(local_md_dir,
  225. f'{file_name}_middle.json')
  226. with open(middle_json_fn) as fd:
  227. jso = json.load(fd)
  228. os.remove(middle_json_fn)
  229. return convert_middle_json_to_layout_elements(jso, local_image_dir)
  230. except Exception as e:
  231. logger.exception(e)
  232. return parse_doc(path)
  233. if __name__ == '__main__':
  234. import pprint
  235. base_dir = '/opt/data/pdf/resources/samples/'
  236. if 0:
  237. with open(base_dir + 'json_outputs/middle.json') as f:
  238. d = json.load(f)
  239. result = convert_middle_json_to_layout_elements(d, '/tmp')
  240. pprint.pp(result)
  241. if 0:
  242. with open(base_dir + 'json_outputs/middle.3.json') as f:
  243. d = json.load(f)
  244. result = convert_middle_json_to_layout_elements(d, '/tmp')
  245. pprint.pp(result)
  246. if 1:
  247. res = inference(
  248. base_dir + 'samples/pdf/one_page_with_table_image.pdf',
  249. '/tmp/output',
  250. 'ocr',
  251. )
  252. pprint.pp(res)