vlm_middle_json_mkcontent.py 10 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221
  1. import os
  2. from mineru.utils.config_reader import get_latex_delimiter_config, get_formula_enable, get_table_enable
  3. from mineru.utils.enum_class import MakeMode, BlockType, ContentType
  4. latex_delimiters_config = get_latex_delimiter_config()
  5. default_delimiters = {
  6. 'display': {'left': '$$', 'right': '$$'},
  7. 'inline': {'left': '$', 'right': '$'}
  8. }
  9. delimiters = latex_delimiters_config if latex_delimiters_config else default_delimiters
  10. display_left_delimiter = delimiters['display']['left']
  11. display_right_delimiter = delimiters['display']['right']
  12. inline_left_delimiter = delimiters['inline']['left']
  13. inline_right_delimiter = delimiters['inline']['right']
  14. def merge_para_with_text(para_block, formula_enable=True, img_buket_path=''):
  15. para_text = ''
  16. for line in para_block['lines']:
  17. for j, span in enumerate(line['spans']):
  18. span_type = span['type']
  19. content = ''
  20. if span_type == ContentType.TEXT:
  21. content = span['content']
  22. elif span_type == ContentType.INLINE_EQUATION:
  23. content = f"{inline_left_delimiter}{span['content']}{inline_right_delimiter}"
  24. elif span_type == ContentType.INTERLINE_EQUATION:
  25. if formula_enable:
  26. content = f"\n{display_left_delimiter}\n{span['content']}\n{display_right_delimiter}\n"
  27. else:
  28. if span.get('image_path', ''):
  29. content = f"![]({img_buket_path}/{span['image_path']})"
  30. # content = content.strip()
  31. if content:
  32. if span_type in [ContentType.TEXT, ContentType.INLINE_EQUATION]:
  33. if j == len(line['spans']) - 1:
  34. para_text += content
  35. else:
  36. para_text += f'{content} '
  37. elif span_type == ContentType.INTERLINE_EQUATION:
  38. para_text += content
  39. return para_text
  40. def mk_blocks_to_markdown(para_blocks, make_mode, formula_enable, table_enable, img_buket_path=''):
  41. page_markdown = []
  42. for para_block in para_blocks:
  43. para_text = ''
  44. para_type = para_block['type']
  45. if para_type in [BlockType.TEXT, BlockType.LIST, BlockType.INDEX, BlockType.INTERLINE_EQUATION]:
  46. para_text = merge_para_with_text(para_block, formula_enable=formula_enable, img_buket_path=img_buket_path)
  47. elif para_type == BlockType.TITLE:
  48. title_level = get_title_level(para_block)
  49. para_text = f'{"#" * title_level} {merge_para_with_text(para_block)}'
  50. elif para_type == BlockType.IMAGE:
  51. if make_mode == MakeMode.NLP_MD:
  52. continue
  53. elif make_mode == MakeMode.MM_MD:
  54. # 检测是否存在图片脚注
  55. has_image_footnote = any(block['type'] == BlockType.IMAGE_FOOTNOTE for block in para_block['blocks'])
  56. # 如果存在图片脚注,则将图片脚注拼接到图片正文后面
  57. if has_image_footnote:
  58. for block in para_block['blocks']: # 1st.拼image_caption
  59. if block['type'] == BlockType.IMAGE_CAPTION:
  60. para_text += merge_para_with_text(block) + ' \n'
  61. for block in para_block['blocks']: # 2nd.拼image_body
  62. if block['type'] == BlockType.IMAGE_BODY:
  63. for line in block['lines']:
  64. for span in line['spans']:
  65. if span['type'] == ContentType.IMAGE:
  66. if span.get('image_path', ''):
  67. para_text += f"![]({img_buket_path}/{span['image_path']})"
  68. for block in para_block['blocks']: # 3rd.拼image_footnote
  69. if block['type'] == BlockType.IMAGE_FOOTNOTE:
  70. para_text += ' \n' + merge_para_with_text(block)
  71. else:
  72. for block in para_block['blocks']: # 1st.拼image_body
  73. if block['type'] == BlockType.IMAGE_BODY:
  74. for line in block['lines']:
  75. for span in line['spans']:
  76. if span['type'] == ContentType.IMAGE:
  77. if span.get('image_path', ''):
  78. para_text += f"![]({img_buket_path}/{span['image_path']})"
  79. for block in para_block['blocks']: # 2nd.拼image_caption
  80. if block['type'] == BlockType.IMAGE_CAPTION:
  81. para_text += ' \n' + merge_para_with_text(block)
  82. elif para_type == BlockType.TABLE:
  83. if make_mode == MakeMode.NLP_MD:
  84. continue
  85. elif make_mode == MakeMode.MM_MD:
  86. for block in para_block['blocks']: # 1st.拼table_caption
  87. if block['type'] == BlockType.TABLE_CAPTION:
  88. para_text += merge_para_with_text(block) + ' \n'
  89. for block in para_block['blocks']: # 2nd.拼table_body
  90. if block['type'] == BlockType.TABLE_BODY:
  91. for line in block['lines']:
  92. for span in line['spans']:
  93. if span['type'] == ContentType.TABLE:
  94. # if processed by table model
  95. if table_enable:
  96. if span.get('html', ''):
  97. para_text += f"\n{span['html']}\n"
  98. elif span.get('image_path', ''):
  99. para_text += f"![]({img_buket_path}/{span['image_path']})"
  100. else:
  101. if span.get('image_path', ''):
  102. para_text += f"![]({img_buket_path}/{span['image_path']})"
  103. for block in para_block['blocks']: # 3rd.拼table_footnote
  104. if block['type'] == BlockType.TABLE_FOOTNOTE:
  105. para_text += '\n' + merge_para_with_text(block) + ' '
  106. if para_text.strip() == '':
  107. continue
  108. else:
  109. # page_markdown.append(para_text.strip() + ' ')
  110. page_markdown.append(para_text.strip())
  111. return page_markdown
  112. def make_blocks_to_content_list(para_block, img_buket_path, page_idx):
  113. para_type = para_block['type']
  114. para_content = {}
  115. if para_type in [BlockType.TEXT, BlockType.LIST, BlockType.INDEX]:
  116. para_content = {
  117. 'type': 'text',
  118. 'text': merge_para_with_text(para_block),
  119. }
  120. elif para_type == BlockType.TITLE:
  121. title_level = get_title_level(para_block)
  122. para_content = {
  123. 'type': 'text',
  124. 'text': merge_para_with_text(para_block),
  125. }
  126. if title_level != 0:
  127. para_content['text_level'] = title_level
  128. elif para_type == BlockType.INTERLINE_EQUATION:
  129. para_content = {
  130. 'type': 'equation',
  131. 'text': merge_para_with_text(para_block),
  132. 'text_format': 'latex',
  133. }
  134. elif para_type == BlockType.IMAGE:
  135. para_content = {'type': 'image', 'img_path': '', 'img_caption': [], 'img_footnote': []}
  136. for block in para_block['blocks']:
  137. if block['type'] == BlockType.IMAGE_BODY:
  138. for line in block['lines']:
  139. for span in line['spans']:
  140. if span['type'] == ContentType.IMAGE:
  141. if span.get('image_path', ''):
  142. para_content['img_path'] = f"{img_buket_path}/{span['image_path']}"
  143. if block['type'] == BlockType.IMAGE_CAPTION:
  144. para_content['img_caption'].append(merge_para_with_text(block))
  145. if block['type'] == BlockType.IMAGE_FOOTNOTE:
  146. para_content['img_footnote'].append(merge_para_with_text(block))
  147. elif para_type == BlockType.TABLE:
  148. para_content = {'type': 'table', 'img_path': '', 'table_caption': [], 'table_footnote': []}
  149. for block in para_block['blocks']:
  150. if block['type'] == BlockType.TABLE_BODY:
  151. for line in block['lines']:
  152. for span in line['spans']:
  153. if span['type'] == ContentType.TABLE:
  154. if span.get('html', ''):
  155. para_content['table_body'] = f"{span['html']}"
  156. if span.get('image_path', ''):
  157. para_content['img_path'] = f"{img_buket_path}/{span['image_path']}"
  158. if block['type'] == BlockType.TABLE_CAPTION:
  159. para_content['table_caption'].append(merge_para_with_text(block))
  160. if block['type'] == BlockType.TABLE_FOOTNOTE:
  161. para_content['table_footnote'].append(merge_para_with_text(block))
  162. para_content['page_idx'] = page_idx
  163. return para_content
  164. def union_make(pdf_info_dict: list,
  165. make_mode: str,
  166. img_buket_path: str = '',
  167. ):
  168. formula_enable = get_formula_enable(os.getenv('MINERU_VLM_FORMULA_ENABLE', 'True').lower() == 'true')
  169. table_enable = get_table_enable(os.getenv('MINERU_VLM_TABLE_ENABLE', 'True').lower() == 'true')
  170. output_content = []
  171. for page_info in pdf_info_dict:
  172. paras_of_layout = page_info.get('para_blocks')
  173. page_idx = page_info.get('page_idx')
  174. if not paras_of_layout:
  175. continue
  176. if make_mode in [MakeMode.MM_MD, MakeMode.NLP_MD]:
  177. page_markdown = mk_blocks_to_markdown(paras_of_layout, make_mode, formula_enable, table_enable, img_buket_path)
  178. output_content.extend(page_markdown)
  179. elif make_mode == MakeMode.CONTENT_LIST:
  180. for para_block in paras_of_layout:
  181. para_content = make_blocks_to_content_list(para_block, img_buket_path, page_idx)
  182. output_content.append(para_content)
  183. if make_mode in [MakeMode.MM_MD, MakeMode.NLP_MD]:
  184. return '\n\n'.join(output_content)
  185. elif make_mode == MakeMode.CONTENT_LIST:
  186. return output_content
  187. return None
  188. def get_title_level(block):
  189. title_level = block.get('level', 1)
  190. if title_level > 4:
  191. title_level = 4
  192. elif title_level < 1:
  193. title_level = 0
  194. return title_level