vlm_middle_json_mkcontent.py 8.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185
  1. import re
  2. from mineru.utils.enum_class import MakeMode, BlockType, ContentType
  3. def merge_para_with_text(para_block):
  4. para_text = ''
  5. for line in para_block['lines']:
  6. for span in line['spans']:
  7. content = span['content']
  8. content = content.strip()
  9. if content:
  10. para_text += content
  11. else:
  12. continue
  13. return para_text
  14. def mk_blocks_to_markdown(para_blocks, make_mode, img_buket_path=''):
  15. page_markdown = []
  16. for para_block in para_blocks:
  17. para_text = ''
  18. para_type = para_block['type']
  19. if para_type in [BlockType.TEXT, BlockType.LIST, BlockType.INDEX, BlockType.INTERLINE_EQUATION]:
  20. para_text = merge_para_with_text(para_block)
  21. elif para_type == BlockType.TITLE:
  22. title_level = get_title_level(para_block)
  23. para_text = f'{"#" * title_level} {merge_para_with_text(para_block)}'
  24. elif para_type == BlockType.IMAGE:
  25. if make_mode == MakeMode.NLP_MD:
  26. continue
  27. elif make_mode == MakeMode.MM_MD:
  28. # 检测是否存在图片脚注
  29. has_image_footnote = any(block['type'] == BlockType.IMAGE_FOOTNOTE for block in para_block['blocks'])
  30. # 如果存在图片脚注,则将图片脚注拼接到图片正文后面
  31. if has_image_footnote:
  32. for block in para_block['blocks']: # 1st.拼image_caption
  33. if block['type'] == BlockType.IMAGE_CAPTION:
  34. para_text += merge_para_with_text(block) + ' \n'
  35. for block in para_block['blocks']: # 2nd.拼image_body
  36. if block['type'] == BlockType.IMAGE_BODY:
  37. for line in block['lines']:
  38. for span in line['spans']:
  39. if span['type'] == ContentType.IMAGE:
  40. if span.get('image_path', ''):
  41. para_text += f"![]({img_buket_path}/{span['image_path']})"
  42. for block in para_block['blocks']: # 3rd.拼image_footnote
  43. if block['type'] == BlockType.IMAGE_FOOTNOTE:
  44. para_text += ' \n' + merge_para_with_text(block)
  45. else:
  46. for block in para_block['blocks']: # 1st.拼image_body
  47. if block['type'] == BlockType.IMAGE_BODY:
  48. for line in block['lines']:
  49. for span in line['spans']:
  50. if span['type'] == ContentType.IMAGE:
  51. if span.get('image_path', ''):
  52. para_text += f"![]({img_buket_path}/{span['image_path']})"
  53. for block in para_block['blocks']: # 2nd.拼image_caption
  54. if block['type'] == BlockType.IMAGE_CAPTION:
  55. para_text += ' \n' + merge_para_with_text(block)
  56. elif para_type == BlockType.TABLE:
  57. if make_mode == MakeMode.NLP_MD:
  58. continue
  59. elif make_mode == MakeMode.MM_MD:
  60. for block in para_block['blocks']: # 1st.拼table_caption
  61. if block['type'] == BlockType.TABLE_CAPTION:
  62. para_text += merge_para_with_text(block) + ' \n'
  63. for block in para_block['blocks']: # 2nd.拼table_body
  64. if block['type'] == BlockType.TABLE_BODY:
  65. for line in block['lines']:
  66. for span in line['spans']:
  67. if span['type'] == ContentType.TABLE:
  68. # if processed by table model
  69. if span.get('html', ''):
  70. para_text += f"\n{span['html']}\n"
  71. elif span.get('image_path', ''):
  72. para_text += f"![]({img_buket_path}/{span['image_path']})"
  73. for block in para_block['blocks']: # 3rd.拼table_footnote
  74. if block['type'] == BlockType.TABLE_FOOTNOTE:
  75. para_text += '\n' + merge_para_with_text(block) + ' '
  76. if para_text.strip() == '':
  77. continue
  78. else:
  79. # page_markdown.append(para_text.strip() + ' ')
  80. page_markdown.append(para_text.strip())
  81. return page_markdown
  82. def make_blocks_to_content_list(para_block, img_buket_path, page_idx):
  83. para_type = para_block['type']
  84. para_content = {}
  85. if para_type in [BlockType.TEXT, BlockType.LIST, BlockType.INDEX]:
  86. para_content = {
  87. 'type': 'text',
  88. 'text': merge_para_with_text(para_block),
  89. }
  90. elif para_type == BlockType.TITLE:
  91. title_level = get_title_level(para_block)
  92. para_content = {
  93. 'type': 'text',
  94. 'text': merge_para_with_text(para_block),
  95. }
  96. if title_level != 0:
  97. para_content['text_level'] = title_level
  98. elif para_type == BlockType.INTERLINE_EQUATION:
  99. para_content = {
  100. 'type': 'equation',
  101. 'text': merge_para_with_text(para_block),
  102. 'text_format': 'latex',
  103. }
  104. elif para_type == BlockType.IMAGE:
  105. para_content = {'type': 'image', 'img_path': '', 'img_caption': [], 'img_footnote': []}
  106. for block in para_block['blocks']:
  107. if block['type'] == BlockType.IMAGE_BODY:
  108. for line in block['lines']:
  109. for span in line['spans']:
  110. if span['type'] == ContentType.IMAGE:
  111. if span.get('image_path', ''):
  112. para_content['img_path'] = f"{img_buket_path}/{span['image_path']}"
  113. if block['type'] == BlockType.IMAGE_CAPTION:
  114. para_content['img_caption'].append(merge_para_with_text(block))
  115. if block['type'] == BlockType.IMAGE_FOOTNOTE:
  116. para_content['img_footnote'].append(merge_para_with_text(block))
  117. elif para_type == BlockType.TABLE:
  118. para_content = {'type': 'table', 'img_path': '', 'table_caption': [], 'table_footnote': []}
  119. for block in para_block['blocks']:
  120. if block['type'] == BlockType.TABLE_BODY:
  121. for line in block['lines']:
  122. for span in line['spans']:
  123. if span['type'] == ContentType.TABLE:
  124. if span.get('html', ''):
  125. para_content['table_body'] = f"{span['html']}"
  126. if span.get('image_path', ''):
  127. para_content['img_path'] = f"{img_buket_path}/{span['image_path']}"
  128. if block['type'] == BlockType.TABLE_CAPTION:
  129. para_content['table_caption'].append(merge_para_with_text(block))
  130. if block['type'] == BlockType.TABLE_FOOTNOTE:
  131. para_content['table_footnote'].append(merge_para_with_text(block))
  132. para_content['page_idx'] = page_idx
  133. return para_content
  134. def union_make(pdf_info_dict: list,
  135. make_mode: str,
  136. img_buket_path: str = '',
  137. ):
  138. output_content = []
  139. for page_info in pdf_info_dict:
  140. paras_of_layout = page_info.get('para_blocks')
  141. page_idx = page_info.get('page_idx')
  142. if not paras_of_layout:
  143. continue
  144. if make_mode in [MakeMode.MM_MD, MakeMode.NLP_MD]:
  145. page_markdown = mk_blocks_to_markdown(paras_of_layout, make_mode, img_buket_path)
  146. output_content.extend(page_markdown)
  147. elif make_mode == MakeMode.STANDARD_FORMAT:
  148. for para_block in paras_of_layout:
  149. para_content = make_blocks_to_content_list(para_block, img_buket_path, page_idx)
  150. output_content.append(para_content)
  151. if make_mode in [MakeMode.MM_MD, MakeMode.NLP_MD]:
  152. return '\n\n'.join(output_content)
  153. elif make_mode == MakeMode.STANDARD_FORMAT:
  154. return output_content
  155. return None
  156. def get_title_level(block):
  157. title_level = block.get('level', 1)
  158. if title_level > 4:
  159. title_level = 4
  160. elif title_level < 1:
  161. title_level = 0
  162. return title_level