vlm_middle_json_mkcontent.py 8.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189
  1. import re
  2. from ..utils.enum_class import MakeMode, BlockType, ContentType
  3. def merge_para_with_text(para_block):
  4. para_text = ''
  5. for line in para_block['lines']:
  6. for span in line['spans']:
  7. content = span['content']
  8. content = content.strip()
  9. if content:
  10. para_text += content
  11. else:
  12. continue
  13. return para_text
  14. def mk_blocks_to_markdown(para_blocks, make_mode, img_buket_path=''):
  15. page_markdown = []
  16. for para_block in para_blocks:
  17. para_text = ''
  18. para_type = para_block['type']
  19. if para_type in [BlockType.TEXT, BlockType.LIST, BlockType.INDEX, BlockType.TITLE, BlockType.INTERLINE_EQUATION]:
  20. para_text = merge_para_with_text(para_block)
  21. elif para_type == BlockType.IMAGE:
  22. if make_mode == MakeMode.NLP_MD:
  23. continue
  24. elif make_mode == MakeMode.MM_MD:
  25. # 检测是否存在图片脚注
  26. has_image_footnote = any(block['type'] == BlockType.IMAGE_FOOTNOTE for block in para_block['blocks'])
  27. # 如果存在图片脚注,则将图片脚注拼接到图片正文后面
  28. if has_image_footnote:
  29. for block in para_block['blocks']: # 1st.拼image_caption
  30. if block['type'] == BlockType.IMAGE_CAPTION:
  31. para_text += merge_para_with_text(block) + ' \n'
  32. for block in para_block['blocks']: # 2nd.拼image_body
  33. if block['type'] == BlockType.IMAGE_BODY:
  34. for line in block['lines']:
  35. for span in line['spans']:
  36. if span['type'] == ContentType.IMAGE:
  37. if span.get('image_path', ''):
  38. para_text += f"![]({img_buket_path}/{span['image_path']})"
  39. for block in para_block['blocks']: # 3rd.拼image_footnote
  40. if block['type'] == BlockType.IMAGE_FOOTNOTE:
  41. para_text += ' \n' + merge_para_with_text(block)
  42. else:
  43. for block in para_block['blocks']: # 1st.拼image_body
  44. if block['type'] == BlockType.IMAGE_BODY:
  45. for line in block['lines']:
  46. for span in line['spans']:
  47. if span['type'] == ContentType.IMAGE:
  48. if span.get('image_path', ''):
  49. para_text += f"![]({img_buket_path}/{span['image_path']})"
  50. for block in para_block['blocks']: # 2nd.拼image_caption
  51. if block['type'] == BlockType.IMAGE_CAPTION:
  52. para_text += ' \n' + merge_para_with_text(block)
  53. elif para_type == BlockType.TABLE:
  54. if make_mode == MakeMode.NLP_MD:
  55. continue
  56. elif make_mode == MakeMode.MM_MD:
  57. for block in para_block['blocks']: # 1st.拼table_caption
  58. if block['type'] == BlockType.TABLE_CAPTION:
  59. para_text += merge_para_with_text(block) + ' \n'
  60. for block in para_block['blocks']: # 2nd.拼table_body
  61. if block['type'] == BlockType.TABLE_BODY:
  62. for line in block['lines']:
  63. for span in line['spans']:
  64. if span['type'] == ContentType.TABLE:
  65. # if processed by table model
  66. if span.get('html', ''):
  67. para_text += f"\n{span['html']}\n"
  68. elif span.get('image_path', ''):
  69. para_text += f"![]({img_buket_path}/{span['image_path']})"
  70. for block in para_block['blocks']: # 3rd.拼table_footnote
  71. if block['type'] == BlockType.TABLE_FOOTNOTE:
  72. para_text += '\n' + merge_para_with_text(block) + ' '
  73. if para_text.strip() == '':
  74. continue
  75. else:
  76. # page_markdown.append(para_text.strip() + ' ')
  77. page_markdown.append(para_text.strip())
  78. return page_markdown
  79. def count_leading_hashes(text):
  80. match = re.match(r'^(#+)', text)
  81. return len(match.group(1)) if match else 0
  82. def strip_leading_hashes(text):
  83. # 去除开头的#和紧随其后的空格
  84. return re.sub(r'^#+\s*', '', text)
  85. def make_blocks_to_content_list(para_block, img_buket_path, page_idx):
  86. para_type = para_block['type']
  87. para_content = {}
  88. if para_type in [BlockType.TEXT, BlockType.LIST, BlockType.INDEX]:
  89. para_content = {
  90. 'type': 'text',
  91. 'text': merge_para_with_text(para_block),
  92. }
  93. elif para_type == BlockType.TITLE:
  94. title_content = merge_para_with_text(para_block)
  95. title_level = count_leading_hashes(title_content)
  96. para_content = {
  97. 'type': 'text',
  98. 'text': strip_leading_hashes(title_content),
  99. }
  100. if title_level != 0:
  101. para_content['text_level'] = title_level
  102. elif para_type == BlockType.INTERLINE_EQUATION:
  103. para_content = {
  104. 'type': 'equation',
  105. 'text': merge_para_with_text(para_block),
  106. 'text_format': 'latex',
  107. }
  108. elif para_type == BlockType.IMAGE:
  109. para_content = {'type': 'image', 'img_path': '', 'img_caption': [], 'img_footnote': []}
  110. for block in para_block['blocks']:
  111. if block['type'] == BlockType.IMAGE_BODY:
  112. for line in block['lines']:
  113. for span in line['spans']:
  114. if span['type'] == ContentType.IMAGE:
  115. if span.get('image_path', ''):
  116. para_content['img_path'] = f"{img_buket_path}/{span['image_path']}"
  117. if block['type'] == BlockType.IMAGE_CAPTION:
  118. para_content['img_caption'].append(merge_para_with_text(block))
  119. if block['type'] == BlockType.IMAGE_FOOTNOTE:
  120. para_content['img_footnote'].append(merge_para_with_text(block))
  121. elif para_type == BlockType.TABLE:
  122. para_content = {'type': 'table', 'img_path': '', 'table_caption': [], 'table_footnote': []}
  123. for block in para_block['blocks']:
  124. if block['type'] == BlockType.TABLE_BODY:
  125. for line in block['lines']:
  126. for span in line['spans']:
  127. if span['type'] == ContentType.TABLE:
  128. if span.get('html', ''):
  129. para_content['table_body'] = f"{span['html']}"
  130. if span.get('image_path', ''):
  131. para_content['img_path'] = f"{img_buket_path}/{span['image_path']}"
  132. if block['type'] == BlockType.TABLE_CAPTION:
  133. para_content['table_caption'].append(merge_para_with_text(block))
  134. if block['type'] == BlockType.TABLE_FOOTNOTE:
  135. para_content['table_footnote'].append(merge_para_with_text(block))
  136. para_content['page_idx'] = page_idx
  137. return para_content
  138. def union_make(pdf_info_dict: list,
  139. make_mode: str,
  140. img_buket_path: str = '',
  141. ):
  142. output_content = []
  143. for page_info in pdf_info_dict:
  144. paras_of_layout = page_info.get('para_blocks')
  145. page_idx = page_info.get('page_idx')
  146. if not paras_of_layout:
  147. continue
  148. if make_mode in [MakeMode.MM_MD, MakeMode.NLP_MD]:
  149. page_markdown = mk_blocks_to_markdown(paras_of_layout, make_mode, img_buket_path)
  150. output_content.extend(page_markdown)
  151. elif make_mode == MakeMode.STANDARD_FORMAT:
  152. for para_block in paras_of_layout:
  153. para_content = make_blocks_to_content_list(para_block, img_buket_path, page_idx)
  154. output_content.append(para_content)
  155. if make_mode in [MakeMode.MM_MD, MakeMode.NLP_MD]:
  156. return '\n\n'.join(output_content)
  157. elif make_mode == MakeMode.STANDARD_FORMAT:
  158. return output_content
  159. return None
  160. def get_title_level(block):
  161. title_level = block.get('level', 1)
  162. if title_level > 4:
  163. title_level = 4
  164. elif title_level < 1:
  165. title_level = 0
  166. return title_level