mkcontent.py 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249
  1. import math
  2. from loguru import logger
  3. from magic_pdf.libs.boxbase import find_bottom_nearest_text_bbox, find_top_nearest_text_bbox
  4. def mk_nlp_markdown(para_dict: dict):
  5. """
  6. 对排序后的bboxes拼接内容
  7. """
  8. content_lst = []
  9. for _, page_info in para_dict.items():
  10. para_blocks = page_info.get("para_blocks")
  11. if not para_blocks:
  12. continue
  13. for block in para_blocks:
  14. item = block["paras"]
  15. for _, p in item.items():
  16. para_text = p["para_text"]
  17. is_title = p["is_para_title"]
  18. title_level = p['para_title_level']
  19. md_title_prefix = "#"*title_level
  20. if is_title:
  21. content_lst.append(f"{md_title_prefix} {para_text}")
  22. else:
  23. content_lst.append(para_text)
  24. content_text = "\n\n".join(content_lst)
  25. return content_text
  26. # 找到目标字符串在段落中的索引
  27. def __find_index(paragraph, target):
  28. index = paragraph.find(target)
  29. if index != -1:
  30. return index
  31. else:
  32. return None
  33. def __insert_string(paragraph, target, postion):
  34. new_paragraph = paragraph[:postion] + target + paragraph[postion:]
  35. return new_paragraph
  36. def __insert_after(content, image_content, target):
  37. """
  38. 在content中找到target,将image_content插入到target后面
  39. """
  40. index = content.find(target)
  41. if index != -1:
  42. content = content[:index+len(target)] + "\n\n" + image_content + "\n\n" + content[index+len(target):]
  43. else:
  44. logger.error(f"Can't find the location of image {image_content} in the markdown file, search target is {target}")
  45. return content
  46. def __insert_before(content, image_content, target):
  47. """
  48. 在content中找到target,将image_content插入到target前面
  49. """
  50. index = content.find(target)
  51. if index != -1:
  52. content = content[:index] + "\n\n" + image_content + "\n\n" + content[index:]
  53. else:
  54. logger.error(f"Can't find the location of image {image_content} in the markdown file, search target is {target}")
  55. return content
  56. def mk_mm_markdown(para_dict: dict):
  57. """拼装多模态markdown"""
  58. content_lst = []
  59. for _, page_info in para_dict.items():
  60. page_lst = [] # 一个page内的段落列表
  61. para_blocks = page_info.get("para_blocks")
  62. pymu_raw_blocks = page_info.get("preproc_blocks")
  63. all_page_images = []
  64. all_page_images.extend(page_info.get("images",[]))
  65. all_page_images.extend(page_info.get("image_backup", []) )
  66. all_page_images.extend(page_info.get("tables",[]))
  67. all_page_images.extend(page_info.get("table_backup",[]) )
  68. if not para_blocks or not pymu_raw_blocks: # 只有图片的拼接的场景
  69. for img in all_page_images:
  70. page_lst.append(f"![]({img['image_path']})") # TODO 图片顺序
  71. page_md = "\n\n".join(page_lst)
  72. else:
  73. for block in para_blocks:
  74. item = block["paras"]
  75. for _, p in item.items():
  76. para_text = p["para_text"]
  77. is_title = p["is_para_title"]
  78. title_level = p['para_title_level']
  79. md_title_prefix = "#"*title_level
  80. if is_title:
  81. page_lst.append(f"{md_title_prefix} {para_text}")
  82. else:
  83. page_lst.append(para_text)
  84. """拼装成一个页面的文本"""
  85. page_md = "\n\n".join(page_lst)
  86. """插入图片"""
  87. for img in all_page_images:
  88. imgbox = img['bbox']
  89. img_content = f"![]({img['image_path']})"
  90. # 先看在哪个block内
  91. for block in pymu_raw_blocks:
  92. bbox = block['bbox']
  93. if bbox[0]-1 <= imgbox[0] < bbox[2]+1 and bbox[1]-1 <= imgbox[1] < bbox[3]+1:# 确定在block内
  94. for l in block['lines']:
  95. line_box = l['bbox']
  96. if line_box[0]-1 <= imgbox[0] < line_box[2]+1 and line_box[1]-1 <= imgbox[1] < line_box[3]+1: # 在line内的,插入line前面
  97. line_txt = "".join([s['text'] for s in l['spans']])
  98. page_md = __insert_before(page_md, img_content, line_txt)
  99. break
  100. break
  101. else:# 在行与行之间
  102. # 找到图片x0,y0与line的x0,y0最近的line
  103. min_distance = 100000
  104. min_line = None
  105. for l in block['lines']:
  106. line_box = l['bbox']
  107. distance = math.sqrt((line_box[0] - imgbox[0])**2 + (line_box[1] - imgbox[1])**2)
  108. if distance < min_distance:
  109. min_distance = distance
  110. min_line = l
  111. if min_line:
  112. line_txt = "".join([s['text'] for s in min_line['spans']])
  113. img_h = imgbox[3] - imgbox[1]
  114. if min_distance<img_h: # 文字在图片前面
  115. page_md = __insert_after(page_md, img_content, line_txt)
  116. else:
  117. page_md = __insert_before(page_md, img_content, line_txt)
  118. else:
  119. logger.error(f"Can't find the location of image {img['image_path']} in the markdown file")
  120. else:# 应当在两个block之间
  121. # 找到上方最近的block,如果上方没有就找大下方最近的block
  122. top_txt_block = find_top_nearest_text_bbox(pymu_raw_blocks, imgbox)
  123. if top_txt_block:
  124. line_txt = "".join([s['text'] for s in top_txt_block['lines'][-1]['spans']])
  125. page_md = __insert_after(page_md, img_content, line_txt)
  126. else:
  127. bottom_txt_block = find_bottom_nearest_text_bbox(pymu_raw_blocks, imgbox)
  128. if bottom_txt_block:
  129. line_txt = "".join([s['text'] for s in bottom_txt_block['lines'][0]['spans']])
  130. page_md = __insert_before(page_md, img_content, line_txt)
  131. else:
  132. logger.error(f"Can't find the location of image {img['image_path']} in the markdown file")
  133. content_lst.append(page_md)
  134. """拼装成全部页面的文本"""
  135. content_text = "\n\n".join(content_lst)
  136. return content_text
  137. @DeprecationWarning
  138. def mk_mm_markdown_1(para_dict: dict):
  139. """
  140. 得到images和tables变量
  141. """
  142. image_all_list = []
  143. for _, page_info in para_dict.items():
  144. images = page_info.get("images",[])
  145. tables = page_info.get("tables",[])
  146. image_backup = page_info.get("image_backup", [])
  147. table_backup = page_info.get("table_backup",[])
  148. all_page_images = []
  149. all_page_images.extend(images)
  150. all_page_images.extend(image_backup)
  151. all_page_images.extend(tables)
  152. all_page_images.extend(table_backup)
  153. pymu_raw_blocks = page_info.get("pymu_raw_blocks")
  154. # 提取每个图片所在位置
  155. for image_info in all_page_images:
  156. x0_image, y0_image, x1_image, y1_image = image_info['bbox'][:4]
  157. image_path = image_info['image_path']
  158. # 判断图片处于原始PDF中哪个模块之间
  159. image_internal_dict = {}
  160. image_external_dict = {}
  161. between_dict = {}
  162. for block in pymu_raw_blocks:
  163. x0, y0, x1, y1 = block['bbox'][:4]
  164. # 在某个模块内部
  165. if x0 <= x0_image < x1 and y0 <= y0_image < y1:
  166. image_internal_dict['bbox'] = [x0_image, y0_image, x1_image, y1_image]
  167. image_internal_dict['path'] = image_path
  168. # 确定图片在哪句文本之前
  169. y_pre = 0
  170. for line in block['lines']:
  171. x0, y0, x1, y1 = line['spans'][0]['bbox']
  172. if x0 <= x0_image < x1 and y_pre <= y0_image < y0:
  173. text = line['spans']['text']
  174. image_internal_dict['text'] = text
  175. image_internal_dict['markdown_image'] = f'![image_path]({image_path})'
  176. break
  177. else:
  178. y_pre = y0
  179. # 在某两个模块之间
  180. elif x0 <= x0_image < x1:
  181. distance = math.sqrt((x1_image - x0)**2 + (y1_image - y0)**2)
  182. between_dict[block['number']] = distance
  183. # 找到与定位点距离最小的文本block
  184. if between_dict:
  185. min_key = min(between_dict, key=between_dict.get)
  186. spans_list = []
  187. for span in pymu_raw_blocks[min_key]['lines']:
  188. for text_piece in span['spans']:
  189. # 防止索引定位文本内容过多
  190. if len(spans_list) < 60:
  191. spans_list.append(text_piece['text'])
  192. text1 = ''.join(spans_list)
  193. image_external_dict['bbox'] = [x0_image, y0_image, x1_image, y1_image]
  194. image_external_dict['path'] = image_path
  195. image_external_dict['text'] = text1
  196. image_external_dict['markdown_image'] = f'![image_path]({image_path})'
  197. # 将内部图片或外部图片存入当页所有图片的列表
  198. if len(image_internal_dict) != 0:
  199. image_all_list.append(image_internal_dict)
  200. elif len(image_external_dict) != 0:
  201. image_all_list.append(image_external_dict)
  202. else:
  203. logger.error(f"Can't find the location of image {image_path} in the markdown file")
  204. content_text = mk_nlp_markdown(para_dict)
  205. for image_info_extract in image_all_list:
  206. loc = __find_index(content_text, image_info_extract['text'])
  207. if loc is not None:
  208. content_text = __insert_string(content_text, image_info_extract['markdown_image'], loc)
  209. else:
  210. logger.error(f"Can't find the location of image {image_info_extract['path']} in the markdown file")
  211. return content_text