mkcontent.py 17 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397
  1. import math
  2. from loguru import logger
  3. from magic_pdf.libs.boxbase import find_bottom_nearest_text_bbox, find_top_nearest_text_bbox
  4. from magic_pdf.libs.commons import join_path
  5. from magic_pdf.libs.ocr_content_type import ContentType
  6. TYPE_INLINE_EQUATION = ContentType.InlineEquation
  7. TYPE_INTERLINE_EQUATION = ContentType.InterlineEquation
  8. UNI_FORMAT_TEXT_TYPE = ['text', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6']
  9. @DeprecationWarning
  10. def mk_nlp_markdown_1(para_dict: dict):
  11. """
  12. 对排序后的bboxes拼接内容
  13. """
  14. content_lst = []
  15. for _, page_info in para_dict.items():
  16. para_blocks = page_info.get("para_blocks")
  17. if not para_blocks:
  18. continue
  19. for block in para_blocks:
  20. item = block["paras"]
  21. for _, p in item.items():
  22. para_text = p["para_text"]
  23. is_title = p["is_para_title"]
  24. title_level = p['para_title_level']
  25. md_title_prefix = "#"*title_level
  26. if is_title:
  27. content_lst.append(f"{md_title_prefix} {para_text}")
  28. else:
  29. content_lst.append(para_text)
  30. content_text = "\n\n".join(content_lst)
  31. return content_text
  32. # 找到目标字符串在段落中的索引
  33. def __find_index(paragraph, target):
  34. index = paragraph.find(target)
  35. if index != -1:
  36. return index
  37. else:
  38. return None
  39. def __insert_string(paragraph, target, postion):
  40. new_paragraph = paragraph[:postion] + target + paragraph[postion:]
  41. return new_paragraph
  42. def __insert_after(content, image_content, target):
  43. """
  44. 在content中找到target,将image_content插入到target后面
  45. """
  46. index = content.find(target)
  47. if index != -1:
  48. content = content[:index+len(target)] + "\n\n" + image_content + "\n\n" + content[index+len(target):]
  49. else:
  50. logger.error(f"Can't find the location of image {image_content} in the markdown file, search target is {target}")
  51. return content
  52. def __insert_before(content, image_content, target):
  53. """
  54. 在content中找到target,将image_content插入到target前面
  55. """
  56. index = content.find(target)
  57. if index != -1:
  58. content = content[:index] + "\n\n" + image_content + "\n\n" + content[index:]
  59. else:
  60. logger.error(f"Can't find the location of image {image_content} in the markdown file, search target is {target}")
  61. return content
  62. @DeprecationWarning
  63. def mk_mm_markdown_1(para_dict: dict):
  64. """拼装多模态markdown"""
  65. content_lst = []
  66. for _, page_info in para_dict.items():
  67. page_lst = [] # 一个page内的段落列表
  68. para_blocks = page_info.get("para_blocks")
  69. pymu_raw_blocks = page_info.get("preproc_blocks")
  70. all_page_images = []
  71. all_page_images.extend(page_info.get("images",[]))
  72. all_page_images.extend(page_info.get("image_backup", []) )
  73. all_page_images.extend(page_info.get("tables",[]))
  74. all_page_images.extend(page_info.get("table_backup",[]) )
  75. if not para_blocks or not pymu_raw_blocks: # 只有图片的拼接的场景
  76. for img in all_page_images:
  77. page_lst.append(f"![]({img['image_path']})") # TODO 图片顺序
  78. page_md = "\n\n".join(page_lst)
  79. else:
  80. for block in para_blocks:
  81. item = block["paras"]
  82. for _, p in item.items():
  83. para_text = p["para_text"]
  84. is_title = p["is_para_title"]
  85. title_level = p['para_title_level']
  86. md_title_prefix = "#"*title_level
  87. if is_title:
  88. page_lst.append(f"{md_title_prefix} {para_text}")
  89. else:
  90. page_lst.append(para_text)
  91. """拼装成一个页面的文本"""
  92. page_md = "\n\n".join(page_lst)
  93. """插入图片"""
  94. for img in all_page_images:
  95. imgbox = img['bbox']
  96. img_content = f"![]({img['image_path']})"
  97. # 先看在哪个block内
  98. for block in pymu_raw_blocks:
  99. bbox = block['bbox']
  100. if bbox[0]-1 <= imgbox[0] < bbox[2]+1 and bbox[1]-1 <= imgbox[1] < bbox[3]+1:# 确定在block内
  101. for l in block['lines']:
  102. line_box = l['bbox']
  103. if line_box[0]-1 <= imgbox[0] < line_box[2]+1 and line_box[1]-1 <= imgbox[1] < line_box[3]+1: # 在line内的,插入line前面
  104. line_txt = "".join([s['text'] for s in l['spans']])
  105. page_md = __insert_before(page_md, img_content, line_txt)
  106. break
  107. break
  108. else:# 在行与行之间
  109. # 找到图片x0,y0与line的x0,y0最近的line
  110. min_distance = 100000
  111. min_line = None
  112. for l in block['lines']:
  113. line_box = l['bbox']
  114. distance = math.sqrt((line_box[0] - imgbox[0])**2 + (line_box[1] - imgbox[1])**2)
  115. if distance < min_distance:
  116. min_distance = distance
  117. min_line = l
  118. if min_line:
  119. line_txt = "".join([s['text'] for s in min_line['spans']])
  120. img_h = imgbox[3] - imgbox[1]
  121. if min_distance<img_h: # 文字在图片前面
  122. page_md = __insert_after(page_md, img_content, line_txt)
  123. else:
  124. page_md = __insert_before(page_md, img_content, line_txt)
  125. else:
  126. logger.error(f"Can't find the location of image {img['image_path']} in the markdown file #1")
  127. else:# 应当在两个block之间
  128. # 找到上方最近的block,如果上方没有就找大下方最近的block
  129. top_txt_block = find_top_nearest_text_bbox(pymu_raw_blocks, imgbox)
  130. if top_txt_block:
  131. line_txt = "".join([s['text'] for s in top_txt_block['lines'][-1]['spans']])
  132. page_md = __insert_after(page_md, img_content, line_txt)
  133. else:
  134. bottom_txt_block = find_bottom_nearest_text_bbox(pymu_raw_blocks, imgbox)
  135. if bottom_txt_block:
  136. line_txt = "".join([s['text'] for s in bottom_txt_block['lines'][0]['spans']])
  137. page_md = __insert_before(page_md, img_content, line_txt)
  138. else:
  139. logger.error(f"Can't find the location of image {img['image_path']} in the markdown file #2")
  140. content_lst.append(page_md)
  141. """拼装成全部页面的文本"""
  142. content_text = "\n\n".join(content_lst)
  143. return content_text
  144. def __insert_after_para(text, type, element, content_list):
  145. """
  146. 在content_list中找到text,将image_path作为一个新的node插入到text后面
  147. """
  148. for i, c in enumerate(content_list):
  149. content_type = c.get("type")
  150. if content_type in UNI_FORMAT_TEXT_TYPE and text in c.get("text", ''):
  151. if type == "image":
  152. content_node = {
  153. "type": "image",
  154. "img_path": element.get("image_path"),
  155. "img_alt": "",
  156. "img_title": "",
  157. "img_caption": "",
  158. }
  159. elif type == "table":
  160. content_node = {
  161. "type": "table",
  162. "img_path": element.get("image_path"),
  163. "table_latex": element.get("text"),
  164. "table_title": "",
  165. "table_caption": "",
  166. "table_quality": element.get("quality"),
  167. }
  168. content_list.insert(i+1, content_node)
  169. break
  170. else:
  171. logger.error(f"Can't find the location of image {element.get('image_path')} in the markdown file, search target is {text}")
  172. def __insert_before_para(text, type, element, content_list):
  173. """
  174. 在content_list中找到text,将image_path作为一个新的node插入到text前面
  175. """
  176. for i, c in enumerate(content_list):
  177. content_type = c.get("type")
  178. if content_type in UNI_FORMAT_TEXT_TYPE and text in c.get("text", ''):
  179. if type == "image":
  180. content_node = {
  181. "type": "image",
  182. "img_path": element.get("image_path"),
  183. "img_alt": "",
  184. "img_title": "",
  185. "img_caption": "",
  186. }
  187. elif type == "table":
  188. content_node = {
  189. "type": "table",
  190. "img_path": element.get("image_path"),
  191. "table_latex": element.get("text"),
  192. "table_title": "",
  193. "table_caption": "",
  194. "table_quality": element.get("quality"),
  195. }
  196. content_list.insert(i, content_node)
  197. break
  198. else:
  199. logger.error(f"Can't find the location of image {element.get('image_path')} in the markdown file, search target is {text}")
  200. def mk_universal_format(pdf_info_list: list, img_buket_path):
  201. """
  202. 构造统一格式 https://aicarrier.feishu.cn/wiki/FqmMwcH69iIdCWkkyjvcDwNUnTY
  203. """
  204. content_lst = []
  205. for page_info in pdf_info_list:
  206. page_lst = [] # 一个page内的段落列表
  207. para_blocks = page_info.get("para_blocks")
  208. pymu_raw_blocks = page_info.get("preproc_blocks")
  209. all_page_images = []
  210. all_page_images.extend(page_info.get("images",[]))
  211. all_page_images.extend(page_info.get("image_backup", []) )
  212. # all_page_images.extend(page_info.get("tables",[]))
  213. # all_page_images.extend(page_info.get("table_backup",[]) )
  214. all_page_tables = []
  215. all_page_tables.extend(page_info.get("tables", []))
  216. if not para_blocks or not pymu_raw_blocks: # 只有图片的拼接的场景
  217. for img in all_page_images:
  218. content_node = {
  219. "type": "image",
  220. "img_path": join_path(img_buket_path, img['image_path']),
  221. "img_alt":"",
  222. "img_title":"",
  223. "img_caption":""
  224. }
  225. page_lst.append(content_node) # TODO 图片顺序
  226. for table in all_page_tables:
  227. content_node = {
  228. "type": "table",
  229. "img_path": join_path(img_buket_path, table['image_path']),
  230. "table_latex": table.get("text"),
  231. "table_title": "",
  232. "table_caption": "",
  233. "table_quality": table.get("quality"),
  234. }
  235. page_lst.append(content_node) # TODO 图片顺序
  236. else:
  237. for block in para_blocks:
  238. item = block["paras"]
  239. for _, p in item.items():
  240. font_type = p['para_font_type']# 对于文本来说,要么是普通文本,要么是个行间公式
  241. if font_type == TYPE_INTERLINE_EQUATION:
  242. content_node = {
  243. "type": "equation",
  244. "latex": p["para_text"]
  245. }
  246. page_lst.append(content_node)
  247. else:
  248. para_text = p["para_text"]
  249. is_title = p["is_para_title"]
  250. title_level = p['para_title_level']
  251. if is_title:
  252. content_node = {
  253. "type": f"h{title_level}",
  254. "text": para_text
  255. }
  256. page_lst.append(content_node)
  257. else:
  258. content_node = {
  259. "type": "text",
  260. "text": para_text
  261. }
  262. page_lst.append(content_node)
  263. content_lst.extend(page_lst)
  264. """插入图片"""
  265. for img in all_page_images:
  266. insert_img_or_table("image", img, pymu_raw_blocks, content_lst)
  267. """插入表格"""
  268. for table in all_page_tables:
  269. insert_img_or_table("table", table, pymu_raw_blocks, content_lst)
  270. # end for
  271. return content_lst
  272. def insert_img_or_table(type, element, pymu_raw_blocks, content_lst):
  273. element_bbox = element['bbox']
  274. # 先看在哪个block内
  275. for block in pymu_raw_blocks:
  276. bbox = block['bbox']
  277. if bbox[0] - 1 <= element_bbox[0] < bbox[2] + 1 and bbox[1] - 1 <= element_bbox[1] < bbox[
  278. 3] + 1: # 确定在这个大的block内,然后进入逐行比较距离
  279. for l in block['lines']:
  280. line_box = l['bbox']
  281. if line_box[0] - 1 <= element_bbox[0] < line_box[2] + 1 and line_box[1] - 1 <= element_bbox[1] < line_box[
  282. 3] + 1: # 在line内的,插入line前面
  283. line_txt = "".join([s['text'] for s in l['spans']])
  284. __insert_before_para(line_txt, type, element, content_lst)
  285. break
  286. break
  287. else: # 在行与行之间
  288. # 找到图片x0,y0与line的x0,y0最近的line
  289. min_distance = 100000
  290. min_line = None
  291. for l in block['lines']:
  292. line_box = l['bbox']
  293. distance = math.sqrt((line_box[0] - element_bbox[0]) ** 2 + (line_box[1] - element_bbox[1]) ** 2)
  294. if distance < min_distance:
  295. min_distance = distance
  296. min_line = l
  297. if min_line:
  298. line_txt = "".join([s['text'] for s in min_line['spans']])
  299. img_h = element_bbox[3] - element_bbox[1]
  300. if min_distance < img_h: # 文字在图片前面
  301. __insert_after_para(line_txt, type, element, content_lst)
  302. else:
  303. __insert_before_para(line_txt, type, element, content_lst)
  304. break
  305. else:
  306. logger.error(f"Can't find the location of image {element.get('image_path')} in the markdown file #1")
  307. else: # 应当在两个block之间
  308. # 找到上方最近的block,如果上方没有就找大下方最近的block
  309. top_txt_block = find_top_nearest_text_bbox(pymu_raw_blocks, element_bbox)
  310. if top_txt_block:
  311. line_txt = "".join([s['text'] for s in top_txt_block['lines'][-1]['spans']])
  312. __insert_after_para(line_txt, type, element, content_lst)
  313. else:
  314. bottom_txt_block = find_bottom_nearest_text_bbox(pymu_raw_blocks, element_bbox)
  315. if bottom_txt_block:
  316. line_txt = "".join([s['text'] for s in bottom_txt_block['lines'][0]['spans']])
  317. __insert_before_para(line_txt, type, element, content_lst)
  318. else: # TODO ,图片可能独占一列,这种情况上下是没有图片的
  319. logger.error(f"Can't find the location of image {element.get('image_path')} in the markdown file #2")
  320. def mk_mm_markdown(content_list):
  321. """
  322. 基于同一格式的内容列表,构造markdown,含图片
  323. """
  324. content_md = []
  325. for c in content_list:
  326. content_type = c.get("type")
  327. if content_type == "text":
  328. content_md.append(c.get("text"))
  329. elif content_type == "equation":
  330. content = c.get("latex")
  331. if content.startswith("$$") and content.endswith("$$"):
  332. content_md.append(content)
  333. else:
  334. content_md.append(f"\n$$\n{c.get('latex')}\n$$\n")
  335. elif content_type in UNI_FORMAT_TEXT_TYPE:
  336. content_md.append(f"{'#'*int(content_type[1])} {c.get('text')}")
  337. elif content_type == "image":
  338. content_md.append(f"![]({c.get('img_path')})")
  339. return "\n\n".join(content_md)
  340. def mk_nlp_markdown(content_list):
  341. """
  342. 基于同一格式的内容列表,构造markdown,不含图片
  343. """
  344. content_md = []
  345. for c in content_list:
  346. content_type = c.get("type")
  347. if content_type == "text":
  348. content_md.append(c.get("text"))
  349. elif content_type == "equation":
  350. content_md.append(f"$$\n{c.get('latex')}\n$$")
  351. elif content_type == "table":
  352. content_md.append(f"$$$\n{c.get('table_latex')}\n$$$")
  353. elif content_type in UNI_FORMAT_TEXT_TYPE:
  354. content_md.append(f"{'#'*int(content_type[1])} {c.get('text')}")
  355. return "\n\n".join(content_md)