mkcontent.py 18 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438
  1. import math
  2. from loguru import logger
  3. from magic_pdf.config.ocr_content_type import ContentType
  4. from magic_pdf.libs.boxbase import (find_bottom_nearest_text_bbox,
  5. find_top_nearest_text_bbox)
  6. from magic_pdf.libs.commons import join_path
  7. TYPE_INLINE_EQUATION = ContentType.InlineEquation
  8. TYPE_INTERLINE_EQUATION = ContentType.InterlineEquation
  9. UNI_FORMAT_TEXT_TYPE = ['text', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6']
  10. @DeprecationWarning
  11. def mk_nlp_markdown_1(para_dict: dict):
  12. """对排序后的bboxes拼接内容."""
  13. content_lst = []
  14. for _, page_info in para_dict.items():
  15. para_blocks = page_info.get('para_blocks')
  16. if not para_blocks:
  17. continue
  18. for block in para_blocks:
  19. item = block['paras']
  20. for _, p in item.items():
  21. para_text = p['para_text']
  22. is_title = p['is_para_title']
  23. title_level = p['para_title_level']
  24. md_title_prefix = '#' * title_level
  25. if is_title:
  26. content_lst.append(f'{md_title_prefix} {para_text}')
  27. else:
  28. content_lst.append(para_text)
  29. content_text = '\n\n'.join(content_lst)
  30. return content_text
  31. # 找到目标字符串在段落中的索引
  32. def __find_index(paragraph, target):
  33. index = paragraph.find(target)
  34. if index != -1:
  35. return index
  36. else:
  37. return None
  38. def __insert_string(paragraph, target, position):
  39. new_paragraph = paragraph[:position] + target + paragraph[position:]
  40. return new_paragraph
  41. def __insert_after(content, image_content, target):
  42. """在content中找到target,将image_content插入到target后面."""
  43. index = content.find(target)
  44. if index != -1:
  45. content = (
  46. content[: index + len(target)]
  47. + '\n\n'
  48. + image_content
  49. + '\n\n'
  50. + content[index + len(target) :]
  51. )
  52. else:
  53. logger.error(
  54. f"Can't find the location of image {image_content} in the markdown file, search target is {target}"
  55. )
  56. return content
  57. def __insert_before(content, image_content, target):
  58. """在content中找到target,将image_content插入到target前面."""
  59. index = content.find(target)
  60. if index != -1:
  61. content = content[:index] + '\n\n' + image_content + '\n\n' + content[index:]
  62. else:
  63. logger.error(
  64. f"Can't find the location of image {image_content} in the markdown file, search target is {target}"
  65. )
  66. return content
  67. @DeprecationWarning
  68. def mk_mm_markdown_1(para_dict: dict):
  69. """拼装多模态markdown."""
  70. content_lst = []
  71. for _, page_info in para_dict.items():
  72. page_lst = [] # 一个page内的段落列表
  73. para_blocks = page_info.get('para_blocks')
  74. pymu_raw_blocks = page_info.get('preproc_blocks')
  75. all_page_images = []
  76. all_page_images.extend(page_info.get('images', []))
  77. all_page_images.extend(page_info.get('image_backup', []))
  78. all_page_images.extend(page_info.get('tables', []))
  79. all_page_images.extend(page_info.get('table_backup', []))
  80. if not para_blocks or not pymu_raw_blocks: # 只有图片的拼接的场景
  81. for img in all_page_images:
  82. page_lst.append(f"![]({img['image_path']})") # TODO 图片顺序
  83. page_md = '\n\n'.join(page_lst)
  84. else:
  85. for block in para_blocks:
  86. item = block['paras']
  87. for _, p in item.items():
  88. para_text = p['para_text']
  89. is_title = p['is_para_title']
  90. title_level = p['para_title_level']
  91. md_title_prefix = '#' * title_level
  92. if is_title:
  93. page_lst.append(f'{md_title_prefix} {para_text}')
  94. else:
  95. page_lst.append(para_text)
  96. """拼装成一个页面的文本"""
  97. page_md = '\n\n'.join(page_lst)
  98. """插入图片"""
  99. for img in all_page_images:
  100. imgbox = img['bbox']
  101. img_content = f"![]({img['image_path']})"
  102. # 先看在哪个block内
  103. for block in pymu_raw_blocks:
  104. bbox = block['bbox']
  105. if (
  106. bbox[0] - 1 <= imgbox[0] < bbox[2] + 1
  107. and bbox[1] - 1 <= imgbox[1] < bbox[3] + 1
  108. ): # 确定在block内
  109. for l in block['lines']: # noqa: E741
  110. line_box = l['bbox']
  111. if (
  112. line_box[0] - 1 <= imgbox[0] < line_box[2] + 1
  113. and line_box[1] - 1 <= imgbox[1] < line_box[3] + 1
  114. ): # 在line内的,插入line前面
  115. line_txt = ''.join([s['text'] for s in l['spans']])
  116. page_md = __insert_before(
  117. page_md, img_content, line_txt
  118. )
  119. break
  120. break
  121. else: # 在行与行之间
  122. # 找到图片x0,y0与line的x0,y0最近的line
  123. min_distance = 100000
  124. min_line = None
  125. for l in block['lines']: # noqa: E741
  126. line_box = l['bbox']
  127. distance = math.sqrt(
  128. (line_box[0] - imgbox[0]) ** 2
  129. + (line_box[1] - imgbox[1]) ** 2
  130. )
  131. if distance < min_distance:
  132. min_distance = distance
  133. min_line = l
  134. if min_line:
  135. line_txt = ''.join(
  136. [s['text'] for s in min_line['spans']]
  137. )
  138. img_h = imgbox[3] - imgbox[1]
  139. if min_distance < img_h: # 文字在图片前面
  140. page_md = __insert_after(
  141. page_md, img_content, line_txt
  142. )
  143. else:
  144. page_md = __insert_before(
  145. page_md, img_content, line_txt
  146. )
  147. else:
  148. logger.error(
  149. f"Can't find the location of image {img['image_path']} in the markdown file #1"
  150. )
  151. else: # 应当在两个block之间
  152. # 找到上方最近的block,如果上方没有就找大下方最近的block
  153. top_txt_block = find_top_nearest_text_bbox(pymu_raw_blocks, imgbox)
  154. if top_txt_block:
  155. line_txt = ''.join(
  156. [s['text'] for s in top_txt_block['lines'][-1]['spans']]
  157. )
  158. page_md = __insert_after(page_md, img_content, line_txt)
  159. else:
  160. bottom_txt_block = find_bottom_nearest_text_bbox(
  161. pymu_raw_blocks, imgbox
  162. )
  163. if bottom_txt_block:
  164. line_txt = ''.join(
  165. [
  166. s['text']
  167. for s in bottom_txt_block['lines'][0]['spans']
  168. ]
  169. )
  170. page_md = __insert_before(page_md, img_content, line_txt)
  171. else:
  172. logger.error(
  173. f"Can't find the location of image {img['image_path']} in the markdown file #2"
  174. )
  175. content_lst.append(page_md)
  176. """拼装成全部页面的文本"""
  177. content_text = '\n\n'.join(content_lst)
  178. return content_text
  179. def __insert_after_para(text, type, element, content_list):
  180. """在content_list中找到text,将image_path作为一个新的node插入到text后面."""
  181. for i, c in enumerate(content_list):
  182. content_type = c.get('type')
  183. if content_type in UNI_FORMAT_TEXT_TYPE and text in c.get('text', ''):
  184. if type == 'image':
  185. content_node = {
  186. 'type': 'image',
  187. 'img_path': element.get('image_path'),
  188. 'img_alt': '',
  189. 'img_title': '',
  190. 'img_caption': '',
  191. }
  192. elif type == 'table':
  193. content_node = {
  194. 'type': 'table',
  195. 'img_path': element.get('image_path'),
  196. 'table_latex': element.get('text'),
  197. 'table_title': '',
  198. 'table_caption': '',
  199. 'table_quality': element.get('quality'),
  200. }
  201. content_list.insert(i + 1, content_node)
  202. break
  203. else:
  204. logger.error(
  205. f"Can't find the location of image {element.get('image_path')} in the markdown file, search target is {text}"
  206. )
  207. def __insert_before_para(text, type, element, content_list):
  208. """在content_list中找到text,将image_path作为一个新的node插入到text前面."""
  209. for i, c in enumerate(content_list):
  210. content_type = c.get('type')
  211. if content_type in UNI_FORMAT_TEXT_TYPE and text in c.get('text', ''):
  212. if type == 'image':
  213. content_node = {
  214. 'type': 'image',
  215. 'img_path': element.get('image_path'),
  216. 'img_alt': '',
  217. 'img_title': '',
  218. 'img_caption': '',
  219. }
  220. elif type == 'table':
  221. content_node = {
  222. 'type': 'table',
  223. 'img_path': element.get('image_path'),
  224. 'table_latex': element.get('text'),
  225. 'table_title': '',
  226. 'table_caption': '',
  227. 'table_quality': element.get('quality'),
  228. }
  229. content_list.insert(i, content_node)
  230. break
  231. else:
  232. logger.error(
  233. f"Can't find the location of image {element.get('image_path')} in the markdown file, search target is {text}"
  234. )
  235. def mk_universal_format(pdf_info_list: list, img_buket_path):
  236. """构造统一格式 https://aicarrier.feishu.cn/wiki/FqmMwcH69iIdCWkkyjvcDwNUnTY."""
  237. content_lst = []
  238. for page_info in pdf_info_list:
  239. page_lst = [] # 一个page内的段落列表
  240. para_blocks = page_info.get('para_blocks')
  241. pymu_raw_blocks = page_info.get('preproc_blocks')
  242. all_page_images = []
  243. all_page_images.extend(page_info.get('images', []))
  244. all_page_images.extend(page_info.get('image_backup', []))
  245. # all_page_images.extend(page_info.get("tables",[]))
  246. # all_page_images.extend(page_info.get("table_backup",[]) )
  247. all_page_tables = []
  248. all_page_tables.extend(page_info.get('tables', []))
  249. if not para_blocks or not pymu_raw_blocks: # 只有图片的拼接的场景
  250. for img in all_page_images:
  251. content_node = {
  252. 'type': 'image',
  253. 'img_path': join_path(img_buket_path, img['image_path']),
  254. 'img_alt': '',
  255. 'img_title': '',
  256. 'img_caption': '',
  257. }
  258. page_lst.append(content_node) # TODO 图片顺序
  259. for table in all_page_tables:
  260. content_node = {
  261. 'type': 'table',
  262. 'img_path': join_path(img_buket_path, table['image_path']),
  263. 'table_latex': table.get('text'),
  264. 'table_title': '',
  265. 'table_caption': '',
  266. 'table_quality': table.get('quality'),
  267. }
  268. page_lst.append(content_node) # TODO 图片顺序
  269. else:
  270. for block in para_blocks:
  271. item = block['paras']
  272. for _, p in item.items():
  273. font_type = p[
  274. 'para_font_type'
  275. ] # 对于文本来说,要么是普通文本,要么是个行间公式
  276. if font_type == TYPE_INTERLINE_EQUATION:
  277. content_node = {'type': 'equation', 'latex': p['para_text']}
  278. page_lst.append(content_node)
  279. else:
  280. para_text = p['para_text']
  281. is_title = p['is_para_title']
  282. title_level = p['para_title_level']
  283. if is_title:
  284. content_node = {
  285. 'type': f'h{title_level}',
  286. 'text': para_text,
  287. }
  288. page_lst.append(content_node)
  289. else:
  290. content_node = {'type': 'text', 'text': para_text}
  291. page_lst.append(content_node)
  292. content_lst.extend(page_lst)
  293. """插入图片"""
  294. for img in all_page_images:
  295. insert_img_or_table('image', img, pymu_raw_blocks, content_lst)
  296. """插入表格"""
  297. for table in all_page_tables:
  298. insert_img_or_table('table', table, pymu_raw_blocks, content_lst)
  299. # end for
  300. return content_lst
  301. def insert_img_or_table(type, element, pymu_raw_blocks, content_lst):
  302. element_bbox = element['bbox']
  303. # 先看在哪个block内
  304. for block in pymu_raw_blocks:
  305. bbox = block['bbox']
  306. if (
  307. bbox[0] - 1 <= element_bbox[0] < bbox[2] + 1
  308. and bbox[1] - 1 <= element_bbox[1] < bbox[3] + 1
  309. ): # 确定在这个大的block内,然后进入逐行比较距离
  310. for l in block['lines']: # noqa: E741
  311. line_box = l['bbox']
  312. if (
  313. line_box[0] - 1 <= element_bbox[0] < line_box[2] + 1
  314. and line_box[1] - 1 <= element_bbox[1] < line_box[3] + 1
  315. ): # 在line内的,插入line前面
  316. line_txt = ''.join([s['text'] for s in l['spans']])
  317. __insert_before_para(line_txt, type, element, content_lst)
  318. break
  319. break
  320. else: # 在行与行之间
  321. # 找到图片x0,y0与line的x0,y0最近的line
  322. min_distance = 100000
  323. min_line = None
  324. for l in block['lines']: # noqa: E741
  325. line_box = l['bbox']
  326. distance = math.sqrt(
  327. (line_box[0] - element_bbox[0]) ** 2
  328. + (line_box[1] - element_bbox[1]) ** 2
  329. )
  330. if distance < min_distance:
  331. min_distance = distance
  332. min_line = l
  333. if min_line:
  334. line_txt = ''.join([s['text'] for s in min_line['spans']])
  335. img_h = element_bbox[3] - element_bbox[1]
  336. if min_distance < img_h: # 文字在图片前面
  337. __insert_after_para(line_txt, type, element, content_lst)
  338. else:
  339. __insert_before_para(line_txt, type, element, content_lst)
  340. break
  341. else:
  342. logger.error(
  343. f"Can't find the location of image {element.get('image_path')} in the markdown file #1"
  344. )
  345. else: # 应当在两个block之间
  346. # 找到上方最近的block,如果上方没有就找大下方最近的block
  347. top_txt_block = find_top_nearest_text_bbox(pymu_raw_blocks, element_bbox)
  348. if top_txt_block:
  349. line_txt = ''.join([s['text'] for s in top_txt_block['lines'][-1]['spans']])
  350. __insert_after_para(line_txt, type, element, content_lst)
  351. else:
  352. bottom_txt_block = find_bottom_nearest_text_bbox(
  353. pymu_raw_blocks, element_bbox
  354. )
  355. if bottom_txt_block:
  356. line_txt = ''.join(
  357. [s['text'] for s in bottom_txt_block['lines'][0]['spans']]
  358. )
  359. __insert_before_para(line_txt, type, element, content_lst)
  360. else: # TODO ,图片可能独占一列,这种情况上下是没有图片的
  361. logger.error(
  362. f"Can't find the location of image {element.get('image_path')} in the markdown file #2"
  363. )
  364. def mk_mm_markdown(content_list):
  365. """基于同一格式的内容列表,构造markdown,含图片."""
  366. content_md = []
  367. for c in content_list:
  368. content_type = c.get('type')
  369. if content_type == 'text':
  370. content_md.append(c.get('text'))
  371. elif content_type == 'equation':
  372. content = c.get('latex')
  373. if content.startswith('$$') and content.endswith('$$'):
  374. content_md.append(content)
  375. else:
  376. content_md.append(f"\n$$\n{c.get('latex')}\n$$\n")
  377. elif content_type in UNI_FORMAT_TEXT_TYPE:
  378. content_md.append(f"{'#'*int(content_type[1])} {c.get('text')}")
  379. elif content_type == 'image':
  380. content_md.append(f"![]({c.get('img_path')})")
  381. return '\n\n'.join(content_md)
  382. def mk_nlp_markdown(content_list):
  383. """基于同一格式的内容列表,构造markdown,不含图片."""
  384. content_md = []
  385. for c in content_list:
  386. content_type = c.get('type')
  387. if content_type == 'text':
  388. content_md.append(c.get('text'))
  389. elif content_type == 'equation':
  390. content_md.append(f"$$\n{c.get('latex')}\n$$")
  391. elif content_type == 'table':
  392. content_md.append(f"$$$\n{c.get('table_latex')}\n$$$")
  393. elif content_type in UNI_FORMAT_TEXT_TYPE:
  394. content_md.append(f"{'#'*int(content_type[1])} {c.get('text')}")
  395. return '\n\n'.join(content_md)