ocr_mkcontent.py 18 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434
  1. import re
  2. import wordninja
  3. from loguru import logger
  4. from magic_pdf.libs.commons import join_path
  5. from magic_pdf.libs.language import detect_lang
  6. from magic_pdf.libs.MakeContentConfig import DropMode, MakeMode
  7. from magic_pdf.libs.markdown_utils import ocr_escape_special_markdown_char
  8. from magic_pdf.libs.ocr_content_type import BlockType, ContentType
  9. def __is_hyphen_at_line_end(line):
  10. """
  11. Check if a line ends with one or more letters followed by a hyphen.
  12. Args:
  13. line (str): The line of text to check.
  14. Returns:
  15. bool: True if the line ends with one or more letters followed by a hyphen, False otherwise.
  16. """
  17. # Use regex to check if the line ends with one or more letters followed by a hyphen
  18. return bool(re.search(r'[A-Za-z]+-\s*$', line))
  19. def split_long_words(text):
  20. segments = text.split(' ')
  21. for i in range(len(segments)):
  22. words = re.findall(r'\w+|[^\w]', segments[i], re.UNICODE)
  23. for j in range(len(words)):
  24. if len(words[j]) > 10:
  25. words[j] = ' '.join(wordninja.split(words[j]))
  26. segments[i] = ''.join(words)
  27. return ' '.join(segments)
  28. def ocr_mk_mm_markdown_with_para(pdf_info_list: list, img_buket_path):
  29. markdown = []
  30. for page_info in pdf_info_list:
  31. paras_of_layout = page_info.get('para_blocks')
  32. page_markdown = ocr_mk_markdown_with_para_core_v2(
  33. paras_of_layout, 'mm', img_buket_path)
  34. markdown.extend(page_markdown)
  35. return '\n\n'.join(markdown)
  36. def ocr_mk_nlp_markdown_with_para(pdf_info_dict: list):
  37. markdown = []
  38. for page_info in pdf_info_dict:
  39. paras_of_layout = page_info.get('para_blocks')
  40. page_markdown = ocr_mk_markdown_with_para_core_v2(
  41. paras_of_layout, 'nlp')
  42. markdown.extend(page_markdown)
  43. return '\n\n'.join(markdown)
  44. def ocr_mk_mm_markdown_with_para_and_pagination(pdf_info_dict: list,
  45. img_buket_path):
  46. markdown_with_para_and_pagination = []
  47. page_no = 0
  48. for page_info in pdf_info_dict:
  49. paras_of_layout = page_info.get('para_blocks')
  50. if not paras_of_layout:
  51. continue
  52. page_markdown = ocr_mk_markdown_with_para_core_v2(
  53. paras_of_layout, 'mm', img_buket_path)
  54. markdown_with_para_and_pagination.append({
  55. 'page_no':
  56. page_no,
  57. 'md_content':
  58. '\n\n'.join(page_markdown)
  59. })
  60. page_no += 1
  61. return markdown_with_para_and_pagination
  62. def ocr_mk_markdown_with_para_core(paras_of_layout, mode, img_buket_path=''):
  63. page_markdown = []
  64. for paras in paras_of_layout:
  65. for para in paras:
  66. para_text = ''
  67. for line in para:
  68. for span in line['spans']:
  69. span_type = span.get('type')
  70. content = ''
  71. language = ''
  72. if span_type == ContentType.Text:
  73. content = span['content']
  74. language = detect_lang(content)
  75. if (language == 'en'): # 只对英文长词进行分词处理,中文分词会丢失文本
  76. content = ocr_escape_special_markdown_char(
  77. split_long_words(content))
  78. else:
  79. content = ocr_escape_special_markdown_char(content)
  80. elif span_type == ContentType.InlineEquation:
  81. content = f"${span['content']}$"
  82. elif span_type == ContentType.InterlineEquation:
  83. content = f"\n$$\n{span['content']}\n$$\n"
  84. elif span_type in [ContentType.Image, ContentType.Table]:
  85. if mode == 'mm':
  86. content = f"\n![]({join_path(img_buket_path, span['image_path'])})\n"
  87. elif mode == 'nlp':
  88. pass
  89. if content != '':
  90. if language == 'en': # 英文语境下 content间需要空格分隔
  91. para_text += content + ' '
  92. else: # 中文语境下,content间不需要空格分隔
  93. para_text += content
  94. if para_text.strip() == '':
  95. continue
  96. else:
  97. page_markdown.append(para_text.strip() + ' ')
  98. return page_markdown
  99. def ocr_mk_markdown_with_para_core_v2(paras_of_layout,
  100. mode,
  101. img_buket_path=''):
  102. page_markdown = []
  103. for para_block in paras_of_layout:
  104. para_text = ''
  105. para_type = para_block['type']
  106. if para_type == BlockType.Text:
  107. para_text = merge_para_with_text(para_block)
  108. elif para_type == BlockType.Title:
  109. para_text = f'# {merge_para_with_text(para_block)}'
  110. elif para_type == BlockType.InterlineEquation:
  111. para_text = merge_para_with_text(para_block)
  112. elif para_type == BlockType.Image:
  113. if mode == 'nlp':
  114. continue
  115. elif mode == 'mm':
  116. for block in para_block['blocks']: # 1st.拼image_body
  117. if block['type'] == BlockType.ImageBody:
  118. for line in block['lines']:
  119. for span in line['spans']:
  120. if span['type'] == ContentType.Image:
  121. para_text += f"\n![]({join_path(img_buket_path, span['image_path'])}) \n"
  122. for block in para_block['blocks']: # 2nd.拼image_caption
  123. if block['type'] == BlockType.ImageCaption:
  124. para_text += merge_para_with_text(block)
  125. for block in para_block['blocks']: # 2nd.拼image_caption
  126. if block['type'] == BlockType.ImageFootnote:
  127. para_text += merge_para_with_text(block)
  128. elif para_type == BlockType.Table:
  129. if mode == 'nlp':
  130. continue
  131. elif mode == 'mm':
  132. for block in para_block['blocks']: # 1st.拼table_caption
  133. if block['type'] == BlockType.TableCaption:
  134. para_text += merge_para_with_text(block)
  135. for block in para_block['blocks']: # 2nd.拼table_body
  136. if block['type'] == BlockType.TableBody:
  137. for line in block['lines']:
  138. for span in line['spans']:
  139. if span['type'] == ContentType.Table:
  140. # if processed by table model
  141. if span.get('latex', ''):
  142. para_text += f"\n\n$\n {span['latex']}\n$\n\n"
  143. elif span.get('html', ''):
  144. para_text += f"\n\n{span['html']}\n\n"
  145. else:
  146. para_text += f"\n![]({join_path(img_buket_path, span['image_path'])}) \n"
  147. for block in para_block['blocks']: # 3rd.拼table_footnote
  148. if block['type'] == BlockType.TableFootnote:
  149. para_text += merge_para_with_text(block)
  150. if para_text.strip() == '':
  151. continue
  152. else:
  153. page_markdown.append(para_text.strip() + ' ')
  154. return page_markdown
  155. def merge_para_with_text(para_block):
  156. def detect_language(text):
  157. en_pattern = r'[a-zA-Z]+'
  158. en_matches = re.findall(en_pattern, text)
  159. en_length = sum(len(match) for match in en_matches)
  160. if len(text) > 0:
  161. if en_length / len(text) >= 0.5:
  162. return 'en'
  163. else:
  164. return 'unknown'
  165. else:
  166. return 'empty'
  167. para_text = ''
  168. for line in para_block['lines']:
  169. line_text = ''
  170. line_lang = ''
  171. for span in line['spans']:
  172. span_type = span['type']
  173. if span_type == ContentType.Text:
  174. line_text += span['content'].strip()
  175. if line_text != '':
  176. line_lang = detect_lang(line_text)
  177. for span in line['spans']:
  178. span_type = span['type']
  179. content = ''
  180. if span_type == ContentType.Text:
  181. content = span['content']
  182. # language = detect_lang(content)
  183. language = detect_language(content)
  184. if language == 'en': # 只对英文长词进行分词处理,中文分词会丢失文本
  185. content = ocr_escape_special_markdown_char(
  186. split_long_words(content))
  187. else:
  188. content = ocr_escape_special_markdown_char(content)
  189. elif span_type == ContentType.InlineEquation:
  190. content = f" ${span['content']}$ "
  191. elif span_type == ContentType.InterlineEquation:
  192. content = f"\n$$\n{span['content']}\n$$\n"
  193. if content != '':
  194. langs = ['zh', 'ja', 'ko']
  195. if line_lang in langs: # 遇到一些一个字一个span的文档,这种单字语言判断不准,需要用整行文本判断
  196. para_text += content # 中文/日语/韩文语境下,content间不需要空格分隔
  197. elif line_lang == 'en':
  198. # 如果是前一行带有-连字符,那么末尾不应该加空格
  199. if __is_hyphen_at_line_end(content):
  200. para_text += content[:-1]
  201. else:
  202. para_text += content + ' '
  203. else:
  204. para_text += content + ' ' # 西方文本语境下 content间需要空格分隔
  205. return para_text
  206. def para_to_standard_format(para, img_buket_path):
  207. para_content = {}
  208. if len(para) == 1:
  209. para_content = line_to_standard_format(para[0], img_buket_path)
  210. elif len(para) > 1:
  211. para_text = ''
  212. inline_equation_num = 0
  213. for line in para:
  214. for span in line['spans']:
  215. language = ''
  216. span_type = span.get('type')
  217. content = ''
  218. if span_type == ContentType.Text:
  219. content = span['content']
  220. language = detect_lang(content)
  221. if language == 'en': # 只对英文长词进行分词处理,中文分词会丢失文本
  222. content = ocr_escape_special_markdown_char(
  223. split_long_words(content))
  224. else:
  225. content = ocr_escape_special_markdown_char(content)
  226. elif span_type == ContentType.InlineEquation:
  227. content = f"${span['content']}$"
  228. inline_equation_num += 1
  229. if language == 'en': # 英文语境下 content间需要空格分隔
  230. para_text += content + ' '
  231. else: # 中文语境下,content间不需要空格分隔
  232. para_text += content
  233. para_content = {
  234. 'type': 'text',
  235. 'text': para_text,
  236. 'inline_equation_num': inline_equation_num,
  237. }
  238. return para_content
  239. def para_to_standard_format_v2(para_block, img_buket_path, page_idx):
  240. para_type = para_block['type']
  241. if para_type == BlockType.Text:
  242. para_content = {
  243. 'type': 'text',
  244. 'text': merge_para_with_text(para_block),
  245. 'page_idx': page_idx,
  246. }
  247. elif para_type == BlockType.Title:
  248. para_content = {
  249. 'type': 'text',
  250. 'text': merge_para_with_text(para_block),
  251. 'text_level': 1,
  252. 'page_idx': page_idx,
  253. }
  254. elif para_type == BlockType.InterlineEquation:
  255. para_content = {
  256. 'type': 'equation',
  257. 'text': merge_para_with_text(para_block),
  258. 'text_format': 'latex',
  259. 'page_idx': page_idx,
  260. }
  261. elif para_type == BlockType.Image:
  262. para_content = {'type': 'image', 'page_idx': page_idx}
  263. for block in para_block['blocks']:
  264. if block['type'] == BlockType.ImageBody:
  265. para_content['img_path'] = join_path(
  266. img_buket_path,
  267. block['lines'][0]['spans'][0]['image_path'])
  268. if block['type'] == BlockType.ImageCaption:
  269. para_content['img_caption'] = merge_para_with_text(block)
  270. if block['type'] == BlockType.ImageFootnote:
  271. para_content['img_footnote'] = merge_para_with_text(block)
  272. elif para_type == BlockType.Table:
  273. para_content = {'type': 'table', 'page_idx': page_idx}
  274. for block in para_block['blocks']:
  275. if block['type'] == BlockType.TableBody:
  276. if block["lines"][0]["spans"][0].get('latex', ''):
  277. para_content['table_body'] = f"\n\n$\n {block['lines'][0]['spans'][0]['latex']}\n$\n\n"
  278. elif block["lines"][0]["spans"][0].get('html', ''):
  279. para_content['table_body'] = f"\n\n{block['lines'][0]['spans'][0]['html']}\n\n"
  280. para_content['img_path'] = join_path(img_buket_path, block["lines"][0]["spans"][0]['image_path'])
  281. if block['type'] == BlockType.TableCaption:
  282. para_content['table_caption'] = merge_para_with_text(block)
  283. if block['type'] == BlockType.TableFootnote:
  284. para_content['table_footnote'] = merge_para_with_text(block)
  285. return para_content
  286. def make_standard_format_with_para(pdf_info_dict: list, img_buket_path: str):
  287. content_list = []
  288. for page_info in pdf_info_dict:
  289. paras_of_layout = page_info.get('para_blocks')
  290. if not paras_of_layout:
  291. continue
  292. for para_block in paras_of_layout:
  293. para_content = para_to_standard_format_v2(para_block,
  294. img_buket_path)
  295. content_list.append(para_content)
  296. return content_list
  297. def line_to_standard_format(line, img_buket_path):
  298. line_text = ''
  299. inline_equation_num = 0
  300. for span in line['spans']:
  301. if not span.get('content'):
  302. if not span.get('image_path'):
  303. continue
  304. else:
  305. if span['type'] == ContentType.Image:
  306. content = {
  307. 'type': 'image',
  308. 'img_path': join_path(img_buket_path,
  309. span['image_path']),
  310. }
  311. return content
  312. elif span['type'] == ContentType.Table:
  313. content = {
  314. 'type': 'table',
  315. 'img_path': join_path(img_buket_path,
  316. span['image_path']),
  317. }
  318. return content
  319. else:
  320. if span['type'] == ContentType.InterlineEquation:
  321. interline_equation = span['content']
  322. content = {
  323. 'type': 'equation',
  324. 'latex': f'$$\n{interline_equation}\n$$'
  325. }
  326. return content
  327. elif span['type'] == ContentType.InlineEquation:
  328. inline_equation = span['content']
  329. line_text += f'${inline_equation}$'
  330. inline_equation_num += 1
  331. elif span['type'] == ContentType.Text:
  332. text_content = ocr_escape_special_markdown_char(
  333. span['content']) # 转义特殊符号
  334. line_text += text_content
  335. content = {
  336. 'type': 'text',
  337. 'text': line_text,
  338. 'inline_equation_num': inline_equation_num,
  339. }
  340. return content
  341. def ocr_mk_mm_standard_format(pdf_info_dict: list):
  342. """content_list type string
  343. image/text/table/equation(行间的单独拿出来,行内的和text合并) latex string
  344. latex文本字段。 text string 纯文本格式的文本数据。 md string
  345. markdown格式的文本数据。 img_path string s3://full/path/to/img.jpg."""
  346. content_list = []
  347. for page_info in pdf_info_dict:
  348. blocks = page_info.get('preproc_blocks')
  349. if not blocks:
  350. continue
  351. for block in blocks:
  352. for line in block['lines']:
  353. content = line_to_standard_format(line)
  354. content_list.append(content)
  355. return content_list
  356. def union_make(pdf_info_dict: list,
  357. make_mode: str,
  358. drop_mode: str,
  359. img_buket_path: str = ''):
  360. output_content = []
  361. for page_info in pdf_info_dict:
  362. if page_info.get('need_drop', False):
  363. drop_reason = page_info.get('drop_reason')
  364. if drop_mode == DropMode.NONE:
  365. pass
  366. elif drop_mode == DropMode.WHOLE_PDF:
  367. raise Exception((f'drop_mode is {DropMode.WHOLE_PDF} ,'
  368. f'drop_reason is {drop_reason}'))
  369. elif drop_mode == DropMode.SINGLE_PAGE:
  370. logger.warning((f'drop_mode is {DropMode.SINGLE_PAGE} ,'
  371. f'drop_reason is {drop_reason}'))
  372. continue
  373. else:
  374. raise Exception('drop_mode can not be null')
  375. paras_of_layout = page_info.get('para_blocks')
  376. page_idx = page_info.get('page_idx')
  377. if not paras_of_layout:
  378. continue
  379. if make_mode == MakeMode.MM_MD:
  380. page_markdown = ocr_mk_markdown_with_para_core_v2(
  381. paras_of_layout, 'mm', img_buket_path)
  382. output_content.extend(page_markdown)
  383. elif make_mode == MakeMode.NLP_MD:
  384. page_markdown = ocr_mk_markdown_with_para_core_v2(
  385. paras_of_layout, 'nlp')
  386. output_content.extend(page_markdown)
  387. elif make_mode == MakeMode.STANDARD_FORMAT:
  388. for para_block in paras_of_layout:
  389. para_content = para_to_standard_format_v2(
  390. para_block, img_buket_path, page_idx)
  391. output_content.append(para_content)
  392. if make_mode in [MakeMode.MM_MD, MakeMode.NLP_MD]:
  393. return '\n\n'.join(output_content)
  394. elif make_mode == MakeMode.STANDARD_FORMAT:
  395. return output_content