ocr_mkcontent.py 2.2 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455
  1. from magic_pdf.libs.markdown_utils import ocr_escape_special_markdown_char
  2. from magic_pdf.libs.ocr_content_type import ContentType
  3. def ocr_mk_nlp_markdown(pdf_info_dict: dict):
  4. markdown = []
  5. for _, page_info in pdf_info_dict.items():
  6. blocks = page_info.get("preproc_blocks")
  7. if not blocks:
  8. continue
  9. for block in blocks:
  10. for line in block['lines']:
  11. line_text = ''
  12. for span in line['spans']:
  13. if not span.get('content'):
  14. continue
  15. content = ocr_escape_special_markdown_char(span['content']) # 转义特殊符号
  16. if span['type'] == ContentType.InlineEquation:
  17. content = f"${content}$"
  18. elif span['type'] == ContentType.InterlineEquation:
  19. content = f"$$\n{content}\n$$"
  20. line_text += content + ' '
  21. # 在行末添加两个空格以强制换行
  22. markdown.append(line_text.strip() + ' ')
  23. return '\n'.join(markdown)
  24. def ocr_mk_mm_markdown(pdf_info_dict: dict):
  25. markdown = []
  26. for _, page_info in pdf_info_dict.items():
  27. blocks = page_info.get("preproc_blocks")
  28. if not blocks:
  29. continue
  30. for block in blocks:
  31. for line in block['lines']:
  32. line_text = ''
  33. for span in line['spans']:
  34. if not span.get('content'):
  35. if not span.get('image_path'):
  36. continue
  37. else:
  38. content = f"![]({span['image_path']})"
  39. else:
  40. content = ocr_escape_special_markdown_char(span['content']) # 转义特殊符号
  41. if span['type'] == ContentType.InlineEquation:
  42. content = f"${content}$"
  43. elif span['type'] == ContentType.InterlineEquation:
  44. content = f"$$\n{content}\n$$"
  45. line_text += content + ' '
  46. # 在行末添加两个空格以强制换行
  47. markdown.append(line_text.strip() + ' ')
  48. return '\n'.join(markdown)