ocr_mkcontent.py 2.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354
  1. from magic_pdf.libs.ocr_content_type import ContentType
  2. def ocr_mk_nlp_markdown(pdf_info_dict: dict):
  3. markdown = []
  4. for _, page_info in pdf_info_dict.items():
  5. blocks = page_info.get("preproc_blocks")
  6. if not blocks:
  7. continue
  8. for block in blocks:
  9. for line in block['lines']:
  10. line_text = ''
  11. for span in line['spans']:
  12. if not span.get('content'):
  13. continue
  14. content = span['content'].replace('$', '\$') # 转义$
  15. if span['type'] == ContentType.InlineEquation:
  16. content = f"${content}$"
  17. elif span['type'] == ContentType.InterlineEquation:
  18. content = f"$$\n{content}\n$$"
  19. line_text += content + ' '
  20. # 在行末添加两个空格以强制换行
  21. markdown.append(line_text.strip() + ' ')
  22. return '\n'.join(markdown)
  23. def ocr_mk_mm_markdown(pdf_info_dict: dict):
  24. markdown = []
  25. for _, page_info in pdf_info_dict.items():
  26. blocks = page_info.get("preproc_blocks")
  27. if not blocks:
  28. continue
  29. for block in blocks:
  30. for line in block['lines']:
  31. line_text = ''
  32. for span in line['spans']:
  33. if not span.get('content'):
  34. if not span.get('image_path'):
  35. continue
  36. else:
  37. content = f"![]({span['image_path']})"
  38. else:
  39. content = span['content'].replace('$', '\$') # 转义$
  40. if span['type'] == ContentType.InlineEquation:
  41. content = f"${content}$"
  42. elif span['type'] == ContentType.InterlineEquation:
  43. content = f"$$\n{content}\n$$"
  44. line_text += content + ' '
  45. # 在行末添加两个空格以强制换行
  46. markdown.append(line_text.strip() + ' ')
  47. return '\n'.join(markdown)