ocr_mkcontent.py 3.4 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788
  1. from magic_pdf.libs.markdown_utils import ocr_escape_special_markdown_char
  2. from magic_pdf.libs.ocr_content_type import ContentType
  3. def ocr_mk_nlp_markdown(pdf_info_dict: dict):
  4. markdown = []
  5. for _, page_info in pdf_info_dict.items():
  6. blocks = page_info.get("preproc_blocks")
  7. if not blocks:
  8. continue
  9. for block in blocks:
  10. for line in block['lines']:
  11. line_text = ''
  12. for span in line['spans']:
  13. if not span.get('content'):
  14. continue
  15. content = ocr_escape_special_markdown_char(span['content']) # 转义特殊符号
  16. if span['type'] == ContentType.InlineEquation:
  17. content = f"${content}$"
  18. elif span['type'] == ContentType.InterlineEquation:
  19. content = f"$$\n{content}\n$$"
  20. line_text += content + ' '
  21. # 在行末添加两个空格以强制换行
  22. markdown.append(line_text.strip() + ' ')
  23. return '\n'.join(markdown)
  24. def ocr_mk_mm_markdown(pdf_info_dict: dict):
  25. markdown = []
  26. for _, page_info in pdf_info_dict.items():
  27. blocks = page_info.get("preproc_blocks")
  28. if not blocks:
  29. continue
  30. for block in blocks:
  31. for line in block['lines']:
  32. line_text = ''
  33. for span in line['spans']:
  34. if not span.get('content'):
  35. if not span.get('image_path'):
  36. continue
  37. else:
  38. content = f"![](s3://mllm-raw-media/pdf2md_img/{span['image_path']})"
  39. else:
  40. content = ocr_escape_special_markdown_char(span['content']) # 转义特殊符号
  41. if span['type'] == ContentType.InlineEquation:
  42. content = f"${content}$"
  43. elif span['type'] == ContentType.InterlineEquation:
  44. content = f"$$\n{content}\n$$"
  45. line_text += content + ' '
  46. # 在行末添加两个空格以强制换行
  47. markdown.append(line_text.strip() + ' ')
  48. return '\n'.join(markdown)
  49. def mk_mm_markdown2(pdf_info_dict:dict):
  50. markdown = []
  51. for _, page_info in pdf_info_dict.items():
  52. paras = page_info.get("para_blocks")
  53. if not paras:
  54. continue
  55. for para in paras:
  56. para_text = ''
  57. for line in para:
  58. for span in line['spans']:
  59. span_type = span.get('type')
  60. if span_type == ContentType.Text:
  61. para_text += span['content']
  62. elif span_type == ContentType.InlineEquation:
  63. para_text += f" ${span['content']}$ "
  64. elif span_type == ContentType.InterlineEquation:
  65. para_text += f"$$\n{span['content']}\n$$ "
  66. elif span_type == ContentType.Image:
  67. para_text += f"![](s3://mllm-raw-media/pdf2md_img/{span['image_path']}) "
  68. markdown.append(para_text)
  69. return '\n\n'.join(markdown)
  70. def ocr_mk_mm_standard_format():
  71. '''
  72. content_list
  73. type string image/text/table/equation(行间的单独拿出来,行内的和text合并)
  74. '''
  75. pass