ocr_mkcontent.py 843 B

123456789101112131415161718192021
  1. def mk_nlp_markdown(pdf_info_dict: dict):
  2. markdown = []
  3. for _, page_info in pdf_info_dict.items():
  4. blocks = page_info.get("preproc_blocks")
  5. if not blocks:
  6. continue
  7. for block in blocks:
  8. for line in block['lines']:
  9. line_text = ''
  10. for span in line['spans']:
  11. content = span['content'].replace('$', '\$') # 转义$
  12. if span['type'] == 'inline_equation':
  13. content = f"${content}$"
  14. elif span['type'] == 'displayed_equation':
  15. content = f"$$\n{content}\n$$"
  16. line_text += content + ' '
  17. # 在行末添加两个空格以强制换行
  18. markdown.append(line_text.strip() + ' ')
  19. return '\n'.join(markdown)