pdf2md.py 3.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293
  1. import os
  2. import sys
  3. from pathlib import Path
  4. import click
  5. from loguru import logger
  6. from magic_pdf.libs import join_path
  7. from magic_pdf.dict2md.mkcontent import mk_mm_markdown
  8. from magic_pdf.pipeline import parse_pdf_by_model
  9. def main(s3_pdf_path: str, s3_pdf_profile: str, pdf_model_path: str, pdf_model_profile: str, start_page_num=0, debug_mode=True):
  10. """ """
  11. pth = Path(s3_pdf_path)
  12. book_name = pth.name
  13. # book_name = "".join(os.path.basename(s3_pdf_path).split(".")[0:-1])
  14. save_tmp_path = os.path.join(os.path.dirname(__file__), "../..", "..", "tmp", "unittest")
  15. save_path = join_path(save_tmp_path, "md")
  16. text_content_save_path = f"{save_path}/{book_name}/book.md"
  17. # metadata_save_path = f"{save_path}/{book_name}/metadata.json"
  18. try:
  19. paras_dict = parse_pdf_by_model(
  20. s3_pdf_path, s3_pdf_profile, pdf_model_path, save_path, book_name, pdf_model_profile, start_page_num, debug_mode=debug_mode
  21. )
  22. parent_dir = os.path.dirname(text_content_save_path)
  23. if not os.path.exists(parent_dir):
  24. os.makedirs(parent_dir)
  25. if not paras_dict.get('need_drop'):
  26. markdown_content = mk_mm_markdown(paras_dict)
  27. else:
  28. markdown_content = paras_dict['drop_reason']
  29. with open(text_content_save_path, "w", encoding="utf-8") as f:
  30. f.write(markdown_content)
  31. except Exception as e:
  32. print(f"ERROR: {s3_pdf_path}, {e}", file=sys.stderr)
  33. logger.exception(e)
  34. @click.command()
  35. @click.option("--pdf-file-path", help="s3上pdf文件的路径")
  36. @click.option("--save-path", help="解析出来的图片,文本的保存父目录")
  37. def main_shell(pdf_file_path: str, save_path: str):
  38. # pdf_bin_file_path = "s3://llm-raw-snew/llm-raw-scihub/scimag07865000-07865999/10.1007/"
  39. pdf_bin_file_parent_path = "s3://llm-raw-snew/llm-raw-scihub/"
  40. pdf_bin_file_profile = "s2"
  41. pdf_model_parent_dir = "s3://llm-pdf-text/eval_1k/layout_res/"
  42. pdf_model_profile = "langchao"
  43. p = Path(pdf_file_path)
  44. pdf_parent_path = p.parent
  45. pdf_file_name = p.name # pdf文件名字,含后缀
  46. pdf_bin_file_path = join_path(pdf_bin_file_parent_path, pdf_parent_path)
  47. pdf_model_dir = join_path(pdf_model_parent_dir, pdf_parent_path)
  48. main(
  49. join_path(pdf_bin_file_path, pdf_file_name),
  50. pdf_bin_file_profile,
  51. join_path(pdf_model_dir, pdf_file_name),
  52. pdf_model_profile,
  53. save_path,
  54. )
  55. @click.command()
  56. @click.option("--pdf-dir", help="s3上pdf文件的路径")
  57. @click.option("--model-dir", help="s3上pdf文件的路径")
  58. @click.option("--start-page-num", default=0, help="从第几页开始解析")
  59. def main_shell2(pdf_dir: str, model_dir: str,start_page_num: int):
  60. # 先扫描所有的pdf目录里的文件名字
  61. pdf_dir = Path(pdf_dir)
  62. model_dir = Path(model_dir)
  63. if pdf_dir.is_file():
  64. pdf_file_names = [pdf_dir.name]
  65. pdf_dir = pdf_dir.parent
  66. else:
  67. pdf_file_names = [f.name for f in pdf_dir.glob("*.pdf")]
  68. for pdf_file in pdf_file_names:
  69. pdf_file_path = os.path.join(pdf_dir, pdf_file)
  70. model_file_path = os.path.join(model_dir, pdf_file)
  71. main(pdf_file_path, None, model_file_path, None, start_page_num)
  72. if __name__ == "__main__":
  73. main_shell2()