pdf2md.py 3.3 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495
  1. import os
  2. import sys
  3. from pathlib import Path
  4. import click
  5. from loguru import logger
  6. from magic_pdf.libs.commons import join_path, read_file
  7. from magic_pdf.dict2md.mkcontent import mk_mm_markdown
  8. from magic_pdf.pipeline import parse_pdf_by_model
  9. def main(s3_pdf_path: str, s3_pdf_profile: str, pdf_model_path: str, pdf_model_profile: str, start_page_num=0, debug_mode=True):
  10. """ """
  11. pth = Path(s3_pdf_path)
  12. book_name = pth.name
  13. # book_name = "".join(os.path.basename(s3_pdf_path).split(".")[0:-1])
  14. save_tmp_path = os.path.join(os.path.dirname(__file__), "../..", "..", "tmp", "unittest")
  15. save_path = join_path(save_tmp_path, "md")
  16. text_content_save_path = f"{save_path}/{book_name}/book.md"
  17. # metadata_save_path = f"{save_path}/{book_name}/metadata.json"
  18. pdf_bytes = read_file(s3_pdf_path, s3_pdf_profile)
  19. try:
  20. paras_dict = parse_pdf_by_model(
  21. pdf_bytes, pdf_model_path, save_path, book_name, pdf_model_profile, start_page_num, debug_mode=debug_mode
  22. )
  23. parent_dir = os.path.dirname(text_content_save_path)
  24. if not os.path.exists(parent_dir):
  25. os.makedirs(parent_dir)
  26. if not paras_dict.get('need_drop'):
  27. markdown_content = mk_mm_markdown(paras_dict)
  28. else:
  29. markdown_content = paras_dict['drop_reason']
  30. with open(text_content_save_path, "w", encoding="utf-8") as f:
  31. f.write(markdown_content)
  32. except Exception as e:
  33. print(f"ERROR: {s3_pdf_path}, {e}", file=sys.stderr)
  34. logger.exception(e)
  35. @click.command()
  36. @click.option("--pdf-file-path", help="s3上pdf文件的路径")
  37. @click.option("--save-path", help="解析出来的图片,文本的保存父目录")
  38. def main_shell(pdf_file_path: str, save_path: str):
  39. # pdf_bin_file_path = "s3://llm-raw-snew/llm-raw-scihub/scimag07865000-07865999/10.1007/"
  40. pdf_bin_file_parent_path = "s3://llm-raw-snew/llm-raw-scihub/"
  41. pdf_bin_file_profile = "s2"
  42. pdf_model_parent_dir = "s3://llm-pdf-text/eval_1k/layout_res/"
  43. pdf_model_profile = "langchao"
  44. p = Path(pdf_file_path)
  45. pdf_parent_path = p.parent
  46. pdf_file_name = p.name # pdf文件名字,含后缀
  47. pdf_bin_file_path = join_path(pdf_bin_file_parent_path, pdf_parent_path)
  48. pdf_model_dir = join_path(pdf_model_parent_dir, pdf_parent_path)
  49. main(
  50. join_path(pdf_bin_file_path, pdf_file_name),
  51. pdf_bin_file_profile,
  52. join_path(pdf_model_dir, pdf_file_name),
  53. pdf_model_profile,
  54. save_path,
  55. )
  56. @click.command()
  57. @click.option("--pdf-dir", help="s3上pdf文件的路径")
  58. @click.option("--model-dir", help="s3上pdf文件的路径")
  59. @click.option("--start-page-num", default=0, help="从第几页开始解析")
  60. def main_shell2(pdf_dir: str, model_dir: str,start_page_num: int):
  61. # 先扫描所有的pdf目录里的文件名字
  62. pdf_dir = Path(pdf_dir)
  63. model_dir = Path(model_dir)
  64. if pdf_dir.is_file():
  65. pdf_file_names = [pdf_dir.name]
  66. pdf_dir = pdf_dir.parent
  67. else:
  68. pdf_file_names = [f.name for f in pdf_dir.glob("*.pdf")]
  69. for pdf_file in pdf_file_names:
  70. pdf_file_path = os.path.join(pdf_dir, pdf_file)
  71. model_file_path = os.path.join(model_dir, pdf_file)
  72. main(pdf_file_path, None, model_file_path, None, start_page_num)
  73. if __name__ == "__main__":
  74. main_shell2()