pdf2md.bak 3.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899
  1. import json
  2. import os
  3. import sys
  4. from pathlib import Path
  5. import click
  6. from loguru import logger
  7. from magic_pdf.libs.commons import join_path, read_file
  8. from magic_pdf.dict2md.mkcontent import mk_mm_markdown, mk_universal_format
  9. from magic_pdf.pdf_parse_by_txt import parse_pdf_by_txt
  10. def main(s3_pdf_path: str, s3_pdf_profile: str, pdf_model_path: str, pdf_model_profile: str, start_page_num=0, debug_mode=True):
  11. """ """
  12. pth = Path(s3_pdf_path)
  13. book_name = pth.name
  14. # book_name = "".join(os.path.basename(s3_pdf_path).split(".")[0:-1])
  15. save_tmp_path = os.path.join(os.path.dirname(__file__), "../..", "..", "tmp", "unittest")
  16. save_path = join_path(save_tmp_path, "md")
  17. text_content_save_path = f"{save_path}/{book_name}/book.md"
  18. # metadata_save_path = f"{save_path}/{book_name}/metadata.json"
  19. pdf_bytes = read_file(s3_pdf_path, s3_pdf_profile)
  20. try:
  21. paras_dict = parse_pdf_by_txt(
  22. pdf_bytes, pdf_model_path, save_path, book_name, pdf_model_profile, start_page_num, debug_mode=debug_mode
  23. )
  24. parent_dir = os.path.dirname(text_content_save_path)
  25. if not os.path.exists(parent_dir):
  26. os.makedirs(parent_dir)
  27. if not paras_dict.get('need_drop'):
  28. content_list = mk_universal_format(paras_dict)
  29. markdown_content = mk_mm_markdown(content_list)
  30. else:
  31. markdown_content = paras_dict['drop_reason']
  32. with open(text_content_save_path, "w", encoding="utf-8") as f:
  33. f.write(markdown_content)
  34. except Exception as e:
  35. print(f"ERROR: {s3_pdf_path}, {e}", file=sys.stderr)
  36. logger.exception(e)
  37. @click.command()
  38. @click.option("--pdf-file-path", help="s3上pdf文件的路径")
  39. @click.option("--save-path", help="解析出来的图片,文本的保存父目录")
  40. def main_shell(pdf_file_path: str, save_path: str):
  41. # pdf_bin_file_path = "s3://llm-raw-snew/llm-raw-scihub/scimag07865000-07865999/10.1007/"
  42. pdf_bin_file_parent_path = "s3://llm-raw-snew/llm-raw-scihub/"
  43. pdf_bin_file_profile = "s2"
  44. pdf_model_parent_dir = "s3://llm-pdf-text/eval_1k/layout_res/"
  45. pdf_model_profile = "langchao"
  46. p = Path(pdf_file_path)
  47. pdf_parent_path = p.parent
  48. pdf_file_name = p.name # pdf文件名字,含后缀
  49. pdf_bin_file_path = join_path(pdf_bin_file_parent_path, pdf_parent_path)
  50. pdf_model_dir = join_path(pdf_model_parent_dir, pdf_parent_path)
  51. main(
  52. join_path(pdf_bin_file_path, pdf_file_name),
  53. pdf_bin_file_profile,
  54. join_path(pdf_model_dir, pdf_file_name),
  55. pdf_model_profile,
  56. save_path,
  57. )
  58. @click.command()
  59. @click.option("--pdf-dir", help="本地pdf文件的路径")
  60. @click.option("--model-dir", help="本地模型文件的路径")
  61. @click.option("--start-page-num", default=0, help="从第几页开始解析")
  62. def main_shell2(pdf_dir: str, model_dir: str,start_page_num: int):
  63. # 先扫描所有的pdf目录里的文件名字
  64. pdf_dir = Path(pdf_dir)
  65. model_dir = Path(model_dir)
  66. if pdf_dir.is_file():
  67. pdf_file_names = [pdf_dir.name]
  68. pdf_dir = pdf_dir.parent
  69. else:
  70. pdf_file_names = [f.name for f in pdf_dir.glob("*.pdf")]
  71. for pdf_file in pdf_file_names:
  72. pdf_file_path = os.path.join(pdf_dir, pdf_file)
  73. model_file_path = os.path.join(model_dir, pdf_file).rstrip(".pdf") + ".json"
  74. with open(model_file_path, "r") as json_file:
  75. model_list = json.load(json_file)
  76. main(pdf_file_path, None, model_list, None, start_page_num)
  77. if __name__ == "__main__":
  78. main_shell2()