magicpdf.py 6.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173
  1. """
  2. 这里实现2个click命令:
  3. 第一个:
  4. 接收一个完整的s3路径,例如:s3://llm-pdf-text/pdf_ebook_and_paper/pre-clean-mm-markdown/v014/part-660420b490be-000008.jsonl?bytes=0,81350
  5. 1)根据~/magic-pdf.json里的ak,sk等,构造s3cliReader读取到这个jsonl的对应行,返回json对象。
  6. 2)根据Json对象里的pdf的s3路径获取到他的ak,sk,endpoint,构造出s3cliReader用来读取pdf
  7. 3)从magic-pdf.json里读取到本地保存图片、Md等的临时目录位置,构造出LocalImageWriter,用来保存截图
  8. 4)从magic-pdf.json里读取到本地保存图片、Md等的临时目录位置,构造出LocalIRdWriter,用来读写本地文件
  9. 最后把以上步骤准备好的对象传入真正的解析API
  10. 第二个:
  11. 接收1)pdf的本地路径。2)模型json文件(可选)。然后:
  12. 1)根据~/magic-pdf.json读取到本地保存图片、md等临时目录的位置,构造出LocalImageWriter,用来保存截图
  13. 2)从magic-pdf.json里读取到本地保存图片、Md等的临时目录位置,构造出LocalIRdWriter,用来读写本地文件
  14. 3)根据约定,根据pdf本地路径,推导出pdf模型的json,并读入
  15. 效果:
  16. python magicpdf.py --json s3://llm-pdf-text/scihub/xxxx.json?bytes=0,81350
  17. python magicpdf.py --pdf /home/llm/Downloads/xxxx.pdf --model /home/llm/Downloads/xxxx.json 或者 python magicpdf.py --pdf /home/llm/Downloads/xxxx.pdf
  18. """
  19. import os
  20. import json as json_parse
  21. from datetime import datetime
  22. import click
  23. from magic_pdf.pipe.UNIPipe import UNIPipe
  24. from magic_pdf.libs.config_reader import get_s3_config
  25. from magic_pdf.libs.path_utils import (
  26. parse_s3path,
  27. parse_s3_range_params,
  28. remove_non_official_s3_args,
  29. )
  30. from magic_pdf.libs.config_reader import get_local_dir
  31. from magic_pdf.rw.S3ReaderWriter import S3ReaderWriter, MODE_BIN, MODE_TXT
  32. from magic_pdf.rw.DiskReaderWriter import DiskReaderWriter
  33. from magic_pdf.libs.json_compressor import JsonCompressor
  34. parse_pdf_methods = click.Choice(["ocr", "txt", "auto"])
  35. def prepare_env():
  36. local_parent_dir = os.path.join(
  37. get_local_dir(), "magic-pdf", datetime.now().strftime("%Y-%m-%d")
  38. )
  39. local_image_dir = os.path.join(local_parent_dir, "images")
  40. local_md_dir = os.path.join(local_parent_dir, "md")
  41. os.makedirs(local_image_dir, exist_ok=True)
  42. os.makedirs(local_md_dir, exist_ok=True)
  43. return local_image_dir, local_md_dir
  44. def _do_parse(pdf_bytes, model_list, parse_method, image_writer, md_writer, image_dir):
  45. uni_pipe = UNIPipe(pdf_bytes, model_list, image_writer, image_dir)
  46. jso_useful_key = {
  47. "_pdf_type": "txt",
  48. "model_list": model_list,
  49. }
  50. if parse_method == "ocr":
  51. jso_useful_key["_pdf_type"] = "ocr"
  52. uni_pipe.pipe_parse()
  53. md_content = uni_pipe.pipe_mk_markdown()
  54. part_file_name = datetime.now().strftime("%H-%M-%S")
  55. md_writer.write(content=md_content, path=f"{part_file_name}.md", mode=MODE_TXT)
  56. md_writer.write(
  57. content=json_parse.dumps(
  58. uni_pipe.pdf_mid_data, ensure_ascii=False, indent=4
  59. ),
  60. path=f"{part_file_name}.json",
  61. mode=MODE_TXT,
  62. )
  63. @click.group()
  64. def cli():
  65. pass
  66. @cli.command()
  67. @click.option("--json", type=str, help="输入一个S3路径")
  68. @click.option(
  69. "--method",
  70. type=parse_pdf_methods,
  71. help="指定解析方法。txt: 文本型 pdf 解析方法, ocr: 光学识别解析 pdf, auto: 程序智能选择解析方法",
  72. default="auto",
  73. )
  74. def json_command(json, method):
  75. if not json.startswith("s3://"):
  76. print("usage: python magipdf.py --json s3://some_bucket/some_path")
  77. os.exit(1)
  78. def read_s3_path(s3path):
  79. bucket, key = parse_s3path(s3path)
  80. s3_ak, s3_sk, s3_endpoint = get_s3_config(bucket)
  81. s3_rw = S3ReaderWriter(
  82. s3_ak, s3_sk, s3_endpoint, "auto", remove_non_official_s3_args(s3path)
  83. )
  84. may_range_params = parse_s3_range_params(json)
  85. if may_range_params is None or 2 != len(may_range_params):
  86. byte_start, byte_end = 0, None
  87. else:
  88. byte_start, byte_end = int(may_range_params[0]), int(may_range_params[1])
  89. return s3_rw.read_jsonl(
  90. remove_non_official_s3_args(s3path), byte_start, byte_end, MODE_BIN
  91. )
  92. jso = json_parse.loads(read_s3_path(json).decode("utf-8"))
  93. pdf_data = read_s3_path(jso["file_location"])
  94. local_image_dir, local_md_dir = prepare_env()
  95. local_image_rw, local_md_rw = DiskReaderWriter(local_image_dir), DiskReaderWriter(
  96. local_md_dir
  97. )
  98. _do_parse(
  99. pdf_data,
  100. jso['doc_layout_result'],
  101. method,
  102. local_image_rw,
  103. local_md_rw,
  104. local_image_dir,
  105. )
  106. @cli.command()
  107. @click.option(
  108. "--pdf", type=click.Path(exists=True), required=True, help="PDF文件的路径"
  109. )
  110. @click.option("--model", type=click.Path(exists=True), help="模型的路径")
  111. @click.option(
  112. "--method",
  113. type=parse_pdf_methods,
  114. help="指定解析方法。txt: 文本型 pdf 解析方法, ocr: 光学识别解析 pdf, auto: 程序智能选择解析方法",
  115. default="auto",
  116. )
  117. def pdf_command(pdf, model, method):
  118. # 这里处理pdf和模型相关的逻辑
  119. if model is None:
  120. model = pdf.replace(".pdf", ".json")
  121. if not os.path.exists(model):
  122. print(f"make sure json file existed and place under {os.dirname(pdf)}")
  123. os.eixt(1)
  124. def read_fn(path):
  125. disk_rw = DiskReaderWriter(os.path.dirname(path))
  126. return disk_rw.read(os.path.basename(path), MODE_BIN)
  127. pdf_data = read_fn(pdf)
  128. jso = json_parse.loads(read_fn(model).decode("utf-8"))
  129. local_image_dir, local_md_dir = prepare_env()
  130. local_image_rw, local_md_rw = DiskReaderWriter(local_image_dir), DiskReaderWriter(
  131. local_md_dir
  132. )
  133. _do_parse(
  134. pdf_data,
  135. jso,
  136. method,
  137. local_image_rw,
  138. local_md_rw,
  139. local_image_dir,
  140. )
  141. if __name__ == "__main__":
  142. """
  143. python magic_pdf/cli/magicpdf.py json-command --json s3://llm-pdf-text/pdf_ebook_and_paper/manual/v001/part-660407a28beb-000002.jsonl?bytes=0,63551
  144. """
  145. cli()