magicpdf.py 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359
  1. """
  2. 这里实现2个click命令:
  3. 第一个:
  4. 接收一个完整的s3路径,例如:s3://llm-pdf-text/pdf_ebook_and_paper/pre-clean-mm-markdown/v014/part-660420b490be-000008.jsonl?bytes=0,81350
  5. 1)根据~/magic-pdf.json里的ak,sk等,构造s3cliReader读取到这个jsonl的对应行,返回json对象。
  6. 2)根据Json对象里的pdf的s3路径获取到他的ak,sk,endpoint,构造出s3cliReader用来读取pdf
  7. 3)从magic-pdf.json里读取到本地保存图片、Md等的临时目录位置,构造出LocalImageWriter,用来保存截图
  8. 4)从magic-pdf.json里读取到本地保存图片、Md等的临时目录位置,构造出LocalIRdWriter,用来读写本地文件
  9. 最后把以上步骤准备好的对象传入真正的解析API
  10. 第二个:
  11. 接收1)pdf的本地路径。2)模型json文件(可选)。然后:
  12. 1)根据~/magic-pdf.json读取到本地保存图片、md等临时目录的位置,构造出LocalImageWriter,用来保存截图
  13. 2)从magic-pdf.json里读取到本地保存图片、Md等的临时目录位置,构造出LocalIRdWriter,用来读写本地文件
  14. 3)根据约定,根据pdf本地路径,推导出pdf模型的json,并读入
  15. 效果:
  16. python magicpdf.py json-command --json s3://llm-pdf-text/scihub/xxxx.json?bytes=0,81350
  17. python magicpdf.py pdf-command --pdf /home/llm/Downloads/xxxx.pdf --model /home/llm/Downloads/xxxx.json 或者 python magicpdf.py --pdf /home/llm/Downloads/xxxx.pdf
  18. """
  19. import os
  20. import json as json_parse
  21. import click
  22. from loguru import logger
  23. from pathlib import Path
  24. from magic_pdf.libs.version import __version__
  25. from magic_pdf.libs.MakeContentConfig import DropMode, MakeMode
  26. from magic_pdf.libs.draw_bbox import draw_layout_bbox, draw_span_bbox
  27. from magic_pdf.pipe.UNIPipe import UNIPipe
  28. from magic_pdf.pipe.OCRPipe import OCRPipe
  29. from magic_pdf.pipe.TXTPipe import TXTPipe
  30. from magic_pdf.libs.path_utils import (
  31. parse_s3path,
  32. parse_s3_range_params,
  33. remove_non_official_s3_args,
  34. )
  35. from magic_pdf.libs.config_reader import (
  36. get_local_dir,
  37. get_s3_config,
  38. )
  39. from magic_pdf.rw.S3ReaderWriter import S3ReaderWriter
  40. from magic_pdf.rw.DiskReaderWriter import DiskReaderWriter
  41. from magic_pdf.rw.AbsReaderWriter import AbsReaderWriter
  42. import csv
  43. import copy
  44. import magic_pdf.model as model_config
  45. parse_pdf_methods = click.Choice(["ocr", "txt", "auto"])
  46. def prepare_env(pdf_file_name, method):
  47. local_parent_dir = os.path.join(get_local_dir(), "magic-pdf", pdf_file_name, method)
  48. local_image_dir = os.path.join(str(local_parent_dir), "images")
  49. local_md_dir = local_parent_dir
  50. os.makedirs(local_image_dir, exist_ok=True)
  51. os.makedirs(local_md_dir, exist_ok=True)
  52. return local_image_dir, local_md_dir
  53. def write_to_csv(csv_file_path, csv_data):
  54. with open(csv_file_path, mode="a", newline="", encoding="utf-8") as csvfile:
  55. # 创建csv writer对象
  56. csv_writer = csv.writer(csvfile)
  57. # 写入数据
  58. csv_writer.writerow(csv_data)
  59. logger.info(f"数据已成功追加到 '{csv_file_path}'")
  60. def do_parse(
  61. pdf_file_name,
  62. pdf_bytes,
  63. model_list,
  64. parse_method,
  65. f_draw_span_bbox=True,
  66. f_draw_layout_bbox=True,
  67. f_dump_md=True,
  68. f_dump_middle_json=True,
  69. f_dump_model_json=True,
  70. f_dump_orig_pdf=True,
  71. f_dump_content_list=True,
  72. f_make_md_mode=MakeMode.MM_MD,
  73. ):
  74. orig_model_list = copy.deepcopy(model_list)
  75. local_image_dir, local_md_dir = prepare_env(pdf_file_name, parse_method)
  76. image_writer, md_writer = DiskReaderWriter(local_image_dir), DiskReaderWriter(local_md_dir)
  77. image_dir = str(os.path.basename(local_image_dir))
  78. if parse_method == "auto":
  79. jso_useful_key = {"_pdf_type": "", "model_list": model_list}
  80. pipe = UNIPipe(pdf_bytes, jso_useful_key, image_writer, is_debug=True)
  81. elif parse_method == "txt":
  82. pipe = TXTPipe(pdf_bytes, model_list, image_writer, is_debug=True)
  83. elif parse_method == "ocr":
  84. pipe = OCRPipe(pdf_bytes, model_list, image_writer, is_debug=True)
  85. else:
  86. logger.error("unknown parse method")
  87. exit(1)
  88. pipe.pipe_classify()
  89. """如果没有传入有效的模型数据,则使用内置model解析"""
  90. if len(model_list) == 0:
  91. if model_config.__use_inside_model__:
  92. pipe.pipe_analyze()
  93. orig_model_list = copy.deepcopy(pipe.model_list)
  94. else:
  95. logger.error("need model list input")
  96. exit(1)
  97. pipe.pipe_parse()
  98. pdf_info = pipe.pdf_mid_data["pdf_info"]
  99. if f_draw_layout_bbox:
  100. draw_layout_bbox(pdf_info, pdf_bytes, local_md_dir)
  101. if f_draw_span_bbox:
  102. draw_span_bbox(pdf_info, pdf_bytes, local_md_dir)
  103. md_content = pipe.pipe_mk_markdown(image_dir, drop_mode=DropMode.NONE, md_make_mode=f_make_md_mode)
  104. if f_dump_md:
  105. """写markdown"""
  106. md_writer.write(
  107. content=md_content,
  108. path=f"{pdf_file_name}.md",
  109. mode=AbsReaderWriter.MODE_TXT,
  110. )
  111. if f_dump_middle_json:
  112. """写middle_json"""
  113. md_writer.write(
  114. content=json_parse.dumps(pipe.pdf_mid_data, ensure_ascii=False, indent=4),
  115. path=f"{pdf_file_name}_middle.json",
  116. mode=AbsReaderWriter.MODE_TXT,
  117. )
  118. if f_dump_model_json:
  119. """写model_json"""
  120. md_writer.write(
  121. content=json_parse.dumps(orig_model_list, ensure_ascii=False, indent=4),
  122. path=f"{pdf_file_name}_model.json",
  123. mode=AbsReaderWriter.MODE_TXT,
  124. )
  125. if f_dump_orig_pdf:
  126. """写源pdf"""
  127. md_writer.write(
  128. content=pdf_bytes,
  129. path=f"{pdf_file_name}_origin.pdf",
  130. mode=AbsReaderWriter.MODE_BIN,
  131. )
  132. content_list = pipe.pipe_mk_uni_format(image_dir, drop_mode=DropMode.NONE)
  133. if f_dump_content_list:
  134. """写content_list"""
  135. md_writer.write(
  136. content=json_parse.dumps(content_list, ensure_ascii=False, indent=4),
  137. path=f"{pdf_file_name}_content_list.json",
  138. mode=AbsReaderWriter.MODE_TXT,
  139. )
  140. logger.info(f"local output dir is '{local_md_dir}', you can found the result in it.")
  141. @click.group()
  142. @click.version_option(__version__, "--version", "-v", help="显示版本信息")
  143. @click.help_option("--help", "-h", help="显示帮助信息")
  144. def cli():
  145. pass
  146. @cli.command()
  147. @click.option("--json", type=str, help="输入一个S3路径")
  148. @click.option(
  149. "--method",
  150. type=parse_pdf_methods,
  151. help="指定解析方法。txt: 文本型 pdf 解析方法, ocr: 光学识别解析 pdf, auto: 程序智能选择解析方法",
  152. default="auto",
  153. )
  154. @click.option("--inside_model", type=click.BOOL, default=True, help="使用内置模型测试")
  155. @click.option("--model_mode", type=click.STRING, default="full",
  156. help="内置模型选择。lite: 快速解析,精度较低,full: 高精度解析,速度较慢")
  157. def json_command(json, method, inside_model, model_mode):
  158. model_config.__use_inside_model__ = inside_model
  159. model_config.__model_mode__ = model_mode
  160. if not json.startswith("s3://"):
  161. logger.error("usage: magic-pdf json-command --json s3://some_bucket/some_path")
  162. exit(1)
  163. def read_s3_path(s3path):
  164. bucket, key = parse_s3path(s3path)
  165. s3_ak, s3_sk, s3_endpoint = get_s3_config(bucket)
  166. s3_rw = S3ReaderWriter(
  167. s3_ak, s3_sk, s3_endpoint, "auto", remove_non_official_s3_args(s3path)
  168. )
  169. may_range_params = parse_s3_range_params(s3path)
  170. if may_range_params is None or 2 != len(may_range_params):
  171. byte_start, byte_end = 0, None
  172. else:
  173. byte_start, byte_end = int(may_range_params[0]), int(may_range_params[1])
  174. byte_end += byte_start - 1
  175. return s3_rw.read_jsonl(
  176. remove_non_official_s3_args(s3path),
  177. byte_start,
  178. byte_end,
  179. AbsReaderWriter.MODE_BIN,
  180. )
  181. jso = json_parse.loads(read_s3_path(json).decode("utf-8"))
  182. s3_file_path = jso.get("file_location")
  183. if s3_file_path is None:
  184. s3_file_path = jso.get("path")
  185. pdf_file_name = Path(s3_file_path).stem
  186. pdf_data = read_s3_path(s3_file_path)
  187. do_parse(
  188. pdf_file_name,
  189. pdf_data,
  190. jso["doc_layout_result"],
  191. method,
  192. )
  193. @cli.command()
  194. @click.option("--local_json", type=str, help="输入一个本地jsonl路径")
  195. @click.option(
  196. "--method",
  197. type=parse_pdf_methods,
  198. help="指定解析方法。txt: 文本型 pdf 解析方法, ocr: 光学识别解析 pdf, auto: 程序智能选择解析方法",
  199. default="auto",
  200. )
  201. @click.option("--inside_model", type=click.BOOL, default=True, help="使用内置模型测试")
  202. @click.option("--model_mode", type=click.STRING, default="full",
  203. help="内置模型选择。lite: 快速解析,精度较低,full: 高精度解析,速度较慢")
  204. def local_json_command(local_json, method, inside_model, model_mode):
  205. model_config.__use_inside_model__ = inside_model
  206. model_config.__model_mode__ = model_mode
  207. def read_s3_path(s3path):
  208. bucket, key = parse_s3path(s3path)
  209. s3_ak, s3_sk, s3_endpoint = get_s3_config(bucket)
  210. s3_rw = S3ReaderWriter(
  211. s3_ak, s3_sk, s3_endpoint, "auto", remove_non_official_s3_args(s3path)
  212. )
  213. may_range_params = parse_s3_range_params(s3path)
  214. if may_range_params is None or 2 != len(may_range_params):
  215. byte_start, byte_end = 0, None
  216. else:
  217. byte_start, byte_end = int(may_range_params[0]), int(may_range_params[1])
  218. byte_end += byte_start - 1
  219. return s3_rw.read_jsonl(
  220. remove_non_official_s3_args(s3path),
  221. byte_start,
  222. byte_end,
  223. AbsReaderWriter.MODE_BIN,
  224. )
  225. with open(local_json, "r", encoding="utf-8") as f:
  226. for json_line in f:
  227. jso = json_parse.loads(json_line)
  228. s3_file_path = jso.get("file_location")
  229. if s3_file_path is None:
  230. s3_file_path = jso.get("path")
  231. pdf_file_name = Path(s3_file_path).stem
  232. pdf_data = read_s3_path(s3_file_path)
  233. do_parse(
  234. pdf_file_name,
  235. pdf_data,
  236. jso["doc_layout_result"],
  237. method,
  238. )
  239. @cli.command()
  240. @click.option(
  241. "--pdf", type=click.Path(exists=True), required=True,
  242. help='pdf 文件路径, 支持单个文件或文件列表, 文件列表需要以".list"结尾, 一行一个pdf文件路径')
  243. @click.option("--model", type=click.Path(exists=True), help="模型的路径")
  244. @click.option(
  245. "--method",
  246. type=parse_pdf_methods,
  247. help="指定解析方法。txt: 文本型 pdf 解析方法, ocr: 光学识别解析 pdf, auto: 程序智能选择解析方法",
  248. default="auto",
  249. )
  250. @click.option("--inside_model", type=click.BOOL, default=True, help="使用内置模型测试")
  251. @click.option("--model_mode", type=click.STRING, default="full",
  252. help="内置模型选择。lite: 快速解析,精度较低,full: 高精度解析,速度较慢")
  253. def pdf_command(pdf, model, method, inside_model, model_mode):
  254. model_config.__use_inside_model__ = inside_model
  255. model_config.__model_mode__ = model_mode
  256. def read_fn(path):
  257. disk_rw = DiskReaderWriter(os.path.dirname(path))
  258. return disk_rw.read(os.path.basename(path), AbsReaderWriter.MODE_BIN)
  259. def get_model_json(model_path, doc_path):
  260. # 这里处理pdf和模型相关的逻辑
  261. if model_path is None:
  262. file_name_without_extension, extension = os.path.splitext(doc_path)
  263. if extension == ".pdf":
  264. model_path = file_name_without_extension + ".json"
  265. else:
  266. raise Exception("pdf_path input error")
  267. if not os.path.exists(model_path):
  268. logger.warning(
  269. f"not found json {model_path} existed"
  270. )
  271. # 本地无模型数据则调用内置paddle分析,先传空list,在内部识别到空list再调用paddle
  272. model_json = "[]"
  273. else:
  274. model_json = read_fn(model_path).decode("utf-8")
  275. else:
  276. model_json = read_fn(model_path).decode("utf-8")
  277. return model_json
  278. def parse_doc(doc_path):
  279. try:
  280. file_name = str(Path(doc_path).stem)
  281. pdf_data = read_fn(doc_path)
  282. jso = json_parse.loads(get_model_json(model, doc_path))
  283. do_parse(
  284. file_name,
  285. pdf_data,
  286. jso,
  287. method,
  288. )
  289. except Exception as e:
  290. logger.exception(e)
  291. if not pdf:
  292. logger.error(f"Error: Missing argument '--pdf'.")
  293. exit(f"Error: Missing argument '--pdf'.")
  294. else:
  295. '''适配多个文档的list文件输入'''
  296. if pdf.endswith(".list"):
  297. with open(pdf, "r") as f:
  298. for line in f.readlines():
  299. line = line.strip()
  300. parse_doc(line)
  301. else:
  302. '''适配单个文档的输入'''
  303. parse_doc(pdf)
  304. if __name__ == "__main__":
  305. """
  306. python magic_pdf/cli/magicpdf.py json-command --json s3://llm-pdf-text/pdf_ebook_and_paper/manual/v001/part-660407a28beb-000002.jsonl?bytes=0,63551
  307. """
  308. cli()