magicpdf.py 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336
  1. import os
  2. import json as json_parse
  3. import click
  4. from loguru import logger
  5. from pathlib import Path
  6. from magic_pdf.libs.version import __version__
  7. from magic_pdf.libs.MakeContentConfig import DropMode, MakeMode
  8. from magic_pdf.libs.draw_bbox import draw_layout_bbox, draw_span_bbox
  9. from magic_pdf.pipe.UNIPipe import UNIPipe
  10. from magic_pdf.pipe.OCRPipe import OCRPipe
  11. from magic_pdf.pipe.TXTPipe import TXTPipe
  12. from magic_pdf.libs.path_utils import (
  13. parse_s3path,
  14. parse_s3_range_params,
  15. remove_non_official_s3_args,
  16. )
  17. from magic_pdf.libs.config_reader import (
  18. get_local_dir,
  19. get_s3_config,
  20. )
  21. from magic_pdf.rw.S3ReaderWriter import S3ReaderWriter
  22. from magic_pdf.rw.DiskReaderWriter import DiskReaderWriter
  23. from magic_pdf.rw.AbsReaderWriter import AbsReaderWriter
  24. import csv
  25. import copy
  26. import magic_pdf.model as model_config
  27. parse_pdf_methods = click.Choice(["ocr", "txt", "auto"])
  28. def prepare_env(pdf_file_name, method):
  29. local_parent_dir = os.path.join(get_local_dir(), "magic-pdf", pdf_file_name, method)
  30. local_image_dir = os.path.join(str(local_parent_dir), "images")
  31. local_md_dir = local_parent_dir
  32. os.makedirs(local_image_dir, exist_ok=True)
  33. os.makedirs(local_md_dir, exist_ok=True)
  34. return local_image_dir, local_md_dir
  35. def write_to_csv(csv_file_path, csv_data):
  36. with open(csv_file_path, mode="a", newline="", encoding="utf-8") as csvfile:
  37. # 创建csv writer对象
  38. csv_writer = csv.writer(csvfile)
  39. # 写入数据
  40. csv_writer.writerow(csv_data)
  41. logger.info(f"数据已成功追加到 '{csv_file_path}'")
  42. def do_parse(
  43. pdf_file_name,
  44. pdf_bytes,
  45. model_list,
  46. parse_method,
  47. f_draw_span_bbox=True,
  48. f_draw_layout_bbox=True,
  49. f_dump_md=True,
  50. f_dump_middle_json=True,
  51. f_dump_model_json=True,
  52. f_dump_orig_pdf=True,
  53. f_dump_content_list=True,
  54. f_make_md_mode=MakeMode.MM_MD,
  55. ):
  56. orig_model_list = copy.deepcopy(model_list)
  57. local_image_dir, local_md_dir = prepare_env(pdf_file_name, parse_method)
  58. image_writer, md_writer = DiskReaderWriter(local_image_dir), DiskReaderWriter(local_md_dir)
  59. image_dir = str(os.path.basename(local_image_dir))
  60. if parse_method == "auto":
  61. jso_useful_key = {"_pdf_type": "", "model_list": model_list}
  62. pipe = UNIPipe(pdf_bytes, jso_useful_key, image_writer, is_debug=True)
  63. elif parse_method == "txt":
  64. pipe = TXTPipe(pdf_bytes, model_list, image_writer, is_debug=True)
  65. elif parse_method == "ocr":
  66. pipe = OCRPipe(pdf_bytes, model_list, image_writer, is_debug=True)
  67. else:
  68. logger.error("unknown parse method")
  69. exit(1)
  70. pipe.pipe_classify()
  71. """如果没有传入有效的模型数据,则使用内置model解析"""
  72. if len(model_list) == 0:
  73. if model_config.__use_inside_model__:
  74. pipe.pipe_analyze()
  75. orig_model_list = copy.deepcopy(pipe.model_list)
  76. else:
  77. logger.error("need model list input")
  78. exit(1)
  79. pipe.pipe_parse()
  80. pdf_info = pipe.pdf_mid_data["pdf_info"]
  81. if f_draw_layout_bbox:
  82. draw_layout_bbox(pdf_info, pdf_bytes, local_md_dir)
  83. if f_draw_span_bbox:
  84. draw_span_bbox(pdf_info, pdf_bytes, local_md_dir)
  85. md_content = pipe.pipe_mk_markdown(image_dir, drop_mode=DropMode.NONE, md_make_mode=f_make_md_mode)
  86. if f_dump_md:
  87. """写markdown"""
  88. md_writer.write(
  89. content=md_content,
  90. path=f"{pdf_file_name}.md",
  91. mode=AbsReaderWriter.MODE_TXT,
  92. )
  93. if f_dump_middle_json:
  94. """写middle_json"""
  95. md_writer.write(
  96. content=json_parse.dumps(pipe.pdf_mid_data, ensure_ascii=False, indent=4),
  97. path=f"{pdf_file_name}_middle.json",
  98. mode=AbsReaderWriter.MODE_TXT,
  99. )
  100. if f_dump_model_json:
  101. """写model_json"""
  102. md_writer.write(
  103. content=json_parse.dumps(orig_model_list, ensure_ascii=False, indent=4),
  104. path=f"{pdf_file_name}_model.json",
  105. mode=AbsReaderWriter.MODE_TXT,
  106. )
  107. if f_dump_orig_pdf:
  108. """写源pdf"""
  109. md_writer.write(
  110. content=pdf_bytes,
  111. path=f"{pdf_file_name}_origin.pdf",
  112. mode=AbsReaderWriter.MODE_BIN,
  113. )
  114. content_list = pipe.pipe_mk_uni_format(image_dir, drop_mode=DropMode.NONE)
  115. if f_dump_content_list:
  116. """写content_list"""
  117. md_writer.write(
  118. content=json_parse.dumps(content_list, ensure_ascii=False, indent=4),
  119. path=f"{pdf_file_name}_content_list.json",
  120. mode=AbsReaderWriter.MODE_TXT,
  121. )
  122. logger.info(f"local output dir is '{local_md_dir}', you can found the result in it.")
  123. @click.group()
  124. @click.version_option(__version__, "--version", "-v", help="显示版本信息")
  125. @click.help_option("--help", "-h", help="显示帮助信息")
  126. def cli():
  127. pass
  128. @cli.command()
  129. @click.option("--json", type=str, help="输入一个S3路径")
  130. @click.option(
  131. "--method",
  132. type=parse_pdf_methods,
  133. help="指定解析方法。txt: 文本型 pdf 解析方法, ocr: 光学识别解析 pdf, auto: 程序智能选择解析方法",
  134. default="auto",
  135. )
  136. @click.option("--inside_model", type=click.BOOL, default=True, help="使用内置模型测试")
  137. @click.option("--model_mode", type=click.STRING, default="full",
  138. help="内置模型选择。lite: 快速解析,精度较低,full: 高精度解析,速度较慢")
  139. def json_command(json, method, inside_model, model_mode):
  140. model_config.__use_inside_model__ = inside_model
  141. model_config.__model_mode__ = model_mode
  142. if not json.startswith("s3://"):
  143. logger.error("usage: magic-pdf json-command --json s3://some_bucket/some_path")
  144. exit(1)
  145. def read_s3_path(s3path):
  146. bucket, key = parse_s3path(s3path)
  147. s3_ak, s3_sk, s3_endpoint = get_s3_config(bucket)
  148. s3_rw = S3ReaderWriter(
  149. s3_ak, s3_sk, s3_endpoint, "auto", remove_non_official_s3_args(s3path)
  150. )
  151. may_range_params = parse_s3_range_params(s3path)
  152. if may_range_params is None or 2 != len(may_range_params):
  153. byte_start, byte_end = 0, None
  154. else:
  155. byte_start, byte_end = int(may_range_params[0]), int(may_range_params[1])
  156. byte_end += byte_start - 1
  157. return s3_rw.read_jsonl(
  158. remove_non_official_s3_args(s3path),
  159. byte_start,
  160. byte_end,
  161. AbsReaderWriter.MODE_BIN,
  162. )
  163. jso = json_parse.loads(read_s3_path(json).decode("utf-8"))
  164. s3_file_path = jso.get("file_location")
  165. if s3_file_path is None:
  166. s3_file_path = jso.get("path")
  167. pdf_file_name = Path(s3_file_path).stem
  168. pdf_data = read_s3_path(s3_file_path)
  169. do_parse(
  170. pdf_file_name,
  171. pdf_data,
  172. jso["doc_layout_result"],
  173. method,
  174. )
  175. @cli.command()
  176. @click.option("--local_json", type=str, help="输入一个本地jsonl路径")
  177. @click.option(
  178. "--method",
  179. type=parse_pdf_methods,
  180. help="指定解析方法。txt: 文本型 pdf 解析方法, ocr: 光学识别解析 pdf, auto: 程序智能选择解析方法",
  181. default="auto",
  182. )
  183. @click.option("--inside_model", type=click.BOOL, default=True, help="使用内置模型测试")
  184. @click.option("--model_mode", type=click.STRING, default="full",
  185. help="内置模型选择。lite: 快速解析,精度较低,full: 高精度解析,速度较慢")
  186. def local_json_command(local_json, method, inside_model, model_mode):
  187. model_config.__use_inside_model__ = inside_model
  188. model_config.__model_mode__ = model_mode
  189. def read_s3_path(s3path):
  190. bucket, key = parse_s3path(s3path)
  191. s3_ak, s3_sk, s3_endpoint = get_s3_config(bucket)
  192. s3_rw = S3ReaderWriter(
  193. s3_ak, s3_sk, s3_endpoint, "auto", remove_non_official_s3_args(s3path)
  194. )
  195. may_range_params = parse_s3_range_params(s3path)
  196. if may_range_params is None or 2 != len(may_range_params):
  197. byte_start, byte_end = 0, None
  198. else:
  199. byte_start, byte_end = int(may_range_params[0]), int(may_range_params[1])
  200. byte_end += byte_start - 1
  201. return s3_rw.read_jsonl(
  202. remove_non_official_s3_args(s3path),
  203. byte_start,
  204. byte_end,
  205. AbsReaderWriter.MODE_BIN,
  206. )
  207. with open(local_json, "r", encoding="utf-8") as f:
  208. for json_line in f:
  209. jso = json_parse.loads(json_line)
  210. s3_file_path = jso.get("file_location")
  211. if s3_file_path is None:
  212. s3_file_path = jso.get("path")
  213. pdf_file_name = Path(s3_file_path).stem
  214. pdf_data = read_s3_path(s3_file_path)
  215. do_parse(
  216. pdf_file_name,
  217. pdf_data,
  218. jso["doc_layout_result"],
  219. method,
  220. )
  221. @cli.command()
  222. @click.option(
  223. "--pdf", type=click.Path(exists=True), required=True,
  224. help='pdf 文件路径, 支持单个文件或文件列表, 文件列表需要以".list"结尾, 一行一个pdf文件路径')
  225. @click.option("--model", type=click.Path(exists=True), help="模型的路径")
  226. @click.option(
  227. "--method",
  228. type=parse_pdf_methods,
  229. help="指定解析方法。txt: 文本型 pdf 解析方法, ocr: 光学识别解析 pdf, auto: 程序智能选择解析方法",
  230. default="auto",
  231. )
  232. @click.option("--inside_model", type=click.BOOL, default=True, help="使用内置模型测试")
  233. @click.option("--model_mode", type=click.STRING, default="full",
  234. help="内置模型选择。lite: 快速解析,精度较低,full: 高精度解析,速度较慢")
  235. def pdf_command(pdf, model, method, inside_model, model_mode):
  236. model_config.__use_inside_model__ = inside_model
  237. model_config.__model_mode__ = model_mode
  238. def read_fn(path):
  239. disk_rw = DiskReaderWriter(os.path.dirname(path))
  240. return disk_rw.read(os.path.basename(path), AbsReaderWriter.MODE_BIN)
  241. def get_model_json(model_path, doc_path):
  242. # 这里处理pdf和模型相关的逻辑
  243. if model_path is None:
  244. file_name_without_extension, extension = os.path.splitext(doc_path)
  245. if extension == ".pdf":
  246. model_path = file_name_without_extension + ".json"
  247. else:
  248. raise Exception("pdf_path input error")
  249. if not os.path.exists(model_path):
  250. logger.warning(
  251. f"not found json {model_path} existed"
  252. )
  253. # 本地无模型数据则调用内置paddle分析,先传空list,在内部识别到空list再调用paddle
  254. model_json = "[]"
  255. else:
  256. model_json = read_fn(model_path).decode("utf-8")
  257. else:
  258. model_json = read_fn(model_path).decode("utf-8")
  259. return model_json
  260. def parse_doc(doc_path):
  261. try:
  262. file_name = str(Path(doc_path).stem)
  263. pdf_data = read_fn(doc_path)
  264. jso = json_parse.loads(get_model_json(model, doc_path))
  265. do_parse(
  266. file_name,
  267. pdf_data,
  268. jso,
  269. method,
  270. )
  271. except Exception as e:
  272. logger.exception(e)
  273. if not pdf:
  274. logger.error(f"Error: Missing argument '--pdf'.")
  275. exit(f"Error: Missing argument '--pdf'.")
  276. else:
  277. '''适配多个文档的list文件输入'''
  278. if pdf.endswith(".list"):
  279. with open(pdf, "r") as f:
  280. for line in f.readlines():
  281. line = line.strip()
  282. parse_doc(line)
  283. else:
  284. '''适配单个文档的输入'''
  285. parse_doc(pdf)
  286. if __name__ == "__main__":
  287. """
  288. python magic_pdf/cli/magicpdf.py json-command --json s3://llm-pdf-text/pdf_ebook_and_paper/manual/v001/part-660407a28beb-000002.jsonl?bytes=0,63551
  289. """
  290. cli()