magicpdf.py 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294
  1. """
  2. 这里实现2个click命令:
  3. 第一个:
  4. 接收一个完整的s3路径,例如:s3://llm-pdf-text/pdf_ebook_and_paper/pre-clean-mm-markdown/v014/part-660420b490be-000008.jsonl?bytes=0,81350
  5. 1)根据~/magic-pdf.json里的ak,sk等,构造s3cliReader读取到这个jsonl的对应行,返回json对象。
  6. 2)根据Json对象里的pdf的s3路径获取到他的ak,sk,endpoint,构造出s3cliReader用来读取pdf
  7. 3)从magic-pdf.json里读取到本地保存图片、Md等的临时目录位置,构造出LocalImageWriter,用来保存截图
  8. 4)从magic-pdf.json里读取到本地保存图片、Md等的临时目录位置,构造出LocalIRdWriter,用来读写本地文件
  9. 最后把以上步骤准备好的对象传入真正的解析API
  10. 第二个:
  11. 接收1)pdf的本地路径。2)模型json文件(可选)。然后:
  12. 1)根据~/magic-pdf.json读取到本地保存图片、md等临时目录的位置,构造出LocalImageWriter,用来保存截图
  13. 2)从magic-pdf.json里读取到本地保存图片、Md等的临时目录位置,构造出LocalIRdWriter,用来读写本地文件
  14. 3)根据约定,根据pdf本地路径,推导出pdf模型的json,并读入
  15. 效果:
  16. python magicpdf.py --json s3://llm-pdf-text/scihub/xxxx.json?bytes=0,81350
  17. python magicpdf.py --pdf /home/llm/Downloads/xxxx.pdf --model /home/llm/Downloads/xxxx.json 或者 python magicpdf.py --pdf /home/llm/Downloads/xxxx.pdf
  18. """
  19. import os
  20. import json as json_parse
  21. import sys
  22. import click
  23. from loguru import logger
  24. from pathlib import Path
  25. from magic_pdf.libs.version import __version__
  26. from magic_pdf.libs.MakeContentConfig import DropMode
  27. from magic_pdf.libs.draw_bbox import draw_layout_bbox, draw_span_bbox
  28. from magic_pdf.model.doc_analyze_by_pp_structurev2 import doc_analyze
  29. from magic_pdf.pipe.UNIPipe import UNIPipe
  30. from magic_pdf.pipe.OCRPipe import OCRPipe
  31. from magic_pdf.pipe.TXTPipe import TXTPipe
  32. from magic_pdf.libs.config_reader import get_s3_config
  33. from magic_pdf.libs.path_utils import (
  34. parse_s3path,
  35. parse_s3_range_params,
  36. remove_non_official_s3_args,
  37. )
  38. from magic_pdf.libs.config_reader import get_local_dir
  39. from magic_pdf.rw.S3ReaderWriter import S3ReaderWriter
  40. from magic_pdf.rw.DiskReaderWriter import DiskReaderWriter
  41. from magic_pdf.rw.AbsReaderWriter import AbsReaderWriter
  42. import csv
  43. parse_pdf_methods = click.Choice(["ocr", "txt", "auto"])
  44. def prepare_env(pdf_file_name, method):
  45. local_parent_dir = os.path.join(
  46. get_local_dir(), "magic-pdf", pdf_file_name, method
  47. )
  48. local_image_dir = os.path.join(str(local_parent_dir), "images")
  49. local_md_dir = local_parent_dir
  50. os.makedirs(local_image_dir, exist_ok=True)
  51. os.makedirs(local_md_dir, exist_ok=True)
  52. return local_image_dir, local_md_dir
  53. def write_to_csv(csv_file_path, csv_data):
  54. with open(csv_file_path, mode='a', newline='', encoding='utf-8') as csvfile:
  55. # 创建csv writer对象
  56. csv_writer = csv.writer(csvfile)
  57. # 写入数据
  58. csv_writer.writerow(csv_data)
  59. print(f"数据已成功追加到 '{csv_file_path}'")
  60. def _do_parse(pdf_file_name, pdf_bytes, model_list, parse_method, image_writer, md_writer, image_dir, local_md_dir):
  61. if parse_method == "auto":
  62. jso_useful_key = {
  63. "_pdf_type": "",
  64. "model_list": model_list
  65. }
  66. pipe = UNIPipe(pdf_bytes, jso_useful_key, image_writer, is_debug=True)
  67. elif parse_method == "txt":
  68. pipe = TXTPipe(pdf_bytes, model_list, image_writer, is_debug=True)
  69. elif parse_method == "ocr":
  70. pipe = OCRPipe(pdf_bytes, model_list, image_writer, is_debug=True)
  71. else:
  72. print("unknow parse method")
  73. sys.exit(1)
  74. pipe.pipe_classify()
  75. '''如果没有传入有效的模型数据,则使用内置paddle解析'''
  76. if len(model_list) == 0:
  77. pipe.pipe_analyze()
  78. pipe.pipe_parse()
  79. pdf_info = pipe.pdf_mid_data['pdf_info']
  80. draw_layout_bbox(pdf_info, pdf_bytes, local_md_dir)
  81. draw_span_bbox(pdf_info, pdf_bytes, local_md_dir)
  82. # write_to_csv(r"D:\project\20231108code-clean\linshixuqiu\pdf_dev\新模型\新建文件夹\luanma.csv",
  83. # [pdf_file_name, pipe.pdf_mid_data['not_common_character_rate'], pipe.pdf_mid_data['not_printable_rate']])
  84. md_content = pipe.pipe_mk_markdown(image_dir, drop_mode=DropMode.NONE)
  85. md_writer.write(
  86. content=md_content, path=f"{pdf_file_name}.md", mode=AbsReaderWriter.MODE_TXT
  87. )
  88. md_writer.write(
  89. content=json_parse.dumps(pipe.pdf_mid_data, ensure_ascii=False, indent=4),
  90. path=f"{pdf_file_name}.json",
  91. mode=AbsReaderWriter.MODE_TXT,
  92. )
  93. content_list = pipe.pipe_mk_uni_format(image_dir, drop_mode=DropMode.NONE)
  94. md_writer.write(
  95. str(content_list), f"{pdf_file_name}.txt", AbsReaderWriter.MODE_TXT
  96. )
  97. @click.group()
  98. @click.version_option(__version__, "--version", "-v", help="显示版本信息")
  99. @click.help_option("--help", "-h", help="显示帮助信息")
  100. def cli():
  101. pass
  102. @cli.command()
  103. @click.option("--json", type=str, help="输入一个S3路径")
  104. @click.option(
  105. "--method",
  106. type=parse_pdf_methods,
  107. help="指定解析方法。txt: 文本型 pdf 解析方法, ocr: 光学识别解析 pdf, auto: 程序智能选择解析方法",
  108. default="auto",
  109. )
  110. def json_command(json, method):
  111. if not json.startswith("s3://"):
  112. print("usage: python magipdf.py --json s3://some_bucket/some_path")
  113. sys.exit(1)
  114. def read_s3_path(s3path):
  115. bucket, key = parse_s3path(s3path)
  116. s3_ak, s3_sk, s3_endpoint = get_s3_config(bucket)
  117. s3_rw = S3ReaderWriter(
  118. s3_ak, s3_sk, s3_endpoint, "auto", remove_non_official_s3_args(s3path)
  119. )
  120. may_range_params = parse_s3_range_params(s3path)
  121. if may_range_params is None or 2 != len(may_range_params):
  122. byte_start, byte_end = 0, None
  123. else:
  124. byte_start, byte_end = int(may_range_params[0]), int(may_range_params[1])
  125. byte_end += byte_start - 1
  126. return s3_rw.read_jsonl(
  127. remove_non_official_s3_args(s3path),
  128. byte_start,
  129. byte_end,
  130. AbsReaderWriter.MODE_BIN,
  131. )
  132. jso = json_parse.loads(read_s3_path(json).decode("utf-8"))
  133. s3_file_path = jso.get("file_location")
  134. if s3_file_path is None:
  135. s3_file_path = jso.get("path")
  136. pdf_file_name = Path(s3_file_path).stem
  137. pdf_data = read_s3_path(s3_file_path)
  138. local_image_dir, local_md_dir = prepare_env(pdf_file_name, method)
  139. local_image_rw, local_md_rw = DiskReaderWriter(local_image_dir), DiskReaderWriter(
  140. local_md_dir
  141. )
  142. _do_parse(
  143. pdf_file_name,
  144. pdf_data,
  145. jso["doc_layout_result"],
  146. method,
  147. local_image_rw,
  148. local_md_rw,
  149. os.path.basename(local_image_dir),
  150. local_md_dir
  151. )
  152. @cli.command()
  153. @click.option("--local_json", type=str, help="输入一个本地jsonl路径")
  154. @click.option(
  155. "--method",
  156. type=parse_pdf_methods,
  157. help="指定解析方法。txt: 文本型 pdf 解析方法, ocr: 光学识别解析 pdf, auto: 程序智能选择解析方法",
  158. default="auto",
  159. )
  160. def local_json_command(local_json, method):
  161. def read_s3_path(s3path):
  162. bucket, key = parse_s3path(s3path)
  163. s3_ak, s3_sk, s3_endpoint = get_s3_config(bucket)
  164. s3_rw = S3ReaderWriter(
  165. s3_ak, s3_sk, s3_endpoint, "auto", remove_non_official_s3_args(s3path)
  166. )
  167. may_range_params = parse_s3_range_params(s3path)
  168. if may_range_params is None or 2 != len(may_range_params):
  169. byte_start, byte_end = 0, None
  170. else:
  171. byte_start, byte_end = int(may_range_params[0]), int(may_range_params[1])
  172. byte_end += byte_start - 1
  173. return s3_rw.read_jsonl(
  174. remove_non_official_s3_args(s3path),
  175. byte_start,
  176. byte_end,
  177. AbsReaderWriter.MODE_BIN,
  178. )
  179. with open(local_json, "r", encoding="utf-8") as f:
  180. for json_line in f:
  181. jso = json_parse.loads(json_line)
  182. s3_file_path = jso.get("file_location")
  183. if s3_file_path is None:
  184. s3_file_path = jso.get("path")
  185. pdf_file_name = Path(s3_file_path).stem
  186. pdf_data = read_s3_path(s3_file_path)
  187. local_image_dir, local_md_dir = prepare_env(pdf_file_name, method)
  188. local_image_rw, local_md_rw = DiskReaderWriter(local_image_dir), DiskReaderWriter(
  189. local_md_dir
  190. )
  191. _do_parse(
  192. pdf_file_name,
  193. pdf_data,
  194. jso["doc_layout_result"],
  195. method,
  196. local_image_rw,
  197. local_md_rw,
  198. os.path.basename(local_image_dir),
  199. local_md_dir
  200. )
  201. @cli.command()
  202. @click.option(
  203. "--pdf", type=click.Path(exists=True), required=True, help="PDF文件的路径"
  204. )
  205. @click.option("--model", type=click.Path(exists=True), help="模型的路径")
  206. @click.option(
  207. "--method",
  208. type=parse_pdf_methods,
  209. help="指定解析方法。txt: 文本型 pdf 解析方法, ocr: 光学识别解析 pdf, auto: 程序智能选择解析方法",
  210. default="auto",
  211. )
  212. def pdf_command(pdf, model, method):
  213. def read_fn(path):
  214. disk_rw = DiskReaderWriter(os.path.dirname(path))
  215. return disk_rw.read(os.path.basename(path), AbsReaderWriter.MODE_BIN)
  216. pdf_data = read_fn(pdf)
  217. def get_model_json(model_path):
  218. # 这里处理pdf和模型相关的逻辑
  219. if model_path is None:
  220. model_path = pdf.replace(".pdf", ".json")
  221. if not os.path.exists(model_path):
  222. logger.warning(f"not found json {model_path} existed, use paddle analyze")
  223. # 本地无模型数据则调用内置paddle分析,先传空list,在内部识别到空list再调用paddle
  224. model_json = "[]"
  225. else:
  226. model_json = read_fn(model_path).decode("utf-8")
  227. else:
  228. model_json = read_fn(model_path).decode("utf-8")
  229. return model_json
  230. jso = json_parse.loads(get_model_json(model))
  231. pdf_file_name = Path(pdf).stem
  232. local_image_dir, local_md_dir = prepare_env(pdf_file_name, method)
  233. local_image_rw, local_md_rw = DiskReaderWriter(local_image_dir), DiskReaderWriter(
  234. local_md_dir
  235. )
  236. _do_parse(
  237. pdf_file_name,
  238. pdf_data,
  239. jso,
  240. method,
  241. local_image_rw,
  242. local_md_rw,
  243. os.path.basename(local_image_dir),
  244. local_md_dir
  245. )
  246. if __name__ == "__main__":
  247. """
  248. python magic_pdf/cli/magicpdf.py json-command --json s3://llm-pdf-text/pdf_ebook_and_paper/manual/v001/part-660407a28beb-000002.jsonl?bytes=0,63551
  249. """
  250. cli()