cli_dev.py 3.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149
  1. import json as json_parse
  2. import os
  3. from pathlib import Path
  4. import click
  5. import magic_pdf.model as model_config
  6. from magic_pdf.data.data_reader_writer import FileBasedDataReader, S3DataReader
  7. from magic_pdf.libs.config_reader import get_s3_config
  8. from magic_pdf.libs.path_utils import (parse_s3_range_params, parse_s3path,
  9. remove_non_official_s3_args)
  10. from magic_pdf.libs.version import __version__
  11. from magic_pdf.tools.common import do_parse, parse_pdf_methods
  12. def read_s3_path(s3path):
  13. bucket, key = parse_s3path(s3path)
  14. s3_ak, s3_sk, s3_endpoint = get_s3_config(bucket)
  15. s3_rw = S3DataReader('', bucket, s3_ak, s3_sk, s3_endpoint, 'auto')
  16. may_range_params = parse_s3_range_params(s3path)
  17. if may_range_params is None or 2 != len(may_range_params):
  18. byte_start, byte_end = 0, -1
  19. else:
  20. byte_start, byte_end = int(may_range_params[0]), int(
  21. may_range_params[1])
  22. return s3_rw.read_at(
  23. remove_non_official_s3_args(s3path),
  24. byte_start,
  25. byte_end,
  26. )
  27. @click.group()
  28. @click.version_option(__version__, '--version', '-v', help='显示版本信息')
  29. def cli():
  30. pass
  31. @cli.command()
  32. @click.option(
  33. '-j',
  34. '--jsonl',
  35. 'jsonl',
  36. type=str,
  37. help='输入 jsonl 路径,本地或者 s3 上的文件',
  38. required=True,
  39. )
  40. @click.option(
  41. '-m',
  42. '--method',
  43. 'method',
  44. type=parse_pdf_methods,
  45. help='指定解析方法。txt: 文本型 pdf 解析方法, ocr: 光学识别解析 pdf, auto: 程序智能选择解析方法',
  46. default='auto',
  47. )
  48. @click.option(
  49. '-o',
  50. '--output-dir',
  51. 'output_dir',
  52. type=click.Path(),
  53. required=True,
  54. help='输出到本地目录',
  55. )
  56. def jsonl(jsonl, method, output_dir):
  57. model_config.__use_inside_model__ = False
  58. if jsonl.startswith('s3://'):
  59. jso = json_parse.loads(read_s3_path(jsonl).decode('utf-8'))
  60. else:
  61. with open(jsonl) as f:
  62. jso = json_parse.loads(f.readline())
  63. os.makedirs(output_dir, exist_ok=True)
  64. s3_file_path = jso.get('file_location')
  65. if s3_file_path is None:
  66. s3_file_path = jso.get('path')
  67. pdf_file_name = Path(s3_file_path).stem
  68. pdf_data = read_s3_path(s3_file_path)
  69. print(pdf_file_name, jso, method)
  70. do_parse(
  71. output_dir,
  72. pdf_file_name,
  73. pdf_data,
  74. jso['doc_layout_result'],
  75. method,
  76. False,
  77. f_dump_content_list=True,
  78. f_draw_model_bbox=True,
  79. )
  80. @cli.command()
  81. @click.option(
  82. '-p',
  83. '--pdf',
  84. 'pdf',
  85. type=click.Path(exists=True),
  86. required=True,
  87. help='本地 PDF 文件',
  88. )
  89. @click.option(
  90. '-j',
  91. '--json',
  92. 'json_data',
  93. type=click.Path(exists=True),
  94. required=True,
  95. help='本地模型推理出的 json 数据',
  96. )
  97. @click.option('-o',
  98. '--output-dir',
  99. 'output_dir',
  100. type=click.Path(),
  101. required=True,
  102. help='本地输出目录')
  103. @click.option(
  104. '-m',
  105. '--method',
  106. 'method',
  107. type=parse_pdf_methods,
  108. help='指定解析方法。txt: 文本型 pdf 解析方法, ocr: 光学识别解析 pdf, auto: 程序智能选择解析方法',
  109. default='auto',
  110. )
  111. def pdf(pdf, json_data, output_dir, method):
  112. model_config.__use_inside_model__ = False
  113. full_pdf_path = os.path.realpath(pdf)
  114. os.makedirs(output_dir, exist_ok=True)
  115. def read_fn(path):
  116. disk_rw = FileBasedDataReader(os.path.dirname(path))
  117. return disk_rw.read(os.path.basename(path))
  118. model_json_list = json_parse.loads(read_fn(json_data).decode('utf-8'))
  119. file_name = str(Path(full_pdf_path).stem)
  120. pdf_data = read_fn(full_pdf_path)
  121. do_parse(
  122. output_dir,
  123. file_name,
  124. pdf_data,
  125. model_json_list,
  126. method,
  127. False,
  128. f_dump_content_list=True,
  129. f_draw_model_bbox=True,
  130. )
  131. if __name__ == '__main__':
  132. cli()