cli_dev.py 4.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152
  1. import json as json_parse
  2. import os
  3. from pathlib import Path
  4. import click
  5. import magic_pdf.model as model_config
  6. from magic_pdf.libs.config_reader import get_s3_config
  7. from magic_pdf.libs.path_utils import (parse_s3_range_params, parse_s3path,
  8. remove_non_official_s3_args)
  9. from magic_pdf.libs.version import __version__
  10. from magic_pdf.rw.AbsReaderWriter import AbsReaderWriter
  11. from magic_pdf.rw.DiskReaderWriter import DiskReaderWriter
  12. from magic_pdf.rw.S3ReaderWriter import S3ReaderWriter
  13. from magic_pdf.tools.common import do_parse, parse_pdf_methods
  14. def read_s3_path(s3path):
  15. bucket, key = parse_s3path(s3path)
  16. s3_ak, s3_sk, s3_endpoint = get_s3_config(bucket)
  17. s3_rw = S3ReaderWriter(s3_ak, s3_sk, s3_endpoint, 'auto',
  18. remove_non_official_s3_args(s3path))
  19. may_range_params = parse_s3_range_params(s3path)
  20. if may_range_params is None or 2 != len(may_range_params):
  21. byte_start, byte_end = 0, None
  22. else:
  23. byte_start, byte_end = int(may_range_params[0]), int(
  24. may_range_params[1])
  25. return s3_rw.read_offset(
  26. remove_non_official_s3_args(s3path),
  27. byte_start,
  28. byte_end,
  29. )
  30. @click.group()
  31. @click.version_option(__version__, '--version', '-v', help='显示版本信息')
  32. def cli():
  33. pass
  34. @cli.command()
  35. @click.option(
  36. '-j',
  37. '--jsonl',
  38. 'jsonl',
  39. type=str,
  40. help='输入 jsonl 路径,本地或者 s3 上的文件',
  41. required=True,
  42. )
  43. @click.option(
  44. '-m',
  45. '--method',
  46. 'method',
  47. type=parse_pdf_methods,
  48. help='指定解析方法。txt: 文本型 pdf 解析方法, ocr: 光学识别解析 pdf, auto: 程序智能选择解析方法',
  49. default='auto',
  50. )
  51. @click.option(
  52. '-o',
  53. '--output-dir',
  54. 'output_dir',
  55. type=click.Path(),
  56. required=True,
  57. help='输出到本地目录',
  58. )
  59. def jsonl(jsonl, method, output_dir):
  60. model_config.__use_inside_model__ = False
  61. if jsonl.startswith('s3://'):
  62. jso = json_parse.loads(read_s3_path(jsonl).decode('utf-8'))
  63. else:
  64. with open(jsonl) as f:
  65. jso = json_parse.loads(f.readline())
  66. os.makedirs(output_dir, exist_ok=True)
  67. s3_file_path = jso.get('file_location')
  68. if s3_file_path is None:
  69. s3_file_path = jso.get('path')
  70. pdf_file_name = Path(s3_file_path).stem
  71. pdf_data = read_s3_path(s3_file_path)
  72. print(pdf_file_name, jso, method)
  73. do_parse(
  74. output_dir,
  75. pdf_file_name,
  76. pdf_data,
  77. jso['doc_layout_result'],
  78. method,
  79. False,
  80. f_dump_content_list=True,
  81. f_draw_model_bbox=True,
  82. )
  83. @cli.command()
  84. @click.option(
  85. '-p',
  86. '--pdf',
  87. 'pdf',
  88. type=click.Path(exists=True),
  89. required=True,
  90. help='本地 PDF 文件',
  91. )
  92. @click.option(
  93. '-j',
  94. '--json',
  95. 'json_data',
  96. type=click.Path(exists=True),
  97. required=True,
  98. help='本地模型推理出的 json 数据',
  99. )
  100. @click.option('-o',
  101. '--output-dir',
  102. 'output_dir',
  103. type=click.Path(),
  104. required=True,
  105. help='本地输出目录')
  106. @click.option(
  107. '-m',
  108. '--method',
  109. 'method',
  110. type=parse_pdf_methods,
  111. help='指定解析方法。txt: 文本型 pdf 解析方法, ocr: 光学识别解析 pdf, auto: 程序智能选择解析方法',
  112. default='auto',
  113. )
  114. def pdf(pdf, json_data, output_dir, method):
  115. model_config.__use_inside_model__ = False
  116. full_pdf_path = os.path.realpath(pdf)
  117. os.makedirs(output_dir, exist_ok=True)
  118. def read_fn(path):
  119. disk_rw = DiskReaderWriter(os.path.dirname(path))
  120. return disk_rw.read(os.path.basename(path), AbsReaderWriter.MODE_BIN)
  121. model_json_list = json_parse.loads(read_fn(json_data).decode('utf-8'))
  122. file_name = str(Path(full_pdf_path).stem)
  123. pdf_data = read_fn(full_pdf_path)
  124. do_parse(
  125. output_dir,
  126. file_name,
  127. pdf_data,
  128. model_json_list,
  129. method,
  130. False,
  131. f_dump_content_list=True,
  132. f_draw_model_bbox=True,
  133. )
  134. if __name__ == '__main__':
  135. cli()