client.py 3.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111
  1. # Copyright (c) Opendatalab. All rights reserved.
  2. import os
  3. import click
  4. from pathlib import Path
  5. from loguru import logger
  6. from ..version import __version__
  7. from .common import do_parse, read_fn, pdf_suffixes, image_suffixes
  8. @click.command()
  9. @click.version_option(__version__,
  10. '--version',
  11. '-v',
  12. help='display the version and exit')
  13. @click.option(
  14. '-p',
  15. '--path',
  16. 'input_path',
  17. type=click.Path(exists=True),
  18. required=True,
  19. help='local filepath or directory. support pdf, png, jpg, jpeg files',
  20. )
  21. @click.option(
  22. '-o',
  23. '--output-dir',
  24. 'output_dir',
  25. type=click.Path(),
  26. required=True,
  27. help='output local directory',
  28. )
  29. @click.option(
  30. '-b',
  31. '--backend',
  32. 'backend',
  33. type=click.Choice(['pipeline', 'vlm-huggingface', 'vlm-sglang-engine', 'vlm-sglang-client']),
  34. help="""the backend for parsing pdf:
  35. pipeline: More general.
  36. vlm-huggingface: More general.
  37. vlm-sglang-engine: Faster(engine).
  38. vlm-sglang-client: Faster(client).
  39. without method specified, huggingface will be used by default.""",
  40. default='pipeline',
  41. )
  42. @click.option(
  43. '-l',
  44. '--lang',
  45. 'lang',
  46. type=click.Choice(['ch', 'ch_server', 'ch_lite', 'en', 'korean', 'japan', 'chinese_cht', 'ta', 'te', 'ka']),
  47. help="""
  48. Input the languages in the pdf (if known) to improve OCR accuracy. Optional.
  49. Without languages specified, 'ch' will be used by default.
  50. """,
  51. default='ch',
  52. )
  53. @click.option(
  54. '-u',
  55. '--url',
  56. 'server_url',
  57. type=str,
  58. help="""
  59. When the backend is `sglang-client`, you need to specify the server_url, for example:`http://127.0.0.1:30000`
  60. """,
  61. default=None,
  62. )
  63. @click.option(
  64. '-s',
  65. '--start',
  66. 'start_page_id',
  67. type=int,
  68. help='The starting page for PDF parsing, beginning from 0.',
  69. default=0,
  70. )
  71. @click.option(
  72. '-e',
  73. '--end',
  74. 'end_page_id',
  75. type=int,
  76. help='The ending page for PDF parsing, beginning from 0.',
  77. default=None,
  78. )
  79. def main(input_path, output_dir, backend, lang, server_url, start_page_id, end_page_id):
  80. os.makedirs(output_dir, exist_ok=True)
  81. def parse_doc(path_list: list[Path]):
  82. try:
  83. file_name_list = []
  84. pdf_bytes_list = []
  85. lang_list = []
  86. for path in path_list:
  87. file_name = str(Path(path).stem)
  88. pdf_bytes = read_fn(path)
  89. file_name_list.append(file_name)
  90. pdf_bytes_list.append(pdf_bytes)
  91. lang_list.append(lang)
  92. do_parse(output_dir, file_name_list, pdf_bytes_list, lang_list, backend, server_url,
  93. start_page_id=start_page_id, end_page_id=end_page_id)
  94. except Exception as e:
  95. logger.exception(e)
  96. if os.path.isdir(input_path):
  97. doc_path_list = []
  98. for doc_path in Path(input_path).glob('*'):
  99. if doc_path.suffix in pdf_suffixes + image_suffixes:
  100. doc_path_list.append(doc_path)
  101. parse_doc(doc_path_list)
  102. else:
  103. parse_doc([Path(input_path)])
  104. if __name__ == '__main__':
  105. main()