client.py 2.4 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091
  1. # Copyright (c) Opendatalab. All rights reserved.
  2. import os
  3. import click
  4. from pathlib import Path
  5. from loguru import logger
  6. from ..version import __version__
  7. from .common import do_parse, read_fn, pdf_suffixes, image_suffixes
  8. @click.command()
  9. @click.version_option(__version__,
  10. '--version',
  11. '-v',
  12. help='display the version and exit')
  13. @click.option(
  14. '-p',
  15. '--path',
  16. 'input_path',
  17. type=click.Path(exists=True),
  18. required=True,
  19. help='local filepath or directory. support pdf, png, jpg, jpeg files',
  20. )
  21. @click.option(
  22. '-o',
  23. '--output-dir',
  24. 'output_dir',
  25. type=click.Path(),
  26. required=True,
  27. help='output local directory',
  28. )
  29. @click.option(
  30. '-b',
  31. '--backend',
  32. 'backend',
  33. type=click.Choice(['pipeline', 'vlm-huggingface', 'vlm-sglang-engine', 'vlm-sglang-client']),
  34. help="""the backend for parsing pdf:
  35. pipeline: More general.
  36. vlm-huggingface: More general.
  37. vlm-sglang-engine: Faster(engine).
  38. vlm-sglang-client: Faster(client).
  39. without method specified, huggingface will be used by default.""",
  40. default='pipeline',
  41. )
  42. @click.option(
  43. '-u',
  44. '--url',
  45. 'server_url',
  46. type=str,
  47. help="""
  48. When the backend is `sglang-client`, you need to specify the server_url, for example:`http://127.0.0.1:30000`
  49. """,
  50. default=None,
  51. )
  52. @click.option(
  53. '-s',
  54. '--start',
  55. 'start_page_id',
  56. type=int,
  57. help='The starting page for PDF parsing, beginning from 0.',
  58. default=0,
  59. )
  60. @click.option(
  61. '-e',
  62. '--end',
  63. 'end_page_id',
  64. type=int,
  65. help='The ending page for PDF parsing, beginning from 0.',
  66. default=None,
  67. )
  68. def main(input_path, output_dir, backend, server_url, start_page_id, end_page_id):
  69. os.makedirs(output_dir, exist_ok=True)
  70. def parse_doc(path: Path):
  71. try:
  72. file_name = str(Path(path).stem)
  73. pdf_bits = read_fn(path)
  74. do_parse(output_dir, file_name, pdf_bits, backend, server_url,
  75. start_page_id=start_page_id, end_page_id=end_page_id)
  76. except Exception as e:
  77. logger.exception(e)
  78. if os.path.isdir(input_path):
  79. for doc_path in Path(input_path).glob('*'):
  80. if doc_path.suffix in pdf_suffixes + image_suffixes:
  81. parse_doc(Path(doc_path))
  82. else:
  83. parse_doc(Path(input_path))
  84. if __name__ == '__main__':
  85. main()