cli.py 3.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118
  1. import os
  2. from pathlib import Path
  3. import click
  4. from loguru import logger
  5. import magic_pdf.model as model_config
  6. from magic_pdf.data.data_reader_writer import FileBasedDataReader
  7. from magic_pdf.libs.version import __version__
  8. from magic_pdf.tools.common import do_parse, parse_pdf_methods
  9. @click.command()
  10. @click.version_option(__version__,
  11. '--version',
  12. '-v',
  13. help='display the version and exit')
  14. @click.option(
  15. '-p',
  16. '--path',
  17. 'path',
  18. type=click.Path(exists=True),
  19. required=True,
  20. help='local pdf filepath or directory',
  21. )
  22. @click.option(
  23. '-o',
  24. '--output-dir',
  25. 'output_dir',
  26. type=click.Path(),
  27. required=True,
  28. help='output local directory',
  29. )
  30. @click.option(
  31. '-m',
  32. '--method',
  33. 'method',
  34. type=parse_pdf_methods,
  35. help="""the method for parsing pdf.
  36. ocr: using ocr technique to extract information from pdf.
  37. txt: suitable for the text-based pdf only and outperform ocr.
  38. auto: automatically choose the best method for parsing pdf from ocr and txt.
  39. without method specified, auto will be used by default.""",
  40. default='auto',
  41. )
  42. @click.option(
  43. '-l',
  44. '--lang',
  45. 'lang',
  46. type=str,
  47. help="""
  48. Input the languages in the pdf (if known) to improve OCR accuracy. Optional.
  49. You should input "Abbreviation" with language form url:
  50. https://paddlepaddle.github.io/PaddleOCR/latest/en/ppocr/blog/multi_languages.html#5-support-languages-and-abbreviations
  51. """,
  52. default=None,
  53. )
  54. @click.option(
  55. '-d',
  56. '--debug',
  57. 'debug_able',
  58. type=bool,
  59. help='Enables detailed debugging information during the execution of the CLI commands.',
  60. default=False,
  61. )
  62. @click.option(
  63. '-s',
  64. '--start',
  65. 'start_page_id',
  66. type=int,
  67. help='The starting page for PDF parsing, beginning from 0.',
  68. default=0,
  69. )
  70. @click.option(
  71. '-e',
  72. '--end',
  73. 'end_page_id',
  74. type=int,
  75. help='The ending page for PDF parsing, beginning from 0.',
  76. default=None,
  77. )
  78. def cli(path, output_dir, method, lang, debug_able, start_page_id, end_page_id):
  79. model_config.__use_inside_model__ = True
  80. model_config.__model_mode__ = 'full'
  81. os.makedirs(output_dir, exist_ok=True)
  82. def read_fn(path):
  83. disk_rw = FileBasedDataReader(os.path.dirname(path))
  84. return disk_rw.read(os.path.basename(path))
  85. def parse_doc(doc_path: str):
  86. try:
  87. file_name = str(Path(doc_path).stem)
  88. pdf_data = read_fn(doc_path)
  89. do_parse(
  90. output_dir,
  91. file_name,
  92. pdf_data,
  93. [],
  94. method,
  95. debug_able,
  96. start_page_id=start_page_id,
  97. end_page_id=end_page_id,
  98. lang=lang
  99. )
  100. except Exception as e:
  101. logger.exception(e)
  102. if os.path.isdir(path):
  103. for doc_path in Path(path).glob('*.pdf'):
  104. parse_doc(doc_path)
  105. else:
  106. parse_doc(path)
  107. if __name__ == '__main__':
  108. cli()