cli.py 3.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129
  1. import os
  2. from pathlib import Path
  3. import click
  4. from loguru import logger
  5. import magic_pdf.model as model_config
  6. from magic_pdf.libs.version import __version__
  7. from magic_pdf.rw.AbsReaderWriter import AbsReaderWriter
  8. from magic_pdf.rw.DiskReaderWriter import DiskReaderWriter
  9. from magic_pdf.tools.common import do_parse, parse_pdf_methods
  10. @click.command()
  11. @click.version_option(__version__,
  12. '--version',
  13. '-v',
  14. help='display the version and exit')
  15. @click.option(
  16. '-p',
  17. '--path',
  18. 'path',
  19. type=click.Path(exists=True),
  20. required=True,
  21. help='local pdf filepath or directory',
  22. )
  23. @click.option(
  24. '-o',
  25. '--output-dir',
  26. 'output_dir',
  27. type=click.Path(),
  28. required=True,
  29. help='output local directory',
  30. )
  31. @click.option(
  32. '-m',
  33. '--method',
  34. 'method',
  35. type=parse_pdf_methods,
  36. help="""the method for parsing pdf.
  37. ocr: using ocr technique to extract information from pdf.
  38. txt: suitable for the text-based pdf only and outperform ocr.
  39. auto: automatically choose the best method for parsing pdf from ocr and txt.
  40. without method specified, auto will be used by default.""",
  41. default='auto',
  42. )
  43. @click.option(
  44. <<<<<<< HEAD
  45. '-l',
  46. '--lang',
  47. 'lang',
  48. type=str,
  49. help="""
  50. Input the languages in the pdf (if known) to improve OCR accuracy. Optional.
  51. You should input "Abbreviation" with language form url:
  52. https://paddlepaddle.github.io/PaddleOCR/en/ppocr/blog/multi_languages.html#5-support-languages-and-abbreviations
  53. """,
  54. default=None,
  55. )
  56. @click.option(
  57. =======
  58. >>>>>>> 0140d7d271ac3b1561ca2272030e9e038b469999
  59. '-d',
  60. '--debug',
  61. 'debug_able',
  62. type=bool,
  63. help='Enables detailed debugging information during the execution of the CLI commands.',
  64. default=False,
  65. )
  66. @click.option(
  67. '-s',
  68. '--start',
  69. 'start_page_id',
  70. type=int,
  71. help='The starting page for PDF parsing, beginning from 0.',
  72. default=0,
  73. )
  74. @click.option(
  75. '-e',
  76. '--end',
  77. 'end_page_id',
  78. type=int,
  79. help='The ending page for PDF parsing, beginning from 0.',
  80. default=None,
  81. )
  82. <<<<<<< HEAD
  83. def cli(path, output_dir, method, lang, debug_able, start_page_id, end_page_id):
  84. =======
  85. def cli(path, output_dir, method, debug_able, start_page_id, end_page_id):
  86. >>>>>>> 0140d7d271ac3b1561ca2272030e9e038b469999
  87. model_config.__use_inside_model__ = True
  88. model_config.__model_mode__ = 'full'
  89. os.makedirs(output_dir, exist_ok=True)
  90. def read_fn(path):
  91. disk_rw = DiskReaderWriter(os.path.dirname(path))
  92. return disk_rw.read(os.path.basename(path), AbsReaderWriter.MODE_BIN)
  93. def parse_doc(doc_path: str):
  94. try:
  95. file_name = str(Path(doc_path).stem)
  96. pdf_data = read_fn(doc_path)
  97. do_parse(
  98. output_dir,
  99. file_name,
  100. pdf_data,
  101. [],
  102. method,
  103. debug_able,
  104. start_page_id=start_page_id,
  105. end_page_id=end_page_id,
  106. <<<<<<< HEAD
  107. lang=lang
  108. =======
  109. >>>>>>> 0140d7d271ac3b1561ca2272030e9e038b469999
  110. )
  111. except Exception as e:
  112. logger.exception(e)
  113. if os.path.isdir(path):
  114. for doc_path in Path(path).glob('*.pdf'):
  115. parse_doc(doc_path)
  116. else:
  117. parse_doc(path)
  118. if __name__ == '__main__':
  119. cli()