cli.py 2.1 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879
  1. import os
  2. import click
  3. from loguru import logger
  4. from pathlib import Path
  5. from magic_pdf.rw.DiskReaderWriter import DiskReaderWriter
  6. from magic_pdf.rw.AbsReaderWriter import AbsReaderWriter
  7. import magic_pdf.model as model_config
  8. from magic_pdf.tools.common import parse_pdf_methods, do_parse
  9. from magic_pdf.libs.version import __version__
  10. @click.command()
  11. @click.version_option(__version__, "--version", "-v", help="display the version and exit")
  12. @click.option(
  13. "-p",
  14. "--path",
  15. "path",
  16. type=click.Path(exists=True),
  17. required=True,
  18. help="local pdf filepath or directory",
  19. )
  20. @click.option(
  21. "-o",
  22. "--output-dir",
  23. "output_dir",
  24. type=str,
  25. help="output local directory",
  26. default="",
  27. )
  28. @click.option(
  29. "-m",
  30. "--method",
  31. "method",
  32. type=parse_pdf_methods,
  33. help="""the method for parsing pdf.
  34. ocr: using ocr technique to extract information from pdf.
  35. txt: suitable for the text-based pdf only and outperform ocr.
  36. auto: automatically choose the best method for parsing pdf from ocr and txt.
  37. without method specified, auto will be used by default.""",
  38. default="auto",
  39. )
  40. def cli(path, output_dir, method):
  41. model_config.__use_inside_model__ = True
  42. model_config.__model_mode__ = "full"
  43. if output_dir == "":
  44. if os.path.isdir(path):
  45. output_dir = os.path.join(path, "output")
  46. else:
  47. output_dir = os.path.join(os.path.dirname(path), "output")
  48. def read_fn(path):
  49. disk_rw = DiskReaderWriter(os.path.dirname(path))
  50. return disk_rw.read(os.path.basename(path), AbsReaderWriter.MODE_BIN)
  51. def parse_doc(doc_path: str):
  52. try:
  53. file_name = str(Path(doc_path).stem)
  54. pdf_data = read_fn(doc_path)
  55. do_parse(
  56. output_dir,
  57. file_name,
  58. pdf_data,
  59. [],
  60. method,
  61. )
  62. except Exception as e:
  63. logger.exception(e)
  64. if os.path.isdir(path):
  65. for doc_path in Path(path).glob("*.pdf"):
  66. parse_doc(doc_path)
  67. else:
  68. parse_doc(path)
  69. if __name__ == "__main__":
  70. cli()