cli.py 1.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475
  1. import os
  2. import click
  3. from loguru import logger
  4. from pathlib import Path
  5. from magic_pdf.rw.DiskReaderWriter import DiskReaderWriter
  6. from magic_pdf.rw.AbsReaderWriter import AbsReaderWriter
  7. import magic_pdf.model as model_config
  8. from magic_pdf.tools.common import parse_pdf_methods, do_parse
  9. @click.command()
  10. @click.option(
  11. "-p",
  12. "--path",
  13. "path",
  14. type=click.Path(exists=True),
  15. required=True,
  16. help="local pdf filepath or directory",
  17. )
  18. @click.option(
  19. "-o",
  20. "--output-dir",
  21. "output_dir",
  22. type=str,
  23. help="output local directory",
  24. default="",
  25. )
  26. @click.option(
  27. "-m",
  28. "--method",
  29. "method",
  30. type=parse_pdf_methods,
  31. help="""the method for parsing pdf.
  32. ocr: using ocr technique to extract information from pdf.
  33. txt: suitable for the text-based pdf only and outperform ocr.
  34. auto: automatically choose the best method for parsing pdf from ocr and txt""",
  35. default="auto",
  36. )
  37. def cli(path, output_dir, method):
  38. model_config.__use_inside_model__ = True
  39. model_config.__model_mode__ = "full"
  40. if output_dir == "":
  41. if os.path.isdir(path):
  42. output_dir = os.path.join(path, "output")
  43. else:
  44. output_dir = os.path.join(os.path.dirname(path), "output")
  45. def read_fn(path):
  46. disk_rw = DiskReaderWriter(os.path.dirname(path))
  47. return disk_rw.read(os.path.basename(path), AbsReaderWriter.MODE_BIN)
  48. def parse_doc(doc_path: str):
  49. try:
  50. file_name = str(Path(doc_path).stem)
  51. pdf_data = read_fn(doc_path)
  52. do_parse(
  53. output_dir,
  54. file_name,
  55. pdf_data,
  56. [],
  57. method,
  58. )
  59. except Exception as e:
  60. logger.exception(e)
  61. if os.path.isdir(path):
  62. for doc_path in Path(path).glob("*.pdf"):
  63. parse_doc(doc_path)
  64. else:
  65. parse_doc(path)
  66. if __name__ == "__main__":
  67. cli()