| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475 |
- import os
- import click
- from loguru import logger
- from pathlib import Path
- from magic_pdf.rw.DiskReaderWriter import DiskReaderWriter
- from magic_pdf.rw.AbsReaderWriter import AbsReaderWriter
- import magic_pdf.model as model_config
- from magic_pdf.tools.common import parse_pdf_methods, do_parse
- @click.command()
- @click.option(
- "-p",
- "--path",
- "path",
- type=click.Path(exists=True),
- required=True,
- help="local pdf filepath or directory",
- )
- @click.option(
- "-o",
- "--output-dir",
- "output_dir",
- type=str,
- help="output local directory",
- default="",
- )
- @click.option(
- "-m",
- "--method",
- "method",
- type=parse_pdf_methods,
- help="""the method for parsing pdf.
- ocr: using ocr technique to extract information from pdf.
- txt: suitable for the text-based pdf only and outperform ocr.
- auto: automatically choose the best method for parsing pdf from ocr and txt""",
- default="auto",
- )
- def cli(path, output_dir, method):
- model_config.__use_inside_model__ = True
- model_config.__model_mode__ = "full"
- if output_dir == "":
- if os.path.isdir(path):
- output_dir = os.path.join(path, "output")
- else:
- output_dir = os.path.join(os.path.dirname(path), "output")
- def read_fn(path):
- disk_rw = DiskReaderWriter(os.path.dirname(path))
- return disk_rw.read(os.path.basename(path), AbsReaderWriter.MODE_BIN)
- def parse_doc(doc_path: str):
- try:
- file_name = str(Path(doc_path).stem)
- pdf_data = read_fn(doc_path)
- do_parse(
- output_dir,
- file_name,
- pdf_data,
- [],
- method,
- )
- except Exception as e:
- logger.exception(e)
- if os.path.isdir(path):
- for doc_path in Path(path).glob("*.pdf"):
- parse_doc(doc_path)
- else:
- parse_doc(path)
- if __name__ == "__main__":
- cli()
|