| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161 |
- import os
- import shutil
- import tempfile
- from pathlib import Path
- import click
- import fitz
- from loguru import logger
- import magic_pdf.model as model_config
- from magic_pdf.data.batch_build_dataset import batch_build_dataset
- from magic_pdf.data.data_reader_writer import FileBasedDataReader
- from magic_pdf.data.dataset import Dataset
- from magic_pdf.libs.version import __version__
- from magic_pdf.tools.common import batch_do_parse, do_parse, parse_pdf_methods
- from magic_pdf.utils.office_to_pdf import convert_file_to_pdf
- pdf_suffixes = ['.pdf']
- ms_office_suffixes = ['.ppt', '.pptx', '.doc', '.docx']
- image_suffixes = ['.png', '.jpeg', '.jpg']
- @click.command()
- @click.version_option(__version__,
- '--version',
- '-v',
- help='display the version and exit')
- @click.option(
- '-p',
- '--path',
- 'path',
- type=click.Path(exists=True),
- required=True,
- help='local filepath or directory. support PDF, PPT, PPTX, DOC, DOCX, PNG, JPG files',
- )
- @click.option(
- '-o',
- '--output-dir',
- 'output_dir',
- type=click.Path(),
- required=True,
- help='output local directory',
- )
- @click.option(
- '-m',
- '--method',
- 'method',
- type=parse_pdf_methods,
- help="""the method for parsing pdf.
- ocr: using ocr technique to extract information from pdf.
- txt: suitable for the text-based pdf only and outperform ocr.
- auto: automatically choose the best method for parsing pdf from ocr and txt.
- without method specified, auto will be used by default.""",
- default='auto',
- )
- @click.option(
- '-l',
- '--lang',
- 'lang',
- type=str,
- help="""
- Input the languages in the pdf (if known) to improve OCR accuracy. Optional.
- You should input "Abbreviation" with language form url:
- https://paddlepaddle.github.io/PaddleOCR/latest/en/ppocr/blog/multi_languages.html#5-support-languages-and-abbreviations
- """,
- default=None,
- )
- @click.option(
- '-d',
- '--debug',
- 'debug_able',
- type=bool,
- help='Enables detailed debugging information during the execution of the CLI commands.',
- default=False,
- )
- @click.option(
- '-s',
- '--start',
- 'start_page_id',
- type=int,
- help='The starting page for PDF parsing, beginning from 0.',
- default=0,
- )
- @click.option(
- '-e',
- '--end',
- 'end_page_id',
- type=int,
- help='The ending page for PDF parsing, beginning from 0.',
- default=None,
- )
- def cli(path, output_dir, method, lang, debug_able, start_page_id, end_page_id):
- os.makedirs(output_dir, exist_ok=True)
- temp_dir = tempfile.mkdtemp()
- def read_fn(path: Path):
- if path.suffix in ms_office_suffixes:
- convert_file_to_pdf(str(path), temp_dir)
- fn = os.path.join(temp_dir, f'{path.stem}.pdf')
- elif path.suffix in image_suffixes:
- with open(str(path), 'rb') as f:
- bits = f.read()
- pdf_bytes = fitz.open(stream=bits).convert_to_pdf()
- fn = os.path.join(temp_dir, f'{path.stem}.pdf')
- with open(fn, 'wb') as f:
- f.write(pdf_bytes)
- elif path.suffix in pdf_suffixes:
- fn = str(path)
- else:
- raise Exception(f'Unknown file suffix: {path.suffix}')
- disk_rw = FileBasedDataReader(os.path.dirname(fn))
- return disk_rw.read(os.path.basename(fn))
- def parse_doc(doc_path: Path, dataset: Dataset | None = None):
- try:
- file_name = str(Path(doc_path).stem)
- if dataset is None:
- pdf_data_or_dataset = read_fn(doc_path)
- else:
- pdf_data_or_dataset = dataset
- do_parse(
- output_dir,
- file_name,
- pdf_data_or_dataset,
- [],
- method,
- debug_able,
- start_page_id=start_page_id,
- end_page_id=end_page_id,
- lang=lang
- )
- except Exception as e:
- logger.exception(e)
- if os.path.isdir(path):
- doc_paths = []
- for doc_path in Path(path).glob('*'):
- if doc_path.suffix in pdf_suffixes + image_suffixes + ms_office_suffixes:
- if doc_path.suffix in ms_office_suffixes:
- convert_file_to_pdf(str(doc_path), temp_dir)
- doc_path = Path(os.path.join(temp_dir, f'{doc_path.stem}.pdf'))
- elif doc_path.suffix in image_suffixes:
- with open(str(doc_path), 'rb') as f:
- bits = f.read()
- pdf_bytes = fitz.open(stream=bits).convert_to_pdf()
- fn = os.path.join(temp_dir, f'{doc_path.stem}.pdf')
- with open(fn, 'wb') as f:
- f.write(pdf_bytes)
- doc_path = Path(fn)
- doc_paths.append(doc_path)
- datasets = batch_build_dataset(doc_paths, 4, lang)
- batch_do_parse(output_dir, [str(doc_path.stem) for doc_path in doc_paths], datasets, method, debug_able, lang=lang)
- else:
- parse_doc(Path(path))
- shutil.rmtree(temp_dir)
- if __name__ == '__main__':
- cli()
|