batch_demo.py 1.2 KB

1234567891011121314151617181920212223242526272829303132333435363738
  1. import os
  2. import shutil
  3. import tempfile
  4. from pathlib import Path
  5. import click
  6. import fitz
  7. from loguru import logger
  8. import magic_pdf.model as model_config
  9. from magic_pdf.data.batch_build_dataset import batch_build_dataset
  10. from magic_pdf.data.data_reader_writer import FileBasedDataReader
  11. from magic_pdf.data.dataset import Dataset
  12. from magic_pdf.libs.version import __version__
  13. from magic_pdf.tools.common import batch_do_parse, do_parse, parse_pdf_methods
  14. from magic_pdf.utils.office_to_pdf import convert_file_to_pdf
  15. def batch(pdf_dir, output_dir, method, lang):
  16. model_config.__use_inside_model__ = True
  17. model_config.__model_mode__ = 'full'
  18. os.makedirs(output_dir, exist_ok=True)
  19. doc_paths = []
  20. for doc_path in Path(pdf_dir).glob('*'):
  21. if doc_path.suffix == '.pdf':
  22. doc_paths.append(doc_path)
  23. # build dataset with 2 workers
  24. datasets = batch_build_dataset(doc_paths, 2, lang)
  25. os.environ["MINERU_MIN_BATCH_INFERENCE_SIZE"] = "10" # every 10 pages will be parsed in one batch
  26. batch_do_parse(output_dir, [str(doc_path.stem) for doc_path in doc_paths], datasets, method, True)
  27. if __name__ == '__main__':
  28. batch("batch_data", "output", "ocr", "en")