batch_demo.py 750 B

1234567891011121314151617181920212223
  1. import os
  2. from pathlib import Path
  3. from magic_pdf.data.batch_build_dataset import batch_build_dataset
  4. from magic_pdf.tools.common import batch_do_parse
  5. def batch(pdf_dir, output_dir, method, lang):
  6. os.makedirs(output_dir, exist_ok=True)
  7. doc_paths = []
  8. for doc_path in Path(pdf_dir).glob('*'):
  9. if doc_path.suffix == '.pdf':
  10. doc_paths.append(doc_path)
  11. # build dataset with 2 workers
  12. datasets = batch_build_dataset(doc_paths, 4, lang)
  13. # os.environ["MINERU_MIN_BATCH_INFERENCE_SIZE"] = "200" # every 200 pages will be parsed in one batch
  14. batch_do_parse(output_dir, [str(doc_path.stem) for doc_path in doc_paths], datasets, method)
  15. if __name__ == '__main__':
  16. batch("pdfs", "output", "auto", "")