Selaa lähdekoodia

feat: add batch example

icecraft 7 kuukautta sitten
vanhempi
commit
d91367159f
3 muutettua tiedostoa jossa 38 lisäystä ja 0 poistoa
  1. BIN
      demo/batch_data/demo1.pdf
  2. BIN
      demo/batch_data/demo2.pdf
  3. 38 0
      demo/batch_demo.py

BIN
demo/batch_data/demo1.pdf


BIN
demo/batch_data/demo2.pdf


+ 38 - 0
demo/batch_demo.py

@@ -0,0 +1,38 @@
+import os
+import shutil
+import tempfile
+from pathlib import Path
+
+import click
+import fitz
+from loguru import logger
+
+import magic_pdf.model as model_config
+from magic_pdf.data.batch_build_dataset import batch_build_dataset
+from magic_pdf.data.data_reader_writer import FileBasedDataReader
+from magic_pdf.data.dataset import Dataset
+from magic_pdf.libs.version import __version__
+from magic_pdf.tools.common import batch_do_parse, do_parse, parse_pdf_methods
+from magic_pdf.utils.office_to_pdf import convert_file_to_pdf
+
+
+def batch(pdf_dir, output_dir, method, lang):
+    model_config.__use_inside_model__ = True
+    model_config.__model_mode__ = 'full'
+    os.makedirs(output_dir, exist_ok=True)
+
+    doc_paths = []
+    for doc_path in Path(pdf_dir).glob('*'):
+        if doc_path.suffix == '.pdf':
+            doc_paths.append(doc_path)
+
+    # build dataset with 2 workers
+    datasets = batch_build_dataset(doc_paths, 2, lang)
+
+    os.environ["MINERU_MIN_BATCH_INFERENCE_SIZE"] = "10" # every 10 pages will be parsed in one batch
+    batch_do_parse(output_dir, [str(doc_path.stem) for doc_path in doc_paths], datasets, method, True)
+
+
+if __name__ == '__main__':
+    batch("batch_data", "output", "ocr", "en")
+