7 months ago · b0e220c5f0
--- a/demo/batch_demo.py
+++ b/demo/batch_demo.py
@@ -1,38 +1,23 @@
 
															 import os
														
 
															-import shutil
														
 
															-import tempfile
														
 
															 from pathlib import Path
														
 
															-
														
 
															-import click
														
 
															-import fitz
														
 
															-from loguru import logger
														
 
															-
														
 
															-import magic_pdf.model as model_config
														
 
															 from magic_pdf.data.batch_build_dataset import batch_build_dataset
														
 
															-from magic_pdf.data.data_reader_writer import FileBasedDataReader
														
 
															-from magic_pdf.data.dataset import Dataset
														
 
															-from magic_pdf.libs.version import __version__
														
 
															-from magic_pdf.tools.common import batch_do_parse, do_parse, parse_pdf_methods
														
 
															-from magic_pdf.utils.office_to_pdf import convert_file_to_pdf
														
 
															+from magic_pdf.tools.common import batch_do_parse
														
 
															 def batch(pdf_dir, output_dir, method, lang):
														
 
															-    model_config.__use_inside_model__ = True
														
 
															-    model_config.__model_mode__ = 'full'
														
 
															     os.makedirs(output_dir, exist_ok=True)
														
 
															-
														
 
															     doc_paths = []
														
 
															     for doc_path in Path(pdf_dir).glob('*'):
														
 
															         if doc_path.suffix == '.pdf':
														
 
															             doc_paths.append(doc_path)
														
 
															     # build dataset with 2 workers
														
 
															-    datasets = batch_build_dataset(doc_paths, 2, lang)
														
 
															+    datasets = batch_build_dataset(doc_paths, 4, lang)
														
 
															-    os.environ["MINERU_MIN_BATCH_INFERENCE_SIZE"] = "10" # every 10 pages will be parsed in one batch
														
 
															-    batch_do_parse(output_dir, [str(doc_path.stem) for doc_path in doc_paths], datasets, method, True)
														
 
															+    # os.environ["MINERU_MIN_BATCH_INFERENCE_SIZE"] = "200"  # every 200 pages will be parsed in one batch
														
 
															+    batch_do_parse(output_dir, [str(doc_path.stem) for doc_path in doc_paths], datasets, method)
														
 
															 if __name__ == '__main__':
														
 
															-    batch("batch_data", "output", "ocr", "en")
														
 
															+    batch("pdfs", "output", "auto", "")
														
--- a/demo/demo.py
+++ b/demo/demo.py
@@ -7,18 +7,17 @@ from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
 
															 from magic_pdf.config.enums import SupportedPdfParseMethod
														
 
															 # args
														
 
															-pdf_file_name = "demo1.pdf"  # replace with the real pdf path
														
 
															-name_without_suff = pdf_file_name.split(".")[0]
														
 
															+__dir__ = os.path.dirname(os.path.abspath(__file__))
														
 
															+pdf_file_name = os.path.join(__dir__, "pdfs", "demo1.pdf")  # replace with the real pdf path
														
 
															+name_without_extension = os.path.basename(pdf_file_name).split('.')[0]
														
 
															 # prepare env
														
 
															-local_image_dir, local_md_dir = "output/images", "output"
														
 
															+local_image_dir = os.path.join(__dir__, "output", name_without_extension, "images")
														
 
															+local_md_dir = os.path.join(__dir__, "output", name_without_extension)
														
 
															 image_dir = str(os.path.basename(local_image_dir))
														
 
															-
														
 
															 os.makedirs(local_image_dir, exist_ok=True)
														
 
															-image_writer, md_writer = FileBasedDataWriter(local_image_dir), FileBasedDataWriter(
														
 
															-    local_md_dir
														
 
															-)
														
 
															+image_writer, md_writer = FileBasedDataWriter(local_image_dir), FileBasedDataWriter(local_md_dir)
														
 
															 # read bytes
														
 
															 reader1 = FileBasedDataReader("")
														
@@ -41,32 +40,29 @@ else:
 
															     ## pipeline
														
 
															     pipe_result = infer_result.pipe_txt_mode(image_writer)
														
 
															-### draw model result on each page
														
 
															-infer_result.draw_model(os.path.join(local_md_dir, f"{name_without_suff}_model.pdf"))
														
 
															-
														
 
															 ### get model inference result
														
 
															 model_inference_result = infer_result.get_infer_res()
														
 
															 ### draw layout result on each page
														
 
															-pipe_result.draw_layout(os.path.join(local_md_dir, f"{name_without_suff}_layout.pdf"))
														
 
															+pipe_result.draw_layout(os.path.join(local_md_dir, f"{name_without_extension}_layout.pdf"))
														
 
															 ### draw spans result on each page
														
 
															-pipe_result.draw_span(os.path.join(local_md_dir, f"{name_without_suff}_spans.pdf"))
														
 
															+pipe_result.draw_span(os.path.join(local_md_dir, f"{name_without_extension}_spans.pdf"))
														
 
															 ### get markdown content
														
 
															 md_content = pipe_result.get_markdown(image_dir)
														
 
															 ### dump markdown
														
 
															-pipe_result.dump_md(md_writer, f"{name_without_suff}.md", image_dir)
														
 
															+pipe_result.dump_md(md_writer, f"{name_without_extension}.md", image_dir)
														
 
															 ### get content list content
														
 
															 content_list_content = pipe_result.get_content_list(image_dir)
														
 
															 ### dump content list
														
 
															-pipe_result.dump_content_list(md_writer, f"{name_without_suff}_content_list.json", image_dir)
														
 
															+pipe_result.dump_content_list(md_writer, f"{name_without_extension}_content_list.json", image_dir)
														
 
															 ### get middle json
														
 
															 middle_json_content = pipe_result.get_middle_json()
														
 
															 ### dump middle json
														
 
															-pipe_result.dump_middle_json(md_writer, f'{name_without_suff}_middle.json')
														
 
															+pipe_result.dump_middle_json(md_writer, f'{name_without_extension}_middle.json')
														
--- a/demo/demo1.pdf
+++ b/demo/demo1.pdf
--- a/demo/demo2.pdf
+++ b/demo/demo2.pdf
--- a/demo/batch_data/demo1.pdf
+++ b/demo/batch_data/demo1.pdf
--- a/demo/batch_data/demo2.pdf
+++ b/demo/batch_data/demo2.pdf
--- a/demo/pdfs/demo3.pdf
+++ b/demo/pdfs/demo3.pdf
--- a/demo/pdfs/small_ocr.pdf
+++ b/demo/pdfs/small_ocr.pdf
--- a/docs/README_Ubuntu_CUDA_Acceleration_en_US.md
+++ b/docs/README_Ubuntu_CUDA_Acceleration_en_US.md
@@ -92,7 +92,7 @@ You can find the `magic-pdf.json` file in your user directory.
 
															 Download a sample file from the repository and test it.
														
 
															 ```sh
														
 
															-wget https://github.com/opendatalab/MinerU/raw/master/demo/small_ocr.pdf
														
 
															+wget https://github.com/opendatalab/MinerU/raw/master/demo/pdfs/small_ocr.pdf
														
 
															 magic-pdf -p small_ocr.pdf -o ./output
														
 
															 ```
														
--- a/docs/README_Ubuntu_CUDA_Acceleration_zh_CN.md
+++ b/docs/README_Ubuntu_CUDA_Acceleration_zh_CN.md
@@ -91,7 +91,7 @@ pip install -U magic-pdf[full] -i https://mirrors.aliyun.com/pypi/simple
 
															 从仓库中下载样本文件，并测试
														
 
															 ```bash
														
 
															-wget https://gcore.jsdelivr.net/gh/opendatalab/MinerU@master/demo/small_ocr.pdf
														
 
															+wget https://gcore.jsdelivr.net/gh/opendatalab/MinerU@master/demo/pdfs/small_ocr.pdf
														
 
															 magic-pdf -p small_ocr.pdf -o ./output
														
 
															 ```
														
--- a/docs/README_Windows_CUDA_Acceleration_en_US.md
+++ b/docs/README_Windows_CUDA_Acceleration_en_US.md
@@ -53,7 +53,7 @@ You can find the `magic-pdf.json` file in your 【user directory】 .
 
															 Download a sample file from the repository and test it.
														
 
															 ```powershell
														
 
															-  wget https://github.com/opendatalab/MinerU/raw/master/demo/small_ocr.pdf -O small_ocr.pdf
														
 
															+  wget https://github.com/opendatalab/MinerU/raw/master/demo/pdfs/small_ocr.pdf -O small_ocr.pdf
														
 
															   magic-pdf -p small_ocr.pdf -o ./output
														
 
															 ```
														
--- a/docs/README_Windows_CUDA_Acceleration_zh_CN.md
+++ b/docs/README_Windows_CUDA_Acceleration_zh_CN.md
@@ -54,7 +54,7 @@ pip install -U magic-pdf[full] -i https://mirrors.aliyun.com/pypi/simple
 
															 从仓库中下载样本文件，并测试
														
 
															 ```powershell
														
 
															- wget https://github.com/opendatalab/MinerU/raw/master/demo/small_ocr.pdf -O small_ocr.pdf
														
 
															+ wget https://github.com/opendatalab/MinerU/raw/master/demo/pdfs/small_ocr.pdf -O small_ocr.pdf
														
 
															  magic-pdf -p small_ocr.pdf -o ./output
														
 
															 ```
														
--- a/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/data/__init__.py
+++ b/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/data/__init__.py
@@ -3,21 +3,6 @@ from __future__ import division
 
															 from __future__ import print_function
														
 
															 from __future__ import unicode_literals
														
 
															-import os
														
 
															-import sys
														
 
															-import numpy as np
														
 
															-# import paddle
														
 
															-import signal
														
 
															-import random
														
 
															-
														
 
															-__dir__ = os.path.dirname(os.path.abspath(__file__))
														
 
															-sys.path.append(os.path.abspath(os.path.join(__dir__, '../..')))
														
 
															-
														
 
															-
														
 
															-import copy
														
 
															-# from paddle.io import Dataset, DataLoader, BatchSampler, DistributedBatchSampler
														
 
															-# import paddle.distributed as dist
														
 
															-
														
 
															 from .imaug import transform, create_operators
														
--- a/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/tools/infer/predict_system.py
+++ b/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/tools/infer/predict_system.py
@@ -16,7 +16,6 @@ class TextSystem(object):
 
															         if self.use_angle_cls:
														
 
															             self.text_classifier = predict_cls.TextClassifier(args, **kwargs)
														
 
															-
														
 
															     def get_rotate_crop_image(self, img, points):
														
 
															         '''
														
 
															         img_height, img_width = img.shape[0:2]
														
--- a/magic_pdf/tools/cli.py
+++ b/magic_pdf/tools/cli.py
@@ -90,8 +90,6 @@ without method specified, auto will be used by default.""",
 
															     default=None,
														
 
															 )
														
 
															 def cli(path, output_dir, method, lang, debug_able, start_page_id, end_page_id):
														
 
															-    model_config.__use_inside_model__ = True
														
 
															-    model_config.__model_mode__ = 'full'
														
 
															     os.makedirs(output_dir, exist_ok=True)
														
 
															     temp_dir = tempfile.mkdtemp()
														
 
															     def read_fn(path: Path):
														
--- a/magic_pdf/tools/common.py
+++ b/magic_pdf/tools/common.py
@@ -73,7 +73,7 @@ def _do_parse(
 
															     pdf_bytes_or_dataset,
														
 
															     model_list,
														
 
															     parse_method,
														
 
															-    debug_able,
														
 
															+    debug_able=False,
														
 
															     f_draw_span_bbox=True,
														
 
															     f_draw_layout_bbox=True,
														
 
															     f_dump_md=True,
														
@@ -250,7 +250,7 @@ def do_parse(
 
															     pdf_bytes_or_dataset,
														
 
															     model_list,
														
 
															     parse_method,
														
 
															-    debug_able,
														
 
															+    debug_able=False,
														
 
															     f_draw_span_bbox=True,
														
 
															     f_draw_layout_bbox=True,
														
 
															     f_dump_md=True,
														
@@ -291,7 +291,7 @@ def batch_do_parse(
 
															     pdf_file_names: list[str],
														
 
															     pdf_bytes_or_datasets: list[bytes | Dataset],
														
 
															     parse_method,
														
 
															-    debug_able,
														
 
															+    debug_able=False,
														
 
															     f_draw_span_bbox=True,
														
 
															     f_draw_layout_bbox=True,
														
 
															     f_dump_md=True,