5 bulan lalu · 6833882585
--- a/mineru/backend/pipeline/pipeline_analyze.py
+++ b/mineru/backend/pipeline/pipeline_analyze.py
@@ -145,7 +145,7 @@ def doc_analyze(
 
				             f'Batch {index + 1}/{len(batch_images)}: '
			
 
				             f'{processed_images_count} pages/{len(images_with_extra_info)} pages'
			
 
				         )
			
 
				-        batch_results = may_batch_image_analyze(batch_image, formula_enable, table_enable)
			
 
				+        batch_results = batch_image_analyze(batch_image, formula_enable, table_enable)
			
 
				         results.extend(batch_results)
			
 
				 
			
 
				     # 构建返回结果
			
@@ -171,7 +171,7 @@ def doc_analyze(
 
				     return middle_json_list, infer_results
			
 
				 
			
 
				 
			
 
				-def may_batch_image_analyze(
			
 
				+def batch_image_analyze(
			
 
				         images_with_extra_info: list[(np.ndarray, bool, str)],
			
 
				         formula_enable=None,
			
 
				         table_enable=None):
			
@@ -192,7 +192,7 @@ def may_batch_image_analyze(
 
				     if str(device).startswith('npu') or str(device).startswith('cuda'):
			
 
				         vram = get_vram(device)
			
 
				         if vram is not None:
			
 
				-            gpu_memory = int(os.getenv('VIRTUAL_VRAM_SIZE', round(vram)))
			
 
				+            gpu_memory = int(os.getenv('MINERU_VIRTUAL_VRAM_SIZE', round(vram)))
			
 
				             if gpu_memory >= 16:
			
 
				                 batch_ratio = 16
			
 
				             elif gpu_memory >= 12:
			
--- a/mineru/cli/client.py
+++ b/mineru/cli/client.py
@@ -42,6 +42,17 @@ from .common import do_parse, read_fn, pdf_suffixes, image_suffixes
 
				     default='pipeline',
			
 
				 )
			
 
				 @click.option(
			
 
				+    '-l',
			
 
				+    '--lang',
			
 
				+    'lang',
			
 
				+    type=click.Choice(['ch', 'ch_server', 'ch_lite', 'en', 'korean', 'japan', 'chinese_cht', 'ta', 'te', 'ka']),
			
 
				+    help="""
			
 
				+    Input the languages in the pdf (if known) to improve OCR accuracy.  Optional.
			
 
				+    Without languages specified, 'ch' will be used by default.
			
 
				+    """,
			
 
				+    default='ch',
			
 
				+)
			
 
				+@click.option(
			
 
				     '-u',
			
 
				     '--url',
			
 
				     'server_url',
			
@@ -68,24 +79,33 @@ from .common import do_parse, read_fn, pdf_suffixes, image_suffixes
 
				     default=None,
			
 
				 )
			
 
				 
			
 
				-def main(input_path, output_dir, backend, server_url, start_page_id, end_page_id):
			
 
				+def main(input_path, output_dir, backend, lang, server_url, start_page_id, end_page_id):
			
 
				     os.makedirs(output_dir, exist_ok=True)
			
 
				 
			
 
				-    def parse_doc(path: Path):
			
 
				+    def parse_doc(path_list: list[Path]):
			
 
				         try:
			
 
				-            file_name = str(Path(path).stem)
			
 
				-            pdf_bits = read_fn(path)
			
 
				-            do_parse(output_dir, file_name, pdf_bits, backend, server_url,
			
 
				-                     start_page_id=start_page_id, end_page_id=end_page_id)
			
 
				+            file_name_list = []
			
 
				+            pdf_bytes_list = []
			
 
				+            lang_list = []
			
 
				+            for path in path_list:
			
 
				+                file_name = str(Path(path).stem)
			
 
				+                pdf_bytes = read_fn(path)
			
 
				+                file_name_list.append(file_name)
			
 
				+                pdf_bytes_list.append(pdf_bytes)
			
 
				+                lang_list.append(lang)
			
 
				+            do_parse(output_dir, file_name_list, pdf_bytes_list, lang_list, backend, server_url,
			
 
				+                         start_page_id=start_page_id, end_page_id=end_page_id)
			
 
				         except Exception as e:
			
 
				             logger.exception(e)
			
 
				 
			
 
				     if os.path.isdir(input_path):
			
 
				+        doc_path_list = []
			
 
				         for doc_path in Path(input_path).glob('*'):
			
 
				             if doc_path.suffix in pdf_suffixes + image_suffixes:
			
 
				-                parse_doc(Path(doc_path))
			
 
				+                doc_path_list.append(doc_path)
			
 
				+        parse_doc(doc_path_list)
			
 
				     else:
			
 
				-        parse_doc(Path(input_path))
			
 
				+        parse_doc([Path(input_path)])
			
 
				 
			
 
				 if __name__ == '__main__':
			
 
				     main()
			
--- a/mineru/model/ocr/paddleocr2pytorch/pytorch_paddle.py
+++ b/mineru/model/ocr/paddleocr2pytorch/pytorch_paddle.py
@@ -9,7 +9,7 @@ import numpy as np
 
				 import yaml
			
 
				 from loguru import logger
			
 
				 
			
 
				-from magic_pdf.libs.config_reader import get_device, get_local_models_dir
			
 
				+from mineru.backend.pipeline.config_reader import get_device, get_local_models_dir
			
 
				 from ....utils.ocr_utils import check_img, preprocess_image, sorted_boxes, merge_det_boxes, update_det_boxes, get_rotate_crop_image
			
 
				 from .tools.infer.predict_system import TextSystem
			
 
				 from .tools.infer import pytorchocr_utility as utility
			
--- a/setup.py
+++ b/setup.py
@@ -1,6 +1,6 @@
 
				 from pathlib import Path
			
 
				 from setuptools import setup, find_packages
			
 
				-from magic_pdf.libs.version import __version__
			
 
				+from mineru.version import __version__
			
 
				 
			
 
				 
			
 
				 def parse_requirements(filename):
			
@@ -24,13 +24,13 @@ if __name__ == '__main__':
 
				               'README.md').open(encoding='utf-8') as file:
			
 
				         long_description = file.read()
			
 
				     setup(
			
 
				-        name="magic_pdf",  # 项目名
			
 
				+        name="mineru",  # 项目名
			
 
				         version=__version__,  # 自动从tag中获取版本号
			
 
				         license="AGPL-3.0",
			
 
				-        packages=find_packages() + ["magic_pdf.resources"] + ["magic_pdf.model.sub_modules.ocr.paddleocr2pytorch.pytorchocr.utils.resources"],  # 包含所有的包
			
 
				+        packages=find_packages() + ["mineru.resources"] + ["mineru.model.ocr.paddleocr2pytorch.pytorchocr.utils.resources"],  # 包含所有的包
			
 
				         package_data={
			
 
				-            "magic_pdf.resources": ["**"],  # 包含magic_pdf.resources目录下的所有文件
			
 
				-            "magic_pdf.model.sub_modules.ocr.paddleocr2pytorch.pytorchocr.utils.resources": ["**"],  # pytorchocr.resources目录下的所有文件
			
 
				+            "mineru.resources": ["**"],  # 包含magic_pdf.resources目录下的所有文件
			
 
				+            "mineru.model.ocr.paddleocr2pytorch.pytorchocr.utils.resources": ["**"],  # pytorchocr.resources目录下的所有文件
			
 
				         },
			
 
				         install_requires=parse_requirements('requirements.txt'),  # 项目依赖的第三方库
			
 
				         extras_require={
			
@@ -84,8 +84,7 @@ if __name__ == '__main__':
 
				         python_requires=">=3.10,<3.14",  # 项目依赖的 Python 版本
			
 
				         entry_points={
			
 
				             "console_scripts": [
			
 
				-                "magic-pdf = magic_pdf.tools.cli:cli",
			
 
				-                "magic-pdf-dev = magic_pdf.tools.cli_dev:cli" 
			
 
				+                "mineru = mineru.cli:client.main",  # 命令行入口点，mineru命令将调用mineru.cli.client.main函数
			
 
				             ],
			
 
				         },  # 项目提供的可执行命令
			
 
				         include_package_data=True,  # 是否包含非代码文件，如数据文件、配置文件等