瀏覽代碼

refactor: update project configuration and dependencies in pyproject.toml and setup.py

myhloli 5 月之前
父節點
當前提交
53cd91033f
共有 3 個文件被更改,包括 170 次插入30 次删除
  1. 35 1
      mineru/backend/vlm/vlm_magic_model.py
  2. 100 4
      pyproject.toml
  3. 35 25
      setup.py

+ 35 - 1
mineru/backend/vlm/vlm_magic_model.py

@@ -200,7 +200,41 @@ def isolated_formula_clean(txt):
     latex = txt[:]
     if latex.startswith("\\["): latex = latex[2:]
     if latex.endswith("\\]"): latex = latex[:-2]
-    return latex.strip()
+    latex = latex_fix(latex.strip())
+    return latex
+
+
+def latex_fix(latex):
+    # 白名单分隔符
+    valid_delims_list = [r'(', r')', r'[', r']', r'{', r'}', r'/', r'|',
+                         r'\{', r'\}', r'\lceil', r'\rceil', r'\lfloor',
+                         r'\rfloor', r'\backslash', r'\uparrow', r'\downarrow',
+                         r'\Uparrow', r'\Downarrow', r'\|', r'\.']
+
+    # 为\left后缺失有效分隔符的情况添加点
+    def fix_delim(match):
+        cmd = match.group(1)  # \left 或 \right
+        rest = match.group(2) if len(match.groups()) > 1 else ""
+        if not rest or rest not in valid_delims_list:
+            return cmd + "."
+        return match.group(0)
+
+    LEFT_PATTERN = re.compile(r'(\\left)(\S*)')
+    RIGHT_PATTERN = re.compile(r'(\\right)(\S*)')
+    LEFT_COUNT_PATTERN = re.compile(r'\\left(?![a-zA-Z])')
+    RIGHT_COUNT_PATTERN = re.compile(r'\\right(?![a-zA-Z])')
+    LEFT_RIGHT_REMOVE_PATTERN = re.compile(r'\\left\.?|\\right\.?')
+
+    latex = LEFT_PATTERN.sub(lambda m: fix_delim(m), latex)
+    latex = RIGHT_PATTERN.sub(lambda m: fix_delim(m), latex)
+
+
+    left_count = len(LEFT_COUNT_PATTERN.findall(latex))  # 不匹配\lefteqn等
+    right_count = len(RIGHT_COUNT_PATTERN.findall(latex))  # 不匹配\rightarrow
+
+    if left_count != right_count:
+        return LEFT_RIGHT_REMOVE_PATTERN.sub('', latex)
+    return latex
 
 
 def __reduct_overlap(bboxes):

+ 100 - 4
pyproject.toml

@@ -1,7 +1,103 @@
+[build-system]
+requires = ["setuptools>=61.0", "wheel"]
+build-backend = "setuptools.build_meta"
 
+[project]
+name = "mineru"
+dynamic = ["version"]
+license = {text = "AGPL-3.0"}
+description = "A practical tool for converting PDF to Markdown"
+readme = "README.md"
+requires-python = ">=3.10,<3.14"
+keywords = ["magic-pdf", "mineru", "MinerU", "convert", "pdf", "markdown"]
+classifiers = [
+    "Programming Language :: Python :: 3.10",
+    "Programming Language :: Python :: 3.11",
+    "Programming Language :: Python :: 3.12",
+    "Programming Language :: Python :: 3.13",
+]
+dependencies = [
+    "boto3>=1.28.43",
+    "click>=8.1.7",
+    "loguru>=0.7.2",
+    "numpy>=1.21.6",
+    "pdfminer.six==20250506",
+    "tqdm>=4.67.1",
+    "requests",
+    "httpx",
+    "pillow>=11.0.0",
+    "pypdfium2>=4.30.0",
+    "pypdf>=5.6.0",
+    "reportlab",
+    "pdftext>=0.6.2"
+]
 
-[tool.black]
-line-length = 128
+[project.optional-dependencies]
+vlm = [
+    "transformers>=4.51.1",
+    "torch>=2.6.0",
+    "accelerate>=1.5.1",
+    "pydantic>=2.7.2,<2.11",
+]
+sglang = [
+    "sglang[all]==0.4.6.post5",
+]
+pipeline = [
+    "matplotlib>=3.10,<4",
+    "ultralytics>=8.3.48,<9",
+    "doclayout_yolo==0.0.4",
+    "dill>=0.3.8,<1",
+    "rapid_table>=1.0.5,<2.0.0",
+    "PyYAML>=6.0.2,<7",
+    "ftfy>=6.3.1,<7",
+    "openai>=1.70.0,<2",
+    "shapely>=2.0.7,<3",
+    "pyclipper>=1.3.0,<2",
+    "omegaconf>=2.3.0,<3",
+    "torch>=2.2.2,!=2.5.0,!=2.5.1,<3",
+    "torchvision",
+    "transformers>=4.49.0,!=4.51.0,<5.0.0",
+    "fast-langdetect>=0.2.3,<0.3.0",
+]
+pipeline_old_linux = [
+    "matplotlib>=3.10,<=3.10.1",
+    "ultralytics>=8.3.48,<=8.3.104",
+    "doclayout_yolo==0.0.4",
+    "dill==0.3.8",
+    "PyYAML==6.0.2",
+    "ftfy==6.3.1",
+    "openai==1.71.0",
+    "shapely==2.1.0",
+    "pyclipper==1.3.0.post6",
+    "omegaconf==2.3.0",
+    "albumentations==1.4.20",
+    "rapid_table==1.0.3",
+    "torch>=2.2.2,!=2.5.0,!=2.5.1,<3",
+    "torchvision",
+    "transformers>=4.49.0,!=4.51.0,<5.0.0",
+    "fast-langdetect>=0.2.3,<0.3.0",
+]
 
-[tool.ruff]
-line-length = 128
+[project.urls]
+Home = "https://mineru.net/"
+Repository = "https://github.com/opendatalab/MinerU"
+
+[project.scripts]
+mineru = "mineru.cli:client.main"
+mineru-sglang-server = "mineru.cli.vlm-sglang_server:main"
+mineru-models-download = "mineru.cli.models_download:download_models"
+
+[tool.setuptools.dynamic]
+version = {attr = "mineru.version.__version__"}
+
+[tool.setuptools.packages.find]
+include = ["mineru*"]
+namespaces = false
+
+[tool.setuptools.package-data]
+"mineru.resources" = ["**"]
+"mineru.model.ocr.paddleocr2pytorch.pytorchocr.utils.resources" = ["**"]
+
+[tool.setuptools]
+include-package-data = true
+zip-safe = false

+ 35 - 25
setup.py

@@ -3,22 +3,6 @@ from setuptools import setup, find_packages
 from mineru.version import __version__
 
 
-def parse_requirements(filename):
-    with open(filename) as f:
-        lines = f.read().splitlines()
-
-    requires = []
-
-    for line in lines:
-        if "http" in line:
-            pkg_name_without_url = line.split('@')[0].strip()
-            requires.append(pkg_name_without_url)
-        else:
-            requires.append(line)
-
-    return requires
-
-
 if __name__ == '__main__':
     with Path(Path(__file__).parent,
               'README.md').open(encoding='utf-8') as file:
@@ -32,17 +16,35 @@ if __name__ == '__main__':
             "mineru.resources": ["**"],  # 包含magic_pdf.resources目录下的所有文件
             "mineru.model.ocr.paddleocr2pytorch.pytorchocr.utils.resources": ["**"],  # pytorchocr.resources目录下的所有文件
         },
-        install_requires=parse_requirements('requirements.txt'),  # 项目依赖的第三方库
+        install_requires=[
+                    "boto3>=1.28.43",
+                    "click>=8.1.7",
+                    "loguru>=0.6.0",
+                    "numpy>=1.21.6",
+                    "pdfminer.six==20250506",
+                    "tqdm>=4.67.1",
+                    "requests",
+                    "httpx",
+                    "pillow",
+                    "pypdfium2",
+                    "loguru",
+                    "pypdf",
+                    "reportlab",
+        ],  # 项目依赖的第三方库
         extras_require={
-            "lite": [
-                    "paddleocr==2.7.3",
-                    "paddlepaddle==3.0.0b1;platform_system=='Linux'",
-                    "paddlepaddle==2.6.1;platform_system=='Windows' or platform_system=='Darwin'",
+            "vlm":[
+                "transformers>=4.51.1",
+                "torch>=2.6.0",
+                "accelerate>=1.5.1"
+                "pydantic>=2.7.2,<2.11",
+            ],
+            "sglang": [
+                "sglang[all]==0.4.6.post5",
             ],
-            "full": [
+            "pipeline": [
                      "matplotlib>=3.10,<4",
                      "ultralytics>=8.3.48,<9",  # yolov8,公式检测
-                     "doclayout_yolo==0.0.2b1",  # doclayout_yolo
+                     "doclayout_yolo==0.0.4",  # doclayout_yolo
                      "dill>=0.3.8,<1",  # doclayout_yolo
                      "rapid_table>=1.0.5,<2.0.0",  # rapid_table
                      "PyYAML>=6.0.2,<7",  # yaml
@@ -51,11 +53,15 @@ if __name__ == '__main__':
                      "shapely>=2.0.7,<3",  # imgaug-paddleocr2pytorch
                      "pyclipper>=1.3.0,<2",  # paddleocr2pytorch
                      "omegaconf>=2.3.0,<3",  # paddleocr2pytorch
+                    "torch>=2.2.2,!=2.5.0,!=2.5.1,<3",
+                    "torchvision",
+                    "transformers>=4.49.0,!=4.51.0,<5.0.0",
+                    "fast-langdetect>=0.2.3,<0.3.0",
             ],
-            "full_old_linux": [
+            "pipeline_old_linux": [
                     "matplotlib>=3.10,<=3.10.1",
                     "ultralytics>=8.3.48,<=8.3.104",  # yolov8,公式检测
-                    "doclayout_yolo==0.0.2b1",  # doclayout_yolo
+                    "doclayout_yolo==0.0.4",  # doclayout_yolo
                     "dill==0.3.8",  # doclayout_yolo
                     "PyYAML==6.0.2",  # yaml
                     "ftfy==6.3.1",  # unimernet_hf
@@ -65,6 +71,10 @@ if __name__ == '__main__':
                     "omegaconf==2.3.0",  # paddleocr2pytorch
                     "albumentations==1.4.20", # 1.4.21引入的simsimd不支持2019年及更早的linux系统
                     "rapid_table==1.0.3",  # rapid_table新版本依赖的onnxruntime不支持2019年及更早的linux系统
+                    "torch>=2.2.2,!=2.5.0,!=2.5.1,<3",
+                    "torchvision",
+                    "transformers>=4.49.0,!=4.51.0,<5.0.0",
+                    "fast-langdetect>=0.2.3,<0.3.0",
             ],
         },
         description="A practical tool for converting PDF to Markdown",  # 简短描述