Browse Source

refactor(magic_pdf): optimize environment setup and dependencies

- Add environment variables to disable albumentations and yolo updates
- Import torchtext and disable deprecation warnings
- Update unimernet to 0.2.2
- Specify ultralytics version as >=8.3.48
- Remove upper version limit for torch
myhloli 11 months ago
parent
commit
a296ea41f9
4 changed files with 17 additions and 6 deletions
  1. 12 1
      magic_pdf/model/doc_analyze_by_custom_model.py
  2. 2 2
      requirements-docker.txt
  3. 1 1
      requirements.txt
  4. 2 2
      setup.py

+ 12 - 1
magic_pdf/model/doc_analyze_by_custom_model.py

@@ -1,10 +1,21 @@
-
+import os
 import time
 
 import fitz
 import numpy as np
 from loguru import logger
 
+os.environ['NO_ALBUMENTATIONS_UPDATE'] = '1'  # 禁止albumentations检查更新
+os.environ['YOLO_VERBOSE'] = 'False'  # disable yolo logger
+
+try:
+    import torchtext
+
+    if torchtext.__version__ >= '0.18.0':
+        torchtext.disable_torchtext_deprecation_warning()
+except ImportError:
+    pass
+
 import magic_pdf.model as model_config
 from magic_pdf.data.dataset import Dataset
 from magic_pdf.libs.clean_memory import clean_memory

+ 2 - 2
requirements-docker.txt

@@ -7,9 +7,9 @@ numpy>=1.21.6,<2.0.0
 fast-langdetect==0.2.0
 scikit-learn>=1.0.2
 pdfminer.six==20231228
-unimernet==0.2.1
+unimernet==0.2.2
 matplotlib
-ultralytics
+ultralytics>=8.3.48
 paddleocr==2.7.3
 paddlepaddle==3.0.0b1
 struct-eqtable==0.3.2

+ 1 - 1
requirements.txt

@@ -7,7 +7,7 @@ numpy>=1.21.6,<2.0.0
 pydantic>=2.7.2,<2.8.0
 PyMuPDF>=1.24.9
 scikit-learn>=1.0.2
-torch>=2.2.2,<=2.3.1
+torch>=2.2.2
 transformers
 # pdfminer.six==20231228
 # The requirements.txt must ensure that only necessary external dependencies are introduced. If there are new dependencies to add, please contact the project administrator.

+ 2 - 2
setup.py

@@ -36,10 +36,10 @@ if __name__ == '__main__':
                      "paddlepaddle==3.0.0b1;platform_system=='Linux'",
                      "paddlepaddle==2.6.1;platform_system=='Windows' or platform_system=='Darwin'",
                      ],
-            "full": ["unimernet==0.2.1",  # unimernet升级0.2.1
+            "full": ["unimernet==0.2.2",  # unimernet升级0.2.1
                      "matplotlib<=3.9.0;platform_system=='Windows'",  # 3.9.1及之后不提供windows的预编译包,避免一些没有编译环境的windows设备安装失败
                      "matplotlib;platform_system=='Linux' or platform_system=='Darwin'",  # linux 和 macos 不应限制matplotlib的最高版本,以避免无法更新导致的一些bug
-                     "ultralytics>=8.3.47",  # yolov8,公式检测
+                     "ultralytics>=8.3.48",  # yolov8,公式检测
                      "paddleocr==2.7.3",  # 2.8.0及2.8.1版本与detectron2有冲突,需锁定2.7.3
                      "paddlepaddle==3.0.0b1;platform_system=='Linux'",  # 解决linux的段异常问题
                      "paddlepaddle==2.6.1;platform_system=='Windows' or platform_system=='Darwin'",  # windows版本3.0.0b1效率下降,需锁定2.6.1