Przeglądaj źródła

build(docker): add Dockerfiles for global and Huawei NPU setups

- Add Dockerfile for global setup with Ubuntu base image
- Add Dockerfile for Huawei NPU setup with Ascend base image
- Update requirements file structure:  - Rename requirements-docker.txt to docker/china/requirements.txt - Add new requirements files for global and Huawei NPU setups
- Install necessary packages and dependencies in both Dockerfiles- Set up virtual environment and install Python packages
- Download models and configure magic-pdf for both setups
myhloli 10 miesięcy temu
rodzic
commit
ad09980807

+ 2 - 2
Dockerfile → docker/china/Dockerfile

@@ -30,8 +30,8 @@ RUN python3 -m venv /opt/mineru_venv
 # Activate the virtual environment and install necessary Python packages
 RUN /bin/bash -c "source /opt/mineru_venv/bin/activate && \
     pip3 install --upgrade pip && \
-    wget https://gitee.com/myhloli/MinerU/raw/master/requirements-docker.txt && \
-    pip3 install -r requirements-docker.txt --extra-index-url https://wheels.myhloli.com -i https://mirrors.aliyun.com/pypi/simple && \
+    wget https://gitee.com/myhloli/MinerU/raw/master/docker/china/requirements.txt && \
+    pip3 install -r requirements.txt --extra-index-url https://wheels.myhloli.com -i https://mirrors.aliyun.com/pypi/simple && \
     pip3 install paddlepaddle-gpu==3.0.0b1 -i https://www.paddlepaddle.org.cn/packages/stable/cu118/"
 
 # Copy the configuration file template and install magic-pdf latest

+ 25 - 0
docker/china/requirements.txt

@@ -0,0 +1,25 @@
+boto3>=1.28.43
+Brotli>=1.1.0
+click>=8.1.7
+PyMuPDF>=1.24.9
+loguru>=0.6.0
+numpy>=1.21.6,<2.0.0
+fast-langdetect==0.2.0
+scikit-learn>=1.0.2
+pdfminer.six==20231228
+unimernet==0.2.3
+torch>=2.2.2,<=2.3.1
+torchvision>=0.17.2,<=0.18.1
+matplotlib
+ultralytics>=8.3.48
+paddleocr==2.7.3
+struct-eqtable==0.3.2
+einops
+accelerate
+doclayout_yolo==0.0.2
+rapidocr-paddle
+rapidocr-onnxruntime
+rapid_table
+doclayout-yolo==0.0.2
+openai
+detectron2

+ 50 - 0
docker/global/Dockerfile

@@ -0,0 +1,50 @@
+# Use the official Ubuntu base image
+FROM ubuntu:22.04
+
+# Set environment variables to non-interactive to avoid prompts during installation
+ENV DEBIAN_FRONTEND=noninteractive
+
+# Update the package list and install necessary packages
+RUN apt-get update && \
+    apt-get install -y \
+        software-properties-common && \
+    add-apt-repository ppa:deadsnakes/ppa && \
+    apt-get update && \
+    apt-get install -y \
+        python3.10 \
+        python3.10-venv \
+        python3.10-distutils \
+        python3-pip \
+        wget \
+        git \
+        libgl1 \
+        libglib2.0-0 \
+        && rm -rf /var/lib/apt/lists/*
+
+# Set Python 3.10 as the default python3
+RUN update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.10 1
+
+# Create a virtual environment for MinerU
+RUN python3 -m venv /opt/mineru_venv
+
+# Activate the virtual environment and install necessary Python packages
+RUN /bin/bash -c "source /opt/mineru_venv/bin/activate && \
+    pip3 install --upgrade pip && \
+    wget https://github.com/opendatalab/MinerU/raw/master/docker/global/requirements.txt && \
+    pip3 install -r requirements.txt --extra-index-url https://wheels.myhloli.com && \
+    pip3 install paddlepaddle-gpu==3.0.0b1 -i https://www.paddlepaddle.org.cn/packages/stable/cu118/"
+
+# Copy the configuration file template and install magic-pdf latest
+RUN /bin/bash -c "wget https://github.com/opendatalab/MinerU/raw/master/magic-pdf.template.json && \
+    cp magic-pdf.template.json /root/magic-pdf.json && \
+    source /opt/mineru_venv/bin/activate && \
+    pip3 install -U magic-pdf"
+
+# Download models and update the configuration file
+RUN /bin/bash -c "pip3 install huggingface_hub && \
+    wget https://github.com/opendatalab/MinerU/raw/master/scripts/download_models_hf.py -O download_models.py && \
+    python3 download_models.py && \
+    sed -i 's|cpu|cuda|g' /root/magic-pdf.json"
+
+# Set the entry point to activate the virtual environment and run the command line tool
+ENTRYPOINT ["/bin/bash", "-c", "source /opt/mineru_venv/bin/activate && exec \"$@\"", "--"]

+ 25 - 0
docker/global/requirements.txt

@@ -0,0 +1,25 @@
+boto3>=1.28.43
+Brotli>=1.1.0
+click>=8.1.7
+PyMuPDF>=1.24.9
+loguru>=0.6.0
+numpy>=1.21.6,<2.0.0
+fast-langdetect==0.2.0
+scikit-learn>=1.0.2
+pdfminer.six==20231228
+unimernet==0.2.3
+torch>=2.2.2,<=2.3.1
+torchvision>=0.17.2,<=0.18.1
+matplotlib
+ultralytics>=8.3.48
+paddleocr==2.7.3
+struct-eqtable==0.3.2
+einops
+accelerate
+doclayout_yolo==0.0.2
+rapidocr-paddle
+rapidocr-onnxruntime
+rapid_table
+doclayout-yolo==0.0.2
+openai
+detectron2

+ 49 - 0
docker/huawei_npu/Dockerfile

@@ -0,0 +1,49 @@
+# Use the official Ubuntu base image
+FROM swr.cn-south-1.myhuaweicloud.com/ascendhub/ascend-infer:24.0.RC3-ubuntu20.04
+
+# Set environment variables to non-interactive to avoid prompts during installation
+ENV DEBIAN_FRONTEND=noninteractive
+
+# Update the package list and install necessary packages
+RUN apt-get update && \
+    apt-get install -y \
+        software-properties-common && \
+    add-apt-repository ppa:deadsnakes/ppa && \
+    apt-get update && \
+    apt-get install -y \
+        python3.10 \
+        python3.10-venv \
+        python3.10-distutils \
+        python3-pip \
+        wget \
+        git \
+        libgl1 \
+        libglib2.0-0 \
+        && rm -rf /var/lib/apt/lists/*
+
+# Set Python 3.10 as the default python3
+RUN update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.10 1
+
+# Create a virtual environment for MinerU
+RUN python3 -m venv /opt/mineru_venv
+
+# Activate the virtual environment and install necessary Python packages
+RUN /bin/bash -c "source /opt/mineru_venv/bin/activate && \
+    pip3 install --upgrade pip && \
+    wget https://gitee.com/myhloli/MinerU/raw/dev/docker/huawei_npu/requirements.txt && \
+    pip3 install -r requirements.txt --extra-index-url https://wheels.myhloli.com -i https://mirrors.aliyun.com/pypi/simple"
+
+# Copy the configuration file template and install magic-pdf latest
+RUN /bin/bash -c "wget https://gitee.com/myhloli/MinerU/raw/master/magic-pdf.template.json && \
+    cp magic-pdf.template.json /root/magic-pdf.json && \
+    source /opt/mineru_venv/bin/activate && \
+    pip3 install git+https://gitee.com/myhloli/MinerU.git@dev"
+
+# Download models and update the configuration file
+RUN /bin/bash -c "pip3 install modelscope && \
+    wget https://gitee.com/myhloli/MinerU/raw/master/scripts/download_models.py && \
+    python3 download_models.py && \
+    sed -i 's|cpu|npu|g' /root/magic-pdf.json"
+
+# Set the entry point to activate the virtual environment and run the command line tool
+ENTRYPOINT ["/bin/bash", "-c", "source /opt/mineru_venv/bin/activate && exec \"$@\"", "--"]

+ 1 - 0
requirements-docker.txt → docker/huawei_npu/requirements.txt

@@ -19,6 +19,7 @@ einops
 accelerate
 doclayout_yolo==0.0.2
 rapidocr-paddle
+rapidocr-onnxruntime
 rapid_table
 doclayout-yolo==0.0.2
 openai