Browse Source

Merge pull request #1759 from opendatalab/release-1.2.0

Release 1.2.0
Xiaomeng Zhao 9 months ago
parent
commit
1f49712974
37 changed files with 735 additions and 388 deletions
  1. 3 3
      README.md
  2. 3 3
      README_zh-CN.md
  3. 2 1
      docker/ascend_npu/Dockerfile
  4. 3 3
      docker/ascend_npu/requirements.txt
  5. 1 1
      docker/china/Dockerfile
  6. 2 2
      docker/china/requirements.txt
  7. 1 1
      docker/global/Dockerfile
  8. 2 2
      docker/global/requirements.txt
  9. 1 1
      docs/README_Ascend_NPU_Acceleration_zh_CN.md
  10. 1 1
      docs/README_Windows_CUDA_Acceleration_en_US.md
  11. 1 1
      docs/README_Windows_CUDA_Acceleration_zh_CN.md
  12. 1 1
      magic_pdf/filter/__init__.py
  13. 6 4
      magic_pdf/filter/pdf_classify_by_type.py
  14. 4 4
      magic_pdf/filter/pdf_meta_scan.py
  15. 11 1
      magic_pdf/libs/pdf_check.py
  16. 23 33
      magic_pdf/model/doc_analyze_by_custom_model.py
  17. 125 4
      magic_pdf/model/magic_model.py
  18. 0 7
      magic_pdf/model/pdf_extract_kit.py
  19. 4 3
      magic_pdf/model/sub_modules/language_detection/yolov11/YOLOv11.py
  20. 28 14
      magic_pdf/model/sub_modules/model_init.py
  21. 44 11
      magic_pdf/pdf_parse_union_core_v2.py
  22. 14 16
      magic_pdf/post_proc/llm_aided.py
  23. 14 2
      magic_pdf/pre_proc/ocr_dict_merge.py
  24. 47 65
      projects/web_api/Dockerfile
  25. 36 20
      projects/web_api/README.md
  26. 237 122
      projects/web_api/app.py
  27. 32 0
      projects/web_api/download_models.py
  28. 5 0
      projects/web_api/entrypoint.sh
  29. 34 3
      projects/web_api/magic-pdf.json
  30. 0 13
      projects/web_api/magic-pdf.template.json
  31. 7 0
      projects/web_api/requirements.txt
  32. BIN
      projects/web_api/small_ocr.pdf
  33. 0 10
      projects/web_api/sources.list
  34. 0 7
      projects/web_api/start_mineru.sh
  35. 37 23
      projects/web_demo/web_demo/common/ext.py
  36. 4 4
      setup.py
  37. 2 2
      tests/unittest/test_integrations/test_rag/test_utils.py

+ 3 - 3
README.md

@@ -244,8 +244,8 @@ Synced with dev branch updates:
 #### 1. Install magic-pdf
 
 ```bash
-conda create -n MinerU python=3.10
-conda activate MinerU
+conda create -n mineru python=3.10
+conda activate mineru
 pip install -U "magic-pdf[full]" --extra-index-url https://wheels.myhloli.com
 ```
 
@@ -305,7 +305,7 @@ If your device supports CUDA and meets the GPU requirements of the mainline envi
   ```bash
   wget https://github.com/opendatalab/MinerU/raw/master/docker/global/Dockerfile -O Dockerfile
   docker build -t mineru:latest .
-  docker run --rm -it --gpus=all mineru:latest /bin/bash -c "echo 'source /opt/mineru_venv/bin/activate' >> ~/.bashrc && exec bash"
+  docker run -it --name mineru --gpus=all mineru:latest /bin/bash -c "echo 'source /opt/mineru_venv/bin/activate' >> ~/.bashrc && exec bash"
   magic-pdf --help
   ```
 

+ 3 - 3
README_zh-CN.md

@@ -248,8 +248,8 @@ https://github.com/user-attachments/assets/4bea02c9-6d54-4cd6-97ed-dff14340982c
 > 最新版本国内镜像源同步可能会有延迟,请耐心等待
 
 ```bash
-conda create -n MinerU python=3.10
-conda activate MinerU
+conda create -n mineru python=3.10
+conda activate mineru
 pip install -U "magic-pdf[full]" --extra-index-url https://wheels.myhloli.com -i https://mirrors.aliyun.com/pypi/simple
 ```
 
@@ -308,7 +308,7 @@ pip install -U "magic-pdf[full]" --extra-index-url https://wheels.myhloli.com -i
   ```bash
   wget https://gcore.jsdelivr.net/gh/opendatalab/MinerU@master/docker/china/Dockerfile -O Dockerfile
   docker build -t mineru:latest .
-  docker run --rm -it --gpus=all mineru:latest /bin/bash -c "echo 'source /opt/mineru_venv/bin/activate' >> ~/.bashrc && exec bash"
+  docker run -it --name mineru --gpus=all mineru:latest /bin/bash -c "echo 'source /opt/mineru_venv/bin/activate' >> ~/.bashrc && exec bash"
   magic-pdf --help
   ```
 ### 使用NPU

+ 2 - 1
docker/ascend_npu/Dockerfile

@@ -36,7 +36,8 @@ RUN /bin/bash -c "source /opt/mineru_venv/bin/activate && \
     wget https://gcore.jsdelivr.net/gh/opendatalab/MinerU@master/docker/ascend_npu/requirements.txt -O requirements.txt && \
     pip3 install -r requirements.txt --extra-index-url https://wheels.myhloli.com -i https://mirrors.aliyun.com/pypi/simple && \
     wget https://gitee.com/ascend/pytorch/releases/download/v6.0.rc2-pytorch2.3.1/torch_npu-2.3.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl && \
-    pip install torch_npu-2.3.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl"
+    pip3 install torch_npu-2.3.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl && \
+    pip3 install https://gcore.jsdelivr.net/gh/myhloli/wheels@main/assets/whl/paddle-custom-npu/paddle_custom_npu-0.0.0-cp310-cp310-linux_aarch64.whl"
 
 # Copy the configuration file template and install magic-pdf latest
 RUN /bin/bash -c "wget https://gcore.jsdelivr.net/gh/opendatalab/MinerU@master/magic-pdf.template.json && \

+ 3 - 3
docker/ascend_npu/requirements.txt

@@ -13,12 +13,12 @@ torchvision>=0.17.2,<=0.18.1
 matplotlib
 ultralytics>=8.3.48
 paddleocr==2.7.3
-paddlepaddle==3.0.0b1
+paddlepaddle==3.0.0rc1
 struct-eqtable==0.3.2
 einops
 accelerate
-rapidocr-paddle
-rapidocr-onnxruntime
+rapidocr-paddle>=1.4.5,<2.0.0
+rapidocr-onnxruntime>=1.4.4,<2.0.0
 rapid-table>=1.0.3,<2.0.0
 doclayout-yolo==0.0.2b1
 openai

+ 1 - 1
docker/china/Dockerfile

@@ -32,7 +32,7 @@ RUN /bin/bash -c "source /opt/mineru_venv/bin/activate && \
     pip3 install --upgrade pip -i https://mirrors.aliyun.com/pypi/simple && \
     wget https://gcore.jsdelivr.net/gh/opendatalab/MinerU@master/docker/china/requirements.txt -O requirements.txt && \
     pip3 install -r requirements.txt --extra-index-url https://wheels.myhloli.com -i https://mirrors.aliyun.com/pypi/simple && \
-    pip3 install paddlepaddle-gpu==3.0.0b1 -i https://www.paddlepaddle.org.cn/packages/stable/cu118/"
+    pip3 install paddlepaddle-gpu==3.0.0rc1 -i https://www.paddlepaddle.org.cn/packages/stable/cu118/"
 
 # Copy the configuration file template and install magic-pdf latest
 RUN /bin/bash -c "wget https://gcore.jsdelivr.net/gh/opendatalab/MinerU@master/magic-pdf.template.json && \

+ 2 - 2
docker/china/requirements.txt

@@ -16,8 +16,8 @@ paddleocr==2.7.3
 struct-eqtable==0.3.2
 einops
 accelerate
-rapidocr-paddle
-rapidocr-onnxruntime
+rapidocr-paddle>=1.4.5,<2.0.0
+rapidocr-onnxruntime>=1.4.4,<2.0.0
 rapid-table>=1.0.3,<2.0.0
 doclayout-yolo==0.0.2b1
 openai

+ 1 - 1
docker/global/Dockerfile

@@ -32,7 +32,7 @@ RUN /bin/bash -c "source /opt/mineru_venv/bin/activate && \
     pip3 install --upgrade pip && \
     wget https://github.com/opendatalab/MinerU/raw/master/docker/global/requirements.txt -O requirements.txt && \
     pip3 install -r requirements.txt --extra-index-url https://wheels.myhloli.com && \
-    pip3 install paddlepaddle-gpu==3.0.0b1 -i https://www.paddlepaddle.org.cn/packages/stable/cu118/"
+    pip3 install paddlepaddle-gpu==3.0.0rc1 -i https://www.paddlepaddle.org.cn/packages/stable/cu118/"
 
 # Copy the configuration file template and install magic-pdf latest
 RUN /bin/bash -c "wget https://github.com/opendatalab/MinerU/raw/master/magic-pdf.template.json && \

+ 2 - 2
docker/global/requirements.txt

@@ -16,8 +16,8 @@ paddleocr==2.7.3
 struct-eqtable==0.3.2
 einops
 accelerate
-rapidocr-paddle
-rapidocr-onnxruntime
+rapidocr-paddle>=1.4.5,<2.0.0
+rapidocr-onnxruntime>=1.4.4,<2.0.0
 rapid-table>=1.0.3,<2.0.0
 doclayout-yolo==0.0.2b1
 openai

+ 1 - 1
docs/README_Ascend_NPU_Acceleration_zh_CN.md

@@ -25,7 +25,7 @@ docker build -t mineru_npu:latest .
 ## 运行容器
 
 ```bash
-docker run --rm -it -u root --privileged=true \
+docker run -it -u root --name mineru-npu --privileged=true \
     --ipc=host \
     --network=host \
     --device=/dev/davinci0 \

+ 1 - 1
docs/README_Windows_CUDA_Acceleration_en_US.md

@@ -65,7 +65,7 @@ If your graphics card has at least 8GB of VRAM, follow these steps to test CUDA-
 1. **Overwrite the installation of torch and torchvision** supporting CUDA.
 
    ```
-   pip install --force-reinstall torch==2.3.1 torchvision==0.18.1 --index-url https://download.pytorch.org/whl/cu118
+   pip install --force-reinstall torch==2.3.1 torchvision==0.18.1 "numpy<2.0.0" --index-url https://download.pytorch.org/whl/cu118
    ```
 
 2. **Modify the value of `"device-mode"`** in the `magic-pdf.json` configuration file located in your user directory.

+ 1 - 1
docs/README_Windows_CUDA_Acceleration_zh_CN.md

@@ -66,7 +66,7 @@ pip install -U magic-pdf[full] --extra-index-url https://wheels.myhloli.com -i h
 **1.覆盖安装支持cuda的torch和torchvision**
 
 ```bash
-pip install --force-reinstall torch==2.3.1 torchvision==0.18.1 --index-url https://download.pytorch.org/whl/cu118
+pip install --force-reinstall torch==2.3.1 torchvision==0.18.1 "numpy<2.0.0" --index-url https://download.pytorch.org/whl/cu118
 ```
 
 **2.修改【用户目录】中配置文件magic-pdf.json中"device-mode"的值**

+ 1 - 1
magic_pdf/filter/__init__.py

@@ -23,7 +23,7 @@ def classify(pdf_bytes: bytes) -> SupportedPdfParseMethod:
                 pdf_meta['image_info_per_page'],
                 pdf_meta['text_len_per_page'],
                 pdf_meta['imgs_per_page'],
-                pdf_meta['text_layout_per_page'],
+                # pdf_meta['text_layout_per_page'],
                 pdf_meta['invalid_chars'],
             )
             if is_text_pdf:

+ 6 - 4
magic_pdf/filter/pdf_classify_by_type.py

@@ -305,7 +305,8 @@ def classify_by_img_narrow_strips(page_width, page_height, img_sz_list):
 
 
 def classify(total_page: int, page_width, page_height, img_sz_list: list, text_len_list: list, img_num_list: list,
-             text_layout_list: list, invalid_chars: bool):
+             # text_layout_list: list,
+             invalid_chars: bool):
     """
     这里的图片和页面长度单位是pts
     :param total_page:
@@ -321,7 +322,7 @@ def classify(total_page: int, page_width, page_height, img_sz_list: list, text_l
         'by_text_len': classify_by_text_len(text_len_list, total_page),
         'by_avg_words': classify_by_avg_words(text_len_list),
         'by_img_num': classify_by_img_num(img_sz_list, img_num_list),
-        'by_text_layout': classify_by_text_layout(text_layout_list),
+        # 'by_text_layout': classify_by_text_layout(text_layout_list),
         'by_img_narrow_strips': classify_by_img_narrow_strips(page_width, page_height, img_sz_list),
         'by_invalid_chars': invalid_chars,
     }
@@ -332,9 +333,10 @@ def classify(total_page: int, page_width, page_height, img_sz_list: list, text_l
         return False, results
     else:
         logger.warning(
-            f"pdf is not classified by area and text_len, by_image_area: {results['by_image_area']},"
+            f"OCR needed based on classification result, by_image_area: {results['by_image_area']},"
             f" by_text: {results['by_text_len']}, by_avg_words: {results['by_avg_words']}, by_img_num: {results['by_img_num']},"
-            f" by_text_layout: {results['by_text_layout']}, by_img_narrow_strips: {results['by_img_narrow_strips']},"
+            # f" by_text_layout: {results['by_text_layout']},"
+            f" by_img_narrow_strips: {results['by_img_narrow_strips']},"
             f" by_invalid_chars: {results['by_invalid_chars']}",
             file=sys.stderr)  # 利用这种情况可以快速找出来哪些pdf比较特殊,针对性修正分类算法
         return False, results

+ 4 - 4
magic_pdf/filter/pdf_meta_scan.py

@@ -356,9 +356,9 @@ def pdf_meta_scan(pdf_bytes: bytes):
         # logger.info(f"image_info_per_page: {image_info_per_page}, junk_img_bojids: {junk_img_bojids}")
         text_len_per_page = get_pdf_textlen_per_page(doc)
         # logger.info(f"text_len_per_page: {text_len_per_page}")
-        text_layout_per_page = get_pdf_text_layout_per_page(doc)
+        # text_layout_per_page = get_pdf_text_layout_per_page(doc)
         # logger.info(f"text_layout_per_page: {text_layout_per_page}")
-        text_language = get_language(doc)
+        # text_language = get_language(doc)
         # logger.info(f"text_language: {text_language}")
         invalid_chars = check_invalid_chars(pdf_bytes)
         # logger.info(f"invalid_chars: {invalid_chars}")
@@ -372,8 +372,8 @@ def pdf_meta_scan(pdf_bytes: bytes):
             'page_height_pts': int(page_height_pts),
             'image_info_per_page': image_info_per_page,
             'text_len_per_page': text_len_per_page,
-            'text_layout_per_page': text_layout_per_page,
-            'text_language': text_language,
+            # 'text_layout_per_page': text_layout_per_page,
+            # 'text_language': text_language,
             # "svgs_per_page": svgs_per_page,
             'imgs_per_page': imgs_per_page,  # 增加每页img数量list
             'junk_img_bojids': junk_img_bojids,  # 增加垃圾图片的bojid list

+ 11 - 1
magic_pdf/libs/pdf_check.py

@@ -4,6 +4,7 @@ from loguru import logger
 import re
 from io import BytesIO
 from pdfminer.high_level import extract_text
+from pdfminer.layout import LAParams
 
 
 def calculate_sample_count(total_page: int):
@@ -41,7 +42,16 @@ def detect_invalid_chars(src_pdf_bytes: bytes) -> bool:
     sample_docs = extract_pages(src_pdf_bytes)
     sample_pdf_bytes = sample_docs.tobytes()
     sample_pdf_file_like_object = BytesIO(sample_pdf_bytes)
-    text = extract_text(sample_pdf_file_like_object)
+    laparams = LAParams(
+        line_overlap=0.5,
+        char_margin=2.0,
+        line_margin=0.5,
+        word_margin=0.1,
+        boxes_flow=None,
+        detect_vertical=False,
+        all_texts=False,
+    )
+    text = extract_text(pdf_file=sample_pdf_file_like_object, laparams=laparams)
     text = text.replace("\n", "")
     # logger.info(text)
     '''乱码文本用pdfminer提取出来的文本特征是(cid:xxx)'''

+ 23 - 33
magic_pdf/model/doc_analyze_by_custom_model.py

@@ -1,21 +1,22 @@
 import os
 import time
+import torch
 
+os.environ['FLAGS_npu_jit_compile'] = '0'  # 关闭paddle的jit编译
+os.environ['FLAGS_use_stride_kernel'] = '0'
+os.environ['PYTORCH_ENABLE_MPS_FALLBACK'] = '1'  # 让mps可以fallback
+os.environ['NO_ALBUMENTATIONS_UPDATE'] = '1'  # 禁止albumentations检查更新
 # 关闭paddle的信号处理
 import paddle
-import torch
+paddle.disable_signal_handler()
+
 from loguru import logger
 
 from magic_pdf.model.batch_analyze import BatchAnalyze
 from magic_pdf.model.sub_modules.model_utils import get_vram
 
-paddle.disable_signal_handler()
-
-os.environ['NO_ALBUMENTATIONS_UPDATE'] = '1'  # 禁止albumentations检查更新
-
 try:
     import torchtext
-
     if torchtext.__version__ >= '0.18.0':
         torchtext.disable_torchtext_deprecation_warning()
 except ImportError:
@@ -32,20 +33,6 @@ from magic_pdf.model.model_list import MODEL
 from magic_pdf.operators.models import InferenceResult
 
 
-def dict_compare(d1, d2):
-    return d1.items() == d2.items()
-
-
-def remove_duplicates_dicts(lst):
-    unique_dicts = []
-    for dict_item in lst:
-        if not any(
-            dict_compare(dict_item, existing_dict) for existing_dict in unique_dicts
-        ):
-            unique_dicts.append(dict_item)
-    return unique_dicts
-
-
 class ModelSingleton:
     _instance = None
     _models = {}
@@ -158,7 +145,11 @@ def doc_analyze(
     table_enable=None,
 ) -> InferenceResult:
 
-    end_page_id = end_page_id if end_page_id else len(dataset) - 1
+    end_page_id = (
+        end_page_id
+        if end_page_id is not None and end_page_id >= 0
+        else len(dataset) - 1
+    )
 
     model_manager = ModelSingleton()
     custom_model = model_manager.get_model(
@@ -178,21 +169,20 @@ def doc_analyze(
         gpu_memory = int(os.getenv("VIRTUAL_VRAM_SIZE", round(get_vram(device))))
         if gpu_memory is not None and gpu_memory >= 8:
 
-            if 8 <= gpu_memory < 10:
-                batch_ratio = 2
-            elif 10 <= gpu_memory <= 12:
-                batch_ratio = 4
-            elif 12 < gpu_memory <= 16:
-                batch_ratio = 8
-            elif 16 < gpu_memory <= 24:
+            if gpu_memory >= 40:
+                batch_ratio = 32
+            elif gpu_memory >=20:
                 batch_ratio = 16
+            elif gpu_memory >= 16:
+                batch_ratio = 8
+            elif gpu_memory >= 10:
+                batch_ratio = 4
             else:
-                batch_ratio = 32
+                batch_ratio = 2
 
-            if batch_ratio >= 1:
-                logger.info(f'gpu_memory: {gpu_memory} GB, batch_ratio: {batch_ratio}')
-                batch_model = BatchAnalyze(model=custom_model, batch_ratio=batch_ratio)
-                batch_analyze = True
+            logger.info(f'gpu_memory: {gpu_memory} GB, batch_ratio: {batch_ratio}')
+            batch_model = BatchAnalyze(model=custom_model, batch_ratio=batch_ratio)
+            batch_analyze = True
 
     model_json = []
     doc_analyze_start = time.time()

+ 125 - 4
magic_pdf/model/magic_model.py

@@ -450,11 +450,132 @@ class MagicModel:
             )
         return ret
 
+
+    def __tie_up_category_by_distance_v3(
+        self,
+        page_no: int,
+        subject_category_id: int,
+        object_category_id: int,
+        priority_pos: PosRelationEnum,
+    ):
+        subjects = self.__reduct_overlap(
+            list(
+                map(
+                    lambda x: {'bbox': x['bbox'], 'score': x['score']},
+                    filter(
+                        lambda x: x['category_id'] == subject_category_id,
+                        self.__model_list[page_no]['layout_dets'],
+                    ),
+                )
+            )
+        )
+        objects = self.__reduct_overlap(
+            list(
+                map(
+                    lambda x: {'bbox': x['bbox'], 'score': x['score']},
+                    filter(
+                        lambda x: x['category_id'] == object_category_id,
+                        self.__model_list[page_no]['layout_dets'],
+                    ),
+                )
+            )
+        )
+
+        ret = []
+        N, M = len(subjects), len(objects)
+        subjects.sort(key=lambda x: x['bbox'][0] ** 2 + x['bbox'][1] ** 2)
+        objects.sort(key=lambda x: x['bbox'][0] ** 2 + x['bbox'][1] ** 2)
+
+        OBJ_IDX_OFFSET = 10000
+        SUB_BIT_KIND, OBJ_BIT_KIND = 0, 1
+
+        all_boxes_with_idx = [(i, SUB_BIT_KIND, sub['bbox'][0], sub['bbox'][1]) for i, sub in enumerate(subjects)] + [(i + OBJ_IDX_OFFSET , OBJ_BIT_KIND, obj['bbox'][0], obj['bbox'][1]) for i, obj in enumerate(objects)]
+        seen_idx = set()
+        seen_sub_idx = set()
+
+        while N > len(seen_sub_idx):
+            candidates = []
+            for idx, kind, x0, y0 in all_boxes_with_idx:
+                if idx in seen_idx:
+                    continue
+                candidates.append((idx, kind, x0, y0))
+
+            if len(candidates) == 0:
+                break
+            left_x = min([v[2] for v in candidates])
+            top_y =  min([v[3] for v in candidates])
+
+            candidates.sort(key=lambda x: (x[2]-left_x) ** 2 + (x[3] - top_y) ** 2)
+
+
+            fst_idx, fst_kind, left_x, top_y = candidates[0]
+            candidates.sort(key=lambda x: (x[2] - left_x) ** 2 + (x[3] - top_y)**2)
+            nxt = None
+
+            for i in range(1, len(candidates)):
+                if candidates[i][1] ^ fst_kind == 1:
+                    nxt = candidates[i]
+                    break
+            if nxt is None:
+                break
+
+            if fst_kind == SUB_BIT_KIND:
+                sub_idx, obj_idx = fst_idx, nxt[0] - OBJ_IDX_OFFSET
+
+            else:
+                sub_idx, obj_idx = nxt[0], fst_idx - OBJ_IDX_OFFSET
+
+            pair_dis = bbox_distance(subjects[sub_idx]['bbox'], objects[obj_idx]['bbox'])
+            nearest_dis = float('inf')
+            for i in range(N):
+                if i in seen_idx:continue
+                nearest_dis = min(nearest_dis, bbox_distance(subjects[i]['bbox'], objects[obj_idx]['bbox']))
+
+            if pair_dis >= 3*nearest_dis:
+                seen_idx.add(sub_idx)
+                continue
+
+
+            seen_idx.add(sub_idx)
+            seen_idx.add(obj_idx + OBJ_IDX_OFFSET)
+            seen_sub_idx.add(sub_idx)
+
+            ret.append(
+                {
+                    'sub_bbox': {
+                        'bbox': subjects[sub_idx]['bbox'],
+                        'score': subjects[sub_idx]['score'],
+                    },
+                    'obj_bboxes': [
+                        {'score': objects[obj_idx]['score'], 'bbox': objects[obj_idx]['bbox']}
+                    ],
+                    'sub_idx': sub_idx,
+                }
+            )
+
+        for i in range(len(subjects)):
+            if i in seen_sub_idx:
+                continue
+            ret.append(
+                {
+                    'sub_bbox': {
+                        'bbox': subjects[i]['bbox'],
+                        'score': subjects[i]['score'],
+                    },
+                    'obj_bboxes': [],
+                    'sub_idx': i,
+                }
+            )
+
+
+        return ret
+
+
     def get_imgs_v2(self, page_no: int):
-        with_captions = self.__tie_up_category_by_distance_v2(
+        with_captions = self.__tie_up_category_by_distance_v3(
             page_no, 3, 4, PosRelationEnum.BOTTOM
         )
-        with_footnotes = self.__tie_up_category_by_distance_v2(
+        with_footnotes = self.__tie_up_category_by_distance_v3(
             page_no, 3, CategoryId.ImageFootnote, PosRelationEnum.ALL
         )
         ret = []
@@ -470,10 +591,10 @@ class MagicModel:
         return ret
 
     def get_tables_v2(self, page_no: int) -> list:
-        with_captions = self.__tie_up_category_by_distance_v2(
+        with_captions = self.__tie_up_category_by_distance_v3(
             page_no, 5, 6, PosRelationEnum.UP
         )
-        with_footnotes = self.__tie_up_category_by_distance_v2(
+        with_footnotes = self.__tie_up_category_by_distance_v3(
             page_no, 5, 7, PosRelationEnum.ALL
         )
         ret = []

+ 0 - 7
magic_pdf/model/pdf_extract_kit.py

@@ -89,13 +89,6 @@ class CustomPEKModel:
         # 初始化解析方案
         self.device = kwargs.get('device', 'cpu')
 
-        if str(self.device).startswith("npu"):
-            import torch_npu
-            os.environ['FLAGS_npu_jit_compile'] = '0'
-            os.environ['FLAGS_use_stride_kernel'] = '0'
-        elif str(self.device).startswith("mps"):
-            os.environ['PYTORCH_ENABLE_MPS_FALLBACK'] = '1'
-
         logger.info('using device: {}'.format(self.device))
         models_dir = kwargs.get(
             'models_dir', os.path.join(root_dir, 'resources', 'models')

+ 4 - 3
magic_pdf/model/sub_modules/language_detection/yolov11/YOLOv11.py

@@ -1,4 +1,5 @@
 # Copyright (c) Opendatalab. All rights reserved.
+import time
 from collections import Counter
 from uuid import uuid4
 
@@ -102,9 +103,9 @@ class YOLOv11LangDetModel(object):
             temp_images = split_images(image)
             for temp_image in temp_images:
                 all_images.append(resize_images_to_224(temp_image))
-
-        images_lang_res = self.batch_predict(all_images, batch_size=8)
-        # logger.info(f"images_lang_res: {images_lang_res}")
+        # langdetect_start = time.time()
+        images_lang_res = self.batch_predict(all_images, batch_size=256)
+        # logger.info(f"image number of langdetect: {len(images_lang_res)}, langdetect time: {round(time.time() - langdetect_start, 2)}")
         if len(images_lang_res) > 0:
             count_dict = Counter(images_lang_res)
             language = max(count_dict, key=count_dict.get)

+ 28 - 14
magic_pdf/model/sub_modules/model_init.py

@@ -4,22 +4,37 @@ from loguru import logger
 from magic_pdf.config.constants import MODEL_NAME
 from magic_pdf.model.model_list import AtomicModel
 from magic_pdf.model.sub_modules.language_detection.yolov11.YOLOv11 import YOLOv11LangDetModel
-from magic_pdf.model.sub_modules.layout.doclayout_yolo.DocLayoutYOLO import \
-    DocLayoutYOLOModel
-from magic_pdf.model.sub_modules.layout.layoutlmv3.model_init import \
-    Layoutlmv3_Predictor
+from magic_pdf.model.sub_modules.layout.doclayout_yolo.DocLayoutYOLO import DocLayoutYOLOModel
+from magic_pdf.model.sub_modules.layout.layoutlmv3.model_init import Layoutlmv3_Predictor
 from magic_pdf.model.sub_modules.mfd.yolov8.YOLOv8 import YOLOv8MFDModel
 from magic_pdf.model.sub_modules.mfr.unimernet.Unimernet import UnimernetModel
-from magic_pdf.model.sub_modules.ocr.paddleocr.ppocr_273_mod import \
-    ModifiedPaddleOCR
-from magic_pdf.model.sub_modules.table.rapidtable.rapid_table import \
-    RapidTableModel
-# from magic_pdf.model.sub_modules.ocr.paddleocr.ppocr_291_mod import ModifiedPaddleOCR
-from magic_pdf.model.sub_modules.table.structeqtable.struct_eqtable import \
-    StructTableModel
-from magic_pdf.model.sub_modules.table.tablemaster.tablemaster_paddle import \
-    TableMasterPaddleModel
 
+try:
+    from magic_pdf_ascend_plugin.libs.license_verifier import load_license, LicenseFormatError, LicenseSignatureError, LicenseExpiredError
+    from magic_pdf_ascend_plugin.model_plugin.ocr.paddleocr.ppocr_273_npu import ModifiedPaddleOCR
+    from magic_pdf_ascend_plugin.model_plugin.table.rapidtable.rapid_table_npu import RapidTableModel
+    license_key = load_license()
+    logger.info(f'Using Ascend Plugin Success, License id is {license_key["payload"]["id"]},'
+                f' License expired at {license_key["payload"]["date"]["end_date"]}')
+except Exception as e:
+    if isinstance(e, ImportError):
+        pass
+    elif isinstance(e, LicenseFormatError):
+        logger.error("Ascend Plugin: Invalid license format. Please check the license file.")
+    elif isinstance(e, LicenseSignatureError):
+        logger.error("Ascend Plugin: Invalid signature. The license may be tampered with.")
+    elif isinstance(e, LicenseExpiredError):
+        logger.error("Ascend Plugin: License has expired. Please renew your license.")
+    elif isinstance(e, FileNotFoundError):
+        logger.error("Ascend Plugin: Not found License file.")
+    else:
+        logger.error(f"Ascend Plugin: {e}")
+    from magic_pdf.model.sub_modules.ocr.paddleocr.ppocr_273_mod import ModifiedPaddleOCR
+    # from magic_pdf.model.sub_modules.ocr.paddleocr.ppocr_291_mod import ModifiedPaddleOCR
+    from magic_pdf.model.sub_modules.table.rapidtable.rapid_table import RapidTableModel
+
+from magic_pdf.model.sub_modules.table.structeqtable.struct_eqtable import StructTableModel
+from magic_pdf.model.sub_modules.table.tablemaster.tablemaster_paddle import TableMasterPaddleModel
 
 def table_model_init(table_model_type, model_path, max_time, _device_='cpu', ocr_engine=None, table_sub_model_name=None):
     if table_model_type == MODEL_NAME.STRUCT_EQTABLE:
@@ -76,7 +91,6 @@ def ocr_model_init(show_log: bool = False,
                    use_dilation=True,
                    det_db_unclip_ratio=1.8,
                    ):
-
     if lang is not None and lang != '':
         model = ModifiedPaddleOCR(
             show_log=show_log,

+ 44 - 11
magic_pdf/pdf_parse_union_core_v2.py

@@ -6,8 +6,10 @@ import statistics
 import time
 from typing import List
 
+import cv2
 import fitz
 import torch
+import numpy as np
 from loguru import logger
 
 from magic_pdf.config.enums import SupportedPdfParseMethod
@@ -127,16 +129,15 @@ def fill_char_in_spans(spans, all_chars):
                 span['chars'].append(char)
                 break
 
-    empty_spans = []
-
+    need_ocr_spans = []
     for span in spans:
         chars_to_content(span)
         # 有的span中虽然没有字但有一两个空的占位符,用宽高和content长度过滤
         if len(span['content']) * span['height'] < span['width'] * 0.5:
             # logger.info(f"maybe empty span: {len(span['content'])}, {span['height']}, {span['width']}")
-            empty_spans.append(span)
+            need_ocr_spans.append(span)
         del span['height'], span['width']
-    return empty_spans
+    return need_ocr_spans
 
 
 # 使用鲁棒性更强的中心点坐标判断
@@ -190,6 +191,31 @@ def remove_tilted_line(text_blocks):
             block['lines'].remove(line)
 
 
+def calculate_contrast(img, img_mode) -> float:
+    """
+    计算给定图像的对比度。
+    :param img: 图像,类型为numpy.ndarray
+    :Param img_mode = 图像的色彩通道,'rgb' 或 'bgr'
+    :return: 图像的对比度值
+    """
+    if img_mode == 'rgb':
+        # 将RGB图像转换为灰度图
+        gray_img = cv2.cvtColor(img, cv2.COLOR_RGB2GRAY)
+    elif img_mode == 'bgr':
+        # 将BGR图像转换为灰度图
+        gray_img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
+    else:
+        raise ValueError("Invalid image mode. Please provide 'rgb' or 'bgr'.")
+
+    # 计算均值和标准差
+    mean_value = np.mean(gray_img)
+    std_dev = np.std(gray_img)
+    # 对比度定义为标准差除以平均值(加上小常数避免除零错误)
+    contrast = std_dev / (mean_value + 1e-6)
+    # logger.info(f"contrast: {contrast}")
+    return round(contrast, 2)
+
+
 def txt_spans_extract_v2(pdf_page, spans, all_bboxes, all_discarded_blocks, lang):
     # cid用0xfffd表示,连字符拆开
     # text_blocks_raw = pdf_page.get_text('rawdict', flags=fitz.TEXT_PRESERVE_WHITESPACE | fitz.TEXT_MEDIABOX_CLIP)['blocks']
@@ -274,9 +300,9 @@ def txt_spans_extract_v2(pdf_page, spans, all_bboxes, all_discarded_blocks, lang
             span['chars'] = []
             new_spans.append(span)
 
-    empty_spans = fill_char_in_spans(new_spans, all_pymu_chars)
+    need_ocr_spans = fill_char_in_spans(new_spans, all_pymu_chars)
 
-    if len(empty_spans) > 0:
+    if len(need_ocr_spans) > 0:
 
         # 初始化ocr模型
         atom_model_manager = AtomModelSingleton()
@@ -287,9 +313,15 @@ def txt_spans_extract_v2(pdf_page, spans, all_bboxes, all_discarded_blocks, lang
             lang=lang
         )
 
-        for span in empty_spans:
+        for span in need_ocr_spans:
             # 对span的bbox截图再ocr
             span_img = cut_image_to_pil_image(span['bbox'], pdf_page, mode='cv2')
+
+            # 计算span的对比度,低于0.20的span不进行ocr
+            if calculate_contrast(span_img, img_mode='bgr') <= 0.20:
+                spans.remove(span)
+                continue
+
             ocr_res = ocr_model.ocr(span_img, det=False)
             if ocr_res and len(ocr_res) > 0:
                 if len(ocr_res[0]) > 0:
@@ -404,10 +436,11 @@ def cal_block_index(fix_blocks, sorted_bboxes):
             block_bboxes.append(block['bbox'])
 
             # 删除图表body block中的虚拟line信息, 并用real_lines信息回填
-            if block['type'] in [BlockType.ImageBody, BlockType.TableBody]:
-                block['virtual_lines'] = copy.deepcopy(block['lines'])
-                block['lines'] = copy.deepcopy(block['real_lines'])
-                del block['real_lines']
+            if block['type'] in [BlockType.ImageBody, BlockType.TableBody, BlockType.Title, BlockType.InterlineEquation]:
+                if 'real_lines' in block:
+                    block['virtual_lines'] = copy.deepcopy(block['lines'])
+                    block['lines'] = copy.deepcopy(block['real_lines'])
+                    del block['real_lines']
 
         import numpy as np
 

+ 14 - 16
magic_pdf/post_proc/llm_aided.py

@@ -3,6 +3,7 @@ import json
 from loguru import logger
 from magic_pdf.dict2md.ocr_mkcontent import merge_para_with_text
 from openai import OpenAI
+import ast
 
 
 #@todo: 有的公式以"\"结尾,这样会导致尾部拼接的"$"被转义,也需要修复
@@ -119,11 +120,12 @@ def llm_aided_title(pdf_info_dict, title_aided_config):
     - 在完成初步分级后,仔细检查分级结果的合理性
     - 根据上下文关系和逻辑顺序,对不合理的分级进行微调
     - 确保最终的分级结果符合文档的实际结构和逻辑
+    - 字典中可能包含被误当成标题的正文,你可以通过将其层级标记为 0 来排除它们
     
 IMPORTANT: 
-请直接返回优化过的由标题层级组成的json,格式如下:
-{{"0":1,"1":2,"2":2,"3":3}}
-返回的json不需要格式化。
+请直接返回优化过的由标题层级组成的字典,格式为{{标题id:标题层级}},如下:
+{{0:1,1:2,2:2,3:3}}
+不需要对字典格式化,不需要返回任何其他信息
 
 Input title list:
 {title_dict}
@@ -133,7 +135,7 @@ Corrected title list:
 
     retry_count = 0
     max_retries = 3
-    json_completion = None
+    dict_completion = None
 
     while retry_count < max_retries:
         try:
@@ -143,24 +145,20 @@ Corrected title list:
                     {'role': 'user', 'content': title_optimize_prompt}],
                 temperature=0.7,
             )
-            json_completion = json.loads(completion.choices[0].message.content)
+            # logger.info(f"Title completion: {completion.choices[0].message.content}")
+            dict_completion = ast.literal_eval(completion.choices[0].message.content)
+            # logger.info(f"len(dict_completion): {len(dict_completion)}, len(title_dict): {len(title_dict)}")
 
-            # logger.info(f"Title completion: {json_completion}")
-            # logger.info(f"len(json_completion): {len(json_completion)}, len(title_dict): {len(title_dict)}")
-
-            if len(json_completion) == len(title_dict):
+            if len(dict_completion) == len(title_dict):
                 for i, origin_title_block in enumerate(origin_title_list):
-                    origin_title_block["level"] = int(json_completion[str(i)])
+                    origin_title_block["level"] = int(dict_completion[i])
                 break
             else:
                 logger.warning("The number of titles in the optimized result is not equal to the number of titles in the input.")
                 retry_count += 1
         except Exception as e:
-            if isinstance(e, json.decoder.JSONDecodeError):
-                logger.warning(f"JSON decode error on attempt {retry_count + 1}: {e}")
-            else:
-                logger.exception(e)
+            logger.exception(e)
             retry_count += 1
 
-    if json_completion is None:
-        logger.error("Failed to decode JSON after maximum retries.")
+    if dict_completion is None:
+        logger.error("Failed to decode dict after maximum retries.")

+ 14 - 2
magic_pdf/pre_proc/ocr_dict_merge.py

@@ -60,6 +60,19 @@ def merge_spans_to_line(spans, threshold=0.6):
         return lines
 
 
+def span_block_type_compatible(span_type, block_type):
+    if span_type in [ContentType.Text, ContentType.InlineEquation]:
+        return block_type in [BlockType.Text, BlockType.Title, BlockType.ImageCaption, BlockType.ImageFootnote, BlockType.TableCaption, BlockType.TableFootnote]
+    elif span_type == ContentType.InterlineEquation:
+        return block_type in [BlockType.InterlineEquation]
+    elif span_type == ContentType.Image:
+        return block_type in [BlockType.ImageBody]
+    elif span_type == ContentType.Table:
+        return block_type in [BlockType.TableBody]
+    else:
+        return False
+
+
 def fill_spans_in_blocks(blocks, spans, radio):
     """将allspans中的span按位置关系,放入blocks中."""
     block_with_spans = []
@@ -78,8 +91,7 @@ def fill_spans_in_blocks(blocks, spans, radio):
         block_spans = []
         for span in spans:
             span_bbox = span['bbox']
-            if calculate_overlap_area_in_bbox1_area_ratio(
-                    span_bbox, block_bbox) > radio:
+            if calculate_overlap_area_in_bbox1_area_ratio(span_bbox, block_bbox) > radio and span_block_type_compatible(span['type'], block_type):
                 block_spans.append(span)
 
         block_dict['spans'] = block_spans

+ 47 - 65
projects/web_api/Dockerfile

@@ -1,85 +1,67 @@
-# Use the official Ubuntu base image
-FROM ubuntu:latest
+FROM python:3.10-slim-bookworm AS base
 
-# ENV http_proxy http://127.0.0.1:7890
-# ENV https_proxy http://127.0.0.1:7890
+WORKDIR /app
 
-# Set environment variables to non-interactive to avoid prompts during installation
-ENV DEBIAN_FRONTEND=noninteractive
-ENV LANG C.UTF-8
-
-# ADD sources.list /etc/apt
-# RUN apt-get clean
+ENV DEBIAN_FRONTEND=noninteractive \
+    LANG=C.UTF-8 \
+    PYTHONDONTWRITEBYTECODE=1 \
+    PYTHONUNBUFFERED=1 \
+    PIP_DISABLE_PIP_VERSION_CHECK=1 \
+    PIP_NO_CACHE_DIR=1
 
 
+FROM base AS build
 
 # Update the package list and install necessary packages
-RUN apt-get -q update \
-    && apt-get -q install -y --no-install-recommends \
-        apt-utils \
-        bats \
-        build-essential
-RUN apt-get update && apt-get install -y vim net-tools procps lsof curl wget iputils-ping telnet lrzsz git
-
 RUN apt-get update && \
-    apt-get install -y \
-        software-properties-common && \
-    add-apt-repository ppa:deadsnakes/ppa && \
-    apt-get update && \
-    apt-get install -y \
-        python3.10 \
-        python3.10-venv \
-        python3.10-distutils \
-        python3-pip \
-        wget \
-        git \
-        libgl1 \
-        libglib2.0-0 \
-        && rm -rf /var/lib/apt/lists/*
-        
-# RUN unset http_proxy && unset https_proxy
-
-# Set Python 3.10 as the default python3
-RUN update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.10 1
-
-# Create a virtual environment for MinerU
-RUN python3 -m venv /opt/mineru_venv
-RUN pip config set global.index-url https://mirrors.aliyun.com/pypi/simple
-# Activate the virtual environment and install necessary Python packages
-RUN /bin/bash -c "source /opt/mineru_venv/bin/activate && \
-    pip install --upgrade pip && \
-    pip install magic-pdf[full] --extra-index-url https://myhloli.github.io/wheels/ --no-cache-dir"
+    apt-get install -y --no-install-recommends \
+        build-essential && \
+    apt-get clean && \
+    rm -rf /var/lib/apt/lists/*
 
+# Build Python dependencies
+COPY requirements.txt .
+RUN python -m venv /app/venv && \
+    . /app/venv/bin/activate && \
+    pip install -r requirements.txt && \
+    pip uninstall -y paddlepaddle && \
+    pip install -i https://www.paddlepaddle.org.cn/packages/stable/cu118/ \
+        paddlepaddle-gpu==3.0.0rc1
 
-RUN /bin/bash -c "source /opt/mineru_venv/bin/activate && \
-    pip install fastapi uvicorn python-multipart --no-cache-dir"
+# Download models
+COPY download_models.py .
+RUN . /app/venv/bin/activate && \
+    ./download_models.py
 
-RUN /bin/bash -c "source /opt/mineru_venv/bin/activate && \
-    pip uninstall  paddlepaddle -y"
 
-RUN /bin/bash -c "source /opt/mineru_venv/bin/activate && \
-    python -m pip install paddlepaddle-gpu==3.0.0b1 -i https://www.paddlepaddle.org.cn/packages/stable/cu118/ --no-cache-dir"
+FROM base AS prod
 
-# Copy the configuration file template and set up the model directory
-COPY magic-pdf.template.json /root/magic-pdf.json
-ADD models /opt/models
-ADD .paddleocr /root/.paddleocr 
-ADD app.py /root/app.py
+# Copy Python dependencies and models from the build stage
+COPY --from=build /app/venv /app/venv
+COPY --from=build /opt/models /opt/models
+COPY --from=build /opt/layoutreader /opt/layoutreader
 
-WORKDIR /root
-
-# Set the models directory in the configuration file (adjust the path as needed)
-RUN sed -i 's|/tmp/models|/opt/models|g' /root/magic-pdf.json
-
-# Create the models directory
-# RUN mkdir -p /opt/models
+# Update the package list and install necessary packages
+RUN apt-get update && \
+    apt-get install -y --no-install-recommends \
+        libgl1 \
+        libglib2.0-0 \
+        libgomp1 && \
+    apt-get clean && \
+    rm -rf /var/lib/apt/lists/*
 
-# Set the entry point to activate the virtual environment and run the command line tool
-# ENTRYPOINT ["/bin/bash", "-c", "source /opt/mineru_venv/bin/activate && exec \"$@\" && python3 app.py", "--"]
+# Create volume for paddleocr models
+RUN mkdir -p /root/.paddleocr
+VOLUME [ "/root/.paddleocr" ]
 
+# Copy the app and its configuration file
+COPY entrypoint.sh /app/entrypoint.sh
+COPY magic-pdf.json /root/magic-pdf.json
+COPY app.py /app/app.py
 
 # Expose the port that FastAPI will run on
 EXPOSE 8000
 
 # Command to run FastAPI using Uvicorn, pointing to app.py and binding to 0.0.0.0:8000
-CMD ["/bin/bash", "-c", "source /opt/mineru_venv/bin/activate && uvicorn app:app --host 0.0.0.0 --port 8000"]
+ENTRYPOINT [ "/app/entrypoint.sh" ]
+CMD ["--host", "0.0.0.0", "--port", "8000"]

+ 36 - 20
projects/web_api/README.md

@@ -1,44 +1,60 @@
-基于MinerU的PDF解析API
+# 基于MinerU的PDF解析API
 
-    - MinerU的GPU镜像构建
-    - 基于FastAPI的PDF解析接口
+- MinerU的GPU镜像构建
+- 基于FastAPI的PDF解析接口
 
-支持一键启动,已经打包到镜像中,自带模型权重,支持GPU推理加速,GPU速度相比CPU每页解析要快几十倍不等
+## 构建方式
 
+```
+docker build -t mineru-api .
+```
 
-##  启动命令:
+或者使用代理
 
+```
+docker build --build-arg http_proxy=http://127.0.0.1:7890 --build-arg https_proxy=http://127.0.0.1:7890 -t mineru-api .
+```
 
-```docker run -itd --name=mineru_server --gpus=all -p 8888:8000 quincyqiang/mineru:0.1-models```
+## 启动命令
 
-![](https://i-blog.csdnimg.cn/direct/bcff4f524ea5400db14421ba7cec4989.png)
+```
+docker run --rm -it --gpus=all -v ./paddleocr:/root/.paddleocr -p 8000:8000 mineru-api
+```
 
-具体截图请见博客:https://blog.csdn.net/yanqianglifei/article/details/141979684
+初次调用 API 时会自动下载 paddleocr 的模型(约数十 MB),其余模型已包含在镜像中。
 
+## 测试参数
 
-##   启动日志
+访问地址
 
-![](https://i-blog.csdnimg.cn/direct/4eb5657567e4415eba912179dca5c8aa.png)
+```
+http://localhost:8000/docs
+http://127.0.0.1:8000/docs
+```
 
-##  输入参数:
+## 旧版镜像地址
 
-访问地址:
+> 阿里云地址:docker pull registry.cn-beijing.aliyuncs.com/quincyqiang/mineru:0.1-models
+>
+> dockerhub地址:docker pull quincyqiang/mineru:0.1-models
 
-    http://localhost:8888/docs
 
-    http://127.0.01:8888/docs
+## 旧版截图
 
-![](https://i-blog.csdnimg.cn/direct/8b3a2bc5908042268e8cc69756e331a2.png)
+### 启动命令
 
-##  解析效果:
+![](https://i-blog.csdnimg.cn/direct/bcff4f524ea5400db14421ba7cec4989.png)
 
-![](https://i-blog.csdnimg.cn/direct/a54dcae834ae48d498fb595aca4212c3.png)
+具体截图请见博客:https://blog.csdn.net/yanqianglifei/article/details/141979684
 
+### 启动日志
 
+![](https://i-blog.csdnimg.cn/direct/4eb5657567e4415eba912179dca5c8aa.png)
 
-##   镜像地址:
+### 测试参数
 
-> 阿里云地址:docker pull registry.cn-beijing.aliyuncs.com/quincyqiang/mineru:0.1-models
+![](https://i-blog.csdnimg.cn/direct/8b3a2bc5908042268e8cc69756e331a2.png)
 
-> dockerhub地址:docker pull quincyqiang/mineru:0.1-models
+### 解析效果
 
+![](https://i-blog.csdnimg.cn/direct/a54dcae834ae48d498fb595aca4212c3.png)

+ 237 - 122
projects/web_api/app.py

@@ -1,163 +1,278 @@
-import copy
 import json
 import os
-from tempfile import NamedTemporaryFile
+from base64 import b64encode
+from glob import glob
+from io import StringIO
+from typing import Tuple, Union
 
 import uvicorn
-from fastapi import FastAPI, File, UploadFile
+from fastapi import FastAPI, HTTPException, UploadFile
 from fastapi.responses import JSONResponse
 from loguru import logger
 
 import magic_pdf.model as model_config
 from magic_pdf.config.enums import SupportedPdfParseMethod
-from magic_pdf.data.data_reader_writer import FileBasedDataWriter
+from magic_pdf.data.data_reader_writer import DataWriter, FileBasedDataWriter
+from magic_pdf.data.data_reader_writer.s3 import S3DataReader, S3DataWriter
 from magic_pdf.data.dataset import PymuDocDataset
+from magic_pdf.libs.config_reader import get_bucket_name, get_s3_config
 from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
 from magic_pdf.operators.models import InferenceResult
+from magic_pdf.operators.pipes import PipeResult
 
 model_config.__use_inside_model__ = True
 
 app = FastAPI()
 
 
-def json_md_dump(
-    model_json,
-    middle_json,
-    md_writer,
-    pdf_name,
-    content_list,
-    md_content,
-):
-    # Write model results to model.json
-    orig_model_list = copy.deepcopy(model_json)
-    md_writer.write_string(
-        f'{pdf_name}_model.json',
-        json.dumps(orig_model_list, ensure_ascii=False, indent=4),
-    )
-
-    # Write intermediate results to middle.json
-    md_writer.write_string(
-        f'{pdf_name}_middle.json',
-        json.dumps(middle_json, ensure_ascii=False, indent=4),
-    )
-
-    # Write text content results to content_list.json
-    md_writer.write_string(
-        f'{pdf_name}_content_list.json',
-        json.dumps(content_list, ensure_ascii=False, indent=4),
-    )
-
-    # Write results to .md file
-    md_writer.write_string(
-        f'{pdf_name}.md',
-        md_content,
-    )
-
-
-@app.post('/pdf_parse', tags=['projects'], summary='Parse PDF file')
-async def pdf_parse_main(
-    pdf_file: UploadFile = File(...),
-    parse_method: str = 'auto',
-    model_json_path: str = None,
-    is_json_md_dump: bool = True,
-    output_dir: str = 'output',
-):
-    """Execute the process of converting PDF to JSON and MD, outputting MD and
-    JSON files to the specified directory.
-
-    :param pdf_file: The PDF file to be parsed
-    :param parse_method: Parsing method, can be auto, ocr, or txt. Default is auto. If results are not satisfactory, try ocr
-    :param model_json_path: Path to existing model data file. If empty, use built-in model. PDF and model_json must correspond
-    :param is_json_md_dump: Whether to write parsed data to .json and .md files. Default is True. Different stages of data will be written to different .json files (3 in total), md content will be saved to .md file  # noqa E501
-    :param output_dir: Output directory for results. A folder named after the PDF file will be created to store all results
+class MemoryDataWriter(DataWriter):
+    def __init__(self):
+        self.buffer = StringIO()
+
+    def write(self, path: str, data: bytes) -> None:
+        if isinstance(data, str):
+            self.buffer.write(data)
+        else:
+            self.buffer.write(data.decode("utf-8"))
+
+    def write_string(self, path: str, data: str) -> None:
+        self.buffer.write(data)
+
+    def get_value(self) -> str:
+        return self.buffer.getvalue()
+
+    def close(self):
+        self.buffer.close()
+
+
+def init_writers(
+    pdf_path: str = None,
+    pdf_file: UploadFile = None,
+    output_path: str = None,
+    output_image_path: str = None,
+) -> Tuple[
+    Union[S3DataWriter, FileBasedDataWriter],
+    Union[S3DataWriter, FileBasedDataWriter],
+    bytes,
+]:
     """
-    try:
-        # Create a temporary file to store the uploaded PDF
-        with NamedTemporaryFile(delete=False, suffix='.pdf') as temp_pdf:
-            temp_pdf.write(await pdf_file.read())
-            temp_pdf_path = temp_pdf.name
+    Initialize writers based on path type
 
-        pdf_name = os.path.basename(pdf_file.filename).split('.')[0]
+    Args:
+        pdf_path: PDF file path (local path or S3 path)
+        pdf_file: Uploaded PDF file object
+        output_path: Output directory path
+        output_image_path: Image output directory path
 
-        if output_dir:
-            output_path = os.path.join(output_dir, pdf_name)
+    Returns:
+        Tuple[writer, image_writer, pdf_bytes]: Returns initialized writer tuple and PDF
+        file content
+    """
+    if pdf_path:
+        is_s3_path = pdf_path.startswith("s3://")
+        if is_s3_path:
+            bucket = get_bucket_name(pdf_path)
+            ak, sk, endpoint = get_s3_config(bucket)
+
+            writer = S3DataWriter(
+                output_path, bucket=bucket, ak=ak, sk=sk, endpoint_url=endpoint
+            )
+            image_writer = S3DataWriter(
+                output_image_path, bucket=bucket, ak=ak, sk=sk, endpoint_url=endpoint
+            )
+            # 临时创建reader读取文件内容
+            temp_reader = S3DataReader(
+                "", bucket=bucket, ak=ak, sk=sk, endpoint_url=endpoint
+            )
+            pdf_bytes = temp_reader.read(pdf_path)
         else:
-            output_path = os.path.join(os.path.dirname(temp_pdf_path), pdf_name)
+            writer = FileBasedDataWriter(output_path)
+            image_writer = FileBasedDataWriter(output_image_path)
+            os.makedirs(output_image_path, exist_ok=True)
+            with open(pdf_path, "rb") as f:
+                pdf_bytes = f.read()
+    else:
+        # 处理上传的文件
+        pdf_bytes = pdf_file.file.read()
+        writer = FileBasedDataWriter(output_path)
+        image_writer = FileBasedDataWriter(output_image_path)
+        os.makedirs(output_image_path, exist_ok=True)
 
-        output_image_path = os.path.join(output_path, 'images')
+    return writer, image_writer, pdf_bytes
 
-        # Get parent path of images for relative path in .md and content_list.json
-        image_path_parent = os.path.basename(output_image_path)
 
-        pdf_bytes = open(temp_pdf_path, 'rb').read()  # Read binary data of PDF file
+def process_pdf(
+    pdf_bytes: bytes,
+    parse_method: str,
+    image_writer: Union[S3DataWriter, FileBasedDataWriter],
+) -> Tuple[InferenceResult, PipeResult]:
+    """
+    Process PDF file content
+
+    Args:
+        pdf_bytes: Binary content of PDF file
+        parse_method: Parse method ('ocr', 'txt', 'auto')
+        image_writer: Image writer
 
-        if model_json_path:
-            # Read original JSON data of PDF file parsed by model, list type
-            model_json = json.loads(open(model_json_path, 'r', encoding='utf-8').read())
+    Returns:
+        Tuple[InferenceResult, PipeResult]: Returns inference result and pipeline result
+    """
+    ds = PymuDocDataset(pdf_bytes)
+    infer_result: InferenceResult = None
+    pipe_result: PipeResult = None
+
+    if parse_method == "ocr":
+        infer_result = ds.apply(doc_analyze, ocr=True)
+        pipe_result = infer_result.pipe_ocr_mode(image_writer)
+    elif parse_method == "txt":
+        infer_result = ds.apply(doc_analyze, ocr=False)
+        pipe_result = infer_result.pipe_txt_mode(image_writer)
+    else:  # auto
+        if ds.classify() == SupportedPdfParseMethod.OCR:
+            infer_result = ds.apply(doc_analyze, ocr=True)
+            pipe_result = infer_result.pipe_ocr_mode(image_writer)
         else:
-            model_json = []
-
-        # Execute parsing steps
-        image_writer, md_writer = FileBasedDataWriter(
-            output_image_path
-        ), FileBasedDataWriter(output_path)
-
-        ds = PymuDocDataset(pdf_bytes)
-        # Choose parsing method
-        if parse_method == 'auto':
-            if ds.classify() == SupportedPdfParseMethod.OCR:
-                parse_method = 'ocr'
-            else:
-                parse_method = 'txt'
-
-        if parse_method not in ['txt', 'ocr']:
-            logger.error('Unknown parse method, only auto, ocr, txt allowed')
+            infer_result = ds.apply(doc_analyze, ocr=False)
+            pipe_result = infer_result.pipe_txt_mode(image_writer)
+
+    return infer_result, pipe_result
+
+
+def encode_image(image_path: str) -> str:
+    """Encode image using base64"""
+    with open(image_path, "rb") as f:
+        return b64encode(f.read()).decode()
+
+
+@app.post(
+    "/pdf_parse",
+    tags=["projects"],
+    summary="Parse PDF files (supports local files and S3)",
+)
+async def pdf_parse(
+    pdf_file: UploadFile = None,
+    pdf_path: str = None,
+    parse_method: str = "auto",
+    is_json_md_dump: bool = False,
+    output_dir: str = "output",
+    return_layout: bool = False,
+    return_info: bool = False,
+    return_content_list: bool = False,
+    return_images: bool = False,
+):
+    """
+    Execute the process of converting PDF to JSON and MD, outputting MD and JSON files
+    to the specified directory.
+
+    Args:
+        pdf_file: The PDF file to be parsed. Must not be specified together with
+            `pdf_path`
+        pdf_path: The path to the PDF file to be parsed. Must not be specified together
+            with `pdf_file`
+        parse_method: Parsing method, can be auto, ocr, or txt. Default is auto. If
+            results are not satisfactory, try ocr
+        is_json_md_dump: Whether to write parsed data to .json and .md files. Default
+            to False. Different stages of data will be written to different .json files
+            (3 in total), md content will be saved to .md file
+        output_dir: Output directory for results. A folder named after the PDF file
+            will be created to store all results
+        return_layout: Whether to return parsed PDF layout. Default to False
+        return_info: Whether to return parsed PDF info. Default to False
+        return_content_list: Whether to return parsed PDF content list. Default to False
+    """
+    try:
+        if (pdf_file is None and pdf_path is None) or (
+            pdf_file is not None and pdf_path is not None
+        ):
             return JSONResponse(
-                content={'error': 'Invalid parse method'}, status_code=400
+                content={"error": "Must provide either pdf_file or pdf_path"},
+                status_code=400,
             )
 
-        if len(model_json) == 0:
-            if parse_method == 'ocr':
-                infer_result = ds.apply(doc_analyze, ocr=True)
-            else:
-                infer_result = ds.apply(doc_analyze, ocr=False)
+        # Get PDF filename
+        pdf_name = os.path.basename(pdf_path if pdf_path else pdf_file.filename).split(
+            "."
+        )[0]
+        output_path = f"{output_dir}/{pdf_name}"
+        output_image_path = f"{output_path}/images"
 
-        else:
-            infer_result = InferenceResult(model_json, ds)
-
-        if len(model_json) == 0 and not model_config.__use_inside_model__:
-                logger.error('Need model list input')
-                return JSONResponse(
-                    content={'error': 'Model list input required'}, status_code=400
-                )
-        if parse_method == 'ocr':
-            pipe_res = infer_result.pipe_ocr_mode(image_writer)
-        else:
-            pipe_res = infer_result.pipe_txt_mode(image_writer)
+        # Initialize readers/writers and get PDF content
+        writer, image_writer, pdf_bytes = init_writers(
+            pdf_path=pdf_path,
+            pdf_file=pdf_file,
+            output_path=output_path,
+            output_image_path=output_image_path,
+        )
+
+        # Process PDF
+        infer_result, pipe_result = process_pdf(pdf_bytes, parse_method, image_writer)
+
+        # Use MemoryDataWriter to get results
+        content_list_writer = MemoryDataWriter()
+        md_content_writer = MemoryDataWriter()
+        middle_json_writer = MemoryDataWriter()
 
+        # Use PipeResult's dump method to get data
+        pipe_result.dump_content_list(content_list_writer, "", "images")
+        pipe_result.dump_md(md_content_writer, "", "images")
+        pipe_result.dump_middle_json(middle_json_writer, "")
 
-        # Save results in text and md format
-        content_list = pipe_res.get_content_list(image_path_parent, drop_mode='none')
-        md_content = pipe_res.get_markdown(image_path_parent, drop_mode='none')
+        # Get content
+        content_list = json.loads(content_list_writer.get_value())
+        md_content = md_content_writer.get_value()
+        middle_json = json.loads(middle_json_writer.get_value())
+        model_json = infer_result.get_infer_res()
 
+        # If results need to be saved
         if is_json_md_dump:
-            json_md_dump(infer_result._infer_res, pipe_res._pipe_res, md_writer, pdf_name, content_list, md_content)
-        data = {
-            'layout': copy.deepcopy(infer_result._infer_res),
-            'info': pipe_res._pipe_res,
-            'content_list': content_list,
-            'md_content': md_content,
-        }
+            writer.write_string(
+                f"{pdf_name}_content_list.json", content_list_writer.get_value()
+            )
+            writer.write_string(f"{pdf_name}.md", md_content)
+            writer.write_string(
+                f"{pdf_name}_middle.json", middle_json_writer.get_value()
+            )
+            writer.write_string(
+                f"{pdf_name}_model.json",
+                json.dumps(model_json, indent=4, ensure_ascii=False),
+            )
+            # Save visualization results
+            pipe_result.draw_layout(os.path.join(output_path, f"{pdf_name}_layout.pdf"))
+            pipe_result.draw_span(os.path.join(output_path, f"{pdf_name}_spans.pdf"))
+            pipe_result.draw_line_sort(
+                os.path.join(output_path, f"{pdf_name}_line_sort.pdf")
+            )
+            infer_result.draw_model(os.path.join(output_path, f"{pdf_name}_model.pdf"))
+
+        # Build return data
+        data = {}
+        if return_layout:
+            data["layout"] = model_json
+        if return_info:
+            data["info"] = middle_json
+        if return_content_list:
+            data["content_list"] = content_list
+        if return_images:
+            image_paths = glob(f"{output_image_path}/*.jpg")
+            data["images"] = {
+                os.path.basename(
+                    image_path
+                ): f"data:image/jpeg;base64,{encode_image(image_path)}"
+                for image_path in image_paths
+            }
+        data["md_content"] = md_content  # md_content is always returned
+
+        # Clean up memory writers
+        content_list_writer.close()
+        md_content_writer.close()
+        middle_json_writer.close()
+
         return JSONResponse(data, status_code=200)
 
     except Exception as e:
         logger.exception(e)
-        return JSONResponse(content={'error': str(e)}, status_code=500)
-    finally:
-        # Clean up the temporary file
-        if 'temp_pdf_path' in locals():
-            os.unlink(temp_pdf_path)
+        return JSONResponse(content={"error": str(e)}, status_code=500)
 
 
-if __name__ == '__main__':
-    uvicorn.run(app, host='0.0.0.0', port=8888)
+if __name__ == "__main__":
+    uvicorn.run(app, host="0.0.0.0", port=8888)

+ 32 - 0
projects/web_api/download_models.py

@@ -0,0 +1,32 @@
+#!/usr/bin/env python
+from huggingface_hub import snapshot_download
+
+if __name__ == "__main__":
+
+    mineru_patterns = [
+        "models/Layout/LayoutLMv3/*",
+        "models/Layout/YOLO/*",
+        "models/MFD/YOLO/*",
+        "models/MFR/unimernet_small_2501/*",
+        "models/TabRec/TableMaster/*",
+        "models/TabRec/StructEqTable/*",
+    ]
+    model_dir = snapshot_download(
+        "opendatalab/PDF-Extract-Kit-1.0",
+        allow_patterns=mineru_patterns,
+        local_dir="/opt/",
+    )
+
+    layoutreader_pattern = [
+        "*.json",
+        "*.safetensors",
+    ]
+    layoutreader_model_dir = snapshot_download(
+        "hantian/layoutreader",
+        allow_patterns=layoutreader_pattern,
+        local_dir="/opt/layoutreader/",
+    )
+
+    model_dir = model_dir + "/models"
+    print(f"model_dir is: {model_dir}")
+    print(f"layoutreader_model_dir is: {layoutreader_model_dir}")

+ 5 - 0
projects/web_api/entrypoint.sh

@@ -0,0 +1,5 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+. /app/venv/bin/activate
+exec uvicorn app:app "$@"

+ 34 - 3
projects/web_api/magic-pdf.json

@@ -4,10 +4,41 @@
         "bucket-name-2":["ak", "sk", "endpoint"]
     },
     "models-dir":"/opt/models",
+    "layoutreader-model-dir":"/opt/layoutreader",
     "device-mode":"cuda",
+    "layout-config": {
+        "model": "doclayout_yolo"
+    },
+    "formula-config": {
+        "mfd_model": "yolo_v8_mfd",
+        "mfr_model": "unimernet_small",
+        "enable": true
+    },
     "table-config": {
-        "model": "TableMaster",
-        "is_table_recog_enable": false,
+        "model": "rapid_table",
+        "sub_model": "slanet_plus",
+        "enable": true,
         "max_time": 400
-    }
+    },
+    "llm-aided-config": {
+        "formula_aided": {
+            "api_key": "your_api_key",
+            "base_url": "https://dashscope.aliyuncs.com/compatible-mode/v1",
+            "model": "qwen2.5-7b-instruct",
+            "enable": false
+        },
+        "text_aided": {
+            "api_key": "your_api_key",
+            "base_url": "https://dashscope.aliyuncs.com/compatible-mode/v1",
+            "model": "qwen2.5-7b-instruct",
+            "enable": false
+        },
+        "title_aided": {
+            "api_key": "your_api_key",
+            "base_url": "https://dashscope.aliyuncs.com/compatible-mode/v1",
+            "model": "qwen2.5-32b-instruct",
+            "enable": false
+        }
+    },
+    "config_version": "1.1.1"
 }

+ 0 - 13
projects/web_api/magic-pdf.template.json

@@ -1,13 +0,0 @@
-{
-    "bucket_info":{
-        "bucket-name-1":["ak", "sk", "endpoint"],
-        "bucket-name-2":["ak", "sk", "endpoint"]
-    },
-    "models-dir":"/tmp/models",
-    "device-mode":"cuda",
-    "table-config": {
-        "model": "TableMaster",
-        "is_table_recog_enable": false,
-        "max_time": 400
-    }
-}

+ 7 - 0
projects/web_api/requirements.txt

@@ -0,0 +1,7 @@
+--extra-index-url https://myhloli.github.io/wheels/
+
+magic-pdf[full]
+
+fastapi
+uvicorn
+python-multipart

BIN
projects/web_api/small_ocr.pdf


+ 0 - 10
projects/web_api/sources.list

@@ -1,10 +0,0 @@
-deb http://mirrors.aliyun.com/ubuntu/ focal main restricted universe multiverse
-deb-src http://mirrors.aliyun.com/ubuntu/ focal main restricted universe multiverse
-deb http://mirrors.aliyun.com/ubuntu/ focal-security main restricted universe multiverse
-deb-src http://mirrors.aliyun.com/ubuntu/ focal-security main restricted universe multiverse
-deb http://mirrors.aliyun.com/ubuntu/ focal-updates main restricted universe multiverse
-deb-src http://mirrors.aliyun.com/ubuntu/ focal-updates main restricted universe multiverse
-deb http://mirrors.aliyun.com/ubuntu/ focal-proposed main restricted universe multiverse
-deb-src http://mirrors.aliyun.com/ubuntu/ focal-proposed main restricted universe multiverse
-deb http://mirrors.aliyun.com/ubuntu/ focal-backports main restricted universe multiverse
-deb-src http://mirrors.aliyun.com/ubuntu/ focal-backports main restricted universe multiverse

+ 0 - 7
projects/web_api/start_mineru.sh

@@ -1,7 +0,0 @@
-docker run -itd --name=mineru_server --gpus=all -p 8888:8000 quincyqiang/mineru:0.1-models /bin/bash
-
-docker run -itd --name=mineru_server --gpus=all -p 8888:8000 quincyqiang/mineru:0.3-models
-
-docker login --username=1185918903@qq.com registry.cn-beijing.aliyuncs.com
-docker tag quincyqiang/mineru:0.3-models registry.cn-beijing.aliyuncs.com/quincyqiang/gomate:0.3-models
-docker push registry.cn-beijing.aliyuncs.com/quincyqiang/gomate:0.3-models

+ 37 - 23
projects/web_demo/web_demo/common/ext.py

@@ -1,37 +1,51 @@
 import hashlib
 import mimetypes
+import urllib.parse
 
 
 def is_pdf(filename, file):
     """
-    判断文件是否为PDF格式。
+    判断文件是否为PDF格式,支持中文名和特殊字符
 
     :param filename: 文件名
     :param file: 文件对象
     :return: 如果文件是PDF格式,则返回True,否则返回False
     """
-    # 检查文件扩展名  https://arxiv.org/pdf/2405.08702 pdf链接可能存在不带扩展名的情况,先注释
-    # if not filename.endswith('.pdf'):
-    #     return False
-
-    # 检查MIME类型
-    mime_type, _ = mimetypes.guess_type(filename)
-    print(mime_type)
-    if mime_type != 'application/pdf':
-        return False
-
-    # 可选:读取文件的前几KB内容并检查MIME类型
-    # 这一步是可选的,用于更严格的检查
-    # if not mimetypes.guess_type(filename, strict=False)[0] == 'application/pdf':
-    #     return False
-
-    # 检查文件内容
-    file_start = file.read(5)
-    file.seek(0)
-    if not file_start.startswith(b'%PDF-'):
-        return False
-
-    return True
+    try:
+        # 对文件名进行URL解码,处理特殊字符
+        decoded_filename = urllib.parse.unquote(filename)
+        
+        # 检查MIME类型
+        mime_type, _ = mimetypes.guess_type(decoded_filename)
+        print(f"Detected MIME type: {mime_type}")
+        
+        # 某些情况下mime_type可能为None,需要特殊处理
+        if mime_type is None:
+            # 只检查文件内容的PDF标识
+            file_start = file.read(5)
+            file.seek(0)  # 重置文件指针
+            return file_start.startswith(b'%PDF-')
+            
+        if mime_type != 'application/pdf':
+            return False
+
+        # 检查文件内容的PDF标识
+        file_start = file.read(5)
+        file.seek(0)  # 重置文件指针
+        if not file_start.startswith(b'%PDF-'):
+            return False
+
+        return True
+        
+    except Exception as e:
+        print(f"Error checking PDF format: {str(e)}")
+        # 发生错误时,仍然尝试通过文件头判断
+        try:
+            file_start = file.read(5)
+            file.seek(0)
+            return file_start.startswith(b'%PDF-')
+        except:
+            return False
 
 
 def url_is_pdf(file):

+ 4 - 4
setup.py

@@ -43,14 +43,14 @@ if __name__ == '__main__':
                      "matplotlib;platform_system=='Linux' or platform_system=='Darwin'",  # linux 和 macos 不应限制matplotlib的最高版本,以避免无法更新导致的一些bug
                      "ultralytics>=8.3.48",  # yolov8,公式检测
                      "paddleocr==2.7.3",  # 2.8.0及2.8.1版本与detectron2有冲突,需锁定2.7.3
-                     "paddlepaddle==3.0.0b1;platform_system=='Linux'",  # 解决linux的段异常问题
-                     "paddlepaddle==2.6.1;platform_system=='Windows' or platform_system=='Darwin'",  # windows版本3.0.0b1效率下降,需锁定2.6.1
+                     "paddlepaddle==3.0.0rc1;platform_system=='Linux' or platform_system=='Darwin'",  # 解决linux的段异常问题
+                     "paddlepaddle==2.6.1;platform_system=='Windows'",  # windows版本3.0.0效率下降,需锁定2.6.1
                      "struct-eqtable==0.3.2",  # 表格解析
                      "einops",  # struct-eqtable依赖
                      "accelerate",  # struct-eqtable依赖
                      "doclayout_yolo==0.0.2b1",  # doclayout_yolo
-                     "rapidocr-paddle",  # rapidocr-paddle
-                     "rapidocr_onnxruntime",
+                     "rapidocr-paddle>=1.4.5,<2.0.0",  # rapidocr-paddle
+                     "rapidocr_onnxruntime>=1.4.4,<2.0.0",
                      "rapid_table>=1.0.3,<2.0.0",  # rapid_table
                      "PyYAML",  # yaml
                      "openai",  # openai SDK

+ 2 - 2
tests/unittest/test_integrations/test_rag/test_utils.py

@@ -24,7 +24,7 @@ def test_convert_middle_json_to_layout_elements():
     assert len(res[0].layout_dets) > 0
     assert res[0].layout_dets[0].anno_id == 0
     assert res[0].layout_dets[0].category_type == CategoryType.text
-    assert len(res[0].extra.element_relation) >= 3
+    assert len(res[0].extra.element_relation) >= 2
 
     # teardown
     shutil.rmtree(temp_output_dir)
@@ -51,7 +51,7 @@ def test_inference():
     assert len(res[0].layout_dets) > 0
     assert res[0].layout_dets[0].anno_id == 0
     assert res[0].layout_dets[0].category_type == CategoryType.text
-    assert len(res[0].extra.element_relation) >= 3
+    assert len(res[0].extra.element_relation) >= 2
 
     # teardown
     shutil.rmtree(temp_output_dir)