před 1 rokem · d01acab4be
--- a/.github/workflows/cla.yml
+++ b/.github/workflows/cla.yml
@@ -18,7 +18,7 @@ jobs:
 
				     steps:
			
 
				       - name: "CLA Assistant"
			
 
				         if: (github.event.comment.body == 'recheck' || github.event.comment.body == 'I have read the CLA Document and I hereby sign the CLA') || github.event_name == 'pull_request_target'
			
 
				-        uses: contributor-assistant/github-action@v2.4.0
			
 
				+        uses: contributor-assistant/github-action@v2.3.2
			
 
				         env:
			
 
				           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
			
 
				           # the below token should have repo scope and must be manually added by you in the repository's secret
			
@@ -26,7 +26,7 @@ jobs:
 
				           PERSONAL_ACCESS_TOKEN: ${{ secrets.RELEASE_TOKEN }}
			
 
				         with:
			
 
				           path-to-signatures: 'signatures/version1/cla.json'
			
 
				-          path-to-document: 'https://github.com/cla-assistant/github-action/blob/master/SAPCLA.md' # e.g. a CLA or a DCO document
			
 
				+          path-to-document: 'https://github.com/opendatalab/MinerU/blob/master/MinerU_CLA.md' # e.g. a CLA or a DCO document
			
 
				           # branch should not be protected
			
 
				           branch: 'main'
			
 
				           allowlist: user1,bot*
			
--- a/.github/workflows/cli.yml
+++ b/.github/workflows/cli.yml
@@ -48,11 +48,11 @@ jobs:
 
				   notify_to_feishu:
			
 
				     if: ${{ always() && !cancelled() && contains(needs.*.result, 'failure') && (github.ref_name == 'master') }}
			
 
				     needs: [cli-test]
			
 
				-    runs-on: mineru
			
 
				+    runs-on: ubuntu-latest
			
 
				     steps:
			
 
				     - name: get_actor
			
 
				       run: |
			
 
				-          metion_list="quyuan"
			
 
				+          metion_list="dt-yy"
			
 
				           echo $GITHUB_ACTOR
			
 
				           if [[ $GITHUB_ACTOR == "drunkpig" ]]; then
			
 
				             metion_list="xuchao"
			
--- a/.github/workflows/gpu-ci.yml
+++ b/.github/workflows/gpu-ci.yml
@@ -31,9 +31,6 @@ jobs:
 
				       
			
 
				     - name: check-requirements
			
 
				       run: |
			
 
				-        echo $PATH
			
 
				-        conda init
			
 
				-        conda activate QA
			
 
				         source ~/.bashrc
			
 
				         pip install magic-pdf[full-cpu]==0.6.1 -i https://pypi.tuna.tsinghua.edu.cn/simple
			
 
				         pip install https://raw.githubusercontent.com/myhloli/wheels/main/assets/whl/detectron2/detectron2-0.6-cp310-cp310-linux_x86_64.whl
			
@@ -43,5 +40,32 @@ jobs:
 
				         echo $GITHUB_WORKSPACE
			
 
				         cd $GITHUB_WORKSPACE &&  pytest -s -v tests/test_cli/test_bench_gpu.py
			
 
				 
			
 
				-                                                                                                               
			
 
				+  notify_to_feishu:
			
 
				+    if: ${{ always() && !cancelled() && contains(needs.*.result, 'failure') && (github.ref_name == 'master') }}
			
 
				+    needs: [cli-test]
			
 
				+    runs-on: pdf
			
 
				+    steps:
			
 
				+    - name: get_actor
			
 
				+      run: |
			
 
				+          metion_list="dt-yy"
			
 
				+          echo $GITHUB_ACTOR
			
 
				+          if [[ $GITHUB_ACTOR == "drunkpig" ]]; then
			
 
				+            metion_list="xuchao"
			
 
				+          elif [[ $GITHUB_ACTOR == "myhloli" ]]; then
			
 
				+            metion_list="zhaoxiaomeng"
			
 
				+          elif [[ $GITHUB_ACTOR == "icecraft" ]]; then
			
 
				+            metion_list="xurui1"
			
 
				+          fi
			
 
				+          echo $metion_list
			
 
				+          echo "METIONS=$metion_list" >> "$GITHUB_ENV"
			
 
				+          echo ${{ env.METIONS }}
			
 
				+
			
 
				+    - name: notify
			
 
				+      run: |
			
 
				+        curl  ${{ secrets.WEBHOOK_URL }} -H 'Content-Type: application/json'  -d '{
			
 
				+        "msgtype": "text",
			
 
				+        "text": {
			
 
				+            "mentioned_list": ["${{ env.METIONS }}"] , "content": "'${{ github.repository }}' GitHubAction Failed!\n 细节请查看：https://github.com/'${{ github.repository }}'/actions/runs/'${GITHUB_RUN_ID}'"
			
 
				+        }
			
 
				+        }'                                                                                                                
			
 
				  
			
--- a/MinerU_CLA.md
+++ b/MinerU_CLA.md
@@ -0,0 +1,14 @@
 
				+# MinerU Contributor License Agreement
			
 
				+In order to clarify the intellectual property license granted with Contributions from any person or entity, the open source project MinerU ("MinerU") must have a Contributor License Agreement (CLA) on file that has been signed by each Contributor, indicating agreement to the license terms below. This license is for your protection as a Contributor as well as the protection of MinerU and its users; it does not change your rights to use your own Contributions for any other purpose.
			
 
				+
			
 
				+You accept and agree to the following terms and conditions for Your present and future Contributions submitted to MinerU. Except for the license granted herein to MinerU and recipients of software distributed by MinerU, You reserve all right, title, and interest in and to Your Contributions.
			
 
				+
			
 
				+1. Definitions. "You" (or "Your") shall mean the copyright owner or legal entity authorized by the copyright owner that is making this Agreement with MinerU. For legal entities, the entity making a Contribution and all other entities that control, are controlled by, or are under common control with that entity are considered to be a single Contributor. For the purposes of this definition, "control" means (i) the power, direct or indirect, to cause the direction or management of such entity, whether by contract or otherwise, or (ii) ownership of fifty percent (50%) or more of the outstanding shares, or (iii) beneficial ownership of such entity. "Contribution" shall mean the code, documentation or any original work of authorship, including any modifications or additions to an existing work, that is intentionally submitted by You to MinerU for inclusion in, or documentation of, any of the products owned or managed by MinerU (the "Work"). For the purposes of this definition, "submitted" means any form of electronic, verbal, or written communication sent to MinerU or its representatives, including but not limited to communication on electronic mailing lists, source code control systems, and issue tracking systems that are managed by, or on behalf of, MinerU for the purpose of discussing and improving the Work, but excluding communication that is conspicuously marked or otherwise designated in writing by You as "Not a Contribution."
			
 
				+2. Grant of Copyright License. Subject to the terms and conditions of this Agreement, You hereby grant to MinerU and to recipients of software distributed by MinerU a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable copyright license to reproduce, prepare derivative works of, publicly display, publicly perform, sublicense, and distribute Your Contributions and such derivative works.
			
 
				+3. Grant of Patent License. Subject to the terms and conditions of this Agreement, You hereby grant to MinerU and to recipients of software distributed by MinerU a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable (except as stated in this section) patent license to make, have made, use, offer to sell, sell, import, and otherwise transfer the Work, where such license applies only to those patent claims licensable by You that are necessarily infringed by Your Contribution(s) alone or by combination of Your Contribution(s) with the Work to which such Contribution(s) was submitted. If any entity institutes patent litigation against You or any other entity (including a cross-claim or counterclaim in a lawsuit) alleging that Your Contribution, or the Work to which You have contributed, constitutes direct or contributory patent infringement, then any patent licenses granted to that entity under this Agreement for that Contribution or Work shall terminate as of the date such litigation is filed.
			
 
				+4. You represent that You are legally entitled to grant the above license. If You are an entity, You represent further that each of Your employee designated by You is authorized to submit Contributions on behalf of You. If You are an individual and Your employer(s) has rights to intellectual property that You create that includes Your Contributions, You represent further that You have received permission to make Contributions on behalf of that employer, that Your employer has waived such rights for Your Contributions to MinerU, or that Your employer has executed a separate CLA with MinerU.
			
 
				+5. If you do post content or submit material on MinerU and unless we indicate otherwise, you grant MinerU a nonexclusive, royalty-free, perpetual, irrevocable, and fully sublicensable right to use, reproduce, modify, adapt, publish, perform, translate, create derivative works from, distribute, and display such content throughout the world in any media. You grant MinerU and sublicensees the right to use your GitHub Public Profile, including but not limited to name, that you submit in connection with such content. You represent and warrant that you own or otherwise control all of the rights to the content that you post; that the content is accurate; that use of the content you supply does not violate this policy and will not cause injury to any person or entity; and that you will indemnify MinerU for all claims resulting from content you supply. MinerU has the right but not the obligation to monitor and edit or remove any activity or content. MinerU takes no responsibility and assumes no liability for any content posted by you or any third party.
			
 
				+6. You represent that each of Your Contributions is Your original creation. Should You wish to submit work that is not Your original creation, You may submit it to MinerU separately from any Contribution, identifying the complete details of its source and of any license or other restriction (including, but not limited to, related patents, trademarks, and license agreements) of which You are personally aware, and conspicuously marking the work as "Submitted on behalf of a third-party: [named here]".
			
 
				+7. You are not expected to provide support for Your Contributions, except to the extent You desire to provide support. You may provide support for free, for a fee, or not at all. Unless required by applicable law or agreed to in writing, You provide Your Contributions on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied, including, without limitation, any warranties or conditions of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+8. You agree to notify MinerU of any facts or circumstances of which You become aware that would make these representations inaccurate in any respect.
			
 
				+9. MinerU reserves the right to update or change this Agreement at any time, by posting the most current version of the Agreement on MinerU, with a new Effective Date shown on Jul. 24th, 2024. All such changes in the Agreement are effective from the Effective Date. Your continued use of MinerU after we post any such changes signifies your agreement to those changes. If you do not agree to the then-current Agreement, you must immediately discontinue using MinerU.
			
--- a/README.md
+++ b/README.md
@@ -14,6 +14,7 @@
 
				 [![PyPI version](https://badge.fury.io/py/magic-pdf.svg)](https://badge.fury.io/py/magic-pdf)
			
 
				 [![Downloads](https://static.pepy.tech/badge/magic-pdf)](https://pepy.tech/project/magic-pdf)
			
 
				 [![Downloads](https://static.pepy.tech/badge/magic-pdf/month)](https://pepy.tech/project/magic-pdf)
			
 
				+<a href="https://trendshift.io/repositories/11174" target="_blank"><img src="https://trendshift.io/api/badge/repositories/11174" alt="opendatalab%2FMinerU | Trendshift" style="width: 250px; height: 55px;" width="250" height="55"/></a>
			
 
				 
			
 
				 
			
 
				 
			
@@ -64,7 +65,7 @@ Key features include:
 
				 - Available for Windows, Linux, and macOS platforms
			
 
				 
			
 
				 
			
 
				-https://github.com/opendatalab/MinerU/assets/11393164/618937cb-dc6a-4646-b433-e3131a5f4070
			
 
				+https://github.com/user-attachments/assets/4bea02c9-6d54-4cd6-97ed-dff14340982c
			
 
				 
			
 
				 
			
 
				 
			
@@ -297,3 +298,8 @@ The project currently leverages PyMuPDF to deliver advanced functionalities; how
 
				    <img alt="Star History Chart" src="https://api.star-history.com/svg?repos=opendatalab/MinerU&type=Date" />
			
 
				  </picture>
			
 
				 </a>
			
 
				+
			
 
				+# Links
			
 
				+- [LabelU (A Lightweight Multi-modal Data Annotation Tool)](https://github.com/opendatalab/labelU)
			
 
				+- [LabelLLM (An Open-source LLM Dialogue Annotation Platform)](https://github.com/opendatalab/LabelLLM)
			
 
				+- [PDF-Extract-Kit (A Comprehensive Toolkit for High-Quality PDF Content Extraction)](https://github.com/opendatalab/PDF-Extract-Kit)
			
--- a/README_zh-CN.md
+++ b/README_zh-CN.md
@@ -12,6 +12,7 @@
 
				 [![PyPI version](https://badge.fury.io/py/magic-pdf.svg)](https://badge.fury.io/py/magic-pdf)
			
 
				 [![Downloads](https://static.pepy.tech/badge/magic-pdf)](https://pepy.tech/project/magic-pdf)
			
 
				 [![Downloads](https://static.pepy.tech/badge/magic-pdf/month)](https://pepy.tech/project/magic-pdf)
			
 
				+<a href="https://trendshift.io/repositories/11174" target="_blank"><img src="https://trendshift.io/api/badge/repositories/11174" alt="opendatalab%2FMinerU | Trendshift" style="width: 250px; height: 55px;" width="250" height="55"/></a>
			
 
				 
			
 
				 [English](README.md) | [简体中文](README_zh-CN.md)
			
 
				 
			
@@ -58,7 +59,7 @@ Magic-PDF 是一款将 PDF 转化为 markdown 格式的工具。支持转换本
 
				 - 支持windows/linux/mac平台
			
 
				 
			
 
				 
			
 
				-https://github.com/opendatalab/MinerU/assets/11393164/618937cb-dc6a-4646-b433-e3131a5f4070
			
 
				+https://github.com/user-attachments/assets/4bea02c9-6d54-4cd6-97ed-dff14340982c
			
 
				 
			
 
				 
			
 
				 
			
@@ -107,7 +108,8 @@ pip install magic-pdf[full-cpu]
 
				 > 如版本低于0.6.x，请提交issue进行反馈。
			
 
				 
			
 
				 完整功能包依赖detectron2，该库需要编译安装，如需自行编译，请参考 https://github.com/facebookresearch/detectron2/issues/5114  
			
 
				-或是直接使用我们预编译的whl包(仅限python 3.10)：  
			
 
				+或是直接使用我们预编译的whl包：
			
 
				+> ❗️预编译版本仅支持64位系统(windows/linux/macOS)+pyton 3.10平台；不支持任何32位系统和非mac的arm平台，如系统不支持请自行编译安装。
			
 
				 ```bash
			
 
				 pip install detectron2 --extra-index-url https://myhloli.github.io/wheels/
			
 
				 ```
			
@@ -118,11 +120,14 @@ pip install detectron2 --extra-index-url https://myhloli.github.io/wheels/
 
				 下载后请将models目录移动到空间较大的ssd磁盘目录  
			
 
				 
			
 
				 #### 3. 拷贝配置文件并进行配置
			
 
				-在仓库根目录可以获得 [magic-pdf.template.json](magic-pdf.template.json) 文件
			
 
				+在仓库根目录可以获得 [magic-pdf.template.json](magic-pdf.template.json) 配置模版文件
			
 
				+> ❗️务必执行以下命令将配置文件拷贝到用户目录下，否则程序将无法运行
			
 
				 ```bash
			
 
				 cp magic-pdf.template.json ~/magic-pdf.json
			
 
				 ```
			
 
				-在magic-pdf.json中配置"models-dir"为模型权重文件所在目录
			
 
				+
			
 
				+在用户目录中找到magic-pdf.json文件并配置"models-dir"为[2. 下载模型权重文件](#2-下载模型权重文件)中下载的模型权重文件所在目录
			
 
				+> ❗️务必正确配置模型权重文件所在目录，否则会因为找不到模型文件而导致程序无法运行
			
 
				 ```json
			
 
				 {
			
 
				   "models-dir": "/tmp/models"
			
@@ -291,4 +296,9 @@ https://github.com/opendatalab/MinerU/assets/11393164/20438a02-ce6c-4af8-9dde-d7
 
				    <source media="(prefers-color-scheme: light)" srcset="https://api.star-history.com/svg?repos=opendatalab/MinerU&type=Date" />
			
 
				    <img alt="Star History Chart" src="https://api.star-history.com/svg?repos=opendatalab/MinerU&type=Date" />
			
 
				  </picture>
			
 
				-</a>
			
 
				+</a>
			
 
				+
			
 
				+## 友情链接
			
 
				+- [LabelU (轻量级多模态标注工具)](https://github.com/opendatalab/labelU)
			
 
				+- [LabelLLM (开源LLM对话标注平台)](https://github.com/opendatalab/LabelLLM)
			
 
				+- [PDF-Extract-Kit (用于高质量PDF内容提取的综合工具包)](https://github.com/opendatalab/PDF-Extract-Kit))
			
--- a/demo/magic_pdf_parse_main.py
+++ b/demo/magic_pdf_parse_main.py
@@ -0,0 +1,136 @@
 
				+import os

			
 
				+import json

			
 
				+import copy

			
 
				+

			
 
				+from loguru import logger

			
 
				+

			
 
				+from magic_pdf.pipe.UNIPipe import UNIPipe

			
 
				+from magic_pdf.pipe.OCRPipe import OCRPipe

			
 
				+from magic_pdf.pipe.TXTPipe import TXTPipe

			
 
				+from magic_pdf.rw.DiskReaderWriter import DiskReaderWriter

			
 
				+import magic_pdf.model as model_config

			
 
				+

			
 
				+model_config.__use_inside_model__ = True

			
 
				+

			
 
				+# todo: 设备类型选择 （？）

			
 
				+

			
 
				+def json_md_dump(

			
 
				+        pipe,

			
 
				+        md_writer,

			
 
				+        pdf_name,

			
 
				+        content_list,

			
 
				+        md_content,

			
 
				+):

			
 
				+    # 写入模型结果到 model.json

			
 
				+    orig_model_list = copy.deepcopy(pipe.model_list)

			
 
				+    md_writer.write(

			
 
				+        content=json.dumps(orig_model_list, ensure_ascii=False, indent=4),

			
 
				+        path=f"{pdf_name}_model.json"

			
 
				+    )

			
 
				+

			
 
				+    # 写入中间结果到 middle.json

			
 
				+    md_writer.write(

			
 
				+        content=json.dumps(pipe.pdf_mid_data, ensure_ascii=False, indent=4),

			
 
				+        path=f"{pdf_name}_middle.json"

			
 
				+    )

			
 
				+

			
 
				+    # text文本结果写入到 conent_list.json

			
 
				+    md_writer.write(

			
 
				+        content=json.dumps(content_list, ensure_ascii=False, indent=4),

			
 
				+        path=f"{pdf_name}_content_list.json"

			
 
				+    )

			
 
				+

			
 
				+    # 写入结果到 .md 文件中

			
 
				+    md_writer.write(

			
 
				+        content=md_content,

			
 
				+        path=f"{pdf_name}.md"

			
 
				+    )

			
 
				+

			
 
				+

			
 
				+def pdf_parse_main(

			
 
				+        pdf_path: str,

			
 
				+        parse_method: str = 'auto',

			
 
				+        model_json_path: str = None,

			
 
				+        is_json_md_dump: bool = True,

			
 
				+        output_dir: str = None

			
 
				+):

			
 
				+    """

			
 
				+    执行从 pdf 转换到 json、md 的过程，输出 md 和 json 文件到 pdf 文件所在的目录

			
 
				+

			
 
				+    :param pdf_path: .pdf 文件的路径，可以是相对路径，也可以是绝对路径

			
 
				+    :param parse_method: 解析方法， 共 auto、ocr、txt 三种，默认 auto，如果效果不好，可以尝试 ocr

			
 
				+    :param model_json_path: 已经存在的模型数据文件，如果为空则使用内置模型，pdf 和 model_json 务必对应

			
 
				+    :param is_json_md_dump: 是否将解析后的数据写入到 .json 和 .md 文件中，默认 True，会将不同阶段的数据写入到不同的 .json 文件中（共3个.json文件），md内容会保存到 .md 文件中

			
 
				+    :param output_dir: 输出结果的目录地址，会生成一个以 pdf 文件名命名的文件夹并保存所有结果

			
 
				+    """

			
 
				+    try:

			
 
				+        pdf_name = os.path.basename(pdf_path).split(".")[0]

			
 
				+        pdf_path_parent = os.path.dirname(pdf_path)

			
 
				+

			
 
				+        if output_dir:

			
 
				+            output_path = os.path.join(output_dir, pdf_name)

			
 
				+        else:

			
 
				+            output_path = os.path.join(pdf_path_parent, pdf_name)

			
 
				+

			
 
				+        output_image_path = os.path.join(output_path, 'images')

			
 
				+

			
 
				+        # 获取图片的父路径，为的是以相对路径保存到 .md 和 conent_list.json 文件中

			
 
				+        image_path_parent = os.path.basename(output_image_path)

			
 
				+

			
 
				+        pdf_bytes = open(pdf_path, "rb").read()  # 读取 pdf 文件的二进制数据

			
 
				+

			
 
				+        if model_json_path:

			
 
				+            # 读取已经被模型解析后的pdf文件的 json 原始数据，list 类型

			
 
				+            model_json = json.loads(open(model_json_path, "r", encoding="utf-8").read())

			
 
				+        else:

			
 
				+            model_json = []

			
 
				+

			
 
				+        # 执行解析步骤

			
 
				+        # image_writer = DiskReaderWriter(output_image_path)

			
 
				+        image_writer, md_writer = DiskReaderWriter(output_image_path), DiskReaderWriter(output_path)

			
 
				+

			
 
				+        # 选择解析方式

			
 
				+        # jso_useful_key = {"_pdf_type": "", "model_list": model_json}

			
 
				+        # pipe = UNIPipe(pdf_bytes, jso_useful_key, image_writer)

			
 
				+        if parse_method == "auto":

			
 
				+            jso_useful_key = {"_pdf_type": "", "model_list": model_json}

			
 
				+            pipe = UNIPipe(pdf_bytes, jso_useful_key, image_writer)

			
 
				+        elif parse_method == "txt":

			
 
				+            pipe = TXTPipe(pdf_bytes, model_json, image_writer)

			
 
				+        elif parse_method == "ocr":

			
 
				+            pipe = OCRPipe(pdf_bytes, model_json, image_writer)

			
 
				+        else:

			
 
				+            logger.error("unknown parse method, only auto, ocr, txt allowed")

			
 
				+            exit(1)

			
 
				+

			
 
				+        # 执行分类

			
 
				+        pipe.pipe_classify()

			
 
				+

			
 
				+        # 如果没有传入模型数据，则使用内置模型解析

			
 
				+        if not model_json:

			
 
				+            if model_config.__use_inside_model__:

			
 
				+                pipe.pipe_analyze()  # 解析

			
 
				+            else:

			
 
				+                logger.error("need model list input")

			
 
				+                exit(1)

			
 
				+

			
 
				+        # 执行解析

			
 
				+        pipe.pipe_parse()

			
 
				+

			
 
				+        # 保存 text 和 md 格式的结果

			
 
				+        content_list = pipe.pipe_mk_uni_format(image_path_parent, drop_mode="none")

			
 
				+        md_content = pipe.pipe_mk_markdown(image_path_parent, drop_mode="none")

			
 
				+

			
 
				+

			
 
				+        if is_json_md_dump:

			
 
				+            json_md_dump(pipe, md_writer, pdf_name, content_list, md_content)

			
 
				+

			
 
				+

			
 
				+    except Exception as e:

			
 
				+        logger.exception(e)

			
 
				+

			
 
				+

			
 
				+# 测试

			
 
				+if __name__ == '__main__':

			
 
				+    pdf_path = r"C:\Users\XYTK2\Desktop\2024-2016-gb-cd-300.pdf"

			
 
				+    pdf_parse_main(pdf_path)

			
--- a/docs/FAQ_zh_cn.md
+++ b/docs/FAQ_zh_cn.md
@@ -22,6 +22,7 @@ pip install magic-pdf[full-cpu]
 
				 pip install magic-pdf
			
 
				 pip install unimernet==0.1.0
			
 
				 pip install matplotlib ultralytics paddleocr==2.7.3 paddlepaddle
			
 
				+pip install detectron2 --extra-index-url https://myhloli.github.io/wheels/ 
			
 
				 ```
			
 
				 
			
 
				 ### 4.在部分较新的M芯片macOS设备上，MPS加速开启失败
			
@@ -82,4 +83,19 @@ pip install paddlepaddle-gpu
 
				 model_json 指的是通过模型分析后生成的一种有特定格式的json文件。  
			
 
				 如果使用 https://github.com/opendatalab/PDF-Extract-Kit 项目生成，该文件一般在项目的output目录下。  
			
 
				 如果使用 MinerU 的命令行调用内置的模型分析，该文件一般在输出路径"/tmp/magic-pdf/pdf-name"下。  
			
 
				-参考：https://github.com/opendatalab/MinerU/issues/128
			
 
				+参考：https://github.com/opendatalab/MinerU/issues/128
			
 
				+
			
 
				+### 10.Linux下报错：Required dependency not installed, please install by "pip install magic-pdf[full-cpu] detectron2 --extra-index-url https://myhloli.github.io/wheels/"
			
 
				+
			
 
				+这种情况可以先使用pip list 检查一下自己的依赖库列表，重点确认下以下几个库有没有安装（版本不一定完全一致，有就可以）
			
 
				+```bash
			
 
				+opencv-contrib-python     4.6.0.66
			
 
				+opencv-python             4.6.0.66
			
 
				+opencv-python-headless    4.10.0.84
			
 
				+paddleocr                 2.7.3
			
 
				+paddlepaddle              2.6.1
			
 
				+torch                     2.2.2
			
 
				+torchtext                 0.17.2
			
 
				+torchvision               0.17.2
			
 
				+```
			
 
				+如果都有的话，可能是libgl库没有安装，参考 https://github.com/opendatalab/MinerU/issues/165#issuecomment-2245202282 安装libgl库后再试试能不能正常使用。
			
--- a/magic_pdf/cli/magicpdf.py
+++ b/magic_pdf/cli/magicpdf.py
@@ -89,7 +89,6 @@ def do_parse(
 
				     orig_model_list = copy.deepcopy(model_list)
			
 
				 
			
 
				     local_image_dir, local_md_dir = prepare_env(pdf_file_name, parse_method)
			
 
				-    logger.info(f"local output dir is {local_md_dir}")
			
 
				     image_writer, md_writer = DiskReaderWriter(local_image_dir), DiskReaderWriter(local_md_dir)
			
 
				     image_dir = str(os.path.basename(local_image_dir))
			
 
				 
			
@@ -163,6 +162,7 @@ def do_parse(
 
				             path=f"{pdf_file_name}_content_list.json",
			
 
				             mode=AbsReaderWriter.MODE_TXT,
			
 
				         )
			
 
				+    logger.info(f"local output dir is '{local_md_dir}', you can found the result in it.")
			
 
				 
			
 
				 
			
 
				 @click.group()
			
--- a/magic_pdf/libs/__pycache__/__init__.cpython-312.pyc
+++ b/magic_pdf/libs/__pycache__/__init__.cpython-312.pyc
--- a/magic_pdf/libs/__pycache__/version.cpython-312.pyc
+++ b/magic_pdf/libs/__pycache__/version.cpython-312.pyc
--- a/magic_pdf/libs/config_reader.py
+++ b/magic_pdf/libs/config_reader.py
@@ -10,16 +10,19 @@ from loguru import logger
 
				 
			
 
				 from magic_pdf.libs.commons import parse_bucket_key
			
 
				 
			
 
				+# 定义配置文件名常量
			
 
				+CONFIG_FILE_NAME = "magic-pdf.json"
			
 
				+
			
 
				 
			
 
				 def read_config():
			
 
				     home_dir = os.path.expanduser("~")
			
 
				 
			
 
				-    config_file = os.path.join(home_dir, "magic-pdf.json")
			
 
				+    config_file = os.path.join(home_dir, CONFIG_FILE_NAME)
			
 
				 
			
 
				     if not os.path.exists(config_file):
			
 
				-        raise Exception(f"{config_file} not found")
			
 
				+        raise FileNotFoundError(f"{config_file} not found")
			
 
				 
			
 
				-    with open(config_file, "r") as f:
			
 
				+    with open(config_file, "r", encoding="utf-8") as f:
			
 
				         config = json.load(f)
			
 
				     return config
			
 
				 
			
@@ -37,7 +40,7 @@ def get_s3_config(bucket_name: str):
 
				         access_key, secret_key, storage_endpoint = bucket_info[bucket_name]
			
 
				 
			
 
				     if access_key is None or secret_key is None or storage_endpoint is None:
			
 
				-        raise Exception("ak, sk or endpoint not found in magic-pdf.json")
			
 
				+        raise Exception(f"ak, sk or endpoint not found in {CONFIG_FILE_NAME}")
			
 
				 
			
 
				     # logger.info(f"get_s3_config: ak={access_key}, sk={secret_key}, endpoint={storage_endpoint}")
			
 
				 
			
@@ -56,17 +59,32 @@ def get_bucket_name(path):
 
				 
			
 
				 def get_local_dir():
			
 
				     config = read_config()
			
 
				-    return config.get("temp-output-dir", "/tmp")
			
 
				+    local_dir = config.get("temp-output-dir")
			
 
				+    if local_dir is None:
			
 
				+        logger.warning(f"'temp-output-dir' not found in {CONFIG_FILE_NAME}, use '/tmp' as default")
			
 
				+        return "/tmp"
			
 
				+    else:
			
 
				+        return local_dir
			
 
				 
			
 
				 
			
 
				 def get_local_models_dir():
			
 
				     config = read_config()
			
 
				-    return config.get("models-dir", "/tmp/models")
			
 
				+    models_dir = config.get("models-dir")
			
 
				+    if models_dir is None:
			
 
				+        logger.warning(f"'models-dir' not found in {CONFIG_FILE_NAME}, use '/tmp/models' as default")
			
 
				+        return "/tmp/models"
			
 
				+    else:
			
 
				+        return models_dir
			
 
				 
			
 
				 
			
 
				 def get_device():
			
 
				     config = read_config()
			
 
				-    return config.get("device-mode", "cpu")
			
 
				+    device = config.get("device-mode")
			
 
				+    if device is None:
			
 
				+        logger.warning(f"'device-mode' not found in {CONFIG_FILE_NAME}, use 'cpu' as default")
			
 
				+        return "cpu"
			
 
				+    else:
			
 
				+        return device
			
 
				 
			
 
				 
			
 
				 if __name__ == "__main__":
			
--- a/magic_pdf/libs/language.py
+++ b/magic_pdf/libs/language.py
@@ -1,8 +1,19 @@
 
				+import os
			
 
				 import unicodedata
			
 
				+
			
 
				+if not os.getenv("FTLANG_CACHE"):
			
 
				+    current_file_path = os.path.abspath(__file__)
			
 
				+    current_dir = os.path.dirname(current_file_path)
			
 
				+    root_dir = os.path.dirname(current_dir)
			
 
				+    ftlang_cache_dir = os.path.join(root_dir, 'resources', 'fasttext-langdetect')
			
 
				+    os.environ["FTLANG_CACHE"] = str(ftlang_cache_dir)
			
 
				+    # print(os.getenv("FTLANG_CACHE"))
			
 
				+
			
 
				 from fast_langdetect import detect_language
			
 
				 
			
 
				 
			
 
				 def detect_lang(text: str) -> str:
			
 
				+
			
 
				     if len(text) == 0:
			
 
				         return ""
			
 
				     try:
			
@@ -18,6 +29,7 @@ def detect_lang(text: str) -> str:
 
				 
			
 
				 
			
 
				 if __name__ == '__main__':
			
 
				+    print(os.getenv("FTLANG_CACHE"))
			
 
				     print(detect_lang("This is a test."))
			
 
				     print(detect_lang("<html>This is a test</html>"))
			
 
				     print(detect_lang("这个是中文测试。"))
			
--- a/magic_pdf/libs/local_math.py
+++ b/magic_pdf/libs/local_math.py
--- a/magic_pdf/model/magic_model.py
+++ b/magic_pdf/model/magic_model.py
@@ -9,7 +9,7 @@ from magic_pdf.libs.coordinate_transform import get_scale_ratio
 
				 from magic_pdf.libs.ocr_content_type import ContentType
			
 
				 from magic_pdf.rw.AbsReaderWriter import AbsReaderWriter
			
 
				 from magic_pdf.rw.DiskReaderWriter import DiskReaderWriter
			
 
				-from magic_pdf.libs.math import float_gt
			
 
				+from magic_pdf.libs.local_math import float_gt
			
 
				 from magic_pdf.libs.boxbase import (
			
 
				     _is_in,
			
 
				     bbox_relative_pos,
			
--- a/magic_pdf/model/pdf_extract_kit.py
+++ b/magic_pdf/model/pdf_extract_kit.py
@@ -1,6 +1,8 @@
 
				 from loguru import logger
			
 
				 import os
			
 
				 import time
			
 
				+
			
 
				+os.environ['NO_ALBUMENTATIONS_UPDATE'] = '1'  # 禁止albumentations检查更新
			
 
				 try:
			
 
				     import cv2
			
 
				     import yaml
			
@@ -17,14 +19,17 @@ try:
 
				     import unimernet.tasks as tasks
			
 
				     from unimernet.processors import load_processor
			
 
				 
			
 
				-    from magic_pdf.model.pek_sub_modules.layoutlmv3.model_init import Layoutlmv3_Predictor
			
 
				-    from magic_pdf.model.pek_sub_modules.post_process import get_croped_image, latex_rm_whitespace
			
 
				-    from magic_pdf.model.pek_sub_modules.self_modify import ModifiedPaddleOCR
			
 
				 except ImportError as e:
			
 
				     logger.exception(e)
			
 
				-    logger.error('Required dependency not installed, please install by \n"pip install magic-pdf[full-cpu] detectron2 --extra-index-url https://myhloli.github.io/wheels/"')
			
 
				+    logger.error(
			
 
				+        'Required dependency not installed, please install by \n'
			
 
				+        '"pip install magic-pdf[full] detectron2 --extra-index-url https://myhloli.github.io/wheels/"')
			
 
				     exit(1)
			
 
				 
			
 
				+from magic_pdf.model.pek_sub_modules.layoutlmv3.model_init import Layoutlmv3_Predictor
			
 
				+from magic_pdf.model.pek_sub_modules.post_process import get_croped_image, latex_rm_whitespace
			
 
				+from magic_pdf.model.pek_sub_modules.self_modify import ModifiedPaddleOCR
			
 
				+
			
 
				 
			
 
				 def mfd_model_init(weight):
			
 
				     mfd_model = YOLO(weight)
			
@@ -84,7 +89,7 @@ class CustomPEKModel:
 
				         model_config_dir = os.path.join(root_dir, 'resources', 'model_config')
			
 
				         # 构建 model_configs.yaml 文件的完整路径
			
 
				         config_path = os.path.join(model_config_dir, 'model_configs.yaml')
			
 
				-        with open(config_path, "r") as f:
			
 
				+        with open(config_path, "r", encoding='utf-8') as f:
			
 
				             self.configs = yaml.load(f, Loader=yaml.FullLoader)
			
 
				         # 初始化解析配置
			
 
				         self.apply_layout = kwargs.get("apply_layout", self.configs["config"]["layout"])
			
@@ -100,6 +105,7 @@ class CustomPEKModel:
 
				         self.device = kwargs.get("device", self.configs["config"]["device"])
			
 
				         logger.info("using device: {}".format(self.device))
			
 
				         models_dir = kwargs.get("models_dir", os.path.join(root_dir, "resources", "models"))
			
 
				+        logger.info("using models_dir: {}".format(models_dir))
			
 
				 
			
 
				         # 初始化公式识别
			
 
				         if self.apply_formula:
			
@@ -135,66 +141,110 @@ class CustomPEKModel:
 
				         layout_cost = round(time.time() - layout_start, 2)
			
 
				         logger.info(f"layout detection cost: {layout_cost}")
			
 
				 
			
 
				-        # 公式检测
			
 
				-        mfd_res = self.mfd_model.predict(image, imgsz=1888, conf=0.25, iou=0.45, verbose=True)[0]
			
 
				-        for xyxy, conf, cla in zip(mfd_res.boxes.xyxy.cpu(), mfd_res.boxes.conf.cpu(), mfd_res.boxes.cls.cpu()):
			
 
				-            xmin, ymin, xmax, ymax = [int(p.item()) for p in xyxy]
			
 
				-            new_item = {
			
 
				-                'category_id': 13 + int(cla.item()),
			
 
				-                'poly': [xmin, ymin, xmax, ymin, xmax, ymax, xmin, ymax],
			
 
				-                'score': round(float(conf.item()), 2),
			
 
				-                'latex': '',
			
 
				-            }
			
 
				-            layout_res.append(new_item)
			
 
				-            latex_filling_list.append(new_item)
			
 
				-            bbox_img = get_croped_image(Image.fromarray(image), [xmin, ymin, xmax, ymax])
			
 
				-            mf_image_list.append(bbox_img)
			
 
				-
			
 
				-        # 公式识别
			
 
				-        mfr_start = time.time()
			
 
				-        dataset = MathDataset(mf_image_list, transform=self.mfr_transform)
			
 
				-        dataloader = DataLoader(dataset, batch_size=64, num_workers=0)
			
 
				-        mfr_res = []
			
 
				-        for mf_img in dataloader:
			
 
				-            mf_img = mf_img.to(self.device)
			
 
				-            output = self.mfr_model.generate({'image': mf_img})
			
 
				-            mfr_res.extend(output['pred_str'])
			
 
				-        for res, latex in zip(latex_filling_list, mfr_res):
			
 
				-            res['latex'] = latex_rm_whitespace(latex)
			
 
				-        mfr_cost = round(time.time() - mfr_start, 2)
			
 
				-        logger.info(f"formula nums: {len(mf_image_list)}, mfr time: {mfr_cost}")
			
 
				+        if self.apply_formula:
			
 
				+            # 公式检测
			
 
				+            mfd_res = self.mfd_model.predict(image, imgsz=1888, conf=0.25, iou=0.45, verbose=True)[0]
			
 
				+            for xyxy, conf, cla in zip(mfd_res.boxes.xyxy.cpu(), mfd_res.boxes.conf.cpu(), mfd_res.boxes.cls.cpu()):
			
 
				+                xmin, ymin, xmax, ymax = [int(p.item()) for p in xyxy]
			
 
				+                new_item = {
			
 
				+                    'category_id': 13 + int(cla.item()),
			
 
				+                    'poly': [xmin, ymin, xmax, ymin, xmax, ymax, xmin, ymax],
			
 
				+                    'score': round(float(conf.item()), 2),
			
 
				+                    'latex': '',
			
 
				+                }
			
 
				+                layout_res.append(new_item)
			
 
				+                latex_filling_list.append(new_item)
			
 
				+                bbox_img = get_croped_image(Image.fromarray(image), [xmin, ymin, xmax, ymax])
			
 
				+                mf_image_list.append(bbox_img)
			
 
				+
			
 
				+            # 公式识别
			
 
				+            mfr_start = time.time()
			
 
				+            dataset = MathDataset(mf_image_list, transform=self.mfr_transform)
			
 
				+            dataloader = DataLoader(dataset, batch_size=64, num_workers=0)
			
 
				+            mfr_res = []
			
 
				+            for mf_img in dataloader:
			
 
				+                mf_img = mf_img.to(self.device)
			
 
				+                output = self.mfr_model.generate({'image': mf_img})
			
 
				+                mfr_res.extend(output['pred_str'])
			
 
				+            for res, latex in zip(latex_filling_list, mfr_res):
			
 
				+                res['latex'] = latex_rm_whitespace(latex)
			
 
				+            mfr_cost = round(time.time() - mfr_start, 2)
			
 
				+            logger.info(f"formula nums: {len(mf_image_list)}, mfr time: {mfr_cost}")
			
 
				 
			
 
				         # ocr识别
			
 
				         if self.apply_ocr:
			
 
				             ocr_start = time.time()
			
 
				             pil_img = Image.fromarray(image)
			
 
				+
			
 
				+            # 筛选出需要OCR的区域和公式区域
			
 
				+            ocr_res_list = []
			
 
				             single_page_mfdetrec_res = []
			
 
				             for res in layout_res:
			
 
				                 if int(res['category_id']) in [13, 14]:
			
 
				-                    xmin, ymin = int(res['poly'][0]), int(res['poly'][1])
			
 
				-                    xmax, ymax = int(res['poly'][4]), int(res['poly'][5])
			
 
				                     single_page_mfdetrec_res.append({
			
 
				-                        "bbox": [xmin, ymin, xmax, ymax],
			
 
				+                        "bbox": [int(res['poly'][0]), int(res['poly'][1]),
			
 
				+                                 int(res['poly'][4]), int(res['poly'][5])],
			
 
				                     })
			
 
				-            for res in layout_res:
			
 
				-                if int(res['category_id']) in [0, 1, 2, 4, 6, 7]:  # 需要进行ocr的类别
			
 
				-                    xmin, ymin = int(res['poly'][0]), int(res['poly'][1])
			
 
				-                    xmax, ymax = int(res['poly'][4]), int(res['poly'][5])
			
 
				-                    crop_box = (xmin, ymin, xmax, ymax)
			
 
				-                    cropped_img = Image.new('RGB', pil_img.size, 'white')
			
 
				-                    cropped_img.paste(pil_img.crop(crop_box), crop_box)
			
 
				-                    cropped_img = cv2.cvtColor(np.asarray(cropped_img), cv2.COLOR_RGB2BGR)
			
 
				-                    ocr_res = self.ocr_model.ocr(cropped_img, mfd_res=single_page_mfdetrec_res)[0]
			
 
				-                    if ocr_res:
			
 
				-                        for box_ocr_res in ocr_res:
			
 
				-                            p1, p2, p3, p4 = box_ocr_res[0]
			
 
				-                            text, score = box_ocr_res[1]
			
 
				-                            layout_res.append({
			
 
				-                                'category_id': 15,
			
 
				-                                'poly': p1 + p2 + p3 + p4,
			
 
				-                                'score': round(score, 2),
			
 
				-                                'text': text,
			
 
				-                            })
			
 
				+                elif int(res['category_id']) in [0, 1, 2, 4, 6, 7]:
			
 
				+                    ocr_res_list.append(res)
			
 
				+
			
 
				+            # 对每一个需OCR处理的区域进行处理
			
 
				+            for res in ocr_res_list:
			
 
				+                xmin, ymin = int(res['poly'][0]), int(res['poly'][1])
			
 
				+                xmax, ymax = int(res['poly'][4]), int(res['poly'][5])
			
 
				+
			
 
				+                paste_x = 50
			
 
				+                paste_y = 50
			
 
				+                # 创建一个宽高各多50的白色背景
			
 
				+                new_width = xmax - xmin + paste_x * 2
			
 
				+                new_height = ymax - ymin + paste_y * 2
			
 
				+                new_image = Image.new('RGB', (new_width, new_height), 'white')
			
 
				+
			
 
				+                # 裁剪图像
			
 
				+                crop_box = (xmin, ymin, xmax, ymax)
			
 
				+                cropped_img = pil_img.crop(crop_box)
			
 
				+                new_image.paste(cropped_img, (paste_x, paste_y))
			
 
				+
			
 
				+                # 调整公式区域坐标
			
 
				+                adjusted_mfdetrec_res = []
			
 
				+                for mf_res in single_page_mfdetrec_res:
			
 
				+                    mf_xmin, mf_ymin, mf_xmax, mf_ymax = mf_res["bbox"]
			
 
				+                    # 将公式区域坐标调整为相对于裁剪区域的坐标
			
 
				+                    x0 = mf_xmin - xmin + paste_x
			
 
				+                    y0 = mf_ymin - ymin + paste_y
			
 
				+                    x1 = mf_xmax - xmin + paste_x
			
 
				+                    y1 = mf_ymax - ymin + paste_y
			
 
				+                    # 过滤在图外的公式块
			
 
				+                    if any([x1 < 0, y1 < 0]) or any([x0 > new_width, y0 > new_height]):
			
 
				+                        continue
			
 
				+                    else:
			
 
				+                        adjusted_mfdetrec_res.append({
			
 
				+                            "bbox": [x0, y0, x1, y1],
			
 
				+                        })
			
 
				+
			
 
				+                # OCR识别
			
 
				+                new_image = cv2.cvtColor(np.asarray(new_image), cv2.COLOR_RGB2BGR)
			
 
				+                ocr_res = self.ocr_model.ocr(new_image, mfd_res=adjusted_mfdetrec_res)[0]
			
 
				+
			
 
				+                # 整合结果
			
 
				+                if ocr_res:
			
 
				+                    for box_ocr_res in ocr_res:
			
 
				+                        p1, p2, p3, p4 = box_ocr_res[0]
			
 
				+                        text, score = box_ocr_res[1]
			
 
				+
			
 
				+                        # 将坐标转换回原图坐标系
			
 
				+                        p1 = [p1[0] - paste_x + xmin, p1[1] - paste_y + ymin]
			
 
				+                        p2 = [p2[0] - paste_x + xmin, p2[1] - paste_y + ymin]
			
 
				+                        p3 = [p3[0] - paste_x + xmin, p3[1] - paste_y + ymin]
			
 
				+                        p4 = [p4[0] - paste_x + xmin, p4[1] - paste_y + ymin]
			
 
				+
			
 
				+                        layout_res.append({
			
 
				+                            'category_id': 15,
			
 
				+                            'poly': p1 + p2 + p3 + p4,
			
 
				+                            'score': round(score, 2),
			
 
				+                            'text': text,
			
 
				+                        })
			
 
				+
			
 
				             ocr_cost = round(time.time() - ocr_start, 2)
			
 
				             logger.info(f"ocr cost: {ocr_cost}")
			
 
				 
			
--- a/magic_pdf/model/pek_sub_modules/self_modify.py
+++ b/magic_pdf/model/pek_sub_modules/self_modify.py
@@ -10,12 +10,17 @@ from paddleocr import PaddleOCR
 
				 from paddleocr.ppocr.utils.logging import get_logger
			
 
				 from paddleocr.ppocr.utils.utility import check_and_read, alpha_to_color, binarize_img
			
 
				 from paddleocr.tools.infer.utility import draw_ocr_box_txt, get_rotate_crop_image, get_minarea_rect_crop
			
 
				+
			
 
				+from magic_pdf.libs.boxbase import __is_overlaps_y_exceeds_threshold
			
 
				+
			
 
				 logger = get_logger()
			
 
				 
			
 
				+
			
 
				 def img_decode(content: bytes):
			
 
				     np_arr = np.frombuffer(content, dtype=np.uint8)
			
 
				     return cv2.imdecode(np_arr, cv2.IMREAD_UNCHANGED)
			
 
				 
			
 
				+
			
 
				 def check_img(img):
			
 
				     if isinstance(img, bytes):
			
 
				         img = img_decode(img)
			
@@ -51,6 +56,7 @@ def check_img(img):
 
				 
			
 
				     return img
			
 
				 
			
 
				+
			
 
				 def sorted_boxes(dt_boxes):
			
 
				     """
			
 
				     Sort text boxes in order from top to bottom, left to right
			
@@ -75,49 +81,87 @@ def sorted_boxes(dt_boxes):
 
				     return _boxes
			
 
				 
			
 
				 
			
 
				-def formula_in_text(mf_bbox, text_bbox):
			
 
				-    x1, y1, x2, y2 = mf_bbox
			
 
				-    x3, y3 = text_bbox[0]
			
 
				-    x4, y4 = text_bbox[2]
			
 
				-    left_box, right_box = None, None
			
 
				-    same_line = abs((y1+y2)/2 - (y3+y4)/2) / abs(y4-y3) < 0.2
			
 
				-    if not same_line:
			
 
				-        return False, left_box, right_box
			
 
				-    else:
			
 
				-        drop_origin = False
			
 
				-        left_x = x1 - 1
			
 
				-        right_x = x2 + 1
			
 
				-        if x3 < x1 and x2 < x4:
			
 
				-            drop_origin = True
			
 
				-            left_box = np.array([text_bbox[0], [left_x, text_bbox[1][1]], [left_x, text_bbox[2][1]], text_bbox[3]]).astype('float32')
			
 
				-            right_box = np.array([[right_x, text_bbox[0][1]], text_bbox[1], text_bbox[2], [right_x, text_bbox[3][1]]]).astype('float32')
			
 
				-        if x3 < x1 and x1 <= x4 <= x2:
			
 
				-            drop_origin = True
			
 
				-            left_box = np.array([text_bbox[0], [left_x, text_bbox[1][1]], [left_x, text_bbox[2][1]], text_bbox[3]]).astype('float32')
			
 
				-        if x1 <= x3 <= x2 and x2 < x4:
			
 
				-            drop_origin = True
			
 
				-            right_box = np.array([[right_x, text_bbox[0][1]], text_bbox[1], text_bbox[2], [right_x, text_bbox[3][1]]]).astype('float32')
			
 
				-        if x1 <= x3 < x4 <= x2:
			
 
				-            drop_origin = True
			
 
				-        return drop_origin, left_box, right_box
			
 
				-
			
 
				-    
			
 
				-def update_det_boxes(dt_boxes, mfdetrec_res):
			
 
				-    new_dt_boxes = dt_boxes
			
 
				-    for mf_box in mfdetrec_res:
			
 
				-        flag, left_box, right_box = False, None, None
			
 
				-        for idx, text_box in enumerate(new_dt_boxes):
			
 
				-            ret, left_box, right_box = formula_in_text(mf_box['bbox'], text_box)
			
 
				-            if ret:
			
 
				-                new_dt_boxes.pop(idx)
			
 
				-                if left_box is not None:
			
 
				-                    new_dt_boxes.append(left_box)
			
 
				-                if right_box is not None:
			
 
				-                    new_dt_boxes.append(right_box)
			
 
				-                break
			
 
				-            
			
 
				+def bbox_to_points(bbox):
			
 
				+    """ 将bbox格式转换为四个顶点的数组 """
			
 
				+    x0, y0, x1, y1 = bbox
			
 
				+    return np.array([[x0, y0], [x1, y0], [x1, y1], [x0, y1]]).astype('float32')
			
 
				+
			
 
				+
			
 
				+def points_to_bbox(points):
			
 
				+    """ 将四个顶点的数组转换为bbox格式 """
			
 
				+    x0, y0 = points[0]
			
 
				+    x1, _ = points[1]
			
 
				+    _, y1 = points[2]
			
 
				+    return [x0, y0, x1, y1]
			
 
				+
			
 
				+
			
 
				+def merge_intervals(intervals):
			
 
				+    # Sort the intervals based on the start value
			
 
				+    intervals.sort(key=lambda x: x[0])
			
 
				+
			
 
				+    merged = []
			
 
				+    for interval in intervals:
			
 
				+        # If the list of merged intervals is empty or if the current
			
 
				+        # interval does not overlap with the previous, simply append it.
			
 
				+        if not merged or merged[-1][1] < interval[0]:
			
 
				+            merged.append(interval)
			
 
				+        else:
			
 
				+            # Otherwise, there is overlap, so we merge the current and previous intervals.
			
 
				+            merged[-1][1] = max(merged[-1][1], interval[1])
			
 
				+
			
 
				+    return merged
			
 
				+
			
 
				+
			
 
				+def remove_intervals(original, masks):
			
 
				+    # Merge all mask intervals
			
 
				+    merged_masks = merge_intervals(masks)
			
 
				+
			
 
				+    result = []
			
 
				+    original_start, original_end = original
			
 
				+
			
 
				+    for mask in merged_masks:
			
 
				+        mask_start, mask_end = mask
			
 
				+
			
 
				+        # If the mask starts after the original range, ignore it
			
 
				+        if mask_start > original_end:
			
 
				+            continue
			
 
				+
			
 
				+        # If the mask ends before the original range starts, ignore it
			
 
				+        if mask_end < original_start:
			
 
				+            continue
			
 
				+
			
 
				+        # Remove the masked part from the original range
			
 
				+        if original_start < mask_start:
			
 
				+            result.append([original_start, mask_start - 1])
			
 
				+
			
 
				+        original_start = max(mask_end + 1, original_start)
			
 
				+
			
 
				+    # Add the remaining part of the original range, if any
			
 
				+    if original_start <= original_end:
			
 
				+        result.append([original_start, original_end])
			
 
				+
			
 
				+    return result
			
 
				+
			
 
				+
			
 
				+def update_det_boxes(dt_boxes, mfd_res):
			
 
				+    new_dt_boxes = []
			
 
				+    for text_box in dt_boxes:
			
 
				+        text_bbox = points_to_bbox(text_box)
			
 
				+        masks_list = []
			
 
				+        for mf_box in mfd_res:
			
 
				+            mf_bbox = mf_box['bbox']
			
 
				+            if __is_overlaps_y_exceeds_threshold(text_bbox, mf_bbox):
			
 
				+                masks_list.append([mf_bbox[0], mf_bbox[2]])
			
 
				+        text_x_range = [text_bbox[0], text_bbox[2]]
			
 
				+        text_remove_mask_range = remove_intervals(text_x_range, masks_list)
			
 
				+        temp_dt_box = []
			
 
				+        for text_remove_mask in text_remove_mask_range:
			
 
				+            temp_dt_box.append(bbox_to_points([text_remove_mask[0], text_bbox[1], text_remove_mask[1], text_bbox[3]]))
			
 
				+        if len(temp_dt_box) > 0:
			
 
				+            new_dt_boxes.extend(temp_dt_box)
			
 
				     return new_dt_boxes
			
 
				 
			
 
				+
			
 
				 class ModifiedPaddleOCR(PaddleOCR):
			
 
				     def ocr(self, img, det=True, rec=True, cls=True, bin=False, inv=False, mfd_res=None, alpha_color=(255, 255, 255)):
			
 
				         """
			
@@ -197,7 +241,7 @@ class ModifiedPaddleOCR(PaddleOCR):
 
				             if not rec:
			
 
				                 return cls_res
			
 
				             return ocr_res
			
 
				-        
			
 
				+
			
 
				     def __call__(self, img, cls=True, mfd_res=None):
			
 
				         time_dict = {'det': 0, 'rec': 0, 'cls': 0, 'all': 0}
			
 
				 
			
@@ -226,7 +270,7 @@ class ModifiedPaddleOCR(PaddleOCR):
 
				             dt_boxes = update_det_boxes(dt_boxes, mfd_res)
			
 
				             aft = time.time()
			
 
				             logger.debug("split text box by formula, new dt_boxes num : {}, elapsed : {}".format(
			
 
				-                len(dt_boxes), aft-bef))
			
 
				+                len(dt_boxes), aft - bef))
			
 
				 
			
 
				         for bno in range(len(dt_boxes)):
			
 
				             tmp_box = copy.deepcopy(dt_boxes[bno])
			
--- a/magic_pdf/model/pp_structure_v2.py
+++ b/magic_pdf/model/pp_structure_v2.py
@@ -5,7 +5,7 @@ from loguru import logger
 
				 try:
			
 
				     from paddleocr import PPStructure
			
 
				 except ImportError:
			
 
				-    logger.error('paddleocr not installed, please install by "pip install magic-pdf[cpu]" or "pip install magic-pdf[gpu]"')
			
 
				+    logger.error('paddleocr not installed, please install by "pip install magic-pdf[lite]"')
			
 
				     exit(1)
			
 
				 
			
 
				 
			
--- a/magic_pdf/pdf_parse_union_core.py
+++ b/magic_pdf/pdf_parse_union_core.py
@@ -7,7 +7,7 @@ from magic_pdf.layout.layout_sort import get_bboxes_layout, LAYOUT_UNPROC, get_c
 
				 from magic_pdf.libs.convert_utils import dict_to_list
			
 
				 from magic_pdf.libs.drop_reason import DropReason
			
 
				 from magic_pdf.libs.hash_utils import compute_md5
			
 
				-from magic_pdf.libs.math import float_equal
			
 
				+from magic_pdf.libs.local_math import float_equal
			
 
				 from magic_pdf.libs.ocr_content_type import ContentType
			
 
				 from magic_pdf.model.magic_model import MagicModel
			
 
				 from magic_pdf.para.para_split_v2 import para_split
			
@@ -111,7 +111,8 @@ def parse_page_core(pdf_docs, magic_model, page_id, pdf_bytes_md5, imageWriter,
 
				     spans = ocr_cut_image_and_table(spans, pdf_docs[page_id], page_id, pdf_bytes_md5, imageWriter)
			
 
				 
			
 
				     '''将所有区块的bbox整理到一起'''
			
 
				-    # @todo interline_equation_blocks参数不够准，后面切换到interline_equations上
			
 
				+    # interline_equation_blocks参数不够准，后面切换到interline_equations上
			
 
				+    interline_equation_blocks = []
			
 
				     if len(interline_equation_blocks) > 0:
			
 
				         all_bboxes, all_discarded_blocks, drop_reasons = ocr_prepare_bboxes_for_layout_split(
			
 
				             img_blocks, table_blocks, discarded_blocks, text_blocks, title_blocks,
			
@@ -120,6 +121,7 @@ def parse_page_core(pdf_docs, magic_model, page_id, pdf_bytes_md5, imageWriter,
 
				         all_bboxes, all_discarded_blocks, drop_reasons = ocr_prepare_bboxes_for_layout_split(
			
 
				             img_blocks, table_blocks, discarded_blocks, text_blocks, title_blocks,
			
 
				             interline_equations, page_w, page_h)
			
 
				+
			
 
				     if len(drop_reasons) > 0:
			
 
				         need_drop = True
			
 
				         drop_reason.append(DropReason.OVERLAP_BLOCKS_CAN_NOT_SEPARATION)
			
--- a/magic_pdf/pre_proc/citationmarker_remove.py
+++ b/magic_pdf/pre_proc/citationmarker_remove.py
@@ -135,7 +135,11 @@ def remove_citation_marker(with_char_text_blcoks):
 
				                 
			
 
				                 if max_font_sz-span_font_sz<1: # 先以字体过滤正文，如果是正文就不再继续判断了
			
 
				                     continue
			
 
				-                
			
 
				+
			
 
				+                # 对被除数为0的情况进行过滤
			
 
				+                if span_hi==0 or min_font_sz==0:
			
 
				+                    continue
			
 
				+
			
 
				                 if (base_span_mid_y-span_mid_y)/span_hi>0.2 or (base_span_mid_y-span_mid_y>0 and abs(span_font_sz-min_font_sz)/min_font_sz<0.1):
			
 
				                     """
			
 
				                     1. 它的前一个char如果是句号或者逗号的话，那么肯定是角标而不是公式
			
--- a/magic_pdf/pre_proc/ocr_detect_all_bboxes.py
+++ b/magic_pdf/pre_proc/ocr_detect_all_bboxes.py
@@ -36,9 +36,12 @@ def ocr_prepare_bboxes_for_layout_split(img_blocks, table_blocks, discarded_bloc
 
				     all_bboxes = fix_text_overlap_title_blocks(all_bboxes)
			
 
				     '''任何框体与舍弃框重叠，优先信任舍弃框'''
			
 
				     all_bboxes = remove_need_drop_blocks(all_bboxes, discarded_blocks)
			
 
				-    # @todo interline_equation 与title或text框冲突的情况，分两种情况处理
			
 
				+
			
 
				+    # interline_equation 与title或text框冲突的情况，分两种情况处理
			
 
				     '''interline_equation框与文本类型框iou比较接近1的时候，信任行间公式框'''
			
 
				+    all_bboxes = fix_interline_equation_overlap_text_blocks_with_hi_iou(all_bboxes)
			
 
				     '''interline_equation框被包含在文本类型框内，且interline_equation比文本区块小很多时信任文本框，这时需要舍弃公式框'''
			
 
				+    # 通过后续大框套小框逻辑删除
			
 
				 
			
 
				     '''discarded_blocks中只保留宽度超过1/3页面宽度的，高度超过10的，处于页面下半50%区域的（限定footnote）'''
			
 
				     for discarded in discarded_blocks:
			
@@ -57,6 +60,34 @@ def ocr_prepare_bboxes_for_layout_split(img_blocks, table_blocks, discarded_bloc
 
				     return all_bboxes, all_discarded_blocks, drop_reasons
			
 
				 
			
 
				 
			
 
				+def fix_interline_equation_overlap_text_blocks_with_hi_iou(all_bboxes):
			
 
				+    # 先提取所有text和interline block
			
 
				+    text_blocks = []
			
 
				+    for block in all_bboxes:
			
 
				+        if block[7] == BlockType.Text:
			
 
				+            text_blocks.append(block)
			
 
				+    interline_equation_blocks = []
			
 
				+    for block in all_bboxes:
			
 
				+        if block[7] == BlockType.InterlineEquation:
			
 
				+            interline_equation_blocks.append(block)
			
 
				+
			
 
				+    need_remove = []
			
 
				+
			
 
				+    for interline_equation_block in interline_equation_blocks:
			
 
				+        for text_block in text_blocks:
			
 
				+            interline_equation_block_bbox = interline_equation_block[:4]
			
 
				+            text_block_bbox = text_block[:4]
			
 
				+            if calculate_iou(interline_equation_block_bbox, text_block_bbox) > 0.8:
			
 
				+                if text_block not in need_remove:
			
 
				+                    need_remove.append(text_block)
			
 
				+
			
 
				+    if len(need_remove) > 0:
			
 
				+        for block in need_remove:
			
 
				+            all_bboxes.remove(block)
			
 
				+
			
 
				+    return all_bboxes
			
 
				+
			
 
				+
			
 
				 def fix_text_overlap_title_blocks(all_bboxes):
			
 
				     # 先提取所有text和title block
			
 
				     text_blocks = []
			
@@ -68,12 +99,19 @@ def fix_text_overlap_title_blocks(all_bboxes):
 
				         if block[7] == BlockType.Title:
			
 
				             title_blocks.append(block)
			
 
				 
			
 
				+    need_remove = []
			
 
				+
			
 
				     for text_block in text_blocks:
			
 
				         for title_block in title_blocks:
			
 
				             text_block_bbox = text_block[:4]
			
 
				             title_block_bbox = title_block[:4]
			
 
				             if calculate_iou(text_block_bbox, title_block_bbox) > 0.8:
			
 
				-                all_bboxes.remove(title_block)
			
 
				+                if title_block not in need_remove:
			
 
				+                    need_remove.append(title_block)
			
 
				+
			
 
				+    if len(need_remove) > 0:
			
 
				+        for block in need_remove:
			
 
				+            all_bboxes.remove(block)
			
 
				 
			
 
				     return all_bboxes
			
 
				 
			
--- a/magic_pdf/pre_proc/ocr_span_list_modify.py
+++ b/magic_pdf/pre_proc/ocr_span_list_modify.py
@@ -5,19 +5,24 @@ from magic_pdf.libs.boxbase import calculate_overlap_area_in_bbox1_area_ratio, g
 
				 from magic_pdf.libs.drop_tag import DropTag
			
 
				 from magic_pdf.libs.ocr_content_type import ContentType, BlockType
			
 
				 
			
 
				+
			
 
				 def remove_overlaps_low_confidence_spans(spans):
			
 
				     dropped_spans = []
			
 
				     #  删除重叠spans中置信度低的的那些
			
 
				     for span1 in spans:
			
 
				         for span2 in spans:
			
 
				             if span1 != span2:
			
 
				-                if calculate_iou(span1['bbox'], span2['bbox']) > 0.9:
			
 
				-                    if span1['score'] < span2['score']:
			
 
				-                        span_need_remove = span1
			
 
				-                    else:
			
 
				-                        span_need_remove = span2
			
 
				-                    if span_need_remove is not None and span_need_remove not in dropped_spans:
			
 
				-                        dropped_spans.append(span_need_remove)
			
 
				+                # span1 或 span2 任何一个都不应该在 dropped_spans 中
			
 
				+                if span1 in dropped_spans or span2 in dropped_spans:
			
 
				+                    continue
			
 
				+                else:
			
 
				+                    if calculate_iou(span1['bbox'], span2['bbox']) > 0.9:
			
 
				+                        if span1['score'] < span2['score']:
			
 
				+                            span_need_remove = span1
			
 
				+                        else:
			
 
				+                            span_need_remove = span2
			
 
				+                        if span_need_remove is not None and span_need_remove not in dropped_spans:
			
 
				+                            dropped_spans.append(span_need_remove)
			
 
				 
			
 
				     if len(dropped_spans) > 0:
			
 
				         for span_need_remove in dropped_spans:
			
--- a/magic_pdf/resources/fasttext-langdetect/lid.176.ftz
+++ b/magic_pdf/resources/fasttext-langdetect/lid.176.ftz
--- a/magic_pdf/resources/model_config/layoutlmv3/layoutlmv3_base_inference.yaml
+++ b/magic_pdf/resources/model_config/layoutlmv3/layoutlmv3_base_inference.yaml
@@ -1,6 +1,6 @@
 
				 AUG:
			
 
				   DETR: true
			
 
				-CACHE_DIR: /mnt/localdata/users/yupanhuang/cache/huggingface
			
 
				+CACHE_DIR: ~/cache/huggingface
			
 
				 CUDNN_BENCHMARK: false
			
 
				 DATALOADER:
			
 
				   ASPECT_RATIO_GROUPING: true
			
@@ -294,7 +294,7 @@ MODEL:
 
				     POS_TYPE: abs
			
 
				   WEIGHTS: 
			
 
				 OUTPUT_DIR: 
			
 
				-SCIHUB_DATA_DIR_TRAIN: /mnt/petrelfs/share_data/zhaozhiyuan/publaynet/layout_scihub/train
			
 
				+SCIHUB_DATA_DIR_TRAIN: ~/publaynet/layout_scihub/train
			
 
				 SEED: 42
			
 
				 SOLVER:
			
 
				   AMP:
			
--- a/requirements.txt
+++ b/requirements.txt
@@ -7,5 +7,5 @@ numpy>=1.21.6
 
				 fast-langdetect>=0.2.1
			
 
				 wordninja>=2.0.0
			
 
				 scikit-learn>=1.0.2
			
 
				-pdfminer.six>=20231228
			
 
				+pdfminer.six==20231228
			
 
				 # The requirements.txt must ensure that only necessary external dependencies are introduced. If there are new dependencies to add, please contact the project administrator.
			
--- a/setup.py
+++ b/setup.py
@@ -32,9 +32,8 @@ if __name__ == '__main__':
 
				         },
			
 
				         install_requires=parse_requirements('requirements.txt'),  # 项目依赖的第三方库
			
 
				         extras_require={
			
 
				-            "gpu": ["paddleocr==2.7.3", "paddlepaddle-gpu"],
			
 
				-            "cpu": ["paddleocr==2.7.3", "paddlepaddle"],
			
 
				-            "full-cpu": ["unimernet", "matplotlib", "ultralytics", "paddleocr==2.7.3", "paddlepaddle"],
			
 
				+            "lite": ["paddleocr==2.7.3", "paddlepaddle", "paddlepaddle-gpu"],
			
 
				+            "full": ["unimernet", "matplotlib", "ultralytics", "paddleocr==2.7.3", "paddlepaddle", "paddlepaddle-gpu"],
			
 
				         },
			
 
				         description="A practical tool for converting PDF to Markdown",  # 简短描述
			
 
				         long_description=long_description,  # 详细描述
			
--- a/signatures/version1/cla.json
+++ b/signatures/version1/cla.json
@@ -0,0 +1,4 @@
 
				+{
			
 
				+   "signedContributors": [
			
 
				+   ]
			
 
				+}
			
--- a/tests/__pycache__/test_unit.cpython-39-pytest-7.4.0.pyc
+++ b/tests/__pycache__/test_unit.cpython-39-pytest-7.4.0.pyc
--- a/tests/test_cli/__pycache__/test_bench.cpython-39-pytest-7.4.0.pyc
+++ b/tests/test_cli/__pycache__/test_bench.cpython-39-pytest-7.4.0.pyc
--- a/tests/test_cli/__pycache__/test_cli.cpython-39-pytest-7.4.0.pyc
+++ b/tests/test_cli/__pycache__/test_cli.cpython-39-pytest-7.4.0.pyc
--- a/tests/test_cli/conf/__pycache__/conf.cpython-39.pyc
+++ b/tests/test_cli/conf/__pycache__/conf.cpython-39.pyc
--- a/tests/test_cli/lib/__pycache__/__init__.cpython-39.pyc
+++ b/tests/test_cli/lib/__pycache__/__init__.cpython-39.pyc
--- a/tests/test_cli/lib/__pycache__/calculate_score.cpython-39.pyc
+++ b/tests/test_cli/lib/__pycache__/calculate_score.cpython-39.pyc
--- a/tests/test_cli/lib/__pycache__/scoring.cpython-39.pyc
+++ b/tests/test_cli/lib/__pycache__/scoring.cpython-39.pyc
--- a/tests/test_cli/test_bench_gpu.py
+++ b/tests/test_cli/test_bench_gpu.py
@@ -6,7 +6,7 @@ import json
 
				 from magic_pdf.pipe.UNIPipe import UNIPipe
			
 
				 from magic_pdf.rw.DiskReaderWriter import DiskReaderWriter
			
 
				 from lib import calculate_score
			
 
				-
			
 
				+import shutil
			
 
				 pdf_res_path = conf.conf["pdf_res_path"]
			
 
				 code_path = conf.conf["code_path"]
			
 
				 pdf_dev_path = conf.conf["pdf_dev_path"]
			
@@ -58,8 +58,8 @@ def pdf_to_markdown():
 
				         if not os.path.exists(dir_path):
			
 
				             os.makedirs(dir_path, exist_ok=True)
			
 
				         res_path = os.path.join(dir_path, f"{demo_name}.md")
			
 
				-        #src_path = os.path.join(pdf_res_path, "pdf", f"{demo_name}.pdf") 
			
 
				-        #shutil.copy(src_path, res_path)
			
 
				+        src_path = os.path.join(pdf_res_path, demo_name, "auto", f"{demo_name}.md")
			
 
				+        shutil.copy(src_path, res_path)