1 rok temu · 40e0827e60
--- a/.gitignore
+++ b/.gitignore
@@ -35,3 +35,5 @@ ocr_demo
 
				 

			
 
				 /app/common/__init__.py

			
 
				 /magic_pdf/config/__init__.py

			
 
				+source.dev.env

			
 
				+

			
--- a/README_zh-CN_v2.md
+++ b/README_zh-CN_v2.md
@@ -191,7 +191,24 @@ pip install magic-pdf[full]==0.6.2b1 detectron2 --extra-index-url https://wheels
 
				 
			
 
				 ### 命令行
			
 
				 
			
 
				-TODO
			
 
				+```bash
			
 
				+magic-pdf -p {some_pdf} -o {some_output_dir}
			
 
				+```
			
 
				+
			
 
				+运行完命令后输出的结果会保存在`{some_output_dir}`目录下, 输出的文件列表如下
			
 
				+
			
 
				+```text
			
 
				+├── some_pdf.md                 # markdown 文件
			
 
				+├── images                      # 存放图片目录
			
 
				+├── layout.pdf                  # layout 绘图
			
 
				+├── middle.json                 # minerU 中间处理结果
			
 
				+├── model.json                  # 模型推理结果
			
 
				+├── origin.pdf                  # 原 pdf 文件
			
 
				+└── spans.pdf                   # 最小粒度的bbox位置信息绘图
			
 
				+```
			
 
				+
			
 
				+更多有关输出文件的信息，请参考[输出文件说明](docs/output_file_zh_cn.md)
			
 
				+
			
 
				 
			
 
				 ### API
			
 
				 
			
--- a/docs/FAQ_zh_cn.md
+++ b/docs/FAQ_zh_cn.md
@@ -50,20 +50,7 @@ pip install paddlepaddle==3.0.0b1
 
				 可能是由于模型文件未下载完整导致，可尝试重现下载模型文件后再试  
			
 
				 参考：https://github.com/opendatalab/MinerU/issues/143
			
 
				 
			
 
				-### 7.程序运行完成后，找不到tmp目录
			
 
				-
			
 
				-程序输出目录是在"magic-pdf.json"中通过
			
 
				-```json
			
 
				-{
			
 
				-  "temp-output-dir": "/tmp"
			
 
				-}
			
 
				-```
			
 
				-进行配置的。  
			
 
				-如果没有更改这个参数，使用默认的配置执行程序，在linux/macOS会在绝对路径"/tmp"下创建一个"magic-pdf"文件夹作为输出路径。
			
 
				-而在windows下，默认的输出路径与执行命令时，命令行所在的盘符相关，如果命令行在C盘，则默认输出路径为"C://tmp/magic-pdf"。  
			
 
				-参考：https://github.com/opendatalab/MinerU/issues/149
			
 
				-
			
 
				-### 8.模型文件应该下载到哪里/models-dir的配置应该怎么填
			
 
				+### 7.模型文件应该下载到哪里/models-dir的配置应该怎么填
			
 
				 
			
 
				 模型文件的路径输入是在"magic-pdf.json"中通过
			
 
				 ```json
			
@@ -75,14 +62,14 @@ pip install paddlepaddle==3.0.0b1
 
				 这个路径是绝对路径而不是相对路径，绝对路径的获取可在models目录中通过命令 "pwd" 获取。  
			
 
				 参考：https://github.com/opendatalab/MinerU/issues/155#issuecomment-2230216874
			
 
				 
			
 
				-### 9.命令行中 --model "model_json_path" 指的是什么？
			
 
				+### 8.命令行中 --model "model_json_path" 指的是什么？
			
 
				 
			
 
				 model_json 指的是通过模型分析后生成的一种有特定格式的json文件。  
			
 
				 如果使用 https://github.com/opendatalab/PDF-Extract-Kit 项目生成，该文件一般在项目的output目录下。  
			
 
				 如果使用 MinerU 的命令行调用内置的模型分析，该文件一般在输出路径"/tmp/magic-pdf/pdf-name"下。  
			
 
				 参考：https://github.com/opendatalab/MinerU/issues/128
			
 
				 
			
 
				-### 10.Linux下报错：Required dependency not installed, please install by "pip install magic-pdf[full-cpu] detectron2 --extra-index-url https://myhloli.github.io/wheels/"
			
 
				+### 9.Linux下报错：Required dependency not installed, please install by "pip install magic-pdf[full-cpu] detectron2 --extra-index-url https://myhloli.github.io/wheels/"
			
 
				 
			
 
				 这种情况可以先使用pip list 检查一下自己的依赖库列表，重点确认下以下几个库有没有安装（版本不一定完全一致，有就可以）
			
 
				 ```bash
			
--- a/docs/images/layout_example.png
+++ b/docs/images/layout_example.png
--- a/docs/images/poly.png
+++ b/docs/images/poly.png
--- a/docs/images/spans_example.png
+++ b/docs/images/spans_example.png
--- a/docs/output_file_zh_cn.md
+++ b/docs/output_file_zh_cn.md
@@ -0,0 +1,327 @@
 
				+
			
 
				+
			
 
				+## 概览
			
 
				+`magic-pdf` 命令执行后除了输出和 markdown 有关的文件以外，还会生成若干个和 markdown 无关的文件。现在将一一介绍这些文件
			
 
				+
			
 
				+
			
 
				+### layout.pdf 
			
 
				+每一页的 layout 均由一个或多个框组成。 每个框左上脚的数字表明它们的序号。此外 layout.pdf 框内用不同的背景色块圈定不同的内容块。
			
 
				+
			
 
				+![layout 页面示例](images/layout_example.png)
			
 
				+
			
 
				+
			
 
				+### spans.pdf 
			
 
				+根据 span 类型的不同，采用不同颜色线框绘制页面上所有 span。该文件可以用于质检，可以快速排查出文本丢失、行间公式未识别等问题。
			
 
				+
			
 
				+![span 页面示例](images/spans_example.png)
			
 
				+
			
 
				+
			
 
				+### model.json
			
 
				+
			
 
				+#### 结构定义
			
 
				+```python
			
 
				+from pydantic import BaseModel, Field
			
 
				+from enum import IntEnum
			
 
				+
			
 
				+class CategoryType(IntEnum):
			
 
				+     title = 0               # 标题
			
 
				+     plain_text = 1          # 文本
			
 
				+     abandon = 2             # 包括页眉页脚页码和页面注释
			
 
				+     figure = 3              # 图片
			
 
				+     figure_caption = 4      # 图片描述
			
 
				+     table = 5               # 表格
			
 
				+     table_caption = 6       # 表格描述
			
 
				+     table_footnote = 7      # 表格注释
			
 
				+     isolate_formula = 8     # 行间公式
			
 
				+     formula_caption = 9     # 行间公式的标号 
			
 
				+     
			
 
				+     embedding = 13          # 行内公式
			
 
				+     isolated = 14           # 行间公式
			
 
				+     text = 15               # ocr 识别结果
			
 
				+   
			
 
				+     
			
 
				+class PageInfo(BaseModel):
			
 
				+    page_no: int = Field(description="页码序号，第一页的序号是 0", ge=0)
			
 
				+    height: int = Field(description="页面高度", gt=0)
			
 
				+    width: int = Field(description="页面宽度", ge=0)
			
 
				+
			
 
				+class ObjectInferenceResult(BaseModel):
			
 
				+    category_id: CategoryType = Field(description="类别", ge=0)
			
 
				+    poly: list[float] = Field(description="四边形坐标, 分别是 左上，右上，右下，左下 四点的坐标")
			
 
				+    score: float = Field(description="推理结果的置信度")
			
 
				+    latex: str | None = Field(description="latex 解析结果", default=None)
			
 
				+    html: str | None = Field(description="html 解析结果", default=None)
			
 
				+  
			
 
				+class PageInferenceResults(BaseModel):
			
 
				+     layout_dets: list[ObjectInferenceResult] = Field(description="页面识别结果", ge=0)
			
 
				+     page_info: PageInfo = Field(description="页面元信息")
			
 
				+    
			
 
				+    
			
 
				+# 所有页面的推理结果按照页码顺序依次放到列表中即为 minerU 推理结果
			
 
				+inference_result: list[PageInferenceResults] = []
			
 
				+
			
 
				+```
			
 
				+
			
 
				+poly 坐标的格式 [x0, y0, x1, y1, x2, y2, x3, y3], 分别表示左上、右上、右下、左小四点的坐标
			
 
				+![poly 坐标示意图](images/poly.png)
			
 
				+
			
 
				+
			
 
				+#### 示例数据
			
 
				+
			
 
				+```json
			
 
				+[
			
 
				+    {
			
 
				+        "layout_dets": [
			
 
				+            {
			
 
				+                "category_id": 2,
			
 
				+                "poly": [
			
 
				+                    99.1906967163086,
			
 
				+                    100.3119125366211,
			
 
				+                    730.3707885742188,
			
 
				+                    100.3119125366211,
			
 
				+                    730.3707885742188,
			
 
				+                    245.81326293945312,
			
 
				+                    99.1906967163086,
			
 
				+                    245.81326293945312
			
 
				+                ],
			
 
				+                "score": 0.9999997615814209
			
 
				+            }
			
 
				+        ],
			
 
				+        "page_info": {
			
 
				+            "page_no": 0,
			
 
				+            "height": 2339,
			
 
				+            "width": 1654
			
 
				+        }
			
 
				+    },
			
 
				+    {
			
 
				+        "layout_dets": [
			
 
				+            {
			
 
				+                "category_id": 5,
			
 
				+                "poly": [
			
 
				+                    99.13092803955078,
			
 
				+                    2210.680419921875,
			
 
				+                    497.3183898925781,
			
 
				+                    2210.680419921875,
			
 
				+                    497.3183898925781,
			
 
				+                    2264.78076171875,
			
 
				+                    99.13092803955078,
			
 
				+                    2264.78076171875
			
 
				+                ],
			
 
				+                "score": 0.9999997019767761
			
 
				+            }
			
 
				+        ],
			
 
				+        "page_info": {
			
 
				+            "page_no": 1,
			
 
				+            "height": 2339,
			
 
				+            "width": 1654
			
 
				+        }
			
 
				+    }
			
 
				+]
			
 
				+```
			
 
				+
			
 
				+
			
 
				+### middle.json
			
 
				+
			
 
				+| 字段名 | 解释 | 
			
 
				+| :-----| :---- |
			
 
				+|pdf_info |list，每个元素都是一个dict,这个dict是每一页pdf的解析结果，详见下表 |
			
 
				+|_parse_type | ocr \| txt，用来标识本次解析的中间态使用的模式 |
			
 
				+
			
 
				+<br>
			
 
				+
			
 
				+**pdf_info**
			
 
				+字段结构说明
			
 
				+
			
 
				+| 字段名 | 解释 | 
			
 
				+| :-----| :---- |
			
 
				+| preproc_blocks | pdf预处理后，未分段的中间结果 |
			
 
				+| layout_bboxes | 布局分割的结果，含有布局的方向（垂直、水平），和bbox，按阅读顺序排序 |
			
 
				+| page_idx | 页码，从0开始 |
			
 
				+| page_size | 页面的宽度和高度 | 
			
 
				+| _layout_tree | 布局树状结构 |
			
 
				+| images | list，每个元素是一个dict，每个dict表示一个img_block |
			
 
				+| tables | list，每个元素是一个dict，每个dict表示一个table_block |
			
 
				+| interline_equations | list，每个元素是一个dict，每个dict表示一个interline_equation_block |
			
 
				+| discarded_blocks | List, 模型返回的需要drop的block信息 |
			
 
				+| para_blocks | 将preproc_blocks进行分段之后的结果 |
			
 
				+
			
 
				+上表中 `para_blocks` 是个dict的数组，每个dict是一个block结构，block最多支持一次嵌套
			
 
				+
			
 
				+<br>
			
 
				+
			
 
				+**block**
			
 
				+
			
 
				+外层block被称为一级block，一级block中的字段包括
			
 
				+
			
 
				+| type | block类型（table\|image）|
			
 
				+| :-----| :---- |
			
 
				+|bbox | block矩形框坐标 |
			
 
				+|blocks |list，里面的每个元素都是一个dict格式的二级block |
			
 
				+
			
 
				+<br>
			
 
				+一级block只有"table"和"image"两种类型，其余block均为二级block
			
 
				+
			
 
				+二级block中的字段包括
			
 
				+
			
 
				+| type | desc | 
			
 
				+| :-----| :---- |
			
 
				+| bbox | block矩形框坐标 |
			
 
				+| lines | list，每个元素都是一个dict表示的line，用来描述一行信息的构成| 
			
 
				+
			
 
				+二级block的类型详解
			
 
				+| type | desc | 
			
 
				+| :-----| :---- |
			
 
				+| image_body | 图像的本体 |
			
 
				+| image_caption | 图像的描述文本 |
			
 
				+| table_body | 表格本体 |
			
 
				+| table_caption | 表格的描述文本 |
			
 
				+| table_footnote | 表格的脚注 |
			
 
				+| text | 文本块 |
			
 
				+| title | 标题块 |
			
 
				+| interline_equation | 行间公式块| 
			
 
				+
			
 
				+<br>
			
 
				+
			
 
				+**line**
			
 
				+
			
 
				+line 的 字段格式如下
			
 
				+| 字段名 | 解释 | 
			
 
				+| :-----| :---- |
			
 
				+| bbox | line的矩形框坐标 |
			
 
				+| spans | list，每个元素都是一个dict表示的span，用来描述一个最小组成单元的构成 |
			
 
				+
			
 
				+
			
 
				+<br>
			
 
				+
			
 
				+**span**
			
 
				+
			
 
				+| 字段名 | 解释 | 
			
 
				+| :-----| :---- |
			
 
				+| bbox | span的矩形框坐标 |
			
 
				+| type | span的类型 |
			
 
				+| content \| img_path | 文本类型的span使用content，图表类使用img_path 用来存储实际的文本或者截图路径信息 |
			
 
				+
			
 
				+span 的类型有如下几种
			
 
				+
			
 
				+| type | desc | 
			
 
				+| :-----| :---- |
			
 
				+| image | 图片 | 
			
 
				+| table | 表格 |
			
 
				+| text | 文本 |
			
 
				+| inline_equation | 行内公式 |
			
 
				+| interline_equation | 行间公式 |
			
 
				+
			
 
				+
			
 
				+**总结**
			
 
				+
			
 
				+span是所有元素的最小存储单元
			
 
				+
			
 
				+para_blocks内存储的元素为区块信息
			
 
				+
			
 
				+区块结构为
			
 
				+
			
 
				+一级block(如有)->二级block->line->span
			
 
				+
			
 
				+
			
 
				+#### 示例数据
			
 
				+
			
 
				+```json
			
 
				+{
			
 
				+    "pdf_info": [
			
 
				+        {
			
 
				+            "preproc_blocks": [
			
 
				+                {
			
 
				+                    "type": "text",
			
 
				+                    "bbox": [
			
 
				+                        52,
			
 
				+                        61.956024169921875,
			
 
				+                        294,
			
 
				+                        82.99800872802734
			
 
				+                    ],
			
 
				+                    "lines": [
			
 
				+                        {
			
 
				+                            "bbox": [
			
 
				+                                52,
			
 
				+                                61.956024169921875,
			
 
				+                                294,
			
 
				+                                72.0000228881836
			
 
				+                            ],
			
 
				+                            "spans": [
			
 
				+                                {
			
 
				+                                    "bbox": [
			
 
				+                                        54.0,
			
 
				+                                        61.956024169921875,
			
 
				+                                        296.2261657714844,
			
 
				+                                        72.0000228881836
			
 
				+                                    ],
			
 
				+                                    "content": "dependent on the service headway and the reliability of the departure ",
			
 
				+                                    "type": "text",
			
 
				+                                    "score": 1.0
			
 
				+                                }
			
 
				+                            ]
			
 
				+                        }
			
 
				+                    ]
			
 
				+                }
			
 
				+            ],
			
 
				+            "layout_bboxes": [
			
 
				+                {
			
 
				+                    "layout_bbox": [
			
 
				+                        52,
			
 
				+                        61,
			
 
				+                        294,
			
 
				+                        731
			
 
				+                    ],
			
 
				+                    "layout_label": "V",
			
 
				+                    "sub_layout": []
			
 
				+                }
			
 
				+            ],
			
 
				+            "page_idx": 0,
			
 
				+            "page_size": [
			
 
				+                612.0,
			
 
				+                792.0
			
 
				+            ],
			
 
				+            "_layout_tree": [],
			
 
				+            "images": [],
			
 
				+            "tables": [],
			
 
				+            "interline_equations": [],
			
 
				+            "discarded_blocks": [],
			
 
				+            "para_blocks": [
			
 
				+                {
			
 
				+                    "type": "text",
			
 
				+                    "bbox": [
			
 
				+                        52,
			
 
				+                        61.956024169921875,
			
 
				+                        294,
			
 
				+                        82.99800872802734
			
 
				+                    ],
			
 
				+                    "lines": [
			
 
				+                        {
			
 
				+                            "bbox": [
			
 
				+                                52,
			
 
				+                                61.956024169921875,
			
 
				+                                294,
			
 
				+                                72.0000228881836
			
 
				+                            ],
			
 
				+                            "spans": [
			
 
				+                                {
			
 
				+                                    "bbox": [
			
 
				+                                        54.0,
			
 
				+                                        61.956024169921875,
			
 
				+                                        296.2261657714844,
			
 
				+                                        72.0000228881836
			
 
				+                                    ],
			
 
				+                                    "content": "dependent on the service headway and the reliability of the departure ",
			
 
				+                                    "type": "text",
			
 
				+                                    "score": 1.0
			
 
				+                                }
			
 
				+                            ]
			
 
				+                        }
			
 
				+                    ]
			
 
				+                }
			
 
				+            ]
			
 
				+        }
			
 
				+    ],
			
 
				+    "_parse_type": "txt",
			
 
				+    "_version_name": "0.6.1"
			
 
				+}
			
 
				+```
			
--- a/magic-pdf.template.json
+++ b/magic-pdf.template.json
@@ -3,7 +3,6 @@
 
				         "bucket-name-1":["ak", "sk", "endpoint"],
			
 
				         "bucket-name-2":["ak", "sk", "endpoint"]
			
 
				     },
			
 
				-    "temp-output-dir":"/tmp",
			
 
				     "models-dir":"/tmp/models",
			
 
				     "device-mode":"cpu"
			
 
				 }
			
--- a/magic_pdf/libs/config_reader.py
+++ b/magic_pdf/libs/config_reader.py
@@ -57,16 +57,6 @@ def get_bucket_name(path):
 
				     return bucket
			
 
				 
			
 
				 
			
 
				-def get_local_dir():
			
 
				-    config = read_config()
			
 
				-    local_dir = config.get("temp-output-dir")
			
 
				-    if local_dir is None:
			
 
				-        logger.warning(f"'temp-output-dir' not found in {CONFIG_FILE_NAME}, use '/tmp' as default")
			
 
				-        return "/tmp"
			
 
				-    else:
			
 
				-        return local_dir
			
 
				-
			
 
				-
			
 
				 def get_local_models_dir():
			
 
				     config = read_config()
			
 
				     models_dir = config.get("models-dir")
			
--- a/magic_pdf/rw/AbsReaderWriter.py
+++ b/magic_pdf/rw/AbsReaderWriter.py
@@ -2,33 +2,16 @@ from abc import ABC, abstractmethod
 
				 
			
 
				 
			
 
				 class AbsReaderWriter(ABC):
			
 
				-    """
			
 
				-    同时支持二进制和文本读写的抽象类
			
 
				-    """
			
 
				     MODE_TXT = "text"
			
 
				     MODE_BIN = "binary"
			
 
				-
			
 
				-    def __init__(self, parent_path):
			
 
				-        # 初始化代码可以在这里添加，如果需要的话
			
 
				-        self.parent_path = parent_path # 对于本地目录是父目录，对于s3是会写到这个path下。
			
 
				-
			
 
				     @abstractmethod
			
 
				     def read(self, path: str, mode=MODE_TXT):
			
 
				-        """
			
 
				-        无论对于本地还是s3的路径，检查如果path是绝对路径，那么就不再 拼接parent_path, 如果是相对路径就拼接parent_path
			
 
				-        """
			
 
				         raise NotImplementedError
			
 
				 
			
 
				     @abstractmethod
			
 
				     def write(self, content: str, path: str, mode=MODE_TXT):
			
 
				-        """
			
 
				-        无论对于本地还是s3的路径，检查如果path是绝对路径，那么就不再 拼接parent_path, 如果是相对路径就拼接parent_path
			
 
				-        """
			
 
				         raise NotImplementedError
			
 
				 
			
 
				     @abstractmethod
			
 
				-    def read_jsonl(self, path: str, byte_start=0, byte_end=None, encoding='utf-8'):
			
 
				-        """
			
 
				-        无论对于本地还是s3的路径，检查如果path是绝对路径，那么就不再 拼接parent_path, 如果是相对路径就拼接parent_path
			
 
				-        """
			
 
				+    def read_offset(self, path: str, offset=0, limit=None) -> bytes:
			
 
				         raise NotImplementedError
			
--- a/magic_pdf/rw/DiskReaderWriter.py
+++ b/magic_pdf/rw/DiskReaderWriter.py
@@ -3,34 +3,29 @@ from magic_pdf.rw.AbsReaderWriter import AbsReaderWriter
 
				 from loguru import logger
			
 
				 
			
 
				 
			
 
				-MODE_TXT = "text"
			
 
				-MODE_BIN = "binary"
			
 
				-
			
 
				-
			
 
				 class DiskReaderWriter(AbsReaderWriter):
			
 
				-
			
 
				     def __init__(self, parent_path, encoding="utf-8"):
			
 
				         self.path = parent_path
			
 
				         self.encoding = encoding
			
 
				 
			
 
				-    def read(self, path, mode=MODE_TXT):
			
 
				+    def read(self, path, mode=AbsReaderWriter.MODE_TXT):
			
 
				         if os.path.isabs(path):
			
 
				             abspath = path
			
 
				         else:
			
 
				             abspath = os.path.join(self.path, path)
			
 
				         if not os.path.exists(abspath):
			
 
				-            logger.error(f"文件 {abspath} 不存在")
			
 
				-            raise Exception(f"文件 {abspath} 不存在")
			
 
				-        if mode == MODE_TXT:
			
 
				+            logger.error(f"file {abspath} not exists")
			
 
				+            raise Exception(f"file {abspath} no exists")
			
 
				+        if mode == AbsReaderWriter.MODE_TXT:
			
 
				             with open(abspath, "r", encoding=self.encoding) as f:
			
 
				                 return f.read()
			
 
				-        elif mode == MODE_BIN:
			
 
				+        elif mode == AbsReaderWriter.MODE_BIN:
			
 
				             with open(abspath, "rb") as f:
			
 
				                 return f.read()
			
 
				         else:
			
 
				             raise ValueError("Invalid mode. Use 'text' or 'binary'.")
			
 
				 
			
 
				-    def write(self, content, path, mode=MODE_TXT):
			
 
				+    def write(self, content, path, mode=AbsReaderWriter.MODE_TXT):
			
 
				         if os.path.isabs(path):
			
 
				             abspath = path
			
 
				         else:
			
@@ -38,29 +33,42 @@ class DiskReaderWriter(AbsReaderWriter):
 
				         directory_path = os.path.dirname(abspath)
			
 
				         if not os.path.exists(directory_path):
			
 
				             os.makedirs(directory_path)
			
 
				-        if mode == MODE_TXT:
			
 
				+        if mode == AbsReaderWriter.MODE_TXT:
			
 
				             with open(abspath, "w", encoding=self.encoding, errors="replace") as f:
			
 
				                 f.write(content)
			
 
				 
			
 
				-        elif mode == MODE_BIN:
			
 
				+        elif mode == AbsReaderWriter.MODE_BIN:
			
 
				             with open(abspath, "wb") as f:
			
 
				                 f.write(content)
			
 
				         else:
			
 
				             raise ValueError("Invalid mode. Use 'text' or 'binary'.")
			
 
				 
			
 
				-    def read_jsonl(self, path: str, byte_start=0, byte_end=None, encoding="utf-8"):
			
 
				-        return self.read(path)
			
 
				+    def read_offset(self, path: str, offset=0, limit=None):
			
 
				+        abspath = path
			
 
				+        if not os.path.isabs(path):
			
 
				+            abspath = os.path.join(self.path, path)
			
 
				+        with open(abspath, "rb") as f:
			
 
				+            f.seek(offset)
			
 
				+            return f.read(limit)
			
 
				 
			
 
				 
			
 
				-# 使用示例
			
 
				 if __name__ == "__main__":
			
 
				-    file_path = "io/test/example.txt"
			
 
				-    drw = DiskReaderWriter("D:\projects\papayfork\Magic-PDF\magic_pdf")
			
 
				+    if 0:
			
 
				+        file_path = "io/test/example.txt"
			
 
				+        drw = DiskReaderWriter("D:\projects\papayfork\Magic-PDF\magic_pdf")
			
 
				+
			
 
				+        # 写入内容到文件
			
 
				+        drw.write(b"Hello, World!", path="io/test/example.txt", mode="binary")
			
 
				+
			
 
				+        # 从文件读取内容
			
 
				+        content = drw.read(path=file_path)
			
 
				+        if content:
			
 
				+            logger.info(f"从 {file_path} 读取的内容: {content}")
			
 
				+    if 1:
			
 
				+        drw = DiskReaderWriter("/opt/data/pdf/resources/test/io/")
			
 
				+        content_bin = drw.read_offset("1.txt")
			
 
				+        assert content_bin == b"ABCD!"
			
 
				 
			
 
				-    # 写入内容到文件
			
 
				-    drw.write(b"Hello, World!", path="io/test/example.txt", mode="binary")
			
 
				+        content_bin = drw.read_offset("1.txt", offset=1, limit=2)
			
 
				+        assert content_bin == b"BC"
			
 
				 
			
 
				-    # 从文件读取内容
			
 
				-    content = drw.read(path=file_path)
			
 
				-    if content:
			
 
				-        logger.info(f"从 {file_path} 读取的内容: {content}")
			
--- a/magic_pdf/rw/S3ReaderWriter.py
+++ b/magic_pdf/rw/S3ReaderWriter.py
@@ -2,16 +2,18 @@ from magic_pdf.rw.AbsReaderWriter import AbsReaderWriter
 
				 from magic_pdf.libs.commons import parse_aws_param, parse_bucket_key, join_path
			
 
				 import boto3
			
 
				 from loguru import logger
			
 
				-from boto3.s3.transfer import TransferConfig
			
 
				 from botocore.config import Config
			
 
				-import os
			
 
				-
			
 
				-MODE_TXT = "text"
			
 
				-MODE_BIN = "binary"
			
 
				 
			
 
				 
			
 
				 class S3ReaderWriter(AbsReaderWriter):
			
 
				-    def __init__(self, ak: str, sk: str, endpoint_url: str, addressing_style: str = 'auto', parent_path: str = ''):
			
 
				+    def __init__(
			
 
				+        self,
			
 
				+        ak: str,
			
 
				+        sk: str,
			
 
				+        endpoint_url: str,
			
 
				+        addressing_style: str = "auto",
			
 
				+        parent_path: str = "",
			
 
				+    ):
			
 
				         self.client = self._get_client(ak, sk, endpoint_url, addressing_style)
			
 
				         self.path = parent_path
			
 
				 
			
@@ -21,12 +23,14 @@ class S3ReaderWriter(AbsReaderWriter):
 
				             aws_access_key_id=ak,
			
 
				             aws_secret_access_key=sk,
			
 
				             endpoint_url=endpoint_url,
			
 
				-            config=Config(s3={"addressing_style": addressing_style},
			
 
				-                          retries={'max_attempts': 5, 'mode': 'standard'}),
			
 
				+            config=Config(
			
 
				+                s3={"addressing_style": addressing_style},
			
 
				+                retries={"max_attempts": 5, "mode": "standard"},
			
 
				+            ),
			
 
				         )
			
 
				         return s3_client
			
 
				 
			
 
				-    def read(self, s3_relative_path, mode=MODE_TXT, encoding="utf-8"):
			
 
				+    def read(self, s3_relative_path, mode=AbsReaderWriter.MODE_TXT, encoding="utf-8"):
			
 
				         if s3_relative_path.startswith("s3://"):
			
 
				             s3_path = s3_relative_path
			
 
				         else:
			
@@ -34,22 +38,22 @@ class S3ReaderWriter(AbsReaderWriter):
 
				         bucket_name, key = parse_bucket_key(s3_path)
			
 
				         res = self.client.get_object(Bucket=bucket_name, Key=key)
			
 
				         body = res["Body"].read()
			
 
				-        if mode == MODE_TXT:
			
 
				+        if mode == AbsReaderWriter.MODE_TXT:
			
 
				             data = body.decode(encoding)  # Decode bytes to text
			
 
				-        elif mode == MODE_BIN:
			
 
				+        elif mode == AbsReaderWriter.MODE_BIN:
			
 
				             data = body
			
 
				         else:
			
 
				             raise ValueError("Invalid mode. Use 'text' or 'binary'.")
			
 
				         return data
			
 
				 
			
 
				-    def write(self, content, s3_relative_path, mode=MODE_TXT, encoding="utf-8"):
			
 
				+    def write(self, content, s3_relative_path, mode=AbsReaderWriter.MODE_TXT, encoding="utf-8"):
			
 
				         if s3_relative_path.startswith("s3://"):
			
 
				             s3_path = s3_relative_path
			
 
				         else:
			
 
				             s3_path = join_path(self.path, s3_relative_path)
			
 
				-        if mode == MODE_TXT:
			
 
				+        if mode == AbsReaderWriter.MODE_TXT:
			
 
				             body = content.encode(encoding)  # Encode text data as bytes
			
 
				-        elif mode == MODE_BIN:
			
 
				+        elif mode == AbsReaderWriter.MODE_BIN:
			
 
				             body = content
			
 
				         else:
			
 
				             raise ValueError("Invalid mode. Use 'text' or 'binary'.")
			
@@ -57,51 +61,82 @@ class S3ReaderWriter(AbsReaderWriter):
 
				         self.client.put_object(Body=body, Bucket=bucket_name, Key=key)
			
 
				         logger.info(f"内容已写入 {s3_path} ")
			
 
				 
			
 
				-    def read_jsonl(self, path: str, byte_start=0, byte_end=None, mode=MODE_TXT, encoding='utf-8'):
			
 
				+    def read_offset(self, path: str, offset=0, limit=None) -> bytes:
			
 
				         if path.startswith("s3://"):
			
 
				             s3_path = path
			
 
				         else:
			
 
				             s3_path = join_path(self.path, path)
			
 
				         bucket_name, key = parse_bucket_key(s3_path)
			
 
				 
			
 
				-        range_header = f'bytes={byte_start}-{byte_end}' if byte_end else f'bytes={byte_start}-'
			
 
				+        range_header = (
			
 
				+            f"bytes={offset}-{offset+limit-1}" if limit else f"bytes={offset}-"
			
 
				+        )
			
 
				         res = self.client.get_object(Bucket=bucket_name, Key=key, Range=range_header)
			
 
				-        body = res["Body"].read()
			
 
				-        if mode == MODE_TXT:
			
 
				-            data = body.decode(encoding)  # Decode bytes to text
			
 
				-        elif mode == MODE_BIN:
			
 
				-            data = body
			
 
				-        else:
			
 
				-            raise ValueError("Invalid mode. Use 'text' or 'binary'.")
			
 
				-        return data
			
 
				+        return res["Body"].read()
			
 
				 
			
 
				 
			
 
				 if __name__ == "__main__":
			
 
				-    # Config the connection info
			
 
				-    ak = ""
			
 
				-    sk = ""
			
 
				-    endpoint_url = ""
			
 
				-    addressing_style = "auto"
			
 
				-    bucket_name = ""
			
 
				-    # Create an S3ReaderWriter object
			
 
				-    s3_reader_writer = S3ReaderWriter(ak, sk, endpoint_url, addressing_style, "s3://bucket_name/")
			
 
				+    if 0:
			
 
				+        # Config the connection info
			
 
				+        ak = ""
			
 
				+        sk = ""
			
 
				+        endpoint_url = ""
			
 
				+        addressing_style = "auto"
			
 
				+        bucket_name = ""
			
 
				+        # Create an S3ReaderWriter object
			
 
				+        s3_reader_writer = S3ReaderWriter(
			
 
				+            ak, sk, endpoint_url, addressing_style, "s3://bucket_name/"
			
 
				+        )
			
 
				 
			
 
				-    # Write text data to S3
			
 
				-    text_data = "This is some text data"
			
 
				-    s3_reader_writer.write(data=text_data, s3_relative_path=f"s3://{bucket_name}/ebook/test/test.json", mode=MODE_TXT)
			
 
				+        # Write text data to S3
			
 
				+        text_data = "This is some text data"
			
 
				+        s3_reader_writer.write(
			
 
				+            text_data,
			
 
				+            s3_relative_path=f"s3://{bucket_name}/ebook/test/test.json",
			
 
				+            mode=AbsReaderWriter.MODE_TXT,
			
 
				+        )
			
 
				+
			
 
				+        # Read text data from S3
			
 
				+        text_data_read = s3_reader_writer.read(
			
 
				+            s3_relative_path=f"s3://{bucket_name}/ebook/test/test.json", mode=AbsReaderWriter.MODE_TXT
			
 
				+        )
			
 
				+        logger.info(f"Read text data from S3: {text_data_read}")
			
 
				+        # Write binary data to S3
			
 
				+        binary_data = b"This is some binary data"
			
 
				+        s3_reader_writer.write(
			
 
				+            text_data,
			
 
				+            s3_relative_path=f"s3://{bucket_name}/ebook/test/test.json",
			
 
				+            mode=AbsReaderWriter.MODE_BIN,
			
 
				+        )
			
 
				 
			
 
				-    # Read text data from S3
			
 
				-    text_data_read = s3_reader_writer.read(s3_relative_path=f"s3://{bucket_name}/ebook/test/test.json", mode=MODE_TXT)
			
 
				-    logger.info(f"Read text data from S3: {text_data_read}")
			
 
				-    # Write binary data to S3
			
 
				-    binary_data = b"This is some binary data"
			
 
				-    s3_reader_writer.write(data=text_data, s3_relative_path=f"s3://{bucket_name}/ebook/test/test.json", mode=MODE_BIN)
			
 
				+        # Read binary data from S3
			
 
				+        binary_data_read = s3_reader_writer.read(
			
 
				+            s3_relative_path=f"s3://{bucket_name}/ebook/test/test.json", mode=AbsReaderWriter.MODE_BIN
			
 
				+        )
			
 
				+        logger.info(f"Read binary data from S3: {binary_data_read}")
			
 
				+
			
 
				+        # Range Read text data from S3
			
 
				+        binary_data_read = s3_reader_writer.read_offset(
			
 
				+            path=f"s3://{bucket_name}/ebook/test/test.json", offset=0, limit=10
			
 
				+        )
			
 
				+        logger.info(f"Read binary data from S3: {binary_data_read}")
			
 
				+    if 1:
			
 
				+        import os
			
 
				+        import json
			
 
				 
			
 
				-    # Read binary data from S3
			
 
				-    binary_data_read = s3_reader_writer.read(s3_relative_path=f"s3://{bucket_name}/ebook/test/test.json", mode=MODE_BIN)
			
 
				-    logger.info(f"Read binary data from S3: {binary_data_read}")
			
 
				+        ak = os.getenv("AK", "")
			
 
				+        sk = os.getenv("SK", "")
			
 
				+        endpoint_url = os.getenv("ENDPOINT", "")
			
 
				+        bucket = os.getenv("S3_BUCKET", "")
			
 
				+        prefix = os.getenv("S3_PREFIX", "")
			
 
				+        key_basename = os.getenv("S3_KEY_BASENAME", "")
			
 
				+        s3_reader_writer = S3ReaderWriter(
			
 
				+            ak, sk, endpoint_url, "auto", f"s3://{bucket}/{prefix}"
			
 
				+        )
			
 
				+        content_bin = s3_reader_writer.read_offset(key_basename)
			
 
				+        assert content_bin[:10] == b'{"track_id'
			
 
				+        assert content_bin[-10:] == b'r":null}}\n'
			
 
				 
			
 
				-    # Range Read text data from S3
			
 
				-    binary_data_read = s3_reader_writer.read_jsonl(path=f"s3://{bucket_name}/ebook/test/test.json",
			
 
				-                                                   byte_start=0, byte_end=10, mode=MODE_BIN)
			
 
				-    logger.info(f"Read binary data from S3: {binary_data_read}")
			
 
				+        content_bin = s3_reader_writer.read_offset(key_basename, offset=424, limit=426)
			
 
				+        jso = json.dumps(content_bin.decode("utf-8"))
			
 
				+        print(jso)
			
--- a/magic_pdf/tools/__init__.py
+++ b/magic_pdf/tools/__init__.py
--- a/magic_pdf/tools/cli.py
+++ b/magic_pdf/tools/cli.py
@@ -0,0 +1,75 @@
 
				+import os
			
 
				+import click
			
 
				+from loguru import logger
			
 
				+from pathlib import Path
			
 
				+from magic_pdf.rw.DiskReaderWriter import DiskReaderWriter
			
 
				+from magic_pdf.rw.AbsReaderWriter import AbsReaderWriter
			
 
				+import magic_pdf.model as model_config
			
 
				+from magic_pdf.tools.common import parse_pdf_methods, do_parse
			
 
				+
			
 
				+
			
 
				+@click.command()
			
 
				+@click.option(
			
 
				+    "-p",
			
 
				+    "--path",
			
 
				+    "path",
			
 
				+    type=click.Path(exists=True),
			
 
				+    required=True,
			
 
				+    help="local pdf filepath or directory",
			
 
				+)
			
 
				+@click.option(
			
 
				+    "-o",
			
 
				+    "--output-dir",
			
 
				+    "output_dir",
			
 
				+    type=str,
			
 
				+    help="output local directory",
			
 
				+    default="",
			
 
				+)
			
 
				+@click.option(
			
 
				+    "-m",
			
 
				+    "--method",
			
 
				+    "method",
			
 
				+    type=parse_pdf_methods,
			
 
				+    help="""the method for parsing pdf. 
			
 
				+ocr: using ocr technique to extract information from pdf.
			
 
				+txt: suitable for the text-based pdf only and outperform ocr. 
			
 
				+auto: automatically choose the best method for parsing pdf from ocr and txt""",
			
 
				+    default="auto",
			
 
				+)
			
 
				+def cli(path, output_dir, method):
			
 
				+    model_config.__use_inside_model__ = True
			
 
				+    model_config.__model_mode__ = "full"
			
 
				+    if output_dir == "":
			
 
				+        if os.path.isdir(path):
			
 
				+            output_dir = os.path.join(path, "output")
			
 
				+        else:
			
 
				+            output_dir = os.path.join(os.path.dirname(path), "output")
			
 
				+
			
 
				+    def read_fn(path):
			
 
				+        disk_rw = DiskReaderWriter(os.path.dirname(path))
			
 
				+        return disk_rw.read(os.path.basename(path), AbsReaderWriter.MODE_BIN)
			
 
				+
			
 
				+    def parse_doc(doc_path: str):
			
 
				+        try:
			
 
				+            file_name = str(Path(doc_path).stem)
			
 
				+            pdf_data = read_fn(doc_path)
			
 
				+            do_parse(
			
 
				+                output_dir,
			
 
				+                file_name,
			
 
				+                pdf_data,
			
 
				+                [],
			
 
				+                method,
			
 
				+            )
			
 
				+
			
 
				+        except Exception as e:
			
 
				+            logger.exception(e)
			
 
				+
			
 
				+    if os.path.isdir(path):
			
 
				+        for doc_path in Path(path).glob("*.pdf"):
			
 
				+            parse_doc(doc_path)
			
 
				+    else:
			
 
				+        parse_doc(path)
			
 
				+
			
 
				+
			
 
				+if __name__ == "__main__":
			
 
				+    cli()
			
--- a/magic_pdf/tools/cli_dev.py
+++ b/magic_pdf/tools/cli_dev.py
@@ -0,0 +1,154 @@
 
				+import os
			
 
				+import json as json_parse
			
 
				+import click
			
 
				+from pathlib import Path
			
 
				+from magic_pdf.libs.path_utils import (
			
 
				+    parse_s3path,
			
 
				+    parse_s3_range_params,
			
 
				+    remove_non_official_s3_args,
			
 
				+)
			
 
				+from magic_pdf.libs.config_reader import (
			
 
				+    get_s3_config,
			
 
				+)
			
 
				+from magic_pdf.rw.S3ReaderWriter import S3ReaderWriter
			
 
				+from magic_pdf.rw.DiskReaderWriter import DiskReaderWriter
			
 
				+from magic_pdf.rw.AbsReaderWriter import AbsReaderWriter
			
 
				+import magic_pdf.model as model_config
			
 
				+from magic_pdf.tools.common import parse_pdf_methods, do_parse
			
 
				+
			
 
				+
			
 
				+def read_s3_path(s3path):
			
 
				+    bucket, key = parse_s3path(s3path)
			
 
				+
			
 
				+    s3_ak, s3_sk, s3_endpoint = get_s3_config(bucket)
			
 
				+    s3_rw = S3ReaderWriter(
			
 
				+        s3_ak, s3_sk, s3_endpoint, "auto", remove_non_official_s3_args(s3path)
			
 
				+    )
			
 
				+    may_range_params = parse_s3_range_params(s3path)
			
 
				+    if may_range_params is None or 2 != len(may_range_params):
			
 
				+        byte_start, byte_end = 0, None
			
 
				+    else:
			
 
				+        byte_start, byte_end = int(may_range_params[0]), int(may_range_params[1])
			
 
				+        byte_end += byte_start - 1
			
 
				+    return s3_rw.read_jsonl(
			
 
				+        remove_non_official_s3_args(s3path),
			
 
				+        byte_start,
			
 
				+        byte_end,
			
 
				+        AbsReaderWriter.MODE_BIN,
			
 
				+    )
			
 
				+
			
 
				+
			
 
				+@click.group()
			
 
				+def cli():
			
 
				+    pass
			
 
				+
			
 
				+
			
 
				+@cli.command()
			
 
				+@click.option(
			
 
				+    "-j",
			
 
				+    "--jsonl",
			
 
				+    "jsonl",
			
 
				+    type=str,
			
 
				+    help="输入 jsonl 路径，本地或者 s3 上的文件",
			
 
				+    required=True,
			
 
				+)
			
 
				+@click.option(
			
 
				+    "-m",
			
 
				+    "--method",
			
 
				+    "method",
			
 
				+    type=parse_pdf_methods,
			
 
				+    help="指定解析方法。txt: 文本型 pdf 解析方法， ocr: 光学识别解析 pdf, auto: 程序智能选择解析方法",
			
 
				+    default="auto",
			
 
				+)
			
 
				+@click.option(
			
 
				+    "-o",
			
 
				+    "--output-dir",
			
 
				+    "output_dir",
			
 
				+    type=str,
			
 
				+    help="输出到本地目录",
			
 
				+    default="",
			
 
				+)
			
 
				+def jsonl(jsonl, method, output_dir):
			
 
				+    print("haha")
			
 
				+    model_config.__use_inside_model__ = False
			
 
				+    full_jsonl_path = os.path.realpath(jsonl)
			
 
				+    if output_dir == "":
			
 
				+        output_dir = os.path.join(os.path.dirname(full_jsonl_path), "output")
			
 
				+
			
 
				+    if jsonl.startswith("s3://"):
			
 
				+        jso = json_parse.loads(read_s3_path(jsonl).decode("utf-8"))
			
 
				+    else:
			
 
				+        with open(jsonl) as f:
			
 
				+            jso = json_parse.loads(f.readline())
			
 
				+    s3_file_path = jso.get("file_location")
			
 
				+    if s3_file_path is None:
			
 
				+        s3_file_path = jso.get("path")
			
 
				+    pdf_file_name = Path(s3_file_path).stem
			
 
				+    pdf_data = read_s3_path(s3_file_path)
			
 
				+
			
 
				+
			
 
				+    print(pdf_file_name, jso, method)
			
 
				+    do_parse(
			
 
				+        output_dir,
			
 
				+        pdf_file_name,
			
 
				+        pdf_data,
			
 
				+        jso["doc_layout_result"],
			
 
				+        method,
			
 
				+        f_dump_content_list=True,
			
 
				+    )
			
 
				+
			
 
				+
			
 
				+@cli.command()
			
 
				+@click.option(
			
 
				+    "-p",
			
 
				+    "--pdf",
			
 
				+    "pdf",
			
 
				+    type=click.Path(exists=True),
			
 
				+    required=True,
			
 
				+    help="本地 PDF 文件",
			
 
				+)
			
 
				+@click.option(
			
 
				+    "-j",
			
 
				+    "--json",
			
 
				+    "json_data",
			
 
				+    type=click.Path(exists=True),
			
 
				+    required=True,
			
 
				+    help="本地模型推理出的 json 数据",
			
 
				+)
			
 
				+@click.option(
			
 
				+    "-o", "--output-dir", "output_dir", type=str, help="本地输出目录", default=""
			
 
				+)
			
 
				+@click.option(
			
 
				+    "-m",
			
 
				+    "--method",
			
 
				+    "method",
			
 
				+    type=parse_pdf_methods,
			
 
				+    help="指定解析方法。txt: 文本型 pdf 解析方法， ocr: 光学识别解析 pdf, auto: 程序智能选择解析方法",
			
 
				+    default="auto",
			
 
				+)
			
 
				+def pdf(pdf, json_data, output_dir, method):
			
 
				+    model_config.__use_inside_model__ = False
			
 
				+    full_pdf_path = os.path.realpath(pdf)
			
 
				+    if output_dir == "":
			
 
				+        output_dir = os.path.join(os.path.dirname(full_pdf_path), "output")
			
 
				+
			
 
				+    def read_fn(path):
			
 
				+        disk_rw = DiskReaderWriter(os.path.dirname(path))
			
 
				+        return disk_rw.read(os.path.basename(path), AbsReaderWriter.MODE_BIN)
			
 
				+
			
 
				+    model_json_list = json_parse.loads(read_fn(json_data).decode("utf-8"))
			
 
				+
			
 
				+    file_name = str(Path(full_pdf_path).stem)
			
 
				+    pdf_data = read_fn(full_pdf_path)
			
 
				+    do_parse(
			
 
				+        output_dir,
			
 
				+        file_name,
			
 
				+        pdf_data,
			
 
				+        model_json_list,
			
 
				+        method,
			
 
				+        f_dump_content_list=True,
			
 
				+    )
			
 
				+
			
 
				+
			
 
				+if __name__ == "__main__":
			
 
				+    cli()
			
--- a/magic_pdf/tools/common.py
+++ b/magic_pdf/tools/common.py
@@ -0,0 +1,117 @@
 
				+import os
			
 
				+import json as json_parse
			
 
				+import copy
			
 
				+import click
			
 
				+from loguru import logger
			
 
				+from magic_pdf.libs.MakeContentConfig import DropMode, MakeMode
			
 
				+from magic_pdf.libs.draw_bbox import draw_layout_bbox, draw_span_bbox
			
 
				+from magic_pdf.pipe.UNIPipe import UNIPipe
			
 
				+from magic_pdf.pipe.OCRPipe import OCRPipe
			
 
				+from magic_pdf.pipe.TXTPipe import TXTPipe
			
 
				+from magic_pdf.rw.DiskReaderWriter import DiskReaderWriter
			
 
				+from magic_pdf.rw.AbsReaderWriter import AbsReaderWriter
			
 
				+import magic_pdf.model as model_config
			
 
				+
			
 
				+
			
 
				+def prepare_env(output_dir, pdf_file_name, method):
			
 
				+    local_parent_dir = os.path.join(output_dir, pdf_file_name, method)
			
 
				+
			
 
				+    local_image_dir = os.path.join(str(local_parent_dir), "images")
			
 
				+    local_md_dir = local_parent_dir
			
 
				+    os.makedirs(local_image_dir, exist_ok=True)
			
 
				+    os.makedirs(local_md_dir, exist_ok=True)
			
 
				+    return local_image_dir, local_md_dir
			
 
				+
			
 
				+
			
 
				+def do_parse(
			
 
				+    output_dir,
			
 
				+    pdf_file_name,
			
 
				+    pdf_bytes,
			
 
				+    model_list,
			
 
				+    parse_method,
			
 
				+    f_draw_span_bbox=True,
			
 
				+    f_draw_layout_bbox=True,
			
 
				+    f_dump_md=True,
			
 
				+    f_dump_middle_json=True,
			
 
				+    f_dump_model_json=True,
			
 
				+    f_dump_orig_pdf=True,
			
 
				+    f_dump_content_list=False,
			
 
				+    f_make_md_mode=MakeMode.MM_MD,
			
 
				+):
			
 
				+    orig_model_list = copy.deepcopy(model_list)
			
 
				+    local_image_dir, local_md_dir = prepare_env(output_dir, pdf_file_name, parse_method)
			
 
				+    logger.info(f"local output dir is {local_md_dir}")
			
 
				+    image_writer, md_writer = DiskReaderWriter(local_image_dir), DiskReaderWriter(
			
 
				+        local_md_dir
			
 
				+    )
			
 
				+    image_dir = str(os.path.basename(local_image_dir))
			
 
				+
			
 
				+    if parse_method == "auto":
			
 
				+        jso_useful_key = {"_pdf_type": "", "model_list": model_list}
			
 
				+        pipe = UNIPipe(pdf_bytes, jso_useful_key, image_writer, is_debug=True)
			
 
				+    elif parse_method == "txt":
			
 
				+        pipe = TXTPipe(pdf_bytes, model_list, image_writer, is_debug=True)
			
 
				+    elif parse_method == "ocr":
			
 
				+        pipe = OCRPipe(pdf_bytes, model_list, image_writer, is_debug=True)
			
 
				+    else:
			
 
				+        logger.error("unknown parse method")
			
 
				+        exit(1)
			
 
				+
			
 
				+    pipe.pipe_classify()
			
 
				+
			
 
				+    if len(model_list) == 0:
			
 
				+        if model_config.__use_inside_model__:
			
 
				+            pipe.pipe_analyze()
			
 
				+            orig_model_list = copy.deepcopy(pipe.model_list)
			
 
				+        else:
			
 
				+            logger.error("need model list input")
			
 
				+            exit(2)
			
 
				+
			
 
				+    pipe.pipe_parse()
			
 
				+    pdf_info = pipe.pdf_mid_data["pdf_info"]
			
 
				+    if f_draw_layout_bbox:
			
 
				+        draw_layout_bbox(pdf_info, pdf_bytes, local_md_dir)
			
 
				+    if f_draw_span_bbox:
			
 
				+        draw_span_bbox(pdf_info, pdf_bytes, local_md_dir)
			
 
				+
			
 
				+    md_content = pipe.pipe_mk_markdown(
			
 
				+        image_dir, drop_mode=DropMode.NONE, md_make_mode=f_make_md_mode
			
 
				+    )
			
 
				+    if f_dump_md:
			
 
				+        md_writer.write(
			
 
				+            content=md_content,
			
 
				+            path=f"{pdf_file_name}.md",
			
 
				+            mode=AbsReaderWriter.MODE_TXT,
			
 
				+        )
			
 
				+
			
 
				+    if f_dump_middle_json:
			
 
				+        md_writer.write(
			
 
				+            content=json_parse.dumps(pipe.pdf_mid_data, ensure_ascii=False, indent=4),
			
 
				+            path="middle.json",
			
 
				+            mode=AbsReaderWriter.MODE_TXT,
			
 
				+        )
			
 
				+
			
 
				+    if f_dump_model_json:
			
 
				+        md_writer.write(
			
 
				+            content=json_parse.dumps(orig_model_list, ensure_ascii=False, indent=4),
			
 
				+            path="model.json",
			
 
				+            mode=AbsReaderWriter.MODE_TXT,
			
 
				+        )
			
 
				+
			
 
				+    if f_dump_orig_pdf:
			
 
				+        md_writer.write(
			
 
				+            content=pdf_bytes,
			
 
				+            path="origin.pdf",
			
 
				+            mode=AbsReaderWriter.MODE_BIN,
			
 
				+        )
			
 
				+
			
 
				+    content_list = pipe.pipe_mk_uni_format(image_dir, drop_mode=DropMode.NONE)
			
 
				+    if f_dump_content_list:
			
 
				+        md_writer.write(
			
 
				+            content=json_parse.dumps(content_list, ensure_ascii=False, indent=4),
			
 
				+            path="content_list.json",
			
 
				+            mode=AbsReaderWriter.MODE_TXT,
			
 
				+        )
			
 
				+
			
 
				+
			
 
				+parse_pdf_methods = click.Choice(["ocr", "txt", "auto"])
			
--- a/setup.py
+++ b/setup.py
@@ -52,7 +52,8 @@ if __name__ == '__main__':
 
				         python_requires=">=3.9",  # 项目依赖的 Python 版本
			
 
				         entry_points={
			
 
				             "console_scripts": [
			
 
				-                "magic-pdf = magic_pdf.cli.magicpdf:cli"
			
 
				+                "magic-pdf = magic_pdf.tools.cli:cli",
			
 
				+                "magic-pdf-dev = magic_pdf.tools.cli_dev:cli" 
			
 
				             ],
			
 
				         },  # 项目提供的可执行命令
			
 
				         include_package_data=True,  # 是否包含非代码文件，如数据文件、配置文件等
			
--- a/tests/test_tools/__init__.py
+++ b/tests/test_tools/__init__.py
--- a/tests/test_tools/assets/cli/path/cli_test_01.pdf
+++ b/tests/test_tools/assets/cli/path/cli_test_01.pdf
--- a/tests/test_tools/assets/cli/path/cli_test_02.pdf
+++ b/tests/test_tools/assets/cli/path/cli_test_02.pdf
--- a/tests/test_tools/assets/cli/pdf/cli_test_01.pdf
+++ b/tests/test_tools/assets/cli/pdf/cli_test_01.pdf
--- a/tests/test_tools/assets/cli_dev/cli_test_01.jsonl
+++ b/tests/test_tools/assets/cli_dev/cli_test_01.jsonl
--- a/tests/test_tools/assets/cli_dev/cli_test_01.model.json
+++ b/tests/test_tools/assets/cli_dev/cli_test_01.model.json
@@ -0,0 +1,638 @@
 
				+[
			
 
				+    {
			
 
				+        "layout_dets": [
			
 
				+            {
			
 
				+                "category_id": 1,
			
 
				+                "poly": [
			
 
				+                    882.4013061523438,
			
 
				+                    169.93817138671875,
			
 
				+                    1552.350341796875,
			
 
				+                    169.93817138671875,
			
 
				+                    1552.350341796875,
			
 
				+                    625.8263549804688,
			
 
				+                    882.4013061523438,
			
 
				+                    625.8263549804688
			
 
				+                ],
			
 
				+                "score": 0.999992311000824
			
 
				+            },
			
 
				+            {
			
 
				+                "category_id": 1,
			
 
				+                "poly": [
			
 
				+                    882.474853515625,
			
 
				+                    1450.92822265625,
			
 
				+                    1551.4490966796875,
			
 
				+                    1450.92822265625,
			
 
				+                    1551.4490966796875,
			
 
				+                    1877.5712890625,
			
 
				+                    882.474853515625,
			
 
				+                    1877.5712890625
			
 
				+                ],
			
 
				+                "score": 0.9999903440475464
			
 
				+            },
			
 
				+            {
			
 
				+                "category_id": 1,
			
 
				+                "poly": [
			
 
				+                    881.6513061523438,
			
 
				+                    626.2058715820312,
			
 
				+                    1552.1400146484375,
			
 
				+                    626.2058715820312,
			
 
				+                    1552.1400146484375,
			
 
				+                    1450.604736328125,
			
 
				+                    881.6513061523438,
			
 
				+                    1450.604736328125
			
 
				+                ],
			
 
				+                "score": 0.9999856352806091
			
 
				+            },
			
 
				+            {
			
 
				+                "category_id": 1,
			
 
				+                "poly": [
			
 
				+                    149.41075134277344,
			
 
				+                    232.1595001220703,
			
 
				+                    819.0465087890625,
			
 
				+                    232.1595001220703,
			
 
				+                    819.0465087890625,
			
 
				+                    625.8865356445312,
			
 
				+                    149.41075134277344,
			
 
				+                    625.8865356445312
			
 
				+                ],
			
 
				+                "score": 0.99998539686203
			
 
				+            },
			
 
				+            {
			
 
				+                "category_id": 1,
			
 
				+                "poly": [
			
 
				+                    149.3945770263672,
			
 
				+                    1215.5172119140625,
			
 
				+                    817.8850708007812,
			
 
				+                    1215.5172119140625,
			
 
				+                    817.8850708007812,
			
 
				+                    1304.873291015625,
			
 
				+                    149.3945770263672,
			
 
				+                    1304.873291015625
			
 
				+                ],
			
 
				+                "score": 0.9999765157699585
			
 
				+            },
			
 
				+            {
			
 
				+                "category_id": 1,
			
 
				+                "poly": [
			
 
				+                    882.6979370117188,
			
 
				+                    1880.13916015625,
			
 
				+                    1552.15185546875,
			
 
				+                    1880.13916015625,
			
 
				+                    1552.15185546875,
			
 
				+                    2031.339599609375,
			
 
				+                    882.6979370117188,
			
 
				+                    2031.339599609375
			
 
				+                ],
			
 
				+                "score": 0.9999744892120361
			
 
				+            },
			
 
				+            {
			
 
				+                "category_id": 1,
			
 
				+                "poly": [
			
 
				+                    148.96054077148438,
			
 
				+                    743.3055419921875,
			
 
				+                    818.6231689453125,
			
 
				+                    743.3055419921875,
			
 
				+                    818.6231689453125,
			
 
				+                    1074.2369384765625,
			
 
				+                    148.96054077148438,
			
 
				+                    1074.2369384765625
			
 
				+                ],
			
 
				+                "score": 0.9999669790267944
			
 
				+            },
			
 
				+            {
			
 
				+                "category_id": 1,
			
 
				+                "poly": [
			
 
				+                    148.8435516357422,
			
 
				+                    1791.14306640625,
			
 
				+                    818.6885375976562,
			
 
				+                    1791.14306640625,
			
 
				+                    818.6885375976562,
			
 
				+                    2030.794189453125,
			
 
				+                    148.8435516357422,
			
 
				+                    2030.794189453125
			
 
				+                ],
			
 
				+                "score": 0.9999618530273438
			
 
				+            },
			
 
				+            {
			
 
				+                "category_id": 0,
			
 
				+                "poly": [
			
 
				+                    150.7009735107422,
			
 
				+                    684.0087890625,
			
 
				+                    623.5106201171875,
			
 
				+                    684.0087890625,
			
 
				+                    623.5106201171875,
			
 
				+                    717.03662109375,
			
 
				+                    150.7009735107422,
			
 
				+                    717.03662109375
			
 
				+                ],
			
 
				+                "score": 0.9999415278434753
			
 
				+            },
			
 
				+            {
			
 
				+                "category_id": 8,
			
 
				+                "poly": [
			
 
				+                    146.48068237304688,
			
 
				+                    1331.6737060546875,
			
 
				+                    317.2640075683594,
			
 
				+                    1331.6737060546875,
			
 
				+                    317.2640075683594,
			
 
				+                    1400.1722412109375,
			
 
				+                    146.48068237304688,
			
 
				+                    1400.1722412109375
			
 
				+                ],
			
 
				+                "score": 0.9998958110809326
			
 
				+            },
			
 
				+            {
			
 
				+                "category_id": 1,
			
 
				+                "poly": [
			
 
				+                    149.42420959472656,
			
 
				+                    1430.8782958984375,
			
 
				+                    818.9042358398438,
			
 
				+                    1430.8782958984375,
			
 
				+                    818.9042358398438,
			
 
				+                    1672.7386474609375,
			
 
				+                    149.42420959472656,
			
 
				+                    1672.7386474609375
			
 
				+                ],
			
 
				+                "score": 0.9998599290847778
			
 
				+            },
			
 
				+            {
			
 
				+                "category_id": 1,
			
 
				+                "poly": [
			
 
				+                    149.18746948242188,
			
 
				+                    172.10252380371094,
			
 
				+                    818.5662231445312,
			
 
				+                    172.10252380371094,
			
 
				+                    818.5662231445312,
			
 
				+                    230.4594268798828,
			
 
				+                    149.18746948242188,
			
 
				+                    230.4594268798828
			
 
				+                ],
			
 
				+                "score": 0.9997718334197998
			
 
				+            },
			
 
				+            {
			
 
				+                "category_id": 0,
			
 
				+                "poly": [
			
 
				+                    149.0175018310547,
			
 
				+                    1732.1090087890625,
			
 
				+                    702.1005859375,
			
 
				+                    1732.1090087890625,
			
 
				+                    702.1005859375,
			
 
				+                    1763.6046142578125,
			
 
				+                    149.0175018310547,
			
 
				+                    1763.6046142578125
			
 
				+                ],
			
 
				+                "score": 0.9997085928916931
			
 
				+            },
			
 
				+            {
			
 
				+                "category_id": 2,
			
 
				+                "poly": [
			
 
				+                    1519.802490234375,
			
 
				+                    98.59099578857422,
			
 
				+                    1551.985107421875,
			
 
				+                    98.59099578857422,
			
 
				+                    1551.985107421875,
			
 
				+                    119.48420715332031,
			
 
				+                    1519.802490234375,
			
 
				+                    119.48420715332031
			
 
				+                ],
			
 
				+                "score": 0.9995552897453308
			
 
				+            },
			
 
				+            {
			
 
				+                "category_id": 8,
			
 
				+                "poly": [
			
 
				+                    146.9109649658203,
			
 
				+                    1100.156494140625,
			
 
				+                    544.2803344726562,
			
 
				+                    1100.156494140625,
			
 
				+                    544.2803344726562,
			
 
				+                    1184.929443359375,
			
 
				+                    146.9109649658203,
			
 
				+                    1184.929443359375
			
 
				+                ],
			
 
				+                "score": 0.9995207786560059
			
 
				+            },
			
 
				+            {
			
 
				+                "category_id": 2,
			
 
				+                "poly": [
			
 
				+                    148.11611938476562,
			
 
				+                    99.87767791748047,
			
 
				+                    318.926025390625,
			
 
				+                    99.87767791748047,
			
 
				+                    318.926025390625,
			
 
				+                    120.70393371582031,
			
 
				+                    148.11611938476562,
			
 
				+                    120.70393371582031
			
 
				+                ],
			
 
				+                "score": 0.999351441860199
			
 
				+            },
			
 
				+            {
			
 
				+                "category_id": 9,
			
 
				+                "poly": [
			
 
				+                    791.7642211914062,
			
 
				+                    1130.056396484375,
			
 
				+                    818.6940307617188,
			
 
				+                    1130.056396484375,
			
 
				+                    818.6940307617188,
			
 
				+                    1161.1080322265625,
			
 
				+                    791.7642211914062,
			
 
				+                    1161.1080322265625
			
 
				+                ],
			
 
				+                "score": 0.9908884763717651
			
 
				+            },
			
 
				+            {
			
 
				+                "category_id": 9,
			
 
				+                "poly": [
			
 
				+                    788.37060546875,
			
 
				+                    1346.8450927734375,
			
 
				+                    818.5010986328125,
			
 
				+                    1346.8450927734375,
			
 
				+                    818.5010986328125,
			
 
				+                    1377.370361328125,
			
 
				+                    788.37060546875,
			
 
				+                    1377.370361328125
			
 
				+                ],
			
 
				+                "score": 0.9873985052108765
			
 
				+            },
			
 
				+            {
			
 
				+                "category_id": 14,
			
 
				+                "poly": [
			
 
				+                    146,
			
 
				+                    1103,
			
 
				+                    543,
			
 
				+                    1103,
			
 
				+                    543,
			
 
				+                    1184,
			
 
				+                    146,
			
 
				+                    1184
			
 
				+                ],
			
 
				+                "score": 0.94,
			
 
				+                "latex": "E\\!\\left(W\\right)\\!=\\!\\frac{E\\!\\left[H^{2}\\right]}{2E\\!\\left[H\\right]}\\!=\\!\\frac{E\\!\\left[H\\right]}{2}\\!\\!\\left(1\\!+\\!\\operatorname{CV}\\!\\left(H\\right)^{2}\\right)"
			
 
				+            },
			
 
				+            {
			
 
				+                "category_id": 13,
			
 
				+                "poly": [
			
 
				+                    1196,
			
 
				+                    354,
			
 
				+                    1278,
			
 
				+                    354,
			
 
				+                    1278,
			
 
				+                    384,
			
 
				+                    1196,
			
 
				+                    384
			
 
				+                ],
			
 
				+                "score": 0.91,
			
 
				+                "latex": "p(1-q)"
			
 
				+            },
			
 
				+            {
			
 
				+                "category_id": 13,
			
 
				+                "poly": [
			
 
				+                    881,
			
 
				+                    415,
			
 
				+                    1020,
			
 
				+                    415,
			
 
				+                    1020,
			
 
				+                    444,
			
 
				+                    881,
			
 
				+                    444
			
 
				+                ],
			
 
				+                "score": 0.91,
			
 
				+                "latex": "(1-p)(1-q)"
			
 
				+            },
			
 
				+            {
			
 
				+                "category_id": 14,
			
 
				+                "poly": [
			
 
				+                    147,
			
 
				+                    1333,
			
 
				+                    318,
			
 
				+                    1333,
			
 
				+                    318,
			
 
				+                    1400,
			
 
				+                    147,
			
 
				+                    1400
			
 
				+                ],
			
 
				+                "score": 0.91,
			
 
				+                "latex": "\\mathbf{CV}\\big(H\\big)\\!=\\!\\frac{\\boldsymbol{\\upsigma}_{H}}{E\\big[H\\big]}"
			
 
				+            },
			
 
				+            {
			
 
				+                "category_id": 13,
			
 
				+                "poly": [
			
 
				+                    1197,
			
 
				+                    657,
			
 
				+                    1263,
			
 
				+                    657,
			
 
				+                    1263,
			
 
				+                    686,
			
 
				+                    1197,
			
 
				+                    686
			
 
				+                ],
			
 
				+                "score": 0.9,
			
 
				+                "latex": "(1-p)"
			
 
				+            },
			
 
				+            {
			
 
				+                "category_id": 13,
			
 
				+                "poly": [
			
 
				+                    213,
			
 
				+                    1217,
			
 
				+                    263,
			
 
				+                    1217,
			
 
				+                    263,
			
 
				+                    1244,
			
 
				+                    213,
			
 
				+                    1244
			
 
				+                ],
			
 
				+                "score": 0.88,
			
 
				+                "latex": "E[X]"
			
 
				+            },
			
 
				+            {
			
 
				+                "category_id": 13,
			
 
				+                "poly": [
			
 
				+                    214,
			
 
				+                    1434,
			
 
				+                    245,
			
 
				+                    1434,
			
 
				+                    245,
			
 
				+                    1459,
			
 
				+                    214,
			
 
				+                    1459
			
 
				+                ],
			
 
				+                "score": 0.87,
			
 
				+                "latex": "\\upsigma_{H}"
			
 
				+            },
			
 
				+            {
			
 
				+                "category_id": 13,
			
 
				+                "poly": [
			
 
				+                    324,
			
 
				+                    2002,
			
 
				+                    373,
			
 
				+                    2002,
			
 
				+                    373,
			
 
				+                    2028,
			
 
				+                    324,
			
 
				+                    2028
			
 
				+                ],
			
 
				+                "score": 0.84,
			
 
				+                "latex": "30\\%"
			
 
				+            },
			
 
				+            {
			
 
				+                "category_id": 13,
			
 
				+                "poly": [
			
 
				+                    1209,
			
 
				+                    693,
			
 
				+                    1225,
			
 
				+                    693,
			
 
				+                    1225,
			
 
				+                    717,
			
 
				+                    1209,
			
 
				+                    717
			
 
				+                ],
			
 
				+                "score": 0.83,
			
 
				+                "latex": "p"
			
 
				+            },
			
 
				+            {
			
 
				+                "category_id": 13,
			
 
				+                "poly": [
			
 
				+                    990,
			
 
				+                    449,
			
 
				+                    1007,
			
 
				+                    449,
			
 
				+                    1007,
			
 
				+                    474,
			
 
				+                    990,
			
 
				+                    474
			
 
				+                ],
			
 
				+                "score": 0.81,
			
 
				+                "latex": "p"
			
 
				+            },
			
 
				+            {
			
 
				+                "category_id": 13,
			
 
				+                "poly": [
			
 
				+                    346,
			
 
				+                    1277,
			
 
				+                    369,
			
 
				+                    1277,
			
 
				+                    369,
			
 
				+                    1301,
			
 
				+                    346,
			
 
				+                    1301
			
 
				+                ],
			
 
				+                "score": 0.81,
			
 
				+                "latex": "H"
			
 
				+            },
			
 
				+            {
			
 
				+                "category_id": 13,
			
 
				+                "poly": [
			
 
				+                    1137,
			
 
				+                    661,
			
 
				+                    1154,
			
 
				+                    661,
			
 
				+                    1154,
			
 
				+                    686,
			
 
				+                    1137,
			
 
				+                    686
			
 
				+                ],
			
 
				+                "score": 0.81,
			
 
				+                "latex": "p"
			
 
				+            },
			
 
				+            {
			
 
				+                "category_id": 13,
			
 
				+                "poly": [
			
 
				+                    522,
			
 
				+                    1432,
			
 
				+                    579,
			
 
				+                    1432,
			
 
				+                    579,
			
 
				+                    1459,
			
 
				+                    522,
			
 
				+                    1459
			
 
				+                ],
			
 
				+                "score": 0.81,
			
 
				+                "latex": "H\\left(4\\right)"
			
 
				+            },
			
 
				+            {
			
 
				+                "category_id": 13,
			
 
				+                "poly": [
			
 
				+                    944,
			
 
				+                    540,
			
 
				+                    962,
			
 
				+                    540,
			
 
				+                    962,
			
 
				+                    565,
			
 
				+                    944,
			
 
				+                    565
			
 
				+                ],
			
 
				+                "score": 0.8,
			
 
				+                "latex": "p"
			
 
				+            },
			
 
				+            {
			
 
				+                "category_id": 13,
			
 
				+                "poly": [
			
 
				+                    1444,
			
 
				+                    936,
			
 
				+                    1461,
			
 
				+                    936,
			
 
				+                    1461,
			
 
				+                    961,
			
 
				+                    1444,
			
 
				+                    961
			
 
				+                ],
			
 
				+                "score": 0.79,
			
 
				+                "latex": "p"
			
 
				+            },
			
 
				+            {
			
 
				+                "category_id": 13,
			
 
				+                "poly": [
			
 
				+                    602,
			
 
				+                    1247,
			
 
				+                    624,
			
 
				+                    1247,
			
 
				+                    624,
			
 
				+                    1270,
			
 
				+                    602,
			
 
				+                    1270
			
 
				+                ],
			
 
				+                "score": 0.78,
			
 
				+                "latex": "H"
			
 
				+            },
			
 
				+            {
			
 
				+                "category_id": 13,
			
 
				+                "poly": [
			
 
				+                    147,
			
 
				+                    1247,
			
 
				+                    167,
			
 
				+                    1247,
			
 
				+                    167,
			
 
				+                    1271,
			
 
				+                    147,
			
 
				+                    1271
			
 
				+                ],
			
 
				+                "score": 0.77,
			
 
				+                "latex": "X"
			
 
				+            },
			
 
				+            {
			
 
				+                "category_id": 13,
			
 
				+                "poly": [
			
 
				+                    210,
			
 
				+                    1246,
			
 
				+                    282,
			
 
				+                    1246,
			
 
				+                    282,
			
 
				+                    1274,
			
 
				+                    210,
			
 
				+                    1274
			
 
				+                ],
			
 
				+                "score": 0.77,
			
 
				+                "latex": "\\operatorname{CV}(H)"
			
 
				+            },
			
 
				+            {
			
 
				+                "category_id": 13,
			
 
				+                "poly": [
			
 
				+                    1346,
			
 
				+                    268,
			
 
				+                    1361,
			
 
				+                    268,
			
 
				+                    1361,
			
 
				+                    292,
			
 
				+                    1346,
			
 
				+                    292
			
 
				+                ],
			
 
				+                "score": 0.76,
			
 
				+                "latex": "q"
			
 
				+            },
			
 
				+            {
			
 
				+                "category_id": 13,
			
 
				+                "poly": [
			
 
				+                    215,
			
 
				+                    957,
			
 
				+                    238,
			
 
				+                    957,
			
 
				+                    238,
			
 
				+                    981,
			
 
				+                    215,
			
 
				+                    981
			
 
				+                ],
			
 
				+                "score": 0.74,
			
 
				+                "latex": "H"
			
 
				+            },
			
 
				+            {
			
 
				+                "category_id": 13,
			
 
				+                "poly": [
			
 
				+                    149,
			
 
				+                    956,
			
 
				+                    173,
			
 
				+                    956,
			
 
				+                    173,
			
 
				+                    981,
			
 
				+                    149,
			
 
				+                    981
			
 
				+                ],
			
 
				+                "score": 0.63,
			
 
				+                "latex": "W"
			
 
				+            },
			
 
				+            {
			
 
				+                "category_id": 13,
			
 
				+                "poly": [
			
 
				+                    924,
			
 
				+                    841,
			
 
				+                    1016,
			
 
				+                    841,
			
 
				+                    1016,
			
 
				+                    868,
			
 
				+                    924,
			
 
				+                    868
			
 
				+                ],
			
 
				+                "score": 0.56,
			
 
				+                "latex": "8{\\cdot}00\\;\\mathrm{a.m}"
			
 
				+            },
			
 
				+            {
			
 
				+                "category_id": 13,
			
 
				+                "poly": [
			
 
				+                    956,
			
 
				+                    871,
			
 
				+                    1032,
			
 
				+                    871,
			
 
				+                    1032,
			
 
				+                    898,
			
 
				+                    956,
			
 
				+                    898
			
 
				+                ],
			
 
				+                "score": 0.43,
			
 
				+                "latex": "20~\\mathrm{min}"
			
 
				+            },
			
 
				+            {
			
 
				+                "category_id": 13,
			
 
				+                "poly": [
			
 
				+                    1082,
			
 
				+                    781,
			
 
				+                    1112,
			
 
				+                    781,
			
 
				+                    1112,
			
 
				+                    808,
			
 
				+                    1082,
			
 
				+                    808
			
 
				+                ],
			
 
				+                "score": 0.41,
			
 
				+                "latex": "(l)"
			
 
				+            },
			
 
				+            {
			
 
				+                "category_id": 13,
			
 
				+                "poly": [
			
 
				+                    697,
			
 
				+                    1821,
			
 
				+                    734,
			
 
				+                    1821,
			
 
				+                    734,
			
 
				+                    1847,
			
 
				+                    697,
			
 
				+                    1847
			
 
				+                ],
			
 
				+                "score": 0.3,
			
 
				+                "latex": "^{1\\mathrm{~h~}}"
			
 
				+            }
			
 
				+        ],
			
 
				+        "page_info": {
			
 
				+            "page_no": 0,
			
 
				+            "height": 2200,
			
 
				+            "width": 1700
			
 
				+        }
			
 
				+    }
			
 
				+]
			
--- a/tests/test_tools/assets/cli_dev/cli_test_01.pdf
+++ b/tests/test_tools/assets/cli_dev/cli_test_01.pdf
--- a/tests/test_tools/assets/common/cli_test_01.pdf
+++ b/tests/test_tools/assets/common/cli_test_01.pdf
--- a/tests/test_tools/test_cli.py
+++ b/tests/test_tools/test_cli.py
@@ -0,0 +1,125 @@
 
				+import tempfile
			
 
				+import os
			
 
				+import shutil
			
 
				+from click.testing import CliRunner
			
 
				+
			
 
				+from magic_pdf.tools.cli import cli
			
 
				+
			
 
				+
			
 
				+def test_cli_pdf():
			
 
				+    # setup
			
 
				+    unitest_dir = "/tmp/magic_pdf/unittest/tools"
			
 
				+    filename = "cli_test_01"
			
 
				+    os.makedirs(unitest_dir, exist_ok=True)
			
 
				+    temp_output_dir = tempfile.mkdtemp(dir="/tmp/magic_pdf/unittest/tools")
			
 
				+
			
 
				+    # run
			
 
				+    runner = CliRunner()
			
 
				+    result = runner.invoke(
			
 
				+        cli,
			
 
				+        [
			
 
				+            "-p",
			
 
				+            "tests/test_tools/assets/cli/pdf/cli_test_01.pdf",
			
 
				+            "-o",
			
 
				+            temp_output_dir,
			
 
				+        ],
			
 
				+    )
			
 
				+
			
 
				+    # check
			
 
				+    assert result.exit_code == 0
			
 
				+
			
 
				+    base_output_dir = os.path.join(temp_output_dir, "cli_test_01/auto")
			
 
				+
			
 
				+    r = os.stat(os.path.join(base_output_dir, f"{filename}.md"))
			
 
				+    assert r.st_size > 7000
			
 
				+
			
 
				+    r = os.stat(os.path.join(base_output_dir, "middle.json"))
			
 
				+    assert r.st_size > 200000
			
 
				+
			
 
				+    r = os.stat(os.path.join(base_output_dir, "model.json"))
			
 
				+    assert r.st_size > 15000
			
 
				+
			
 
				+    r = os.stat(os.path.join(base_output_dir, "origin.pdf"))
			
 
				+    assert r.st_size > 500000
			
 
				+
			
 
				+    r = os.stat(os.path.join(base_output_dir, "layout.pdf"))
			
 
				+    assert r.st_size > 500000
			
 
				+
			
 
				+    r = os.stat(os.path.join(base_output_dir, "spans.pdf"))
			
 
				+    assert r.st_size > 500000
			
 
				+
			
 
				+    assert os.path.exists(os.path.join(base_output_dir, "images")) is True
			
 
				+    assert os.path.isdir(os.path.join(base_output_dir, "images")) is True
			
 
				+    assert os.path.exists(os.path.join(base_output_dir, "content_list.json")) is False
			
 
				+
			
 
				+    # teardown
			
 
				+    shutil.rmtree(temp_output_dir)
			
 
				+
			
 
				+
			
 
				+def test_cli_path():
			
 
				+    # setup
			
 
				+    unitest_dir = "/tmp/magic_pdf/unittest/tools"
			
 
				+    os.makedirs(unitest_dir, exist_ok=True)
			
 
				+    temp_output_dir = tempfile.mkdtemp(dir="/tmp/magic_pdf/unittest/tools")
			
 
				+
			
 
				+    # run
			
 
				+    runner = CliRunner()
			
 
				+    result = runner.invoke(
			
 
				+        cli, ["-p", "tests/test_tools/assets/cli/path", "-o", temp_output_dir]
			
 
				+    )
			
 
				+
			
 
				+    # check
			
 
				+    assert result.exit_code == 0
			
 
				+
			
 
				+    filename = "cli_test_01"
			
 
				+    base_output_dir = os.path.join(temp_output_dir, "cli_test_01/auto")
			
 
				+
			
 
				+    r = os.stat(os.path.join(base_output_dir, f"{filename}.md"))
			
 
				+    assert r.st_size > 7000
			
 
				+
			
 
				+    r = os.stat(os.path.join(base_output_dir, "middle.json"))
			
 
				+    assert r.st_size > 200000
			
 
				+
			
 
				+    r = os.stat(os.path.join(base_output_dir, "model.json"))
			
 
				+    assert r.st_size > 15000
			
 
				+
			
 
				+    r = os.stat(os.path.join(base_output_dir, "origin.pdf"))
			
 
				+    assert r.st_size > 500000
			
 
				+
			
 
				+    r = os.stat(os.path.join(base_output_dir, "layout.pdf"))
			
 
				+    assert r.st_size > 500000
			
 
				+
			
 
				+    r = os.stat(os.path.join(base_output_dir, "spans.pdf"))
			
 
				+    assert r.st_size > 500000
			
 
				+
			
 
				+    assert os.path.exists(os.path.join(base_output_dir, "images")) is True
			
 
				+    assert os.path.isdir(os.path.join(base_output_dir, "images")) is True
			
 
				+    assert os.path.exists(os.path.join(base_output_dir, "content_list.json")) is False
			
 
				+
			
 
				+    base_output_dir = os.path.join(temp_output_dir, "cli_test_02/auto")
			
 
				+    filename = "cli_test_02"
			
 
				+
			
 
				+    r = os.stat(os.path.join(base_output_dir, f"{filename}.md"))
			
 
				+    assert r.st_size > 5000
			
 
				+
			
 
				+    r = os.stat(os.path.join(base_output_dir, "middle.json"))
			
 
				+    assert r.st_size > 200000
			
 
				+
			
 
				+    r = os.stat(os.path.join(base_output_dir, "model.json"))
			
 
				+    assert r.st_size > 15000
			
 
				+
			
 
				+    r = os.stat(os.path.join(base_output_dir, "origin.pdf"))
			
 
				+    assert r.st_size > 500000
			
 
				+
			
 
				+    r = os.stat(os.path.join(base_output_dir, "layout.pdf"))
			
 
				+    assert r.st_size > 500000
			
 
				+
			
 
				+    r = os.stat(os.path.join(base_output_dir, "spans.pdf"))
			
 
				+    assert r.st_size > 500000
			
 
				+
			
 
				+    assert os.path.exists(os.path.join(base_output_dir, "images")) is True
			
 
				+    assert os.path.isdir(os.path.join(base_output_dir, "images")) is True
			
 
				+    assert os.path.exists(os.path.join(base_output_dir, "content_list.json")) is False
			
 
				+
			
 
				+    # teardown
			
 
				+    shutil.rmtree(temp_output_dir)
			
--- a/tests/test_tools/test_cli_dev.py
+++ b/tests/test_tools/test_cli_dev.py
@@ -0,0 +1,120 @@
 
				+import tempfile
			
 
				+import os
			
 
				+import shutil
			
 
				+from click.testing import CliRunner
			
 
				+
			
 
				+from magic_pdf.tools import cli_dev
			
 
				+
			
 
				+
			
 
				+def test_cli_pdf():
			
 
				+    # setup
			
 
				+    unitest_dir = "/tmp/magic_pdf/unittest/tools"
			
 
				+    filename = "cli_test_01"
			
 
				+    os.makedirs(unitest_dir, exist_ok=True)
			
 
				+    temp_output_dir = tempfile.mkdtemp(dir="/tmp/magic_pdf/unittest/tools")
			
 
				+
			
 
				+    # run
			
 
				+    runner = CliRunner()
			
 
				+    result = runner.invoke(
			
 
				+        cli_dev.cli,
			
 
				+        [
			
 
				+            "pdf",
			
 
				+            "-p",
			
 
				+            "tests/test_tools/assets/cli/pdf/cli_test_01.pdf",
			
 
				+            "-j",
			
 
				+            "tests/test_tools/assets/cli_dev/cli_test_01.model.json",
			
 
				+            "-o",
			
 
				+            temp_output_dir,
			
 
				+        ],
			
 
				+    )
			
 
				+
			
 
				+    # check
			
 
				+    assert result.exit_code == 0
			
 
				+
			
 
				+    base_output_dir = os.path.join(temp_output_dir, "cli_test_01/auto")
			
 
				+
			
 
				+    r = os.stat(os.path.join(base_output_dir, "content_list.json"))
			
 
				+    assert r.st_size > 5000
			
 
				+
			
 
				+    r = os.stat(os.path.join(base_output_dir, f"{filename}.md"))
			
 
				+    assert r.st_size > 7000
			
 
				+
			
 
				+    r = os.stat(os.path.join(base_output_dir, "middle.json"))
			
 
				+    assert r.st_size > 200000
			
 
				+
			
 
				+    r = os.stat(os.path.join(base_output_dir, "model.json"))
			
 
				+    assert r.st_size > 15000
			
 
				+
			
 
				+    r = os.stat(os.path.join(base_output_dir, "origin.pdf"))
			
 
				+    assert r.st_size > 500000
			
 
				+
			
 
				+    r = os.stat(os.path.join(base_output_dir, "layout.pdf"))
			
 
				+    assert r.st_size > 500000
			
 
				+
			
 
				+    r = os.stat(os.path.join(base_output_dir, "spans.pdf"))
			
 
				+    assert r.st_size > 500000
			
 
				+
			
 
				+    assert os.path.exists(os.path.join(base_output_dir, "images")) is True
			
 
				+    assert os.path.isdir(os.path.join(base_output_dir, "images")) is True
			
 
				+
			
 
				+    # teardown
			
 
				+    shutil.rmtree(temp_output_dir)
			
 
				+
			
 
				+
			
 
				+def test_cli_jsonl():
			
 
				+    # setup
			
 
				+    unitest_dir = "/tmp/magic_pdf/unittest/tools"
			
 
				+    filename = "cli_test_01"
			
 
				+    os.makedirs(unitest_dir, exist_ok=True)
			
 
				+    temp_output_dir = tempfile.mkdtemp(dir="/tmp/magic_pdf/unittest/tools")
			
 
				+
			
 
				+    def mock_read_s3_path(s3path):
			
 
				+        with open(s3path, "rb") as f:
			
 
				+            return f.read()
			
 
				+
			
 
				+    cli_dev.read_s3_path = mock_read_s3_path # mock
			
 
				+
			
 
				+    # run
			
 
				+    runner = CliRunner()
			
 
				+    result = runner.invoke(
			
 
				+        cli_dev.cli,
			
 
				+        [
			
 
				+            "jsonl",
			
 
				+            "-j",
			
 
				+            "tests/test_tools/assets/cli_dev/cli_test_01.jsonl",
			
 
				+            "-o",
			
 
				+            temp_output_dir,
			
 
				+        ],
			
 
				+    )
			
 
				+
			
 
				+    # check
			
 
				+    assert result.exit_code == 0
			
 
				+
			
 
				+    base_output_dir = os.path.join(temp_output_dir, "cli_test_01/auto")
			
 
				+
			
 
				+    r = os.stat(os.path.join(base_output_dir, "content_list.json"))
			
 
				+    assert r.st_size > 5000
			
 
				+
			
 
				+    r = os.stat(os.path.join(base_output_dir, f"{filename}.md"))
			
 
				+    assert r.st_size > 7000
			
 
				+
			
 
				+    r = os.stat(os.path.join(base_output_dir, "middle.json"))
			
 
				+    assert r.st_size > 200000
			
 
				+
			
 
				+    r = os.stat(os.path.join(base_output_dir, "model.json"))
			
 
				+    assert r.st_size > 15000
			
 
				+
			
 
				+    r = os.stat(os.path.join(base_output_dir, "origin.pdf"))
			
 
				+    assert r.st_size > 500000
			
 
				+
			
 
				+    r = os.stat(os.path.join(base_output_dir, "layout.pdf"))
			
 
				+    assert r.st_size > 500000
			
 
				+
			
 
				+    r = os.stat(os.path.join(base_output_dir, "spans.pdf"))
			
 
				+    assert r.st_size > 500000
			
 
				+
			
 
				+    assert os.path.exists(os.path.join(base_output_dir, "images")) is True
			
 
				+    assert os.path.isdir(os.path.join(base_output_dir, "images")) is True
			
 
				+
			
 
				+    # teardown
			
 
				+    shutil.rmtree(temp_output_dir)
			
--- a/tests/test_tools/test_common.py
+++ b/tests/test_tools/test_common.py
@@ -0,0 +1,52 @@
 
				+import tempfile
			
 
				+import os
			
 
				+import shutil
			
 
				+
			
 
				+import pytest
			
 
				+
			
 
				+from magic_pdf.tools.common import do_parse
			
 
				+
			
 
				+
			
 
				+@pytest.mark.parametrize("method", ["auto", "txt", "ocr"])
			
 
				+def test_common_do_parse(method):
			
 
				+    # setup
			
 
				+    unitest_dir = "/tmp/magic_pdf/unittest/tools"
			
 
				+    filename = "fake"
			
 
				+    os.makedirs(unitest_dir, exist_ok=True)
			
 
				+
			
 
				+    temp_output_dir = tempfile.mkdtemp(dir="/tmp/magic_pdf/unittest/tools")
			
 
				+
			
 
				+    # run
			
 
				+    with open("tests/test_tools/assets/common/cli_test_01.pdf", "rb") as f:
			
 
				+        bits = f.read()
			
 
				+    do_parse(temp_output_dir, filename, bits, [], method, f_dump_content_list=True)
			
 
				+
			
 
				+    # check
			
 
				+    base_output_dir = os.path.join(temp_output_dir, f"fake/{method}")
			
 
				+
			
 
				+    r = os.stat(os.path.join(base_output_dir, "content_list.json"))
			
 
				+    assert r.st_size > 5000
			
 
				+
			
 
				+    r = os.stat(os.path.join(base_output_dir, f"{filename}.md"))
			
 
				+    assert r.st_size > 7000
			
 
				+
			
 
				+    r = os.stat(os.path.join(base_output_dir, "middle.json"))
			
 
				+    assert r.st_size > 200000
			
 
				+
			
 
				+    r = os.stat(os.path.join(base_output_dir, "model.json"))
			
 
				+    assert r.st_size > 15000
			
 
				+
			
 
				+    r = os.stat(os.path.join(base_output_dir, "origin.pdf"))
			
 
				+    assert r.st_size > 500000
			
 
				+
			
 
				+    r = os.stat(os.path.join(base_output_dir, "layout.pdf"))
			
 
				+    assert r.st_size > 500000
			
 
				+
			
 
				+    r = os.stat(os.path.join(base_output_dir, "spans.pdf"))
			
 
				+    assert r.st_size > 500000
			
 
				+
			
 
				+    os.path.exists(os.path.join(base_output_dir, "images"))
			
 
				+    os.path.isdir(os.path.join(base_output_dir, "images"))
			
 
				+
			
 
				+    # teardown
			
 
				+    shutil.rmtree(temp_output_dir)