|
|
@@ -251,14 +251,14 @@ python merger/merge_mineru_paddle_ocr.py \
|
|
|
--mineru-file /path/to/mineru_page_001.json \
|
|
|
--paddle-file /path/to/paddle_page_001.json \
|
|
|
--output-dir /path/to/output \
|
|
|
- --format both
|
|
|
+ --output-type both
|
|
|
|
|
|
# 批量处理
|
|
|
python merger/merge_mineru_paddle_ocr.py \
|
|
|
--mineru-dir /path/to/mineru_results \
|
|
|
--paddle-dir /path/to/paddle_results \
|
|
|
--output-dir /path/to/output \
|
|
|
- --format both \
|
|
|
+ --output-type both \
|
|
|
--window 15 \
|
|
|
--threshold 85
|
|
|
```
|
|
|
@@ -272,7 +272,7 @@ python merger/merge_mineru_paddle_ocr.py \
|
|
|
| `--mineru-dir` | MinerU 结果目录(批量模式) | - |
|
|
|
| `--paddle-dir` | PaddleOCR 结果目录(批量模式) | - |
|
|
|
| `-o, --output-dir` | 输出目录(必需) | - |
|
|
|
-| `-f, --format` | 输出格式:json/markdown/both | both |
|
|
|
+| `-f, --output-type` | 输出格式:json/markdown/both | both |
|
|
|
| `-w, --window` | 向前查找窗口大小 | 15 |
|
|
|
| `-t, --threshold` | 文本相似度阈值(0-100) | 80 |
|
|
|
|
|
|
@@ -287,14 +287,14 @@ python merger/merge_paddleocr_vl_paddleocr.py \
|
|
|
--paddleocr-vl-file /path/to/paddleocr_vl_page_001.json \
|
|
|
--paddle-file /path/to/paddle_page_001.json \
|
|
|
--output-dir /path/to/output \
|
|
|
- --format both
|
|
|
+ --output-type both
|
|
|
|
|
|
# 批量处理
|
|
|
python merger/merge_paddleocr_vl_paddleocr.py \
|
|
|
--paddleocr-vl-dir /path/to/paddleocr_vl_results \
|
|
|
--paddle-dir /path/to/paddle_results \
|
|
|
--output-dir /path/to/output \
|
|
|
- --format both \
|
|
|
+ --output-type both \
|
|
|
--window 15 \
|
|
|
--threshold 85
|
|
|
```
|
|
|
@@ -308,7 +308,7 @@ python merger/merge_paddleocr_vl_paddleocr.py \
|
|
|
| `--paddleocr-vl-dir` | PaddleOCR_VL 结果目录(批量模式) | - |
|
|
|
| `--paddle-dir` | PaddleOCR 结果目录(批量模式) | - |
|
|
|
| `-o, --output-dir` | 输出目录(必需) | - |
|
|
|
-| `-f, --format` | 输出格式:json/markdown/both | both |
|
|
|
+| `-f, --output-type` | 输出格式:json/markdown/both | both |
|
|
|
| `-w, --window` | 向前查找窗口大小 | 15 |
|
|
|
| `-t, --threshold` | 文本相似度阈值(0-100) | 80 |
|
|
|
|
|
|
@@ -571,80 +571,80 @@ def test_table_processing():
|
|
|
---
|
|
|
|
|
|
### 运行试验数据
|
|
|
-1. mineru-vlm-2.5.3
|
|
|
+#### 1. mineru-vlm-2.5.3
|
|
|
```bash
|
|
|
echo "A用户_单元格扫描流水"
|
|
|
python merge_mineru_paddle_ocr.py \
|
|
|
--mineru-dir "/Users/zhch158/workspace/data/流水分析/A用户_单元格扫描流水/mineru-vlm-2.5.3_Results" \
|
|
|
--paddle-dir "/Users/zhch158/workspace/data/流水分析/A用户_单元格扫描流水/data_PPStructureV3_Results" \
|
|
|
--output-dir "/Users/zhch158/workspace/data/流水分析/A用户_单元格扫描流水/merged_results" \
|
|
|
- --format "both"
|
|
|
+ --output-type "both"
|
|
|
|
|
|
echo "B用户_扫描流水"
|
|
|
python merge_mineru_paddle_ocr.py \
|
|
|
--mineru-dir "/Users/zhch158/workspace/data/流水分析/B用户_扫描流水/mineru-vlm-2.5.3_Results" \
|
|
|
--paddle-dir "/Users/zhch158/workspace/data/流水分析/B用户_扫描流水/data_PPStructureV3_Results" \
|
|
|
--output-dir "/Users/zhch158/workspace/data/流水分析/B用户_扫描流水/merged_results" \
|
|
|
- --format "both"
|
|
|
+ --output-type "both"
|
|
|
|
|
|
echo "德_内蒙古银行照"
|
|
|
python merge_mineru_paddle_ocr.py \
|
|
|
--mineru-dir "/Users/zhch158/workspace/data/流水分析/德_内蒙古银行照/mineru-vlm-2.5.3_Results" \
|
|
|
--paddle-dir "/Users/zhch158/workspace/data/流水分析/德_内蒙古银行照/data_PPStructureV3_Results" \
|
|
|
--output-dir "/Users/zhch158/workspace/data/流水分析/德_内蒙古银行照/merged_results" \
|
|
|
- --format "both"
|
|
|
+ --output-type "both"
|
|
|
|
|
|
echo "对公_招商银行图"
|
|
|
python merge_mineru_paddle_ocr.py \
|
|
|
--mineru-dir "/Users/zhch158/workspace/data/流水分析/对公_招商银行图/mineru-vlm-2.5.3_Results" \
|
|
|
--paddle-dir "/Users/zhch158/workspace/data/流水分析/对公_招商银行图/data_PPStructureV3_Results" \
|
|
|
--output-dir "/Users/zhch158/workspace/data/流水分析/对公_招商银行图/merged_results" \
|
|
|
- --format "both"
|
|
|
+ --output-type "both"
|
|
|
|
|
|
echo "至远彩色印刷工业有限公司"
|
|
|
python merge_mineru_paddle_ocr.py \
|
|
|
--mineru-dir "/Users/zhch158/workspace/data/至远彩色印刷工业有限公司/mineru-vlm-2.5.3_Results" \
|
|
|
--paddle-dir "/Users/zhch158/workspace/data/至远彩色印刷工业有限公司/data_PPStructureV3_Results" \
|
|
|
--output-dir "/Users/zhch158/workspace/data/至远彩色印刷工业有限公司/merged_results" \
|
|
|
- --format "both"
|
|
|
+ --output-type "both"
|
|
|
|
|
|
```
|
|
|
|
|
|
-2. PaddleOCR_VL_Results
|
|
|
+#### 2. PaddleOCR_VL_Results
|
|
|
```bash
|
|
|
echo "A用户_单元格扫描流水"
|
|
|
python merge_paddleocr_vl_paddleocr.py \
|
|
|
--paddleocr-vl-dir "/Users/zhch158/workspace/data/流水分析/A用户_单元格扫描流水/PaddleOCR_VL_Results" \
|
|
|
--paddle-dir "/Users/zhch158/workspace/data/流水分析/A用户_单元格扫描流水/data_PPStructureV3_Results" \
|
|
|
--output-dir "/Users/zhch158/workspace/data/流水分析/A用户_单元格扫描流水/PaddleOCR_VL_Results_cell_bbox" \
|
|
|
- --format "both"
|
|
|
+ --output-type "both"
|
|
|
|
|
|
echo "B用户_扫描流水"
|
|
|
python merge_paddleocr_vl_paddleocr.py \
|
|
|
--paddleocr-vl-dir "/Users/zhch158/workspace/data/流水分析/B用户_扫描流水/PaddleOCR_VL_Results" \
|
|
|
--paddle-dir "/Users/zhch158/workspace/data/流水分析/B用户_扫描流水/data_PPStructureV3_Results" \
|
|
|
--output-dir "/Users/zhch158/workspace/data/流水分析/B用户_扫描流水/PaddleOCR_VL_Results_cell_bbox" \
|
|
|
- --format "both"
|
|
|
+ --output-type "both"
|
|
|
|
|
|
echo "德_内蒙古银行照"
|
|
|
python merge_paddleocr_vl_paddleocr.py \
|
|
|
--paddleocr-vl-dir "/Users/zhch158/workspace/data/流水分析/德_内蒙古银行照/PaddleOCR_VL_Results" \
|
|
|
--paddle-dir "/Users/zhch158/workspace/data/流水分析/德_内蒙古银行照/data_PPStructureV3_Results" \
|
|
|
--output-dir "/Users/zhch158/workspace/data/流水分析/德_内蒙古银行照/PaddleOCR_VL_Results_cell_bbox" \
|
|
|
- --format "both"
|
|
|
+ --output-type "both"
|
|
|
|
|
|
echo "对公_招商银行图"
|
|
|
python merge_paddleocr_vl_paddleocr.py \
|
|
|
--paddleocr-vl-dir "/Users/zhch158/workspace/data/流水分析/对公_招商银行图/PaddleOCR_VL_Results" \
|
|
|
--paddle-dir "/Users/zhch158/workspace/data/流水分析/对公_招商银行图/data_PPStructureV3_Results" \
|
|
|
--output-dir "/Users/zhch158/workspace/data/流水分析/对公_招商银行图/PaddleOCR_VL_Results_cell_bbox" \
|
|
|
- --format "both"
|
|
|
+ --output-type "both"
|
|
|
|
|
|
echo "至远彩色印刷工业有限公司"
|
|
|
python merge_paddleocr_vl_paddleocr.py \
|
|
|
--paddleocr-vl-dir "/Users/zhch158/workspace/data/至远彩色印刷工业有限公司/PaddleOCR_VL_Results" \
|
|
|
--paddle-dir "/Users/zhch158/workspace/data/至远彩色印刷工业有限公司/data_PPStructureV3_Results" \
|
|
|
--output-dir "/Users/zhch158/workspace/data/至远彩色印刷工业有限公司/PaddleOCR_VL_Results_cell_bbox" \
|
|
|
- --format "both"
|
|
|
+ --output-type "both"
|
|
|
|
|
|
```
|