6 miesięcy temu · 244ee9de2c
--- a/ocr_comparator/README.md
+++ b/ocr_comparator/README.md
@@ -0,0 +1,943 @@
 
				+# 📊 OCR 结果对比模块 (Comparator)
			
 
				+
			
 
				+OCR 结果对比模块提供了强大的文档对比功能，支持表格、段落的细粒度差异检测，特别优化了财务报表、流水表格等复杂文档的对比。
			
 
				+
			
 
				+## 📁 模块结构
			
 
				+
			
 
				+```
			
 
				+comparator/
			
 
				+├── __init__.py                      # 模块初始化
			
 
				+├── compare_ocr_results.py           # 命令行对比工具（入口）
			
 
				+├── ocr_comparator.py                # 核心对比器
			
 
				+├── table_comparator.py              # 表格对比器 ✨
			
 
				+├── paragraph_comparator.py          # 段落对比器
			
 
				+├── similarity_calculator.py         # 相似度计算器
			
 
				+├── data_type_detector.py            # 数据类型检测器
			
 
				+├── content_extractor.py             # 内容提取器
			
 
				+├── text_processor.py                # 文本处理器
			
 
				+├── report_generator.py              # 报告生成器
			
 
				+└── README.md                        # 本文档
			
 
				+```
			
 
				+
			
 
				+## ✨ 核心功能
			
 
				+
			
 
				+### 🎯 智能表格对比
			
 
				+
			
 
				+#### 1. 两种对比模式
			
 
				+
			
 
				+**标准模式 (`standard`)**
			
 
				+- 适用于结构固定的表格
			
 
				+- 逐行逐列精确对比
			
 
				+- 不进行表头检测
			
 
				+- 适合：静态报表、统计表
			
 
				+
			
 
				+**流水模式 (`flow_list`)** ✨
			
 
				+- 适用于结构可变的表格
			
 
				+- **智能表头检测**（关键词匹配）
			
 
				+- **支持多层表头识别**（如资产负债表）
			
 
				+- 列类型自动检测
			
 
				+- 差异严重度智能分级
			
 
				+- 适合：流水表、财务报表、交易记录
			
 
				+
			
 
				+
			
 
				+#### 2. 列类型自动检测
			
 
				+
			
 
				+**支持的数据类型：**
			
 
				+
			
 
				+| 类型 | 标识 | 特征 | 示例 |
			
 
				+|------|------|------|------|
			
 
				+| 数字金额 | `numeric` | 包含数字、小数点、逗号 | `28,239,305.48` |
			
 
				+| 日期时间 | `datetime` | 符合日期格式 | `2023-12-31` / `2023年12月31日` |
			
 
				+| 文本型数字 | `text_number` | 纯数字但作为文本（如票据号） | `20231231001` |
			
 
				+| 普通文本 | `text` | 其他文本内容 | `货币资金` |
			
 
				+
			
 
				+**检测算法：**
			
 
				+
			
 
				+```python
			
 
				+def detect_column_type(column_values):
			
 
				+    """检测列的数据类型"""
			
 
				+    numeric_count = 0
			
 
				+    datetime_count = 0
			
 
				+    text_number_count = 0
			
 
				+    
			
 
				+    for value in column_values:
			
 
				+        if is_numeric(value):
			
 
				+            numeric_count += 1
			
 
				+        elif is_datetime(value):
			
 
				+            datetime_count += 1
			
 
				+        elif is_text_number(value):
			
 
				+            text_number_count += 1
			
 
				+    
			
 
				+    # 超过 60% 认定为该类型
			
 
				+    total = len(column_values)
			
 
				+    if numeric_count / total > 0.6:
			
 
				+        return 'numeric'
			
 
				+    elif datetime_count / total > 0.6:
			
 
				+        return 'datetime'
			
 
				+    elif text_number_count / total > 0.6:
			
 
				+        return 'text_number'
			
 
				+    else:
			
 
				+        return 'text'
			
 
				+```
			
 
				+
			
 
				+#### 4. 差异严重度分级 ✨
			
 
				+
			
 
				+**基础严重度（由单元格内容决定）：**
			
 
				+
			
 
				+| 差异类型 | 基础严重度 | 说明 |
			
 
				+|---------|----------|------|
			
 
				+| `table_amount` | **high** | 金额数字不一致 |
			
 
				+| `table_datetime` | **medium** | 日期时间不一致 |
			
 
				+| `table_text` | **low/medium** | 文本不一致（根据相似度） |
			
 
				+| `table_header_position` | **high** | 表头位置不一致 |
			
 
				+| `table_header_content` | **high** | 表头内容不一致 |
			
 
				+| `table_row_missing` | **high** | 行数不一致 |
			
 
				+| `table_column_missing` | **high** | 列数不一致 |
			
 
				+
			
 
				+**列类型冲突自动提升：** ✨
			
 
				+
			
 
				+```python
			
 
				+# 如果列类型不一致，自动将严重度提升到 high
			
 
				+if col_idx in mismatched_columns:
			
 
				+    if base_severity != 'high':
			
 
				+        final_severity = 'high'
			
 
				+        description += " [列类型冲突]"
			
 
				+```
			
 
				+
			
 
				+**示例：**
			
 
				+
			
 
				+```json
			
 
				+{
			
 
				+  "type": "table_text",
			
 
				+  "severity": "high",  // 从 low 提升到 high
			
 
				+  "column_type_mismatch": true,
			
 
				+  "description": "文本不一致: 流动资产 vs 流动 资产 [列类型冲突]"
			
 
				+}
			
 
				+```
			
 
				+
			
 
				+#### 5. 表格匹配算法
			
 
				+
			
 
				+**智能匹配两个文件中的表格：**
			
 
				+
			
 
				+```python
			
 
				+def find_matching_tables(tables1, tables2):
			
 
				+    """查找匹配的表格对"""
			
 
				+    matches = []
			
 
				+    
			
 
				+    for idx1, table1 in enumerate(tables1):
			
 
				+        best_match_idx = -1
			
 
				+        best_similarity = 0
			
 
				+        
			
 
				+        for idx2, table2 in enumerate(tables2):
			
 
				+            # 计算综合相似度
			
 
				+            similarity = calculate_table_similarity(table1, table2)
			
 
				+            
			
 
				+            if similarity > best_similarity and similarity > 0.5:
			
 
				+                best_similarity = similarity
			
 
				+                best_match_idx = idx2
			
 
				+        
			
 
				+        if best_match_idx >= 0:
			
 
				+            matches.append((idx1, best_match_idx, best_similarity))
			
 
				+    
			
 
				+    return matches
			
 
				+```
			
 
				+
			
 
				+**相似度计算（总分 100%）：**
			
 
				+
			
 
				+1. **行列数相似度 (30%)**
			
 
				+   - 行数相似度 (15%)
			
 
				+   - 列数相似度 (15%)
			
 
				+   - ✨ **改进**：容忍 1-2 列差异（如合并列导致）
			
 
				+
			
 
				+2. **表头相似度 (50%)** - 最重要
			
 
				+   - 精确匹配 (40%)：完全一致的表头数量
			
 
				+   - 模糊匹配 (40%)：相似度 > 80% 的表头
			
 
				+   - 语义匹配 (20%)：识别常见表头关键词
			
 
				+
			
 
				+3. **内容特征相似度 (20%)**
			
 
				+   - 数据类型分布
			
 
				+   - 数值范围
			
 
				+   - 文本特征
			
 
				+
			
 
				+**示例输出：**
			
 
				+
			
 
				+```python
			
 
				+# 匹配结果
			
 
				+matches = [
			
 
				+    (0, 0, 95.2),  # 文件1第0个表格 ↔ 文件2第0个表格，相似度 95.2%
			
 
				+    (1, 1, 87.3),  # 文件1第1个表格 ↔ 文件2第1个表格，相似度 87.3%
			
 
				+]
			
 
				+```
			
 
				+
			
 
				+### 📝 段落对比
			
 
				+
			
 
				+**对比策略：**
			
 
				+
			
 
				+1. **段落匹配**
			
 
				+   - 基于相似度的智能匹配
			
 
				+   - 支持段落顺序调整
			
 
				+   - 识别新增/删除的段落
			
 
				+
			
 
				+2. **差异检测**
			
 
				+   - 文本内容差异
			
 
				+   - 格式差异（如列表、引用）
			
 
				+   - 图片内容差异（可选）
			
 
				+
			
 
				+3. **相似度算法**
			
 
				+   - `ratio`：标准 Levenshtein 距离
			
 
				+   - `partial_ratio`：部分匹配
			
 
				+   - `token_sort_ratio`：排序后匹配
			
 
				+   - `token_set_ratio`：集合匹配
			
 
				+
			
 
				+### 🔍 文本相似度计算
			
 
				+
			
 
				+**支持的相似度算法：**
			
 
				+
			
 
				+```python
			
 
				+# 1. 标准相似度（ratio）
			
 
				+similarity = fuzz.ratio("文本1", "文本2")
			
 
				+# 输出: 85 (0-100)
			
 
				+
			
 
				+# 2. 部分匹配（partial_ratio）
			
 
				+similarity = fuzz.partial_ratio("这是一段很长的文本", "很长的文本")
			
 
				+# 输出: 100
			
 
				+
			
 
				+# 3. 排序后匹配（token_sort_ratio）
			
 
				+similarity = fuzz.token_sort_ratio("apple banana", "banana apple")
			
 
				+# 输出: 100
			
 
				+
			
 
				+# 4. 集合匹配（token_set_ratio）
			
 
				+similarity = fuzz.token_set_ratio("the quick brown fox", "brown quick fox")
			
 
				+# 输出: 100
			
 
				+```
			
 
				+
			
 
				+## 🚀 快速开始
			
 
				+
			
 
				+### 1. 基本对比
			
 
				+
			
 
				+```bash
			
 
				+# 对比两个 Markdown 文件
			
 
				+python comparator/compare_ocr_results.py file1.md file2.md
			
 
				+
			
 
				+# 输出 JSON 格式
			
 
				+python comparator/compare_ocr_results.py file1.md file2.md -f json
			
 
				+
			
 
				+# 输出 Markdown 格式
			
 
				+python comparator/compare_ocr_results.py file1.md file2.md -f markdown
			
 
				+
			
 
				+# 同时输出两种格式
			
 
				+python comparator/compare_ocr_results.py file1.md file2.md -f both
			
 
				+```
			
 
				+
			
 
				+### 2. 流水表格对比 ✨
			
 
				+
			
 
				+```bash
			
 
				+# 使用流水模式（智能表头检测 + 多层表头识别）
			
 
				+python comparator/compare_ocr_results.py file1.md file2.md \
			
 
				+    --table-mode flow_list \
			
 
				+    -o output/comparison_result \
			
 
				+    -f both
			
 
				+
			
 
				+# 资产负债表对比（自动识别多层表头）
			
 
				+python comparator/compare_ocr_results.py balance_sheet1.md balance_sheet2.md \
			
 
				+    --table-mode flow_list \
			
 
				+    --similarity-algorithm ratio \
			
 
				+    -o balance_sheet_comparison
			
 
				+```
			
 
				+
			
 
				+### 3. 高级对比
			
 
				+
			
 
				+```bash
			
 
				+# 使用 token_set_ratio 算法（集合匹配）
			
 
				+python comparator/compare_ocr_results.py file1.md file2.md \
			
 
				+    --similarity-algorithm token_set_ratio
			
 
				+
			
 
				+# 忽略图片内容对比
			
 
				+python comparator/compare_ocr_results.py file1.md file2.md \
			
 
				+    --ignore-images
			
 
				+
			
 
				+# 指定输出文件名
			
 
				+python comparator/compare_ocr_results.py file1.md file2.md \
			
 
				+    -o my_comparison_report
			
 
				+
			
 
				+# 详细调试信息
			
 
				+python comparator/compare_ocr_results.py file1.md file2.md \
			
 
				+    --table-mode flow_list \
			
 
				+    -v
			
 
				+```
			
 
				+
			
 
				+## 📖 命令行参数
			
 
				+
			
 
				+### 必需参数
			
 
				+
			
 
				+| 参数 | 类型 | 说明 |
			
 
				+|------|------|------|
			
 
				+| `file1` | string | 第一个文件路径（原OCR结果） |
			
 
				+| `file2` | string | 第二个文件路径（验证结果） |
			
 
				+
			
 
				+### 可选参数
			
 
				+
			
 
				+| 参数 | 默认值 | 说明 |
			
 
				+|------|--------|------|
			
 
				+| `-o, --output` | `comparison_result` | 输出文件名（不含扩展名） |
			
 
				+| `-f, --format` | `both` | 输出格式：`json` / `markdown` / `both` |
			
 
				+| `--table-mode` | `standard` | 表格对比模式：`standard` / `flow_list` |
			
 
				+| `--similarity-algorithm` | `ratio` | 相似度算法：`ratio` / `partial_ratio` / `token_sort_ratio` / `token_set_ratio` |
			
 
				+| `--ignore-images` | `False` | 是否忽略图片内容对比 |
			
 
				+| `-v, --verbose` | `False` | 显示详细调试信息 |
			
 
				+
			
 
				+## 📊 输出格式
			
 
				+
			
 
				+### JSON 格式
			
 
				+
			
 
				+```json
			
 
				+{
			
 
				+  "file1": "/path/to/file1.md",
			
 
				+  "file2": "/path/to/file2.md",
			
 
				+  "comparison_time": "2025-11-07 14:30:25",
			
 
				+  "table_mode": "flow_list",
			
 
				+  "similarity_algorithm": "ratio",
			
 
				+  
			
 
				+  "differences": [
			
 
				+    {
			
 
				+      "type": "table_amount",
			
 
				+      "position": "第15行第5列",
			
 
				+      "file1_value": "15.00",
			
 
				+      "file2_value": "15,00",
			
 
				+      "description": "金额不一致: 15.00 vs 15,00",
			
 
				+      "severity": "high",
			
 
				+      "column_name": "金额",
			
 
				+      "column_type": "numeric",
			
 
				+      "row_index": 15,
			
 
				+      "col_index": 4
			
 
				+    },
			
 
				+    {
			
 
				+      "type": "table_header_position",
			
 
				+      "position": "表头位置",
			
 
				+      "file1_value": "第1行",
			
 
				+      "file2_value": "第2行",
			
 
				+      "description": "表头位置不一致: 文件1在第1行，文件2在第2行",
			
 
				+      "severity": "high"
			
 
				+    },
			
 
				+    {
			
 
				+      "type": "table_text",
			
 
				+      "position": "第20行第3列",
			
 
				+      "file1_value": "流动资产",
			
 
				+      "file2_value": "流动 资产",
			
 
				+      "description": "文本不一致: 流动资产 vs 流动 资产 [列类型冲突]",
			
 
				+      "severity": "high",
			
 
				+      "column_type_mismatch": true
			
 
				+    }
			
 
				+  ],
			
 
				+  
			
 
				+  "statistics": {
			
 
				+    "total_differences": 42,
			
 
				+    "table_differences": 35,
			
 
				+    "amount_differences": 8,
			
 
				+    "datetime_differences": 3,
			
 
				+    "text_differences": 24,
			
 
				+    "paragraph_differences": 7,
			
 
				+    "critical_severity": 2,
			
 
				+    "high_severity": 11,
			
 
				+    "medium_severity": 17,
			
 
				+    "low_severity": 12
			
 
				+  },
			
 
				+  
			
 
				+  "table_matches": [
			
 
				+    {
			
 
				+      "file1_table_index": 0,
			
 
				+      "file2_table_index": 0,
			
 
				+      "similarity": 95.2,
			
 
				+      "header_position_file1": 1,
			
 
				+      "header_position_file2": 1,
			
 
				+      "row_count_file1": 10,
			
 
				+      "row_count_file2": 10,
			
 
				+      "column_count_file1": 6,
			
 
				+      "column_count_file2": 6
			
 
				+    }
			
 
				+  ]
			
 
				+}
			
 
				+```
			
 
				+
			
 
				+### Markdown 格式
			
 
				+
			
 
				+```markdown
			
 
				+# OCR结果对比报告
			
 
				+
			
 
				+## 📋 基本信息
			
 
				+- **原OCR结果**: `/path/to/file1.md`
			
 
				+- **验证结果**: `/path/to/file2.md`
			
 
				+- **对比时间**: `2025-11-07 14:30:25`
			
 
				+- **表格对比模式**: `flow_list`
			
 
				+- **相似度算法**: `ratio`
			
 
				+
			
 
				+---
			
 
				+
			
 
				+## 📊 统计信息
			
 
				+- **总差异数量**: 42
			
 
				+- **表格差异**: 35
			
 
				+  - 金额差异: 8 (严重度: high)
			
 
				+  - 日期差异: 3 (严重度: medium)
			
 
				+  - 文本差异: 24 (严重度: low/medium)
			
 
				+  - 列类型冲突: 3 (严重度提升至: high)
			
 
				+- **段落差异**: 7
			
 
				+
			
 
				+---
			
 
				+
			
 
				+## 📈 严重度分布
			
 
				+- ❌ **严重差异 (Critical)**: 2
			
 
				+- ⚠️ **高优先级 (High)**: 11
			
 
				+- ℹ️ **中优先级 (Medium)**: 17
			
 
				+- 💡 **低优先级 (Low)**: 12
			
 
				+
			
 
				+---
			
 
				+
			
 
				+## 🔍 表格匹配情况
			
 
				+
			
 
				+### 表格 #1 ↔ 表格 #1 (相似度: 95.2%)
			
 
				+- **行数对比**: 10 vs 10, 相似度: 100.0%
			
 
				+- **列数对比**: 6 vs 6, 相似度: 100.0%
			
 
				+- **表头位置**: 文件1第1行, 文件2第1行
			
 
				+- **表头相似度**: 92.5%
			
 
				+  - 精确匹配: 83%
			
 
				+  - 模糊匹配: 100%
			
 
				+  - 语义匹配: 100%
			
 
				+
			
 
				+---
			
 
				+
			
 
				+## 📝 差异详情（按严重度分类）
			
 
				+
			
 
				+### ❌ 严重差异 (Critical)
			
 
				+
			
 
				+| 序号 | 位置 | 类型 | 原OCR结果 | 验证结果 | 描述 |
			
 
				+|------|------|------|-----------|----------|------|
			
 
				+| 1 | 表格列类型 | table_header_critical | 5列类型不一致 | 共10列 | 列类型差异过大 (50%) |
			
 
				+
			
 
				+---
			
 
				+
			
 
				+### ⚠️ 高优先级差异 (High)
			
 
				+
			
 
				+| 序号 | 位置 | 类型 | 原OCR结果 | 验证结果 | 描述 |
			
 
				+|------|------|------|-----------|----------|------|
			
 
				+| 1 | 第15行第5列 | table_amount | 15.00 | 15,00 | 金额不一致 |
			
 
				+| 2 | 第20行第3列 | table_text | 流动资产 | 流动 资产 | 文本不一致 [列类型冲突] |
			
 
				+| 3 | 表头位置 | table_header_position | 第1行 | 第2行 | 表头位置不一致 |
			
 
				+
			
 
				+---
			
 
				+
			
 
				+### ℹ️ 中优先级差异 (Medium)
			
 
				+
			
 
				+| 序号 | 位置 | 类型 | 原OCR结果 | 验证结果 | 描述 |
			
 
				+|------|------|------|-----------|----------|------|
			
 
				+| 1 | 第8行第2列 | table_datetime | 2023-12-31 | 2023年12月31日 | 日期格式不一致 |
			
 
				+
			
 
				+---
			
 
				+
			
 
				+### 💡 低优先级差异 (Low)
			
 
				+
			
 
				+| 序号 | 位置 | 类型 | 原OCR结果 | 验证结果 | 描述 |
			
 
				+|------|------|------|-----------|----------|------|
			
 
				+| 1 | 第3行第1列 | table_text | 现金及现金等价物 | 现金及 现金等价物 | 文本相似度: 92.3% |
			
 
				+```
			
 
				+
			
 
				+## 🎯 使用场景示例
			
 
				+
			
 
				+### 场景 1：银行流水对比
			
 
				+
			
 
				+```bash
			
 
				+# 命令
			
 
				+python comparator/compare_ocr_results.py \
			
 
				+    /data/银行流水/dotsocr/page_001.md \
			
 
				+    /data/银行流水/paddleocr_vl/page_001.md \
			
 
				+    --table-mode flow_list \
			
 
				+    --similarity-algorithm ratio \
			
 
				+    -o output/bank_flow_comparison \
			
 
				+    -f both
			
 
				+
			
 
				+# 特点
			
 
				+# ✅ 自动检测表头（日期、金额、余额等关键词）
			
 
				+# ✅ 列类型检测（数字金额、日期、文本型数字）
			
 
				+# ✅ 金额差异高优先级
			
 
				+# ✅ 列类型冲突自动提升严重度
			
 
				+```
			
 
				+
			
 
				+### 场景 2：资产负债表对比 ✨
			
 
				+
			
 
				+```bash
			
 
				+# 命令
			
 
				+python comparator/compare_ocr_results.py \
			
 
				+    /data/年报/mineru/balance_sheet.md \
			
 
				+    /data/年报/ppstructv3/balance_sheet.md \
			
 
				+    --table-mode flow_list \
			
 
				+    --similarity-algorithm ratio \
			
 
				+    -o output/balance_sheet_comparison \
			
 
				+    -v
			
 
				+
			
 
				+# 特点
			
 
				+# ✅ 自动识别多层表头（总表头 + 分类标题）
			
 
				+# ✅ 检测"流动资产:"等分类行
			
 
				+# ✅ 智能评分（分类行 +0.1，数据行 +0.2）
			
 
				+# ✅ 详细调试信息（-v 参数）
			
 
				+
			
 
				+# 调试输出示例
			
 
				+📍 检测到表头在第 1 行 (得分: 0.87)
			
 
				+   - 关键词: "资产"(0.25) + "余额"(0.50) + "负债"(0.25)
			
 
				+   - 下一行: 分类行 (+0.1)
			
 
				+   - 总得分: 1.0 + 0.1 = 1.1
			
 
				+```
			
 
				+
			
 
				+### 场景 3：利润表对比
			
 
				+
			
 
				+```bash
			
 
				+# 命令
			
 
				+python comparator/compare_ocr_results.py \
			
 
				+    /data/财报/paddleocr_vl/income_statement.md \
			
 
				+    /data/财报/dots_ocr/income_statement.md \
			
 
				+    --table-mode flow_list \
			
 
				+    --similarity-algorithm token_set_ratio \
			
 
				+    -o output/income_statement_comparison
			
 
				+
			
 
				+# 特点
			
 
				+# ✅ 集合匹配算法（容忍词序差异）
			
 
				+# ✅ 自动检测"收入"、"成本"等关键词
			
 
				+# ✅ 数值列精确对比
			
 
				+```
			
 
				+
			
 
				+### 场景 4：批量对比
			
 
				+
			
 
				+```bash
			
 
				+# 批量对比脚本
			
 
				+for file1 in /data/dotsocr/*.md; do
			
 
				+    file2="/data/paddleocr_vl/$(basename $file1)"
			
 
				+    if [ -f "$file2" ]; then
			
 
				+        python comparator/compare_ocr_results.py \
			
 
				+            "$file1" "$file2" \
			
 
				+            --table-mode flow_list \
			
 
				+            -o "output/$(basename $file1 .md)_comparison" \
			
 
				+            -f json
			
 
				+    fi
			
 
				+done
			
 
				+```
			
 
				+
			
 
				+## 🔧 编程接口
			
 
				+
			
 
				+### 直接调用 OCRResultComparator
			
 
				+
			
 
				+```python
			
 
				+from ocr_comparator.ocr_comparator import OCRResultComparator
			
 
				+
			
 
				+# 初始化对比器
			
 
				+comparator = OCRResultComparator(
			
 
				+    table_mode='flow_list',
			
 
				+    similarity_algorithm='ratio',
			
 
				+    ignore_images=False
			
 
				+)
			
 
				+
			
 
				+# 从文件加载内容
			
 
				+with open('file1.md', 'r', encoding='utf-8') as f:
			
 
				+    content1 = f.read()
			
 
				+with open('file2.md', 'r', encoding='utf-8') as f:
			
 
				+    content2 = f.read()
			
 
				+
			
 
				+# 执行对比
			
 
				+result = comparator.compare(content1, content2)
			
 
				+
			
 
				+# 查看结果
			
 
				+print(f"总差异数: {result['statistics']['total_differences']}")
			
 
				+print(f"表格差异: {result['statistics']['table_differences']}")
			
 
				+print(f"段落差异: {result['statistics']['paragraph_differences']}")
			
 
				+
			
 
				+# 获取高优先级差异
			
 
				+high_diffs = [d for d in result['differences'] if d['severity'] == 'high']
			
 
				+print(f"高优先级差异: {len(high_diffs)}")
			
 
				+```
			
 
				+
			
 
				+### 表格对比器独立使用
			
 
				+
			
 
				+```python
			
 
				+from ocr_comparator.table_comparator import TableComparator
			
 
				+
			
 
				+# 初始化表格对比器
			
 
				+table_comparator = TableComparator(
			
 
				+    mode='flow_list',
			
 
				+    similarity_algorithm='ratio'
			
 
				+)
			
 
				+
			
 
				+# 准备表格数据
			
 
				+table1 = [
			
 
				+    ['日期', '金额', '余额'],
			
 
				+    ['2023-01-01', '1000.00', '5000.00'],
			
 
				+    ['2023-01-02', '500.00', '5500.00']
			
 
				+]
			
 
				+
			
 
				+table2 = [
			
 
				+    ['日期', '金额', '余额'],
			
 
				+    ['2023-01-01', '1,000.00', '5,000.00'],
			
 
				+    ['2023-01-02', '500.00', '5500.00']
			
 
				+]
			
 
				+
			
 
				+# 执行对比
			
 
				+differences = table_comparator.compare_tables(table1, table2)
			
 
				+
			
 
				+# 分析差异
			
 
				+for diff in differences:
			
 
				+    print(f"{diff['position']}: {diff['description']} (严重度: {diff['severity']})")
			
 
				+```
			
 
				+
			
 
				+### 相似度计算器独立使用
			
 
				+
			
 
				+```python
			
 
				+from ocr_comparator.similarity_calculator import SimilarityCalculator
			
 
				+
			
 
				+# 初始化计算器
			
 
				+calculator = SimilarityCalculator(algorithm='ratio')
			
 
				+
			
 
				+# 计算文本相似度
			
 
				+similarity = calculator.calculate("流动资产", "流动 资产")
			
 
				+print(f"相似度: {similarity}%")  # 输出: 92.31%
			
 
				+
			
 
				+# 切换算法
			
 
				+calculator.set_algorithm('token_set_ratio')
			
 
				+similarity = calculator.calculate("apple banana", "banana apple")
			
 
				+print(f"相似度: {similarity}%")  # 输出: 100%
			
 
				+```
			
 
				+
			
 
				+### 数据类型检测器独立使用
			
 
				+
			
 
				+```python
			
 
				+from ocr_comparator.data_type_detector import DataTypeDetector
			
 
				+
			
 
				+# 初始化检测器
			
 
				+detector = DataTypeDetector()
			
 
				+
			
 
				+# 检测单个值
			
 
				+print(detector.detect_type("28,239,305.48"))  # 输出: numeric
			
 
				+print(detector.detect_type("2023-12-31"))     # 输出: datetime
			
 
				+print(detector.detect_type("20231231001"))    # 输出: text_number
			
 
				+print(detector.detect_type("货币资金"))        # 输出: text
			
 
				+
			
 
				+# 检测列类型
			
 
				+column_values = ["1000.00", "2000.50", "3000.75", "文本"]
			
 
				+column_type = detector.detect_column_type(column_values)
			
 
				+print(f"列类型: {column_type}")  # 输出: numeric (75% 是数字)
			
 
				+```
			
 
				+
			
 
				+## 🐛 调试技巧
			
 
				+
			
 
				+### 1. 启用详细日志
			
 
				+
			
 
				+```bash
			
 
				+# 使用 -v 参数
			
 
				+python comparator/compare_ocr_results.py file1.md file2.md \
			
 
				+    --table-mode flow_list \
			
 
				+    -v
			
 
				+```
			
 
				+
			
 
				+**输出示例：**
			
 
				+
			
 
				+```
			
 
				+🔍 开始对比...
			
 
				+📄 文件1: /path/to/file1.md
			
 
				+📄 文件2: /path/to/file2.md
			
 
				+⚙️ 表格模式: flow_list
			
 
				+⚙️ 相似度算法: ratio
			
 
				+
			
 
				+📊 提取表格...
			
 
				+   文件1: 发现 2 个表格
			
 
				+   文件2: 发现 2 个表格
			
 
				+
			
 
				+🔗 匹配表格...
			
 
				+   表格 #1 ↔ 表格 #1: 相似度 95.2%
			
 
				+
			
 
				+📍 检测表头位置...
			
 
				+   文件1表格1: 检测到表头在第 1 行 (得分: 0.87)
			
 
				+      关键词: "资产"(0.25) + "余额"(0.50) + "负债"(0.25)
			
 
				+      下一行: 分类行 (+0.1)
			
 
				+   文件2表格1: 检测到表头在第 1 行 (得分: 0.85)
			
 
				+
			
 
				+🔍 对比单元格...
			
 
				+   第15行第5列: 金额差异 (15.00 vs 15,00) [严重度: high]
			
 
				+   第20行第3列: 文本差异 (流动资产 vs 流动 资产) [列类型冲突] [严重度: high]
			
 
				+
			
 
				+✅ 对比完成
			
 
				+   总差异: 42
			
 
				+   表格差异: 35
			
 
				+   段落差异: 7
			
 
				+```
			
 
				+
			
 
				+### 2. 检查表格匹配
			
 
				+
			
 
				+```python
			
 
				+# 查看表格匹配结果
			
 
				+result = comparator.compare(content1, content2)
			
 
				+for match in result['table_matches']:
			
 
				+    print(f"表格 #{match['file1_table_index']} ↔ #{match['file2_table_index']}")
			
 
				+    print(f"  相似度: {match['similarity']}%")
			
 
				+    print(f"  行数: {match['row_count_file1']} vs {match['row_count_file2']}")
			
 
				+    print(f"  列数: {match['column_count_file1']} vs {match['column_count_file2']}")
			
 
				+```
			
 
				+
			
 
				+### 3. 分析列类型冲突
			
 
				+
			
 
				+```python
			
 
				+# 过滤列类型冲突的差异
			
 
				+type_conflicts = [
			
 
				+    d for d in result['differences'] 
			
 
				+    if d.get('column_type_mismatch', False)
			
 
				+]
			
 
				+
			
 
				+for diff in type_conflicts:
			
 
				+    print(f"位置: {diff['position']}")
			
 
				+    print(f"文件1: {diff['file1_value']}")
			
 
				+    print(f"文件2: {diff['file2_value']}")
			
 
				+    print(f"基础严重度: {diff.get('base_severity', 'N/A')}")
			
 
				+    print(f"最终严重度: {diff['severity']}")
			
 
				+    print()
			
 
				+```
			
 
				+
			
 
				+### 4. 检查表头检测结果
			
 
				+
			
 
				+```python
			
 
				+# 手动检测表头
			
 
				+from ocr_comparator.table_comparator import TableComparator
			
 
				+
			
 
				+comparator = TableComparator(mode='flow_list')
			
 
				+header_row_idx = comparator._detect_table_header_row(table)
			
 
				+
			
 
				+print(f"检测到表头在第 {header_row_idx + 1} 行")
			
 
				+
			
 
				+# 查看评分详情
			
 
				+for idx, row in enumerate(table):
			
 
				+    score = comparator._score_header_row(row, table, idx)
			
 
				+    print(f"第 {idx + 1} 行: 得分 {score:.2f}")
			
 
				+```
			
 
				+
			
 
				+## 📚 常见问题
			
 
				+
			
 
				+### Q1: 表头检测不准确？
			
 
				+
			
 
				+**A:** 
			
 
				+- 检查表格是否包含表头关键词（日期、金额、余额等）
			
 
				+- 使用 `-v` 参数查看详细评分信息
			
 
				+- 手动指定表头位置（在代码中设置 `header_row_idx`）
			
 
				+
			
 
				+### Q2: 列类型检测错误？
			
 
				+
			
 
				+**A:** 
			
 
				+- 检查列的数据一致性（是否混合了不同类型）
			
 
				+- 调整检测阈值（默认 60%，可在代码中修改）
			
 
				+- 查看检测日志了解判断依据
			
 
				+
			
 
				+### Q3: 差异过多且都是 high 严重度？
			
 
				+
			
 
				+**A:** 
			
 
				+- 检查是否存在列类型冲突（会自动提升严重度）
			
 
				+- 使用不同的相似度算法（如 `token_set_ratio`）
			
 
				+- 确认表格结构是否一致（行列数）
			
 
				+
			
 
				+### Q4: 多层表头识别失败？✨
			
 
				+
			
 
				+**A:** 
			
 
				+- 确认表格结构符合预期：
			
 
				+  - 第1行：总表头
			
 
				+  - 第2行：分类标题（如"流动资产:"）
			
 
				+  - 第3行起：数据行
			
 
				+- 检查分类行格式：
			
 
				+  - 第一个单元格包含关键词 + 冒号
			
 
				+  - 其他单元格为空
			
 
				+- 使用 `-v` 参数查看检测详情
			
 
				+
			
 
				+### Q5: 表格匹配失败？
			
 
				+
			
 
				+**A:** 
			
 
				+- 检查表格相似度阈值（默认 50%）
			
 
				+- 查看表头相似度（最重要的匹配因素）
			
 
				+- 确认行列数差异是否过大
			
 
				+
			
 
				+### Q6: 金额格式差异导致误报？
			
 
				+
			
 
				+**A:** 
			
 
				+- 使用数字标准化工具预处理：
			
 
				+  ```python
			
 
				+  from normalize_financial_numbers import normalize_financial_numbers
			
 
				+  normalized = normalize_financial_numbers(text)
			
 
				+  ```
			
 
				+- 或在对比前手动统一格式
			
 
				+
			
 
				+### Q7: 相似度计算结果异常？
			
 
				+
			
 
				+**A:** 
			
 
				+- 尝试不同的相似度算法
			
 
				+- 检查文本是否包含特殊字符
			
 
				+- 确认编码格式正确（UTF-8）
			
 
				+
			
 
				+## 🎓 最佳实践
			
 
				+
			
 
				+### 1. 选择合适的对比模式
			
 
				+
			
 
				+| 文档类型 | 推荐模式 | 理由 |
			
 
				+|---------|---------|------|
			
 
				+| 固定格式报表 | `standard` | 结构稳定，逐行对比更精确 |
			
 
				+| 银行流水 | `flow_list` | 表头位置可能变化，需要智能检测 |
			
 
				+| 资产负债表 | `flow_list` | 支持多层表头识别 ✨ |
			
 
				+| 利润表 | `flow_list` | 自动检测"收入"、"成本"等关键词 |
			
 
				+| 交易记录 | `flow_list` | 列类型多样，需要类型检测 |
			
 
				+
			
 
				+### 2. 选择合适的相似度算法
			
 
				+
			
 
				+| 算法 | 适用场景 | 特点 |
			
 
				+|------|---------|------|
			
 
				+| `ratio` | 精确对比 | 严格匹配，适合格式统一的文本 |
			
 
				+| `partial_ratio` | 部分匹配 | 适合长短文本对比 |
			
 
				+| `token_sort_ratio` | 词序差异 | 容忍词序不同 |
			
 
				+| `token_set_ratio` | 集合匹配 | 容忍重复词、词序 |
			
 
				+
			
 
				+### 3. 处理常见差异类型
			
 
				+
			
 
				+**金额差异：**
			
 
				+```bash
			
 
				+# 预处理：标准化金额格式
			
 
				+python normalize_financial_numbers.py input.json output.json
			
 
				+
			
 
				+# 对比
			
 
				+python comparator/compare_ocr_results.py file1.md file2.md \
			
 
				+    --table-mode flow_list
			
 
				+```
			
 
				+
			
 
				+**日期格式差异：**
			
 
				+```python
			
 
				+# 在对比前统一日期格式
			
 
				+def normalize_date(date_str):
			
 
				+    # 2023-12-31 → 2023年12月31日
			
 
				+    # 2023/12/31 → 2023年12月31日
			
 
				+    pass
			
 
				+```
			
 
				+
			
 
				+**列类型冲突：**
			
 
				+```python
			
 
				+# 检查冲突原因
			
 
				+type_conflicts = [d for d in differences if d.get('column_type_mismatch')]
			
 
				+for diff in type_conflicts:
			
 
				+    print(f"{diff['position']}: {diff['file1_value']} vs {diff['file2_value']}")
			
 
				+    # 分析是 OCR 错误还是格式差异
			
 
				+```
			
 
				+
			
 
				+### 4. 批量对比策略
			
 
				+
			
 
				+```bash
			
 
				+#!/bin/bash
			
 
				+# batch_compare.sh
			
 
				+
			
 
				+# 配置
			
 
				+SOURCE_DIR="/data/dotsocr"
			
 
				+TARGET_DIR="/data/paddleocr_vl"
			
 
				+OUTPUT_DIR="/output/comparisons"
			
 
				+
			
 
				+# 批量对比
			
 
				+for file1 in "$SOURCE_DIR"/*.md; do
			
 
				+    filename=$(basename "$file1")
			
 
				+    file2="$TARGET_DIR/$filename"
			
 
				+    
			
 
				+    if [ -f "$file2" ]; then
			
 
				+        echo "对比: $filename"
			
 
				+        python comparator/compare_ocr_results.py \
			
 
				+            "$file1" "$file2" \
			
 
				+            --table-mode flow_list \
			
 
				+            --similarity-algorithm ratio \
			
 
				+            -o "$OUTPUT_DIR/${filename%.md}_comparison" \
			
 
				+            -f both
			
 
				+    else
			
 
				+        echo "跳过: $filename (目标文件不存在)"
			
 
				+    fi
			
 
				+done
			
 
				+
			
 
				+echo "✅ 批量对比完成"
			
 
				+```
			
 
				+
			
 
				+### 5. 结果分析流程
			
 
				+
			
 
				+```python
			
 
				+import json
			
 
				+
			
 
				+# 加载对比结果
			
 
				+with open('comparison_result.json', 'r') as f:
			
 
				+    result = json.load(f)
			
 
				+
			
 
				+# 1. 统计分析
			
 
				+stats = result['statistics']
			
 
				+print(f"总差异: {stats['total_differences']}")
			
 
				+print(f"表格差异: {stats['table_differences']}")
			
 
				+
			
 
				+# 2. 严重度分布
			
 
				+print(f"Critical: {stats['critical_severity']}")
			
 
				+print(f"High: {stats['high_severity']}")
			
 
				+print(f"Medium: {stats['medium_severity']}")
			
 
				+print(f"Low: {stats['low_severity']}")
			
 
				+
			
 
				+# 3. 差异类型分布
			
 
				+type_counts = {}
			
 
				+for diff in result['differences']:
			
 
				+    type_counts[diff['type']] = type_counts.get(diff['type'], 0) + 1
			
 
				+
			
 
				+for diff_type, count in sorted(type_counts.items(), key=lambda x: x[1], reverse=True):
			
 
				+    print(f"{diff_type}: {count}")
			
 
				+
			
 
				+# 4. 关注高优先级差异
			
 
				+high_diffs = [d for d in result['differences'] if d['severity'] == 'high']
			
 
				+for diff in high_diffs:
			
 
				+    print(f"⚠️ {diff['position']}: {diff['description']}")
			
 
				+```
			
 
				+
			
 
				+## 📝 开发指南
			
 
				+
			
 
				+### 扩展新的数据类型检测
			
 
				+
			
 
				+```python
			
 
				+# 在 data_type_detector.py 中添加
			
 
				+def is_currency(value):
			
 
				+    """检测是否为货币格式"""
			
 
				+    patterns = [
			
 
				+        r'¥\s*[\d,]+\.?\d*',
			
 
				+        r'\$\s*[\d,]+\.?\d*',
			
 
				+        r'[\d,]+\.?\d*\s*元',
			
 
				+    ]
			
 
				+    return any(re.match(pattern, str(value)) for pattern in patterns)
			
 
				+```
			
 
				+
			
 
				+### 自定义相似度算法
			
 
				+
			
 
				+```python
			
 
				+# 在 similarity_calculator.py 中添加
			
 
				+def custom_similarity(text1, text2):
			
 
				+    """自定义相似度计算"""
			
 
				+    # 实现自定义逻辑
			
 
				+    pass
			
 
				+
			
 
				+# 注册算法
			
 
				+SimilarityCalculator.register_algorithm('custom', custom_similarity)
			
 
				+```
			
 
				+
			
 
				+### 扩展表格对比逻辑
			
 
				+
			
 
				+```python
			
 
				+# 继承 TableComparator
			
 
				+class CustomTableComparator(TableComparator):
			
 
				+    def _detect_table_header_row(self, table):
			
 
				+        """自定义表头检测逻辑"""
			
 
				+        # 实现自定义检测
			
 
				+        pass
			
 
				+    
			
 
				+    def _compare_cells(self, cell1, cell2, column_type):
			
 
				+        """自定义单元格对比"""
			
 
				+        # 实现自定义对比
			
 
				+        pass
			
 
				+```
			
 
				+
			
 
				+## 📄 许可证
			
 
				+
			
 
				+本模块采用 MIT 许可证。
			
 
				+
			
 
				+---
			
 
				+
			
 
				+**最后更新**: 2025年11月7日
			
 
				+**维护者**: zhch158_admin
			
--- a/ocr_comparator/__init__.py
+++ b/ocr_comparator/__init__.py
@@ -0,0 +1,38 @@
 
				+"""
			
 
				+OCR 结果比较器包
			
 
				+
			
 
				+提供 OCR 结果比较功能，支持表格和段落的智能对比。
			
 
				+"""
			
 
				+
			
 
				+import sys
			
 
				+from pathlib import Path
			
 
				+
			
 
				+# 添加 ocr_platform 根目录到 Python 路径（用于导入 ocr_utils）
			
 
				+# 使用 resolve() 确保路径是绝对路径，避免相对路径导致的 IndexError
			
 
				+_file_path = Path(__file__).resolve()
			
 
				+ocr_platform_root = _file_path.parents[1]  # ocr_comparator -> ocr_platform
			
 
				+if str(ocr_platform_root) not in sys.path:
			
 
				+    sys.path.insert(0, str(ocr_platform_root))
			
 
				+
			
 
				+from .compare_ocr_results import compare_ocr_results
			
 
				+from .ocr_comparator import OCRResultComparator
			
 
				+from .report_generator import ReportGenerator
			
 
				+from .content_extractor import ContentExtractor
			
 
				+from .table_comparator import TableComparator
			
 
				+from .paragraph_comparator import ParagraphComparator
			
 
				+from .data_type_detector import DataTypeDetector
			
 
				+from .similarity_calculator import SimilarityCalculator
			
 
				+from .text_processor import TextProcessor
			
 
				+
			
 
				+__all__ = [
			
 
				+    'compare_ocr_results',
			
 
				+    'OCRResultComparator',
			
 
				+    'ReportGenerator',
			
 
				+    'ContentExtractor',
			
 
				+    'TableComparator',
			
 
				+    'ParagraphComparator',
			
 
				+    'DataTypeDetector',
			
 
				+    'SimilarityCalculator',
			
 
				+    'TextProcessor',
			
 
				+]
			
 
				+
			
--- a/ocr_comparator/compare_ocr_results.py
+++ b/ocr_comparator/compare_ocr_results.py
@@ -0,0 +1,100 @@
 
				+import argparse
			
 
				+import sys
			
 
				+from typing import Dict
			
 
				+from pathlib import Path
			
 
				+
			
 
				+# 添加 ocr_platform 根目录到 Python 路径
			
 
				+# 使用 resolve() 确保路径是绝对路径，避免相对路径导致的 IndexError
			
 
				+_file_path = Path(__file__).resolve()
			
 
				+ocr_platform_root = _file_path.parents[1]  # compare_ocr_results.py -> ocr_comparator -> ocr_platform
			
 
				+if str(ocr_platform_root) not in sys.path:
			
 
				+    sys.path.insert(0, str(ocr_platform_root))
			
 
				+
			
 
				+# ✅ 兼容相对导入和绝对导入
			
 
				+try:
			
 
				+    from .ocr_comparator import OCRResultComparator
			
 
				+    from .report_generator import ReportGenerator
			
 
				+except ImportError:
			
 
				+    from ocr_comparator import OCRResultComparator
			
 
				+    from report_generator import ReportGenerator
			
 
				+
			
 
				+def compare_ocr_results(file1_path: str, file2_path: str, output_file: str = "comparison_report",
			
 
				+                       output_format: str = "markdown", ignore_images: bool = True,
			
 
				+                       table_mode: str = 'standard', similarity_algorithm: str = 'ratio') -> Dict:
			
 
				+    """
			
 
				+    比较两个OCR结果文件
			
 
				+    
			
 
				+    Args:
			
 
				+        file1_path: 第一个OCR结果文件路径
			
 
				+        file2_path: 第二个OCR结果文件路径
			
 
				+        output_file: 输出文件名（不含扩展名）
			
 
				+        output_format: 输出格式 ('json', 'markdown', 'both')
			
 
				+        ignore_images: 是否忽略图片内容
			
 
				+        table_mode: 表格比较模式 ('standard', 'flow_list')
			
 
				+        similarity_algorithm: 相似度算法 ('ratio', 'partial_ratio', 'token_sort_ratio', 'token_set_ratio')
			
 
				+    """
			
 
				+    comparator = OCRResultComparator()
			
 
				+    comparator.table_comparison_mode = table_mode
			
 
				+    
			
 
				+    print("🔍 开始对比OCR结果...")
			
 
				+    print(f"📄 文件1: {file1_path}")
			
 
				+    print(f"📄 文件2: {file2_path}")
			
 
				+    print(f"📊 表格模式: {table_mode}")
			
 
				+    print(f"🔧 相似度算法: {similarity_algorithm}")
			
 
				+    
			
 
				+    try:
			
 
				+        # 执行比较
			
 
				+        result = comparator.compare_files(file1_path, file2_path)
			
 
				+        
			
 
				+        # 生成报告
			
 
				+        print(f"\n📝 生成报告...")
			
 
				+        ReportGenerator.generate_report(result, output_file, output_format)
			
 
				+        
			
 
				+        print(f"\n✅ 对比完成！")
			
 
				+        return result
			
 
				+        
			
 
				+    except Exception as e:
			
 
				+        print(f"\n❌ 对比过程中出错: {str(e)}")
			
 
				+        import traceback
			
 
				+        traceback.print_exc()
			
 
				+        raise
			
 
				+
			
 
				+
			
 
				+if __name__ == "__main__":
			
 
				+    parser = argparse.ArgumentParser(description='OCR结果对比工具')
			
 
				+    parser.add_argument('file1', nargs='?', help='第一个OCR结果文件路径')
			
 
				+    parser.add_argument('file2', nargs='?', help='第二个OCR结果文件路径')
			
 
				+    parser.add_argument('-o', '--output', default='comparison_report', help='输出文件名')
			
 
				+    parser.add_argument('-f', '--format', choices=['json', 'markdown', 'both'], 
			
 
				+                       default='markdown', help='输出格式')
			
 
				+    parser.add_argument('--ignore-images', action='store_true', help='忽略图片内容')
			
 
				+    parser.add_argument('--table-mode', choices=['standard', 'flow_list'], 
			
 
				+                       default='standard', help='表格比较模式')
			
 
				+    parser.add_argument('--similarity-algorithm', 
			
 
				+                       choices=['ratio', 'partial_ratio', 'token_sort_ratio', 'token_set_ratio'],
			
 
				+                       default='ratio', help='相似度算法')
			
 
				+    
			
 
				+    args = parser.parse_args()
			
 
				+
			
 
				+    if args.file1 and args.file2:
			
 
				+        compare_ocr_results(
			
 
				+            args.file1, 
			
 
				+            args.file2, 
			
 
				+            args.output, 
			
 
				+            args.format,
			
 
				+            args.ignore_images,
			
 
				+            args.table_mode,
			
 
				+            args.similarity_algorithm
			
 
				+        )
			
 
				+    else:
			
 
				+        # 测试流水表格对比
			
 
				+        import time
			
 
				+        result = compare_ocr_results(
			
 
				+            file1_path='/Users/zhch158/workspace/data/流水分析/B用户_扫描流水/dotsocr_vllm_results_cell_bbox/B用户_扫描流水_page_008.md',
			
 
				+            file2_path='/Users/zhch158/workspace/data/流水分析/B用户_扫描流水/mineru_vllm_results/B用户_扫描流水_page_008.md',
			
 
				+            output_file=f'/Users/zhch158/workspace/repository.git/ocr_verify/output/flow_list_comparison_{time.strftime("%Y%m%d_%H%M%S")}',
			
 
				+            output_format='both',
			
 
				+            ignore_images=True,
			
 
				+            table_mode='flow_list',  # 使用流水表格模式
			
 
				+            similarity_algorithm='ratio'
			
 
				+        )
			
--- a/ocr_comparator/content_extractor.py
+++ b/ocr_comparator/content_extractor.py
@@ -0,0 +1,216 @@
 
				+import re
			
 
				+from typing import List, Dict
			
 
				+from bs4 import BeautifulSoup
			
 
				+
			
 
				+try:
			
 
				+    from .text_processor import TextProcessor
			
 
				+except ImportError:
			
 
				+    from text_processor import TextProcessor
			
 
				+
			
 
				+
			
 
				+class ContentExtractor:
			
 
				+    """从Markdown中提取表格和段落"""
			
 
				+    
			
 
				+    def __init__(self):
			
 
				+        self.text_processor = TextProcessor()
			
 
				+    
			
 
				+    def _normalize_text(self, text: str) -> str:
			
 
				+        """标准化文本：去除多余空格、回车等无效字符"""
			
 
				+        if not text:
			
 
				+            return ""
			
 
				+        # 去除多余的空白字符
			
 
				+        text = re.sub(r'\s+', ' ', text.strip())
			
 
				+        # 去除标点符号周围的空格
			
 
				+        text = re.sub(r'\s*([，。：；！？、])\s*', r'\1', text)
			
 
				+        return text
			
 
				+    
			
 
				+    def _is_image_reference(self, text: str) -> bool:
			
 
				+        """判断是否为图片引用或描述"""
			
 
				+        image_keywords = [
			
 
				+            '图', '图片', '图像', 'image', 'figure', 'fig',
			
 
				+            '照片', '截图', '示意图', '流程图', '结构图'
			
 
				+        ]
			
 
				+        # 检查是否包含图片相关关键词
			
 
				+        for keyword in image_keywords:
			
 
				+            if keyword in text.lower():
			
 
				+                return True
			
 
				+        
			
 
				+        # 检查是否为Markdown图片语法
			
 
				+        if re.search(r'!\[.*?\]\(.*?\)', text):
			
 
				+            return True
			
 
				+            
			
 
				+        # 检查是否为HTML图片标签
			
 
				+        if re.search(r'<img[^>]*>', text, re.IGNORECASE):
			
 
				+            return True
			
 
				+            
			
 
				+        return False
			
 
				+
			
 
				+    def extract_structured_content(self, content: str) -> Dict:
			
 
				+        """
			
 
				+        提取结构化内容，返回表格和段落块
			
 
				+        
			
 
				+        Returns:
			
 
				+            {
			
 
				+                'tables': [
			
 
				+                    {'start_pos': int, 'end_pos': int, 'data': List[List[str]]},
			
 
				+                    ...
			
 
				+                ],
			
 
				+                'paragraph_blocks': [
			
 
				+                    {'start_pos': int, 'end_pos': int, 'paragraphs': List[str]},
			
 
				+                    ...
			
 
				+                ]
			
 
				+            }
			
 
				+        """
			
 
				+        # 查找所有表格的位置
			
 
				+        table_pattern = r'<table>.*?</table>'
			
 
				+        tables = []
			
 
				+        paragraph_blocks = []
			
 
				+        
			
 
				+        last_pos = 0
			
 
				+        
			
 
				+        for match in re.finditer(table_pattern, content, re.DOTALL):
			
 
				+            start_pos = match.start()
			
 
				+            end_pos = match.end()
			
 
				+            
			
 
				+            # 提取表格前的段落块
			
 
				+            if start_pos > last_pos:
			
 
				+                #[last_pos:start_pos) 左闭右开区间
			
 
				+                before_table_content = content[last_pos:start_pos]
			
 
				+                paragraphs = self.extract_paragraphs(before_table_content)
			
 
				+                if paragraphs:
			
 
				+                    paragraph_blocks.append({
			
 
				+                        'start_pos': last_pos,
			
 
				+                        'end_pos': start_pos,
			
 
				+                        'paragraphs': paragraphs
			
 
				+                    })
			
 
				+            
			
 
				+            # 提取表格数据
			
 
				+            table_html = match.group()
			
 
				+            table_data = self._parse_table_html(table_html)
			
 
				+            tables.append({
			
 
				+                'start_pos': start_pos,
			
 
				+                'end_pos': end_pos,
			
 
				+                'data': table_data
			
 
				+            })
			
 
				+            
			
 
				+            last_pos = end_pos
			
 
				+        
			
 
				+        # 提取最后一个表格后的段落
			
 
				+        if last_pos < len(content):
			
 
				+            after_table_content = content[last_pos:]
			
 
				+            paragraphs = self.extract_paragraphs(after_table_content)
			
 
				+            if paragraphs:
			
 
				+                paragraph_blocks.append({
			
 
				+                    'start_pos': last_pos,
			
 
				+                    'end_pos': len(content),
			
 
				+                    'paragraphs': paragraphs
			
 
				+                })
			
 
				+        
			
 
				+        return {
			
 
				+            'tables': tables,
			
 
				+            'paragraph_blocks': paragraph_blocks
			
 
				+        }
			
 
				+    
			
 
				+    def extract_table_data(self, content: str) -> List[List[List[str]]]:
			
 
				+        """提取所有表格数据（保持原有接口兼容）"""
			
 
				+        structured = self.extract_structured_content(content)
			
 
				+        return [t['data'] for t in structured['tables']]
			
 
				+    
			
 
				+    def _parse_table_html(self, html: str) -> List[List[str]]:
			
 
				+        """
			
 
				+        解析HTML表格为二维数组
			
 
				+        
			
 
				+        Args:
			
 
				+            html: HTML表格字符串
			
 
				+        
			
 
				+        Returns:
			
 
				+            二维数组，每个元素为单元格文本
			
 
				+        """
			
 
				+        soup = BeautifulSoup(html, 'html.parser')
			
 
				+        table = soup.find('table')
			
 
				+        
			
 
				+        if not table:
			
 
				+            return []
			
 
				+        
			
 
				+        table_data = []
			
 
				+        rows = table.find_all('tr')
			
 
				+        
			
 
				+        for row in rows:
			
 
				+            cells = row.find_all(['td', 'th'])
			
 
				+            row_data = []
			
 
				+            for cell in cells:
			
 
				+                cell_text = self._normalize_text(cell.get_text())
			
 
				+                # 跳过图片内容
			
 
				+                if not self._is_image_reference(cell_text):
			
 
				+                    row_data.append(cell_text)
			
 
				+                else:
			
 
				+                    row_data.append("[图片内容-忽略]")
			
 
				+                    
			
 
				+            if row_data:  # 只添加非空行
			
 
				+                table_data.append(row_data)
			
 
				+        
			
 
				+        return table_data
			
 
				+    
			
 
				+    def merge_split_paragraphs(self, lines: List[str]) -> List[str]:
			
 
				+        # 合并连续的非空行作为一个段落，且过滤图片内容
			
 
				+        merged_lines = []
			
 
				+        current_paragraph = ""
			
 
				+        for i, line in enumerate(lines):
			
 
				+            # 跳过空行
			
 
				+            if not line:
			
 
				+                if current_paragraph:
			
 
				+                    merged_lines.append(current_paragraph)
			
 
				+                    current_paragraph = ""
			
 
				+                continue
			
 
				+            # 跳过图片内容
			
 
				+            if self._is_image_reference(line):
			
 
				+                continue
			
 
				+
			
 
				+            # 检查是否是标题（以数字、中文数字或特殊标记开头）
			
 
				+            is_title = (
			
 
				+                line.startswith(('一、', '二、', '三、', '四、', '五、', '六、', '七、', '八、', '九、', '十、')) or
			
 
				+                line.startswith(('1.', '2.', '3.', '4.', '5.', '6.', '7.', '8.', '9.')) or
			
 
				+                line.startswith('#')
			
 
				+            )
			
 
				+                        # 如果是标题，结束当前段落
			
 
				+            if is_title:
			
 
				+                if current_paragraph:
			
 
				+                    merged_lines.append(current_paragraph)
			
 
				+                    current_paragraph = ""
			
 
				+                merged_lines.append(line)
			
 
				+            else:
			
 
				+                # 检查是否应该与前一行合并 # 如果当前段落不为空，且当前段落最后一个字符非空白字符
			
 
				+                if current_paragraph and not current_paragraph.endswith((' ', '\t')):
			
 
				+                    current_paragraph += line
			
 
				+                else:
			
 
				+                    if current_paragraph:
			
 
				+                        merged_lines.append(current_paragraph)
			
 
				+                    current_paragraph = line
			
 
				+        
			
 
				+        # 处理最后一个段落
			
 
				+        if current_paragraph:
			
 
				+            merged_lines.append(current_paragraph)
			
 
				+        
			
 
				+        return merged_lines
			
 
				+    
			
 
				+    def extract_paragraphs(self, content: str) -> List[str]:
			
 
				+        """提取段落内容"""
			
 
				+        # 移除HTML标签
			
 
				+        content_no_html = re.sub(r'<[^>]+>', '', content)
			
 
				+        
			
 
				+        # 移除bbox注释
			
 
				+        content_no_bbox = re.sub(r'<!--.*?-->', '', content_no_html)
			
 
				+        
			
 
				+        # 按换行符分割
			
 
				+        paragraphs = []
			
 
				+        lines = content_no_bbox.split('\n')
			
 
				+        merged_lines = self.merge_split_paragraphs(lines)
			
 
				+        
			
 
				+        for line in merged_lines:
			
 
				+            normalized = self._normalize_text(line)
			
 
				+            if normalized:
			
 
				+                paragraphs.append(normalized)
			
 
				+            else:
			
 
				+                print(f"跳过的内容无效或图片段落: {line[0:30] if line else ''}...")        
			
 
				+
			
 
				+        return paragraphs
			
--- a/ocr_comparator/data_type_detector.py
+++ b/ocr_comparator/data_type_detector.py
@@ -0,0 +1,176 @@
 
				+import re
			
 
				+import sys
			
 
				+from typing import List
			
 
				+from pathlib import Path
			
 
				+
			
 
				+# 添加 ocr_platform 根目录到 Python 路径（用于导入 ocr_utils）
			
 
				+# 使用 resolve() 确保路径是绝对路径，避免相对路径导致的 IndexError
			
 
				+_file_path = Path(__file__).resolve()
			
 
				+ocr_platform_root = _file_path.parents[1]  # data_type_detector.py -> ocr_comparator -> ocr_platform
			
 
				+if str(ocr_platform_root) not in sys.path:
			
 
				+    sys.path.insert(0, str(ocr_platform_root))
			
 
				+
			
 
				+# 从 ocr_utils 导入数字解析函数
			
 
				+from ocr_utils.number_utils import parse_number, normalize_text_number
			
 
				+
			
 
				+
			
 
				+class DataTypeDetector:
			
 
				+    """数据类型检测和解析"""
			
 
				+    
			
 
				+    @staticmethod
			
 
				+    def is_numeric(text: str) -> bool:
			
 
				+        """判断文本是否为数字（15位以内的数值）"""
			
 
				+        if not text:
			
 
				+            return False
			
 
				+        
			
 
				+        clean_text = re.sub(r'[,，\s-]', '', text)
			
 
				+        
			
 
				+        if len(clean_text) > 15:
			
 
				+            return False
			
 
				+        
			
 
				+        try:
			
 
				+            float(clean_text)
			
 
				+            return True
			
 
				+        except ValueError:
			
 
				+            return False
			
 
				+    
			
 
				+    @staticmethod
			
 
				+    def is_text_number(text: str) -> bool:
			
 
				+        """判断是否为文本型数字（如账号、订单号）"""
			
 
				+        if not text:
			
 
				+            return False
			
 
				+        
			
 
				+        clean_text = re.sub(r'[\s-]', '', text)
			
 
				+        
			
 
				+        # 排除日期格式 yyyymmdd
			
 
				+        if len(clean_text) == 8 and clean_text.isdigit():
			
 
				+            # 检查是否为合法日期
			
 
				+            try:
			
 
				+                year = int(clean_text[:4])
			
 
				+                month = int(clean_text[4:6])
			
 
				+                day = int(clean_text[6:8])
			
 
				+                if 1900 <= year <= 2100 and 1 <= month <= 12 and 1 <= day <= 31:
			
 
				+                    return False  # 这是日期，不是文本型数字
			
 
				+            except ValueError:
			
 
				+                pass
			
 
				+        
			
 
				+        if clean_text.isdigit() and len(clean_text) > 15:
			
 
				+            return True
			
 
				+        
			
 
				+        if re.match(r'^[\d\s-]+$', text) and len(clean_text) > 10:
			
 
				+            return True
			
 
				+        
			
 
				+        return False
			
 
				+    
			
 
				+    @staticmethod
			
 
				+    def is_datetime(text: str) -> bool:
			
 
				+        """判断文本是否为日期时间格式"""
			
 
				+        if not text:
			
 
				+            return False
			
 
				+        
			
 
				+        datetime_patterns = [
			
 
				+            r'\d{4}[-/]\d{1,2}[-/]\d{1,2}',
			
 
				+            r'\d{4}[-/]\d{1,2}[-/]\d{1,2}\s*\d{1,2}:\d{1,2}:\d{1,2}',
			
 
				+            r'\d{4}年\d{1,2}月\d{1,2}日',
			
 
				+            r'^\d{8}$',  # yyyymmdd 格式
			
 
				+        ]
			
 
				+        
			
 
				+        for pattern in datetime_patterns:
			
 
				+            if re.search(pattern, str(text).strip()):
			
 
				+                # 对于 yyyymmdd 格式，验证日期合法性
			
 
				+                if pattern == r'^\d{8}$':
			
 
				+                    try:
			
 
				+                        clean_text = str(text).strip()
			
 
				+                        year = int(clean_text[:4])
			
 
				+                        month = int(clean_text[4:6])
			
 
				+                        day = int(clean_text[6:8])
			
 
				+                        if 1900 <= year <= 2100 and 1 <= month <= 12 and 1 <= day <= 31:
			
 
				+                            return True
			
 
				+                    except (ValueError, IndexError):
			
 
				+                        continue
			
 
				+                else:
			
 
				+                    return True
			
 
				+        
			
 
				+        return False
			
 
				+    
			
 
				+    @staticmethod
			
 
				+    def parse_number(text: str) -> float:
			
 
				+        """
			
 
				+        解析数字，处理千分位和货币符号
			
 
				+        
			
 
				+        此方法是对 ocr_utils.number_utils.parse_number 的包装，保持向后兼容性。
			
 
				+        """
			
 
				+        return parse_number(text)
			
 
				+    
			
 
				+    @staticmethod
			
 
				+    def extract_datetime(text: str) -> str:
			
 
				+        """提取并标准化日期时间"""
			
 
				+        patterns = [
			
 
				+            # yyyy-mm-dd hh:mm:ss
			
 
				+            (r'(\d{4})[-/](\d{1,2})[-/](\d{1,2})\s*(\d{1,2}):(\d{1,2}):(\d{1,2})', 
			
 
				+             lambda m: f"{m.group(1)}-{m.group(2).zfill(2)}-{m.group(3).zfill(2)} {m.group(4).zfill(2)}:{m.group(5).zfill(2)}:{m.group(6).zfill(2)}"),
			
 
				+            # yyyy-mm-dd
			
 
				+            (r'(\d{4})[-/](\d{1,2})[-/](\d{1,2})', 
			
 
				+             lambda m: f"{m.group(1)}-{m.group(2).zfill(2)}-{m.group(3).zfill(2)}"),
			
 
				+            # yyyy年mm月dd日
			
 
				+            (r'(\d{4})年(\d{1,2})月(\d{1,2})日', 
			
 
				+             lambda m: f"{m.group(1)}-{m.group(2).zfill(2)}-{m.group(3).zfill(2)}"),
			
 
				+            # yyyymmdd
			
 
				+            (r'^(\d{4})(\d{2})(\d{2})$',
			
 
				+             lambda m: f"{m.group(1)}{m.group(2)}{m.group(3)}"),
			
 
				+        ]
			
 
				+        
			
 
				+        for pattern, formatter in patterns:
			
 
				+            match = re.search(pattern, str(text).strip())
			
 
				+            if match:
			
 
				+                if pattern == r'^(\d{4})(\d{2})(\d{2})$':
			
 
				+                    # 验证日期合法性
			
 
				+                    try:
			
 
				+                        year = int(match.group(1))
			
 
				+                        month = int(match.group(2))
			
 
				+                        day = int(match.group(3))
			
 
				+                        if not (1900 <= year <= 2100 and 1 <= month <= 12 and 1 <= day <= 31):
			
 
				+                            continue
			
 
				+                    except ValueError:
			
 
				+                        continue
			
 
				+                return formatter(match)
			
 
				+        
			
 
				+        return text
			
 
				+    
			
 
				+    @staticmethod
			
 
				+    def detect_column_type(column_values: List[str]) -> str:
			
 
				+        """检测列的数据类型"""
			
 
				+        if not column_values:
			
 
				+            return 'text'
			
 
				+        
			
 
				+        non_empty_values = [v for v in column_values if v and v.strip() and v not in ['/', '-']]
			
 
				+        if not non_empty_values:
			
 
				+            return 'text'
			
 
				+        
			
 
				+        # 先检测日期时间（优先级最高，避免 yyyymmdd 被误判为文本型数字）
			
 
				+        datetime_count = sum(1 for v in non_empty_values[:5] if DataTypeDetector.is_datetime(v))
			
 
				+        if datetime_count >= len(non_empty_values[:5]) * 0.6:
			
 
				+            return 'datetime'
			
 
				+        
			
 
				+        # 检测文本型数字
			
 
				+        text_number_count = sum(1 for v in non_empty_values[:5] if DataTypeDetector.is_text_number(v))
			
 
				+        if text_number_count >= len(non_empty_values[:5]) * 0.6:
			
 
				+            return 'text'
			
 
				+        
			
 
				+        # 检测数字
			
 
				+        numeric_count = sum(1 for v in non_empty_values[:5] 
			
 
				+                           if DataTypeDetector.is_numeric(v) and not DataTypeDetector.is_text_number(v))
			
 
				+        
			
 
				+        if numeric_count >= len(non_empty_values[:5]) * 0.6:
			
 
				+            return 'numeric'
			
 
				+        
			
 
				+        return 'text'
			
 
				+    
			
 
				+    @staticmethod
			
 
				+    def normalize_text_number(text: str) -> str:
			
 
				+        """
			
 
				+        标准化文本型数字：移除空格和连字符
			
 
				+        
			
 
				+        此方法是对 ocr_utils.number_utils.normalize_text_number 的包装，保持向后兼容性。
			
 
				+        """
			
 
				+        return normalize_text_number(text)
			
--- a/ocr_comparator/ocr_comparator.py
+++ b/ocr_comparator/ocr_comparator.py
@@ -0,0 +1,352 @@
 
				+import os
			
 
				+import sys
			
 
				+from typing import Dict, List, Tuple
			
 
				+from datetime import datetime
			
 
				+from pathlib import Path
			
 
				+
			
 
				+# 添加 ocr_platform 根目录到 Python 路径
			
 
				+# 使用 resolve() 确保路径是绝对路径，避免相对路径导致的 IndexError
			
 
				+_file_path = Path(__file__).resolve()
			
 
				+ocr_platform_root = _file_path.parents[1]  # ocr_comparator.py -> ocr_comparator -> ocr_platform
			
 
				+if str(ocr_platform_root) not in sys.path:
			
 
				+    sys.path.insert(0, str(ocr_platform_root))
			
 
				+
			
 
				+try:
			
 
				+    from .content_extractor import ContentExtractor
			
 
				+    from .table_comparator import TableComparator
			
 
				+    from .paragraph_comparator import ParagraphComparator
			
 
				+except ImportError:
			
 
				+    from content_extractor import ContentExtractor
			
 
				+    from table_comparator import TableComparator
			
 
				+    from paragraph_comparator import ParagraphComparator
			
 
				+
			
 
				+
			
 
				+class OCRResultComparator:
			
 
				+    """OCR结果比较器主类"""
			
 
				+    
			
 
				+    def __init__(self):
			
 
				+        self.content_extractor = ContentExtractor()
			
 
				+        self.table_comparator = TableComparator()
			
 
				+        self.paragraph_comparator = ParagraphComparator()
			
 
				+        
			
 
				+        self.differences = []
			
 
				+        self.paragraph_match_threshold = 80
			
 
				+        self.content_similarity_threshold = 95
			
 
				+        self.max_paragraph_window = 6
			
 
				+        self.table_comparison_mode = 'standard'
			
 
				+        self.header_similarity_threshold = 90
			
 
				+    
			
 
				+    def compare_files(self, file1_path: str, file2_path: str) -> Dict:
			
 
				+        """比较两个OCR结果文件"""
			
 
				+        print(f"\n📖 读取文件...")
			
 
				+        
			
 
				+        # 读取文件内容
			
 
				+        with open(file1_path, 'r', encoding='utf-8') as f:
			
 
				+            content1 = f.read()
			
 
				+        
			
 
				+        with open(file2_path, 'r', encoding='utf-8') as f:
			
 
				+            content2 = f.read()
			
 
				+        
			
 
				+        print(f"✅ 文件读取完成")
			
 
				+        print(f"   文件1大小: {len(content1)} 字符")
			
 
				+        print(f"   文件2大小: {len(content2)} 字符")
			
 
				+        
			
 
				+        # ✅ 提取结构化内容（包含位置信息）
			
 
				+        print(f"\n📊 提取结构化内容...")
			
 
				+        structured_content1 = self.content_extractor.extract_structured_content(content1)
			
 
				+        structured_content2 = self.content_extractor.extract_structured_content(content2)
			
 
				+        
			
 
				+        print(f"   文件1: {len(structured_content1['tables'])}个表格, {len(structured_content1['paragraph_blocks'])}个段落块")
			
 
				+        print(f"   文件2: {len(structured_content2['tables'])}个表格, {len(structured_content2['paragraph_blocks'])}个段落块")
			
 
				+        
			
 
				+        # 初始化差异列表
			
 
				+        all_differences = []
			
 
				+        
			
 
				+        # ✅ 智能表格匹配与比较
			
 
				+        print(f"\n🔍 开始表格智能匹配...")
			
 
				+        
			
 
				+        tables1 = structured_content1['tables']
			
 
				+        tables2 = structured_content2['tables']
			
 
				+        
			
 
				+        # 记录匹配的表格对
			
 
				+        table_matches = []
			
 
				+        
			
 
				+        if tables1 and tables2:
			
 
				+            # 找到匹配的表格对
			
 
				+            table_matches = self.table_comparator.find_matching_tables(
			
 
				+                [t['data'] for t in tables1],
			
 
				+                [t['data'] for t in tables2]
			
 
				+            )
			
 
				+            
			
 
				+            if not table_matches:
			
 
				+                print(f"   ⚠️  未找到匹配的表格")
			
 
				+                all_differences.append({
			
 
				+                    'type': 'table_structure',
			
 
				+                    'position': '表格匹配',
			
 
				+                    'file1_value': f'{len(tables1)}个表格',
			
 
				+                    'file2_value': f'{len(tables2)}个表格',
			
 
				+                    'description': '未找到可匹配的表格',
			
 
				+                    'severity': 'high'
			
 
				+                })
			
 
				+            else:
			
 
				+                # 比较每对匹配的表格
			
 
				+                for idx1, idx2, similarity in table_matches:
			
 
				+                    print(f"\n   📋 对比匹配的表格: 表格{idx1+1} vs 表格{idx2+1}")
			
 
				+                    
			
 
				+                    if self.table_comparison_mode == 'flow_list':
			
 
				+                        table_diffs = self.table_comparator.compare_table_flow_list(
			
 
				+                            tables1[idx1]['data'], tables2[idx2]['data']
			
 
				+                        )
			
 
				+                    else:
			
 
				+                        table_diffs = self.table_comparator.compare_tables(
			
 
				+                            tables1[idx1]['data'], tables2[idx2]['data']
			
 
				+                        )
			
 
				+                    
			
 
				+                    # 为每个差异添加表格标识
			
 
				+                    for diff in table_diffs:
			
 
				+                        diff['table_pair'] = f'表格{idx1+1}↔表格{idx2+1}'
			
 
				+                        diff['table_similarity'] = similarity
			
 
				+                    
			
 
				+                    all_differences.extend(table_diffs)
			
 
				+                    print(f"      发现 {len(table_diffs)} 个差异")
			
 
				+                
			
 
				+                # 检查未匹配的表格
			
 
				+                matched_tables1 = {m[0] for m in table_matches}
			
 
				+                matched_tables2 = {m[1] for m in table_matches}
			
 
				+                
			
 
				+                for i in range(len(tables1)):
			
 
				+                    if i not in matched_tables1:
			
 
				+                        all_differences.append({
			
 
				+                            'type': 'table_unmatched',
			
 
				+                            'position': f'文件1表格{i+1}',
			
 
				+                            'file1_value': f'表格{i+1} (无匹配)',
			
 
				+                            'file2_value': '',
			
 
				+                            'description': f'文件1的表格{i+1}在文件2中无匹配表格',
			
 
				+                            'severity': 'medium'
			
 
				+                        })
			
 
				+                
			
 
				+                for j in range(len(tables2)):
			
 
				+                    if j not in matched_tables2:
			
 
				+                        all_differences.append({
			
 
				+                            'type': 'table_unmatched',
			
 
				+                            'position': f'文件2表格{j+1}',
			
 
				+                            'file1_value': '',
			
 
				+                            'file2_value': f'表格{j+1} (无匹配)',
			
 
				+                            'description': f'文件2的表格{j+1}在文件1中无匹配表格',
			
 
				+                            'severity': 'medium'
			
 
				+                        })
			
 
				+        
			
 
				+        elif tables1 and not tables2:
			
 
				+            all_differences.append({
			
 
				+                'type': 'table_structure',
			
 
				+                'position': '表格结构',
			
 
				+                'file1_value': f'包含{len(tables1)}个表格',
			
 
				+                'file2_value': '无表格',
			
 
				+                'description': '文件1包含表格但文件2无表格',
			
 
				+                'severity': 'high'
			
 
				+            })
			
 
				+        elif not tables1 and tables2:
			
 
				+            all_differences.append({
			
 
				+                'type': 'table_structure',
			
 
				+                'position': '表格结构',
			
 
				+                'file1_value': '无表格',
			
 
				+                'file2_value': f'包含{len(tables2)}个表格',
			
 
				+                'description': '文件2包含表格但文件1无表格',
			
 
				+                'severity': 'high'
			
 
				+            })
			
 
				+        
			
 
				+        # ✅ 根据表格匹配结果对齐段落块
			
 
				+        print(f"\n🔍 开始段落对比（基于表格位置对齐）...")
			
 
				+        
			
 
				+        paragraph_blocks1 = structured_content1['paragraph_blocks']
			
 
				+        paragraph_blocks2 = structured_content2['paragraph_blocks']
			
 
				+        
			
 
				+        # ✅ 构建段落块对应关系
			
 
				+        aligned_blocks = self._align_paragraph_blocks(
			
 
				+            paragraph_blocks1, paragraph_blocks2, 
			
 
				+            tables1, tables2, table_matches
			
 
				+        )
			
 
				+        
			
 
				+        for block_pair in aligned_blocks:
			
 
				+            block1 = block_pair['block1']
			
 
				+            block2 = block_pair['block2']
			
 
				+            position_desc = block_pair['position']
			
 
				+            
			
 
				+            paragraphs1 = block1['paragraphs'] if block1 else []
			
 
				+            paragraphs2 = block2['paragraphs'] if block2 else []
			
 
				+            
			
 
				+            if not paragraphs1 and not paragraphs2:
			
 
				+                continue
			
 
				+            
			
 
				+            print(f"   📦 {position_desc}: 文件1有{len(paragraphs1)}个段落, 文件2有{len(paragraphs2)}个段落")
			
 
				+            
			
 
				+            # 每个段落块独立对比，指针重新初始化
			
 
				+            block_diffs = self.paragraph_comparator.compare_paragraphs(
			
 
				+                paragraphs1, paragraphs2
			
 
				+            )
			
 
				+            
			
 
				+            # 为每个差异添加段落块标识
			
 
				+            for diff in block_diffs:
			
 
				+                diff['paragraph_block'] = position_desc
			
 
				+            
			
 
				+            all_differences.extend(block_diffs)
			
 
				+        
			
 
				+        total_paragraph_diffs = len([d for d in all_differences if d['type'] == 'paragraph'])
			
 
				+        print(f"✅ 段落对比完成，共发现 {total_paragraph_diffs} 个差异")
			
 
				+        
			
 
				+        print(f"\n✅ 对比完成")
			
 
				+        
			
 
				+        # 统计差异
			
 
				+        stats = {
			
 
				+            'total_differences': len(all_differences),
			
 
				+            'table_differences': len([d for d in all_differences if d['type'].startswith('table')]),
			
 
				+            'paragraph_differences': len([d for d in all_differences if d['type'] == 'paragraph']),
			
 
				+            'amount_differences': len([d for d in all_differences if d['type'] == 'table_amount']),
			
 
				+            'datetime_differences': len([d for d in all_differences if d['type'] == 'table_datetime']),
			
 
				+            'text_differences': len([d for d in all_differences if d['type'] == 'table_text']),
			
 
				+            'table_pre_header': len([d for d in all_differences if d['type'] == 'table_pre_header']),
			
 
				+            'table_header_mismatch': len([d for d in all_differences if d['type'] == 'table_header_mismatch']),
			
 
				+            'table_header_critical': len([d for d in all_differences if d['type'] == 'table_header_critical']),
			
 
				+            'table_header_position': len([d for d in all_differences if d['type'] == 'table_header_position']),
			
 
				+            'table_row_missing': len([d for d in all_differences if d['type'] == 'table_row_missing']),
			
 
				+            'high_severity': len([d for d in all_differences if d.get('severity') in ['critical', 'high']]),
			
 
				+            'medium_severity': len([d for d in all_differences if d.get('severity') == 'medium']),
			
 
				+            'low_severity': len([d for d in all_differences if d.get('severity') == 'low'])
			
 
				+        }
			
 
				+        
			
 
				+        # ✅ 构建返回结果
			
 
				+        result = {
			
 
				+            'differences': all_differences,
			
 
				+            'statistics': stats,
			
 
				+            'file1_tables': len(tables1),
			
 
				+            'file2_tables': len(tables2),
			
 
				+            'file1_paragraphs': sum(len(b['paragraphs']) for b in paragraph_blocks1),
			
 
				+            'file2_paragraphs': sum(len(b['paragraphs']) for b in paragraph_blocks2),
			
 
				+            'file1_path': file1_path,
			
 
				+            'file2_path': file2_path,
			
 
				+            'timestamp': datetime.now().strftime('%Y-%m-%d %H:%M:%S')
			
 
				+        }
			
 
				+        
			
 
				+        print(f"\n" + "="*60)
			
 
				+        print(f"📊 对比结果汇总")
			
 
				+        print(f"="*60)
			
 
				+        print(f"总差异数: {result['statistics']['total_differences']}")
			
 
				+        print(f"  - 段落差异: {result['statistics']['paragraph_differences']}")
			
 
				+        print(f"  - 表格差异: {result['statistics']['table_differences']}")
			
 
				+        print(f"    - 金额: {result['statistics']['amount_differences']}")
			
 
				+        print(f"    - 日期: {result['statistics']['datetime_differences']}")
			
 
				+        print(f"    - 文本: {result['statistics']['text_differences']}")
			
 
				+        print(f"\n严重级别分布:")
			
 
				+        print(f"  🔴 高: {result['statistics']['high_severity']}")
			
 
				+        print(f"  🟡 中: {result['statistics']['medium_severity']}")
			
 
				+        print(f"  🟢 低: {result['statistics']['low_severity']}")
			
 
				+        print(f"="*60)
			
 
				+        
			
 
				+        return result
			
 
				+    
			
 
				+    def _align_paragraph_blocks(self, blocks1: List[Dict], blocks2: List[Dict],
			
 
				+                               tables1: List[Dict], tables2: List[Dict],
			
 
				+                               table_matches: List[Tuple[int, int, float]]) -> List[Dict]:
			
 
				+        """
			
 
				+        根据表格位置对齐段落块
			
 
				+        
			
 
				+        Returns:
			
 
				+            [
			
 
				+                {'block1': dict, 'block2': dict, 'position': str},
			
 
				+                ...
			
 
				+            ]
			
 
				+        """
			
 
				+        aligned = []
			
 
				+        
			
 
				+        # 如果没有表格，直接对比所有段落块
			
 
				+        if not tables1 and not tables2:
			
 
				+            max_blocks = max(len(blocks1), len(blocks2))
			
 
				+            for i in range(max_blocks):
			
 
				+                aligned.append({
			
 
				+                    'block1': blocks1[i] if i < len(blocks1) else None,
			
 
				+                    'block2': blocks2[i] if i < len(blocks2) else None,
			
 
				+                    'position': f'段落块{i+1}'
			
 
				+                })
			
 
				+            return aligned
			
 
				+        
			
 
				+        # 构建表格索引映射
			
 
				+        table_map = {idx1: idx2 for idx1, idx2, _ in table_matches}
			
 
				+        
			
 
				+        # ✅ 策略：根据表格位置划分段落块
			
 
				+        # 1. 第一个表格前的段落块
			
 
				+        # 2. 每对匹配表格之间的段落块
			
 
				+        # 3. 最后一个表格后的段落块
			
 
				+        
			
 
				+        # 第一个表格前的段落块
			
 
				+        if blocks1 or blocks2:
			
 
				+            first_table_idx1 = min(table_map.keys()) if table_map else len(blocks1)
			
 
				+            first_table_idx2 = min(table_map.values()) if table_map else len(blocks2)
			
 
				+            
			
 
				+            # 找到第一个表格前的所有段落块
			
 
				+            pre_blocks1 = [b for b in blocks1 if b['end_pos'] <= (tables1[first_table_idx1]['start_pos'] if first_table_idx1 < len(tables1) else float('inf'))]
			
 
				+            pre_blocks2 = [b for b in blocks2 if b['end_pos'] <= (tables2[first_table_idx2]['start_pos'] if first_table_idx2 < len(tables2) else float('inf'))]
			
 
				+            
			
 
				+            if pre_blocks1 or pre_blocks2:
			
 
				+                # 合并所有表格前的段落
			
 
				+                merged_block1 = self._merge_paragraph_blocks(pre_blocks1) if pre_blocks1 else None
			
 
				+                merged_block2 = self._merge_paragraph_blocks(pre_blocks2) if pre_blocks2 else None
			
 
				+                
			
 
				+                aligned.append({
			
 
				+                    'block1': merged_block1,
			
 
				+                    'block2': merged_block2,
			
 
				+                    'position': '文档开头（表格前）'
			
 
				+                })
			
 
				+        
			
 
				+        # 每对匹配表格之间的段落块
			
 
				+        sorted_matches = sorted(table_matches, key=lambda x: x[0])
			
 
				+        
			
 
				+        for i, (idx1, idx2, _) in enumerate(sorted_matches):
			
 
				+            # 当前表格后、下一个表格前的段落块
			
 
				+            table1_end = tables1[idx1]['end_pos']
			
 
				+            table2_end = tables2[idx2]['end_pos']
			
 
				+            
			
 
				+            # 下一个表格的开始位置
			
 
				+            if i + 1 < len(sorted_matches):
			
 
				+                next_idx1 = sorted_matches[i + 1][0]
			
 
				+                next_idx2 = sorted_matches[i + 1][1]
			
 
				+                next_table1_start = tables1[next_idx1]['start_pos']
			
 
				+                next_table2_start = tables2[next_idx2]['start_pos']
			
 
				+            else:
			
 
				+                next_table1_start = float('inf')
			
 
				+                next_table2_start = float('inf')
			
 
				+            
			
 
				+            # 找到这个范围内的段落块
			
 
				+            between_blocks1 = [b for b in blocks1 
			
 
				+                             if b['start_pos'] >= table1_end and b['end_pos'] <= next_table1_start]
			
 
				+            between_blocks2 = [b for b in blocks2 
			
 
				+                             if b['start_pos'] >= table2_end and b['end_pos'] <= next_table2_start]
			
 
				+            
			
 
				+            if between_blocks1 or between_blocks2:
			
 
				+                merged_block1 = self._merge_paragraph_blocks(between_blocks1) if between_blocks1 else None
			
 
				+                merged_block2 = self._merge_paragraph_blocks(between_blocks2) if between_blocks2 else None
			
 
				+                
			
 
				+                aligned.append({
			
 
				+                    'block1': merged_block1,
			
 
				+                    'block2': merged_block2,
			
 
				+                    'position': f'表格{idx1+1}↔表格{idx2+1} 之后'
			
 
				+                })
			
 
				+        
			
 
				+        return aligned
			
 
				+    
			
 
				+    def _merge_paragraph_blocks(self, blocks: List[Dict]) -> Dict:
			
 
				+        """合并多个段落块为一个"""
			
 
				+        if not blocks:
			
 
				+            return None
			
 
				+        
			
 
				+        if len(blocks) == 1:
			
 
				+            return blocks[0]
			
 
				+        
			
 
				+        all_paragraphs = []
			
 
				+        for block in blocks:
			
 
				+            all_paragraphs.extend(block['paragraphs'])
			
 
				+        
			
 
				+        return {
			
 
				+            'start_pos': blocks[0]['start_pos'],
			
 
				+            'end_pos': blocks[-1]['end_pos'],
			
 
				+            'paragraphs': all_paragraphs
			
 
				+        }
			
--- a/ocr_comparator/paragraph_comparator.py
+++ b/ocr_comparator/paragraph_comparator.py
@@ -0,0 +1,183 @@
 
				+from typing import Dict, List
			
 
				+# ✅ 兼容相对导入和绝对导入
			
 
				+try:
			
 
				+    from .text_processor import TextProcessor
			
 
				+    from .similarity_calculator import SimilarityCalculator
			
 
				+except ImportError:
			
 
				+    from text_processor import TextProcessor
			
 
				+    from similarity_calculator import SimilarityCalculator
			
 
				+
			
 
				+class ParagraphComparator:
			
 
				+    """段落比较"""
			
 
				+    
			
 
				+    def __init__(self):
			
 
				+        self.text_processor = TextProcessor()
			
 
				+        self.calculator = SimilarityCalculator()
			
 
				+        self.paragraph_match_threshold = 80
			
 
				+        self.content_similarity_threshold = 95
			
 
				+        self.max_paragraph_window = 6
			
 
				+    
			
 
				+    def compare_paragraphs(self, paras1: List[str], paras2: List[str]) -> List[Dict]:
			
 
				+        """改进的段落匹配算法"""
			
 
				+        differences = []
			
 
				+        
			
 
				+        # 预处理
			
 
				+        normalized_paras1 = [self.text_processor.normalize_text_for_comparison(p) for p in paras1]
			
 
				+        normalized_paras2 = [self.text_processor.normalize_text_for_comparison(p) for p in paras2]
			
 
				+        
			
 
				+        original_paras1 = [self.text_processor.strip_markdown_formatting(p) for p in paras1]
			
 
				+        original_paras2 = [self.text_processor.strip_markdown_formatting(p) for p in paras2]
			
 
				+        
			
 
				+        used_paras1 = set()
			
 
				+        used_paras2 = set()
			
 
				+        
			
 
				+        start_index2 = 0
			
 
				+        last_match_index2 = 0
			
 
				+        
			
 
				+        for window_size1 in range(1, min(self.max_paragraph_window, len(normalized_paras1) + 1)):
			
 
				+            for i in range(len(normalized_paras1) - window_size1 + 1):
			
 
				+                if any(idx in used_paras1 for idx in range(i, i + window_size1)):
			
 
				+                    continue
			
 
				+                
			
 
				+                combined_normalized1 = "".join(normalized_paras1[i:i+window_size1])
			
 
				+                combined_original1 = "".join(original_paras1[i:i+window_size1])
			
 
				+                
			
 
				+                best_match = self._find_best_match(
			
 
				+                    combined_normalized1, 
			
 
				+                    normalized_paras2,
			
 
				+                    start_index2,
			
 
				+                    last_match_index2,
			
 
				+                    used_paras2
			
 
				+                )
			
 
				+                
			
 
				+                if best_match and best_match['similarity'] >= self.paragraph_match_threshold:
			
 
				+                    matched_indices = best_match['indices']
			
 
				+                    last_match_index2 = matched_indices[-1]
			
 
				+                    start_index2 = last_match_index2 + 1
			
 
				+                    
			
 
				+                    for idx in range(i, i + window_size1):
			
 
				+                        used_paras1.add(idx)
			
 
				+                    for idx in matched_indices:
			
 
				+                        used_paras2.add(idx)
			
 
				+                    
			
 
				+                    combined_original2 = "".join([original_paras2[idx] for idx in matched_indices])
			
 
				+                    
			
 
				+                    # 检查标点差异
			
 
				+                    punctuation_diffs = self.calculator.check_punctuation_differences(
			
 
				+                        combined_original1, 
			
 
				+                        combined_original2,
			
 
				+                        self.text_processor.normalize_punctuation
			
 
				+                    )
			
 
				+                    
			
 
				+                    if punctuation_diffs:
			
 
				+                        diff_description = []
			
 
				+                        for pdiff in punctuation_diffs:
			
 
				+                            diff_description.append(
			
 
				+                                f"位置{pdiff['position']}: '{pdiff['char1']}' vs '{pdiff['char2']}'"
			
 
				+                            )
			
 
				+                        
			
 
				+                        differences.append({
			
 
				+                            'type': 'paragraph_punctuation',
			
 
				+                            'position': f'段落{i+1}' + (f'-{i+window_size1}' if window_size1 > 1 else ''),
			
 
				+                            'file1_value': combined_original1,
			
 
				+                            'file2_value': combined_original2,
			
 
				+                            'description': f'段落全角半角标点差异: {"; ".join(diff_description)}',
			
 
				+                            'punctuation_differences': punctuation_diffs,
			
 
				+                            'similarity': 100.0,
			
 
				+                            'severity': 'low'
			
 
				+                        })
			
 
				+                    
			
 
				+                    elif best_match['similarity'] < self.content_similarity_threshold:
			
 
				+                        severity = 'low' if best_match['similarity'] >= 90 else 'medium'
			
 
				+                        differences.append({
			
 
				+                            'type': 'paragraph',
			
 
				+                            'position': f'段落{i+1}' + (f'-{i+window_size1}' if window_size1 > 1 else ''),
			
 
				+                            'file1_value': combined_original1,
			
 
				+                            'file2_value': combined_original2,
			
 
				+                            'description': f'段落内容差异 (相似度: {best_match["similarity"]:.1f}%)',
			
 
				+                            'similarity': best_match['similarity'],
			
 
				+                            'severity': severity
			
 
				+                        })
			
 
				+        
			
 
				+        # 处理未匹配的段落
			
 
				+        for i, para in enumerate(original_paras1):
			
 
				+            if i not in used_paras1:
			
 
				+                differences.append({
			
 
				+                    'type': 'paragraph',
			
 
				+                    'position': f'段落{i+1}',
			
 
				+                    'file1_value': para,
			
 
				+                    'file2_value': "",
			
 
				+                    'description': '文件1中独有的段落',
			
 
				+                    'similarity': 0.0,
			
 
				+                    'severity': 'medium'
			
 
				+                })
			
 
				+        
			
 
				+        for j, para in enumerate(original_paras2):
			
 
				+            if j not in used_paras2:
			
 
				+                differences.append({
			
 
				+                    'type': 'paragraph',
			
 
				+                    'position': f'段落{j+1}',
			
 
				+                    'file1_value': "",
			
 
				+                    'file2_value': para,
			
 
				+                    'description': '文件2中独有的段落',
			
 
				+                    'similarity': 0.0,
			
 
				+                    'severity': 'medium'
			
 
				+                })
			
 
				+        
			
 
				+        return differences
			
 
				+    
			
 
				+    def _find_best_match(self, target_text: str, paras2: List[str], 
			
 
				+                        start_index: int, last_match_index: int,
			
 
				+                        used_paras2: set) -> Dict:
			
 
				+        """改进的段落匹配方法"""
			
 
				+        search_start = last_match_index - 1
			
 
				+        unused_count = 0
			
 
				+        
			
 
				+        while search_start >= 0:
			
 
				+            if search_start not in used_paras2:
			
 
				+                unused_count += 1
			
 
				+            if unused_count >= self.max_paragraph_window:
			
 
				+                break
			
 
				+            search_start -= 1
			
 
				+        
			
 
				+        if search_start < 0:
			
 
				+            search_start = 0
			
 
				+            while search_start < start_index and search_start in used_paras2:
			
 
				+                search_start += 1
			
 
				+        
			
 
				+        search_end = min(start_index + self.max_paragraph_window, len(paras2))
			
 
				+        best_match = None
			
 
				+        
			
 
				+        for window_size in range(1, self.max_paragraph_window + 1):
			
 
				+            for j in range(search_start, search_end):
			
 
				+                if any(idx in used_paras2 for idx in range(j, min(j + window_size, len(paras2)))):
			
 
				+                    continue
			
 
				+                
			
 
				+                if j + window_size > len(paras2):
			
 
				+                    break
			
 
				+                
			
 
				+                combined_para2 = "".join(paras2[j:j+window_size])
			
 
				+                
			
 
				+                if target_text == combined_para2:
			
 
				+                    similarity = 100.0
			
 
				+                else:
			
 
				+                    similarity = self.calculator.calculate_text_similarity(target_text, combined_para2)
			
 
				+                
			
 
				+                if not best_match or similarity > best_match['similarity']:
			
 
				+                    best_match = {
			
 
				+                        'text': combined_para2,
			
 
				+                        'similarity': similarity,
			
 
				+                        'indices': list(range(j, j + window_size))
			
 
				+                    }
			
 
				+                    
			
 
				+                    if similarity == 100.0:
			
 
				+                        return best_match
			
 
				+        
			
 
				+        if best_match is None:
			
 
				+            return {
			
 
				+                'text': '',
			
 
				+                'similarity': 0.0,
			
 
				+                'indices': []
			
 
				+            }
			
 
				+        
			
 
				+        return best_match
			
--- a/ocr_comparator/report_generator.py
+++ b/ocr_comparator/report_generator.py
@@ -0,0 +1,124 @@
 
				+import json
			
 
				+import re
			
 
				+from typing import Dict, List
			
 
				+from datetime import datetime
			
 
				+
			
 
				+
			
 
				+class ReportGenerator:
			
 
				+    """生成比较报告"""
			
 
				+    
			
 
				+    @staticmethod
			
 
				+    def generate_json_report(comparison_result: Dict, output_file: str):
			
 
				+        """生成JSON格式报告"""
			
 
				+        with open(f"{output_file}.json", 'w', encoding='utf-8') as f:
			
 
				+            json.dump(comparison_result, f, ensure_ascii=False, indent=2)
			
 
				+        print(f"✅ JSON报告已生成: {output_file}.json")
			
 
				+    
			
 
				+    @staticmethod
			
 
				+    def generate_markdown_report(comparison_result: Dict, output_file: str):
			
 
				+        """生成Markdown格式报告 - 与原版本保持一致"""
			
 
				+        with open(f"{output_file}.md", 'w', encoding='utf-8') as f:
			
 
				+            f.write("# OCR结果对比报告\n\n")
			
 
				+            
			
 
				+            # 基本信息
			
 
				+            f.write("## 基本信息\n\n")
			
 
				+            f.write(f"- **文件1**: `{comparison_result['file1_path']}`\n")
			
 
				+            f.write(f"- **文件2**: `{comparison_result['file2_path']}`\n")
			
 
				+            f.write(f"- **比较时间**: {comparison_result.get('timestamp', datetime.now().strftime('%Y-%m-%d %H:%M:%S'))}\n\n")
			
 
				+            
			
 
				+            # 统计信息
			
 
				+            stats = comparison_result['statistics']
			
 
				+            f.write("## 统计信息\n\n")
			
 
				+            f.write(f"- 总差异数量: **{stats['total_differences']}**\n")
			
 
				+            f.write(f"- 表格差异: **{stats['table_differences']}**\n")
			
 
				+            f.write(f"- 其中表格金额差异: **{stats.get('amount_differences', 0)}**\n")
			
 
				+            f.write(f"- 段落差异: **{stats['paragraph_differences']}**\n")
			
 
				+            f.write(f"- 高严重度: **{stats.get('high_severity', 0)}**\n")
			
 
				+            f.write(f"- 中严重度: **{stats.get('medium_severity', 0)}**\n")
			
 
				+            f.write(f"- 低严重度: **{stats.get('low_severity', 0)}**\n")
			
 
				+            f.write(f"- 文件1表格数: {comparison_result.get('file1_tables', 0)}\n")
			
 
				+            f.write(f"- 文件2表格数: {comparison_result.get('file2_tables', 0)}\n")
			
 
				+            f.write(f"- 文件1段落数: {comparison_result.get('file1_paragraphs', 0)}\n")
			
 
				+            f.write(f"- 文件2段落数: {comparison_result.get('file2_paragraphs', 0)}\n\n")
			
 
				+            
			
 
				+            # 差异摘要
			
 
				+            if stats['total_differences'] == 0:
			
 
				+                f.write("## 结论\n\n")
			
 
				+                f.write("🎉 **完美匹配！没有发现任何差异。**\n\n")
			
 
				+            else:
			
 
				+                f.write("## 差异摘要\n\n")
			
 
				+                
			
 
				+                # ✅ 类型映射（与原版本完全一致）
			
 
				+                type_name_map = {
			
 
				+                    'table_amount': '💰 表格金额差异',
			
 
				+                    'table_text': '📝 表格文本差异',
			
 
				+                    'table_datetime': '📅 表格日期时间差异',
			
 
				+                    'table_pre_header': '📋 表头前内容差异',
			
 
				+                    'table_header_position': '📍 表头位置差异',
			
 
				+                    'table_header_mismatch': '⚠️ 表头不匹配',
			
 
				+                    'table_header_critical': '❌ 表头严重错误',
			
 
				+                    'table_column_type_mismatch': '🔀 列类型不匹配',
			
 
				+                    'table_row_missing': '🚫 表格行缺失',
			
 
				+                    'table_row_data': '📊 表格数据差异',
			
 
				+                    'table_structure': '🏗️ 表格结构差异',
			
 
				+                    'paragraph': '📄 段落差异',
			
 
				+                    'paragraph_punctuation': '🔤 段落标点差异'
			
 
				+                }
			
 
				+                
			
 
				+                # 按类型分组显示差异
			
 
				+                diff_by_type = {}
			
 
				+                for diff in comparison_result['differences']:
			
 
				+                    diff_type = diff['type']
			
 
				+                    if diff_type not in diff_by_type:
			
 
				+                        diff_by_type[diff_type] = []
			
 
				+                    diff_by_type[diff_type].append(diff)
			
 
				+                
			
 
				+                for diff_type, diffs in diff_by_type.items():
			
 
				+                    type_name = type_name_map.get(diff_type, f'❓ {diff_type}')
			
 
				+                    
			
 
				+                    f.write(f"### {type_name} ({len(diffs)}个)\n\n")
			
 
				+                    
			
 
				+                    for i, diff in enumerate(diffs, 1):
			
 
				+                        f.write(f"**{i}. {diff.get('position', 'N/A')}**\n")
			
 
				+                        f.write(f"- 文件1: `{diff.get('file1_value', '')}`\n")
			
 
				+                        f.write(f"- 文件2: `{diff.get('file2_value', '')}`\n")
			
 
				+                        f.write(f"- 说明: {diff.get('description', 'N/A')}\n")
			
 
				+                        if 'severity' in diff:
			
 
				+                            severity_icon = {'critical': '🔴', 'high': '🟠', 'medium': '🟡', 'low': '🟢'}
			
 
				+                            f.write(f"- 严重度: {severity_icon.get(diff['severity'], '⚪')} {diff['severity']}\n")
			
 
				+                        f.write("\n")
			
 
				+            
			
 
				+            # 详细差异列表
			
 
				+            if comparison_result['differences']:
			
 
				+                f.write("## 详细差异列表\n\n")
			
 
				+                f.write("| 序号 | 类型 | 位置 | 文件1内容 | 文件2内容 | 描述 | 严重度 |\n")
			
 
				+                f.write("| --- | --- | --- | --- | --- | --- | --- |\n")
			
 
				+                
			
 
				+                for i, diff in enumerate(comparison_result['differences'], 1):
			
 
				+                    severity = diff.get('severity', 'N/A')
			
 
				+                    position = diff.get('position', 'N/A')
			
 
				+                    file1_value = str(diff.get('file1_value', ''))[:50]
			
 
				+                    file2_value = str(diff.get('file2_value', ''))[:50]
			
 
				+                    description = diff.get('description', 'N/A')
			
 
				+                    
			
 
				+                    # 截断长文本
			
 
				+                    if len(str(diff.get('file1_value', ''))) > 50:
			
 
				+                        file1_value += '...'
			
 
				+                    if len(str(diff.get('file2_value', ''))) > 50:
			
 
				+                        file2_value += '...'
			
 
				+                    
			
 
				+                    f.write(f"| {i} | {diff['type']} | {position} | ")
			
 
				+                    f.write(f"`{file1_value}` | ")
			
 
				+                    f.write(f"`{file2_value}` | ")
			
 
				+                    f.write(f"{description} | {severity} |\n")
			
 
				+        
			
 
				+        print(f"✅ Markdown报告已生成: {output_file}.md")
			
 
				+    
			
 
				+    @staticmethod
			
 
				+    def generate_report(comparison_result: Dict, output_file: str, output_format: str):
			
 
				+        """根据格式生成报告"""
			
 
				+        if output_format in ['json', 'both']:
			
 
				+            ReportGenerator.generate_json_report(comparison_result, output_file)
			
 
				+        
			
 
				+        if output_format in ['markdown', 'both']:
			
 
				+            ReportGenerator.generate_markdown_report(comparison_result, output_file)
			
--- a/ocr_comparator/similarity_calculator.py
+++ b/ocr_comparator/similarity_calculator.py
@@ -0,0 +1,53 @@
 
				+from fuzzywuzzy import fuzz
			
 
				+from typing import Dict, List
			
 
				+
			
 
				+
			
 
				+class SimilarityCalculator:
			
 
				+    """文本相似度计算"""
			
 
				+    
			
 
				+    @staticmethod
			
 
				+    def calculate_text_similarity(text1: str, text2: str) -> float:
			
 
				+        """改进的相似度计算"""
			
 
				+        if not text1 and not text2:
			
 
				+            return 100.0
			
 
				+        if not text1 or not text2:
			
 
				+            return 0.0
			
 
				+        
			
 
				+        if text1 == text2:
			
 
				+            return 100.0
			
 
				+        
			
 
				+        similarity_scores = [fuzz.ratio(text1, text2)]
			
 
				+        return max(similarity_scores)
			
 
				+    
			
 
				+    @staticmethod
			
 
				+    def check_punctuation_differences(text1: str, text2: str, normalize_func) -> List[Dict]:
			
 
				+        """检查两段文本的标点符号差异"""
			
 
				+        differences = []
			
 
				+        
			
 
				+        normalized1 = normalize_func(text1)
			
 
				+        normalized2 = normalize_func(text2)
			
 
				+        
			
 
				+        if normalized1 == normalized2 and text1 != text2:
			
 
				+            min_len = min(len(text1), len(text2))
			
 
				+            
			
 
				+            for i in range(min_len):
			
 
				+                if text1[i] != text2[i]:
			
 
				+                    char1 = text1[i]
			
 
				+                    char2 = text2[i]
			
 
				+                    
			
 
				+                    if normalize_func(char1) == normalize_func(char2):
			
 
				+                        start = max(0, i - 3)
			
 
				+                        end = min(len(text1), i + 4)
			
 
				+                        context1 = text1[start:end]
			
 
				+                        context2 = text2[start:end]
			
 
				+                        
			
 
				+                        differences.append({
			
 
				+                            'position': i,
			
 
				+                            'char1': char1,
			
 
				+                            'char2': char2,
			
 
				+                            'context1': context1,
			
 
				+                            'context2': context2,
			
 
				+                            'type': 'full_half_width'
			
 
				+                        })
			
 
				+        
			
 
				+        return differences
			
--- a/ocr_comparator/table_comparator.py
+++ b/ocr_comparator/table_comparator.py
@@ -0,0 +1,849 @@
 
				+import re
			
 
				+import sys
			
 
				+from typing import Dict, List, Tuple, Optional
			
 
				+from pathlib import Path
			
 
				+
			
 
				+# 添加 ocr_platform 根目录到 Python 路径
			
 
				+# 使用 resolve() 确保路径是绝对路径，避免相对路径导致的 IndexError
			
 
				+_file_path = Path(__file__).resolve()
			
 
				+ocr_platform_root = _file_path.parents[1]  # table_comparator.py -> ocr_comparator -> ocr_platform
			
 
				+if str(ocr_platform_root) not in sys.path:
			
 
				+    sys.path.insert(0, str(ocr_platform_root))
			
 
				+
			
 
				+try:
			
 
				+    from .data_type_detector import DataTypeDetector
			
 
				+    from .similarity_calculator import SimilarityCalculator
			
 
				+    from .text_processor import TextProcessor
			
 
				+except ImportError:
			
 
				+    from data_type_detector import DataTypeDetector
			
 
				+    from similarity_calculator import SimilarityCalculator
			
 
				+    from text_processor import TextProcessor
			
 
				+
			
 
				+
			
 
				+class TableComparator:
			
 
				+    """表格数据比较"""
			
 
				+    
			
 
				+    def __init__(self):
			
 
				+        self.detector = DataTypeDetector()
			
 
				+        self.calculator = SimilarityCalculator()
			
 
				+        self.text_processor = TextProcessor()
			
 
				+        self.header_similarity_threshold = 90
			
 
				+        self.content_similarity_threshold = 95
			
 
				+        self.max_paragraph_window = 6
			
 
				+    
			
 
				+    def find_matching_tables(self, tables1: List[List[List[str]]], 
			
 
				+                            tables2: List[List[List[str]]]) -> List[Tuple[int, int, float]]:
			
 
				+        """
			
 
				+        智能匹配两个文件中的表格
			
 
				+        
			
 
				+        Returns:
			
 
				+            List[Tuple[int, int, float]]: (table1_index, table2_index, similarity_score)
			
 
				+        """
			
 
				+        matches = []
			
 
				+        
			
 
				+        for i, table1 in enumerate(tables1):
			
 
				+            if not table1:
			
 
				+                continue
			
 
				+            
			
 
				+            best_match = None
			
 
				+            best_score = 0
			
 
				+            
			
 
				+            for j, table2 in enumerate(tables2):
			
 
				+                if not table2:
			
 
				+                    continue
			
 
				+                
			
 
				+                # 计算表格相似度
			
 
				+                score = self._calculate_table_similarity(table1, table2)
			
 
				+                
			
 
				+                if score > best_score:
			
 
				+                    best_score = score
			
 
				+                    best_match = j
			
 
				+            
			
 
				+            if best_match is not None and best_score > 50:  # 至少50%相似度
			
 
				+                matches.append((i, best_match, best_score))
			
 
				+                print(f"   📊 表格匹配: 文件1表格{i+1} ↔ 文件2表格{best_match+1} (相似度: {best_score:.1f}%)")
			
 
				+        
			
 
				+        return matches
			
 
				+    
			
 
				+    def _get_max_columns(self, table: List[List[str]]) -> int:
			
 
				+        """获取表格的最大列数"""
			
 
				+        if not table:
			
 
				+            return 0
			
 
				+        return max(len(row) for row in table)
			
 
				+    
			
 
				+    def _calculate_table_similarity(self, table1: List[List[str]], 
			
 
				+                                   table2: List[List[str]]) -> float:
			
 
				+        """计算两个表格的相似度"""
			
 
				+        if not table1 or not table2:
			
 
				+            return 0.0
			
 
				+        
			
 
				+        # 1. 行数相似度 (权重: 15%)
			
 
				+        row_count1 = len(table1)
			
 
				+        row_count2 = len(table2)
			
 
				+        row_similarity = 100 * (1 - abs(row_count1 - row_count2) / max(row_count1, row_count2))
			
 
				+        
			
 
				+        # 2. 列数相似度 (权重: 15%) - ✅ 使用最大列数
			
 
				+        col_count1 = self._get_max_columns(table1)
			
 
				+        col_count2 = self._get_max_columns(table2)
			
 
				+        
			
 
				+        max_cols = max(col_count1, col_count2)
			
 
				+        min_cols = min(col_count1, col_count2)
			
 
				+        
			
 
				+        if max_cols == 0:
			
 
				+            col_similarity = 0
			
 
				+        else:
			
 
				+            # 如果列数差异在合理范围内（比如差1-2列），给予较高分数
			
 
				+            col_diff = abs(col_count1 - col_count2)
			
 
				+            if col_diff == 0:
			
 
				+                col_similarity = 100
			
 
				+            elif col_diff <= 2:
			
 
				+                # 差1-2列，给予80-95分
			
 
				+                col_similarity = 100 - (col_diff * 10)
			
 
				+            else:
			
 
				+                # 差异较大时，使用比例计算
			
 
				+                col_similarity = 100 * (min_cols / max_cols)
			
 
				+        
			
 
				+        print(f"      行数对比: {row_count1} vs {row_count2}, 相似度: {row_similarity:.1f}%")
			
 
				+        print(f"      列数对比: {col_count1} vs {col_count2}, 相似度: {col_similarity:.1f}%")
			
 
				+        
			
 
				+        # 3. 表头相似度 (权重: 50%) - ✅ 先检测表头位置
			
 
				+        header_row_idx1 = self.detect_table_header_row(table1)
			
 
				+        header_row_idx2 = self.detect_table_header_row(table2)
			
 
				+        
			
 
				+        print(f"      表头位置: 文件1第{header_row_idx1+1}行, 文件2第{header_row_idx2+1}行")
			
 
				+        
			
 
				+        header_similarity = 0
			
 
				+        if header_row_idx1 < len(table1) and header_row_idx2 < len(table2):
			
 
				+            header1 = table1[header_row_idx1]
			
 
				+            header2 = table2[header_row_idx2]
			
 
				+            
			
 
				+            if header1 and header2:
			
 
				+                # ✅ 智能表头匹配
			
 
				+                header_similarity = self._calculate_header_similarity_smart(header1, header2)
			
 
				+        
			
 
				+        print(f"      表头相似度: {header_similarity:.1f}%")
			
 
				+        
			
 
				+        # 4. 内容特征相似度 (权重: 20%)
			
 
				+        content_similarity = self._calculate_content_features_similarity(table1, table2)
			
 
				+        
			
 
				+        print(f"      内容特征相似度: {content_similarity:.1f}%")
			
 
				+        
			
 
				+        # ✅ 调整权重分配
			
 
				+        total_similarity = (
			
 
				+            row_similarity * 0.15 +      # 行数 15%
			
 
				+            col_similarity * 0.15 +      # 列数 15%  
			
 
				+            header_similarity * 0.50 +   # 表头 50% (最重要)
			
 
				+            content_similarity * 0.20    # 内容 20%
			
 
				+        )
			
 
				+        
			
 
				+        return total_similarity
			
 
				+    
			
 
				+    def _calculate_header_similarity_smart(self, header1: List[str], 
			
 
				+                                          header2: List[str]) -> float:
			
 
				+        """
			
 
				+        智能计算表头相似度
			
 
				+        
			
 
				+        处理以下情况:
			
 
				+        1. 列数不同但表头内容相似
			
 
				+        2. PaddleOCR可能将多行表头合并
			
 
				+        3. 表头顺序可能不同
			
 
				+        """
			
 
				+        if not header1 or not header2:
			
 
				+            return 0.0
			
 
				+        
			
 
				+        # 标准化表头
			
 
				+        norm_headers1 = [self.normalize_header_text(h) for h in header1]
			
 
				+        norm_headers2 = [self.normalize_header_text(h) for h in header2]
			
 
				+        
			
 
				+        # 方法1: 精确匹配 (最高优先级)
			
 
				+        common_headers = set(norm_headers1) & set(norm_headers2)
			
 
				+        max_len = max(len(norm_headers1), len(norm_headers2))
			
 
				+        
			
 
				+        if max_len == 0:
			
 
				+            return 0.0
			
 
				+        
			
 
				+        exact_match_ratio = len(common_headers) / max_len
			
 
				+        
			
 
				+        # 方法2: 模糊匹配 (针对列数不同的情况)
			
 
				+        fuzzy_matches = 0
			
 
				+        
			
 
				+        # 使用较短的表头作为基准
			
 
				+        if len(norm_headers1) <= len(norm_headers2):
			
 
				+            base_headers = norm_headers1
			
 
				+            compare_headers = norm_headers2
			
 
				+        else:
			
 
				+            base_headers = norm_headers2
			
 
				+            compare_headers = norm_headers1
			
 
				+        
			
 
				+        for base_h in base_headers:
			
 
				+            best_similarity = 0
			
 
				+            for comp_h in compare_headers:
			
 
				+                similarity = self.calculator.calculate_text_similarity(base_h, comp_h)
			
 
				+                if similarity > best_similarity:
			
 
				+                    best_similarity = similarity
			
 
				+                    if best_similarity == 100:
			
 
				+                        break
			
 
				+            
			
 
				+            # 如果相似度超过70%，认为是匹配的
			
 
				+            if best_similarity > 70:
			
 
				+                fuzzy_matches += 1
			
 
				+        
			
 
				+        fuzzy_match_ratio = fuzzy_matches / max_len if max_len > 0 else 0
			
 
				+        
			
 
				+        # 方法3: 关键字匹配 (识别常见表头)
			
 
				+        key_headers = {
			
 
				+            'date': ['日期', 'date', '时间', 'time'],
			
 
				+            'type': ['类型', 'type', '业务', 'business'],
			
 
				+            'number': ['号', 'no', '编号', 'id', '票据', 'bill'],
			
 
				+            'description': ['摘要', 'description', '说明', 'remark'],
			
 
				+            'amount': ['金额', 'amount', '借方', 'debit', '贷方', 'credit'],
			
 
				+            'balance': ['余额', 'balance'],
			
 
				+            'counterparty': ['对手', 'counterparty', '账户', 'account', '户名', 'name']
			
 
				+        }
			
 
				+        
			
 
				+        def categorize_header(h: str) -> set:
			
 
				+            categories = set()
			
 
				+            h_lower = h.lower()
			
 
				+            for category, keywords in key_headers.items():
			
 
				+                for keyword in keywords:
			
 
				+                    if keyword in h_lower:
			
 
				+                        categories.add(category)
			
 
				+            return categories
			
 
				+        
			
 
				+        categories1 = set()
			
 
				+        for h in norm_headers1:
			
 
				+            categories1.update(categorize_header(h))
			
 
				+        
			
 
				+        categories2 = set()
			
 
				+        for h in norm_headers2:
			
 
				+            categories2.update(categorize_header(h))
			
 
				+        
			
 
				+        common_categories = categories1 & categories2
			
 
				+        all_categories = categories1 | categories2
			
 
				+        
			
 
				+        category_match_ratio = len(common_categories) / len(all_categories) if all_categories else 0
			
 
				+        
			
 
				+        # ✅ 综合三种方法，加权计算
			
 
				+        final_similarity = (
			
 
				+            exact_match_ratio * 0.4 +      # 精确匹配 40%
			
 
				+            fuzzy_match_ratio * 0.4 +      # 模糊匹配 40%
			
 
				+            category_match_ratio * 0.2     # 语义匹配 20%
			
 
				+        ) * 100
			
 
				+        
			
 
				+        print(f"        精确匹配: {exact_match_ratio:.1%}, 模糊匹配: {fuzzy_match_ratio:.1%}, 语义匹配: {category_match_ratio:.1%}")
			
 
				+        
			
 
				+        return final_similarity
			
 
				+    
			
 
				+    def _calculate_content_features_similarity(self, table1: List[List[str]], 
			
 
				+                                              table2: List[List[str]]) -> float:
			
 
				+        """计算表格内容特征相似度"""
			
 
				+        # 统计数字、日期等特征
			
 
				+        features1 = self._extract_table_features(table1)
			
 
				+        features2 = self._extract_table_features(table2)
			
 
				+        
			
 
				+        # 比较特征
			
 
				+        similarity_scores = []
			
 
				+        
			
 
				+        for key in ['numeric_ratio', 'date_ratio', 'empty_ratio']:
			
 
				+            if key in features1 and key in features2:
			
 
				+                diff = abs(features1[key] - features2[key])
			
 
				+                similarity_scores.append(100 * (1 - diff))
			
 
				+        
			
 
				+        return sum(similarity_scores) / len(similarity_scores) if similarity_scores else 0
			
 
				+    
			
 
				+    def _extract_table_features(self, table: List[List[str]]) -> Dict:
			
 
				+        """提取表格特征"""
			
 
				+        total_cells = 0
			
 
				+        numeric_cells = 0
			
 
				+        date_cells = 0
			
 
				+        empty_cells = 0
			
 
				+        
			
 
				+        for row in table:
			
 
				+            for cell in row:
			
 
				+                total_cells += 1
			
 
				+                
			
 
				+                if not cell or cell.strip() == '':
			
 
				+                    empty_cells += 1
			
 
				+                    continue
			
 
				+                
			
 
				+                if self.detector.is_numeric(cell):
			
 
				+                    numeric_cells += 1
			
 
				+                
			
 
				+                if self.detector.extract_datetime(cell):
			
 
				+                    date_cells += 1
			
 
				+        
			
 
				+        return {
			
 
				+            'numeric_ratio': numeric_cells / total_cells if total_cells > 0 else 0,
			
 
				+            'date_ratio': date_cells / total_cells if total_cells > 0 else 0,
			
 
				+            'empty_ratio': empty_cells / total_cells if total_cells > 0 else 0,
			
 
				+            'total_cells': total_cells
			
 
				+        }
			
 
				+    
			
 
				+    def normalize_header_text(self, text: str) -> str:
			
 
				+        """标准化表头文本"""
			
 
				+        # 移除括号内容
			
 
				+        text = re.sub(r'[（(].*?[)）]', '', text)
			
 
				+        # 移除空格
			
 
				+        text = re.sub(r'\s+', '', text)
			
 
				+        # 只保留字母、数字和中文
			
 
				+        text = re.sub(r'[^\w\u4e00-\u9fff]', '', text)
			
 
				+        return text.lower().strip()
			
 
				+    
			
 
				+    def detect_table_header_row(self, table: List[List[str]]) -> int:
			
 
				+        """
			
 
				+        智能检测表格的表头行索引
			
 
				+        
			
 
				+        检测策略:
			
 
				+        1. 查找包含表头关键字最多的行
			
 
				+        2. 确认下一行是数据行（或分类行）
			
 
				+        3. 特殊处理：资产负债表等多层表头
			
 
				+        """
			
 
				+        if not table:
			
 
				+            return 0
			
 
				+        
			
 
				+        header_keywords = [
			
 
				+            '日期', 'date', '时间', 'time',
			
 
				+            '类型', 'type', '业务', 'business',
			
 
				+            '号', 'no', '编号', 'id', '票据', 'bill',
			
 
				+            '摘要', 'description', '说明', 'remark',
			
 
				+            '金额', 'amount', '借方', 'debit', '贷方', 'credit',
			
 
				+            '余额', 'balance',
			
 
				+            '对手', 'counterparty', '账户', 'account', '户名', 'name',
			
 
				+            # ✅ 新增：资产负债表关键词
			
 
				+            # '资产', 'asset', '负债', 'liability', '期末', 'period', '期初'
			
 
				+            '期末', 'period', '期初'
			
 
				+        ]
			
 
				+        
			
 
				+        best_header_row = 0
			
 
				+        best_score = 0
			
 
				+
			
 
				+        for row_idx, row in enumerate(table[:10]):
			
 
				+            if not row:
			
 
				+                continue
			
 
				+            
			
 
				+            # 计算关键字匹配分数
			
 
				+            keyword_count = 0
			
 
				+            non_empty_cells = 0
			
 
				+            
			
 
				+            for cell in row:
			
 
				+                cell_text = str(cell).strip()
			
 
				+                if cell_text:
			
 
				+                    non_empty_cells += 1
			
 
				+                    cell_lower = cell_text.lower()
			
 
				+                    
			
 
				+                    for keyword in header_keywords:
			
 
				+                        if keyword in cell_lower:
			
 
				+                            keyword_count += 1
			
 
				+                            break
			
 
				+        
			
 
				+            if non_empty_cells < 3:
			
 
				+                continue
			
 
				+            
			
 
				+            keyword_ratio = keyword_count / non_empty_cells if non_empty_cells > 0 else 0
			
 
				+            column_bonus = min(non_empty_cells / 5, 1.0)
			
 
				+            score = keyword_ratio * 0.7 + column_bonus * 0.3
			
 
				+            
			
 
				+            # ✅ 改进：跳过分类行和数据行检测
			
 
				+            if row_idx + 1 < len(table):
			
 
				+                next_row = table[row_idx + 1]
			
 
				+                # 如果下一行是数据行，加分
			
 
				+                if self._is_data_row(next_row):
			
 
				+                    score += 0.2
			
 
				+                # ✅ 新增：如果下一行是分类行（如"流动资产:"），小幅加分
			
 
				+                elif self._is_category_row(next_row):
			
 
				+                    score += 0.1
			
 
				+
			
 
				+            if score > best_score:
			
 
				+                best_score = score
			
 
				+                best_header_row = row_idx
			
 
				+        
			
 
				+        if best_score < 0.3:
			
 
				+            print(f"   ⚠️  未检测到明确表头，默认使用第1行 (得分: {best_score:.2f})")
			
 
				+            return 0
			
 
				+        
			
 
				+        print(f"   📍 检测到表头在第 {best_header_row + 1} 行 (得分: {best_score:.2f})")
			
 
				+        return best_header_row
			
 
				+
			
 
				+    def _is_category_row(self, row: List[str]) -> bool:
			
 
				+        """
			
 
				+        ✅ 新增：判断是否为分类行（如"流动资产:"）
			
 
				+        """
			
 
				+        if not row:
			
 
				+            return False
			
 
				+        
			
 
				+        category_patterns = [
			
 
				+            # r'流动[资产负债]',
			
 
				+            # r'非流动[资产负债]',
			
 
				+            r'.*:$',  # 以冒号结尾
			
 
				+        ]
			
 
				+        
			
 
				+        for cell in row:
			
 
				+            cell_text = str(cell).strip()
			
 
				+            if not cell_text:
			
 
				+                continue
			
 
				+            
			
 
				+            for pattern in category_patterns:
			
 
				+                if re.search(pattern, cell_text):
			
 
				+                    return True
			
 
				+        
			
 
				+        return False
			
 
				+    
			
 
				+    def _is_data_row(self, row: List[str]) -> bool:
			
 
				+        """
			
 
				+        判断是否为数据行（改进版）
			
 
				+        
			
 
				+        ✅ "-" 符号表示金额为0或空，应该被认为是有效的数据单元格
			
 
				+        """
			
 
				+        if not row:
			
 
				+            return False
			
 
				+        
			
 
				+        data_pattern_count = 0
			
 
				+        non_empty_count = 0
			
 
				+        
			
 
				+        for cell in row:
			
 
				+            cell_text = str(cell).strip()
			
 
				+            if not cell_text:
			
 
				+                continue
			
 
				+            
			
 
				+            non_empty_count += 1
			
 
				+            
			
 
				+            # ✅ "-" 符号也是有效的数据（表示0或空）
			
 
				+            if cell_text == '-' or cell_text == '—' or cell_text == '--':
			
 
				+                data_pattern_count += 1
			
 
				+                continue
			
 
				+            
			
 
				+            # 包含数字
			
 
				+            if re.search(r'\d', cell_text):
			
 
				+                data_pattern_count += 1
			
 
				+            
			
 
				+            # 包含日期格式
			
 
				+            if re.search(r'\d{4}[-/年]\d{1,2}[-/月]\d{1,2}', cell_text):
			
 
				+                data_pattern_count += 1
			
 
				+            
			
 
				+            # 包含金额格式
			
 
				+            if re.search(r'-?\d+[,，]?\d*\.?\d+', cell_text):
			
 
				+                data_pattern_count += 1
			
 
				+        
			
 
				+        if non_empty_count == 0:
			
 
				+            return False
			
 
				+        
			
 
				+        # 至少30%的单元格包含数据特征
			
 
				+        return data_pattern_count / non_empty_count >= 0.3
			
 
				+    
			
 
				+    def compare_table_headers(self, headers1: List[str], headers2: List[str]) -> Dict:
			
 
				+        """比较表格表头"""
			
 
				+        result = {
			
 
				+            'match': True,
			
 
				+            'differences': [],
			
 
				+            'column_mapping': {},
			
 
				+            'similarity_scores': []
			
 
				+        }
			
 
				+        
			
 
				+        if len(headers1) != len(headers2):
			
 
				+            result['match'] = False
			
 
				+            result['differences'].append({
			
 
				+                'type': 'table_header_critical',
			
 
				+                'description': f'表头列数不一致: {len(headers1)} vs {len(headers2)}',
			
 
				+                'severity': 'critical'
			
 
				+            })
			
 
				+            return result
			
 
				+        
			
 
				+        for i, (h1, h2) in enumerate(zip(headers1, headers2)):
			
 
				+            norm_h1 = self.normalize_header_text(h1)
			
 
				+            norm_h2 = self.normalize_header_text(h2)
			
 
				+            
			
 
				+            similarity = self.calculator.calculate_text_similarity(norm_h1, norm_h2)
			
 
				+            result['similarity_scores'].append({
			
 
				+                'column_index': i,
			
 
				+                'header1': h1,
			
 
				+                'header2': h2,
			
 
				+                'similarity': similarity
			
 
				+            })
			
 
				+            
			
 
				+            if similarity < self.header_similarity_threshold:
			
 
				+                result['match'] = False
			
 
				+                result['differences'].append({
			
 
				+                    'type': 'table_header_mismatch',
			
 
				+                    'column_index': i,
			
 
				+                    'header1': h1,
			
 
				+                    'header2': h2,
			
 
				+                    'similarity': similarity,
			
 
				+                    'description': f'第{i+1}列表头不匹配: "{h1}" vs "{h2}" (相似度: {similarity:.1f}%)',
			
 
				+                    'severity': 'medium' if similarity < 50 else 'high'
			
 
				+                })
			
 
				+            else:
			
 
				+                result['column_mapping'][i] = i
			
 
				+        
			
 
				+        return result
			
 
				+    
			
 
				+    def compare_cell_value(self, value1: str, value2: str, column_type: str, 
			
 
				+                          column_name: str = '') -> Dict:
			
 
				+        """比较单元格值"""
			
 
				+        result = {
			
 
				+            'match': True,
			
 
				+            'difference': None
			
 
				+        }
			
 
				+        
			
 
				+        v1 = self.text_processor.normalize_text(value1)
			
 
				+        v2 = self.text_processor.normalize_text(value2)
			
 
				+        
			
 
				+        if v1 == v2:
			
 
				+            return result
			
 
				+        
			
 
				+        if column_type == 'text_number':
			
 
				+            norm_v1 = self.detector.normalize_text_number(v1)
			
 
				+            norm_v2 = self.detector.normalize_text_number(v2)
			
 
				+            
			
 
				+            if norm_v1 == norm_v2:
			
 
				+                result['match'] = False
			
 
				+                result['difference'] = {
			
 
				+                    'type': 'table_text',
			
 
				+                    'value1': value1,
			
 
				+                    'value2': value2,
			
 
				+                    'description': f'文本型数字格式差异: "{value1}" vs "{value2}" (内容相同，空格不同)',
			
 
				+                    'severity': 'low'
			
 
				+                }
			
 
				+            else:
			
 
				+                result['match'] = False
			
 
				+                result['difference'] = {
			
 
				+                    'type': 'table_text',
			
 
				+                    'value1': value1,
			
 
				+                    'value2': value2,
			
 
				+                    'description': f'文本型数字不一致: {value1} vs {value2}',
			
 
				+                    'severity': 'high'
			
 
				+                }
			
 
				+            return result
			
 
				+        
			
 
				+        if column_type == 'numeric':
			
 
				+            if self.detector.is_numeric(v1) and self.detector.is_numeric(v2):
			
 
				+                num1 = self.detector.parse_number(v1)
			
 
				+                num2 = self.detector.parse_number(v2)
			
 
				+                if abs(num1 - num2) > 0.01:
			
 
				+                    result['match'] = False
			
 
				+                    result['difference'] = {
			
 
				+                        'type': 'table_amount',
			
 
				+                        'value1': value1,
			
 
				+                        'value2': value2,
			
 
				+                        'diff_amount': abs(num1 - num2),
			
 
				+                        'description': f'金额不一致: {value1} vs {value2}',
			
 
				+                        'severity': 'high'  # ✅ 修改：金额差异 = high
			
 
				+                    }
			
 
				+            else:
			
 
				+                result['match'] = False
			
 
				+                result['difference'] = {
			
 
				+                    'type': 'table_text',
			
 
				+                    'value1': value1,
			
 
				+                    'value2': value2,
			
 
				+                    'description': f'长数字字符串不一致: {value1} vs {value2}',
			
 
				+                    'severity': 'medium'  # ✅ 修改：数字字符串差异 = medium
			
 
				+                }
			
 
				+        elif column_type == 'datetime':
			
 
				+            datetime1 = self.detector.extract_datetime(v1)
			
 
				+            datetime2 = self.detector.extract_datetime(v2)
			
 
				+            
			
 
				+            if datetime1 != datetime2:
			
 
				+                result['match'] = False
			
 
				+                result['difference'] = {
			
 
				+                    'type': 'table_datetime',
			
 
				+                    'value1': value1,
			
 
				+                    'value2': value2,
			
 
				+                    'description': f'日期时间不一致: {value1} vs {value2}',
			
 
				+                    'severity': 'medium'  # 日期差异 = medium
			
 
				+                }
			
 
				+        else:
			
 
				+            similarity = self.calculator.calculate_text_similarity(v1, v2)
			
 
				+            if similarity < self.content_similarity_threshold:
			
 
				+                result['match'] = False
			
 
				+                result['difference'] = {
			
 
				+                    'type': 'table_text',
			
 
				+                    'value1': value1,
			
 
				+                    'value2': value2,
			
 
				+                    'similarity': similarity,
			
 
				+                    'description': f'文本不一致: {value1} vs {value2} (相似度: {similarity:.1f}%)',
			
 
				+                    'severity': 'low' if similarity > 80 else 'medium'  # 根据相似度判断
			
 
				+                }
			
 
				+        
			
 
				+        return result
			
 
				+    
			
 
				+    def compare_tables(self, table1: List[List[str]], table2: List[List[str]]) -> List[Dict]:
			
 
				+        """标准表格比较"""
			
 
				+        differences = []
			
 
				+        max_rows = max(len(table1), len(table2))
			
 
				+        
			
 
				+        for i in range(max_rows):
			
 
				+            row1 = table1[i] if i < len(table1) else []
			
 
				+            row2 = table2[i] if i < len(table2) else []
			
 
				+            
			
 
				+            max_cols = max(len(row1), len(row2))
			
 
				+            
			
 
				+            for j in range(max_cols):
			
 
				+                cell1 = row1[j] if j < len(row1) else ""
			
 
				+                cell2 = row2[j] if j < len(row2) else ""
			
 
				+                
			
 
				+                if "[图片内容-忽略]" in cell1 or "[图片内容-忽略]" in cell2:
			
 
				+                    continue
			
 
				+                
			
 
				+                if cell1 != cell2:
			
 
				+                    if self.detector.is_numeric(cell1) and self.detector.is_numeric(cell2):
			
 
				+                        num1 = self.detector.parse_number(cell1)
			
 
				+                        num2 = self.detector.parse_number(cell2)
			
 
				+                        if abs(num1 - num2) > 0.001:
			
 
				+                            differences.append({
			
 
				+                                'type': 'table_amount',
			
 
				+                                'position': f'行{i+1}列{j+1}',
			
 
				+                                'file1_value': cell1,
			
 
				+                                'file2_value': cell2,
			
 
				+                                'description': f'金额不一致: {cell1} vs {cell2}',
			
 
				+                                'severity': 'high',  # ✅ 添加：金额差异 = high
			
 
				+                                'row_index': i,
			
 
				+                                'col_index': j
			
 
				+                            })
			
 
				+                    else:
			
 
				+                        differences.append({
			
 
				+                            'type': 'table_text',
			
 
				+                            'position': f'行{i+1}列{j+1}',
			
 
				+                            'file1_value': cell1,
			
 
				+                            'file2_value': cell2,
			
 
				+                            'description': f'文本不一致: {cell1} vs {cell2}',
			
 
				+                            'severity': 'medium',  # ✅ 添加：文本差异 = medium
			
 
				+                            'row_index': i,
			
 
				+                            'col_index': j
			
 
				+                        })
			
 
				+        
			
 
				+        return differences
			
 
				+    
			
 
				+    def compare_table_flow_list(self, table1: List[List[str]], table2: List[List[str]]) -> List[Dict]:
			
 
				+        """流水列表表格比较算法"""
			
 
				+        differences = []
			
 
				+        
			
 
				+        if not table1 or not table2:
			
 
				+            return [{
			
 
				+                'type': 'table_empty',
			
 
				+                'description': '表格为空',
			
 
				+                'severity': 'critical'
			
 
				+            }]
			
 
				+        
			
 
				+        print(f"\n📋 开始流水表格对比...")
			
 
				+        
			
 
				+        # 检测表头位置
			
 
				+        header_row_idx1 = self.detect_table_header_row(table1)
			
 
				+        header_row_idx2 = self.detect_table_header_row(table2)
			
 
				+        
			
 
				+        if header_row_idx1 != header_row_idx2:
			
 
				+            differences.append({
			
 
				+                'type': 'table_header_position',
			
 
				+                'position': '表头位置',
			
 
				+                'file1_value': f'第{header_row_idx1 + 1}行',
			
 
				+                'file2_value': f'第{header_row_idx2 + 1}行',
			
 
				+                'description': f'表头位置不一致: 文件1在第{header_row_idx1 + 1}行，文件2在第{header_row_idx2 + 1}行',
			
 
				+                'severity': 'high'
			
 
				+            })
			
 
				+        
			
 
				+        # 比对表头前的内容
			
 
				+        if header_row_idx1 > 0 or header_row_idx2 > 0:
			
 
				+            print(f"\n📝 对比表头前的内容...")
			
 
				+            pre_header_table1 = table1[:header_row_idx1] if header_row_idx1 > 0 else []
			
 
				+            pre_header_table2 = table2[:header_row_idx2] if header_row_idx2 > 0 else []
			
 
				+            
			
 
				+            if pre_header_table1 or pre_header_table2:
			
 
				+                pre_header_diffs = self.compare_tables(pre_header_table1, pre_header_table2)
			
 
				+                for diff in pre_header_diffs:
			
 
				+                    diff['type'] = 'table_pre_header'
			
 
				+                    diff['position'] = f"表头前{diff['position']}"
			
 
				+                    diff['severity'] = 'medium'
			
 
				+                differences.extend(pre_header_diffs)
			
 
				+        
			
 
				+        # 比较表头
			
 
				+        headers1 = table1[header_row_idx1]
			
 
				+        headers2 = table2[header_row_idx2]
			
 
				+        
			
 
				+        print(f"\n📋 对比表头...")
			
 
				+        header_result = self.compare_table_headers(headers1, headers2)
			
 
				+        
			
 
				+        if not header_result['match']:
			
 
				+            print(f"\n⚠️  表头文字存在差异")
			
 
				+            for diff in header_result['differences']:
			
 
				+                differences.append({
			
 
				+                    'type': diff.get('type', 'table_header_mismatch'),
			
 
				+                    'position': '表头',
			
 
				+                    'file1_value': diff.get('header1', ''),
			
 
				+                    'file2_value': diff.get('header2', ''),
			
 
				+                    'description': diff['description'],
			
 
				+                    'severity': diff.get('severity', 'high'),
			
 
				+                })
			
 
				+                if diff.get('severity') == 'critical':
			
 
				+                    return differences
			
 
				+        
			
 
				+        # 检测列类型并比较数据行
			
 
				+        column_types1 = self._detect_column_types(table1, header_row_idx1, headers1)
			
 
				+        column_types2 = self._detect_column_types(table2, header_row_idx2, headers2)
			
 
				+        
			
 
				+        # 处理列类型不匹配
			
 
				+        mismatched_columns = self._check_column_type_mismatch(
			
 
				+            column_types1, column_types2, headers1, headers2, differences
			
 
				+        )
			
 
				+        
			
 
				+        # 合并列类型
			
 
				+        column_types = self._merge_column_types(column_types1, column_types2, mismatched_columns)
			
 
				+        
			
 
				+        # 逐行比较数据
			
 
				+        data_diffs = self._compare_data_rows(
			
 
				+            table1, table2, header_row_idx1, header_row_idx2,
			
 
				+            headers1, column_types, mismatched_columns, header_result['match']
			
 
				+        )
			
 
				+        differences.extend(data_diffs)
			
 
				+        
			
 
				+        print(f"\n✅ 流水表格对比完成，发现 {len(differences)} 个差异")
			
 
				+        return differences
			
 
				+    
			
 
				+    def _detect_column_types(self, table: List[List[str]], header_row_idx: int, 
			
 
				+                            headers: List[str]) -> List[str]:
			
 
				+        """检测列类型"""
			
 
				+        column_types = []
			
 
				+        for col_idx in range(len(headers)):
			
 
				+            col_values = [
			
 
				+                row[col_idx] 
			
 
				+                for row in table[header_row_idx + 1:] 
			
 
				+                if col_idx < len(row)
			
 
				+            ]
			
 
				+            col_type = self.detector.detect_column_type(col_values)
			
 
				+            column_types.append(col_type)
			
 
				+        return column_types
			
 
				+    
			
 
				+    def _check_column_type_mismatch(self, column_types1: List[str], column_types2: List[str],
			
 
				+                                   headers1: List[str], headers2: List[str],
			
 
				+                                   differences: List[Dict]) -> List[int]:
			
 
				+        """检查列类型不匹配"""
			
 
				+        mismatched_columns = []
			
 
				+        for col_idx in range(min(len(column_types1), len(column_types2))):
			
 
				+            if column_types1[col_idx] != column_types2[col_idx]:
			
 
				+                mismatched_columns.append(col_idx)
			
 
				+                differences.append({
			
 
				+                    'type': 'table_column_type_mismatch',
			
 
				+                    'position': f'第{col_idx + 1}列',
			
 
				+                    'file1_value': f'{headers1[col_idx]} ({column_types1[col_idx]})',
			
 
				+                    'file2_value': f'{headers2[col_idx]} ({column_types2[col_idx]})',
			
 
				+                    'description': f'列类型不一致: {column_types1[col_idx]} vs {column_types2[col_idx]}',
			
 
				+                    'severity': 'high',
			
 
				+                    'column_index': col_idx
			
 
				+                })
			
 
				+        
			
 
				+        total_columns = min(len(column_types1), len(column_types2))
			
 
				+        mismatch_ratio = len(mismatched_columns) / total_columns if total_columns > 0 else 0
			
 
				+        
			
 
				+        if mismatch_ratio > 0.5:
			
 
				+            differences.append({
			
 
				+                'type': 'table_header_critical',
			
 
				+                'position': '表格列类型',
			
 
				+                'file1_value': f'{len(mismatched_columns)}列类型不一致',
			
 
				+                'file2_value': f'共{total_columns}列',
			
 
				+                'description': f'列类型差异过大: {len(mismatched_columns)}/{total_columns}列不匹配 ({mismatch_ratio:.1%})',
			
 
				+                'severity': 'critical'
			
 
				+            })
			
 
				+        
			
 
				+        return mismatched_columns
			
 
				+    
			
 
				+    def _merge_column_types(self, column_types1: List[str], column_types2: List[str],
			
 
				+                           mismatched_columns: List[int]) -> List[str]:
			
 
				+        """合并列类型"""
			
 
				+        column_types = []
			
 
				+        for col_idx in range(max(len(column_types1), len(column_types2))):
			
 
				+            if col_idx >= len(column_types1):
			
 
				+                column_types.append(column_types2[col_idx])
			
 
				+            elif col_idx >= len(column_types2):
			
 
				+                column_types.append(column_types1[col_idx])
			
 
				+            elif col_idx in mismatched_columns:
			
 
				+                type1 = column_types1[col_idx]
			
 
				+                type2 = column_types2[col_idx]
			
 
				+                
			
 
				+                if type1 == 'text' or type2 == 'text':
			
 
				+                    column_types.append('text')
			
 
				+                elif type1 == 'text_number' or type2 == 'text_number':
			
 
				+                    column_types.append('text_number')
			
 
				+                else:
			
 
				+                    column_types.append(type1)
			
 
				+            else:
			
 
				+                column_types.append(column_types1[col_idx])
			
 
				+        
			
 
				+        return column_types
			
 
				+    
			
 
				+    def _compare_data_rows(self, table1: List[List[str]], table2: List[List[str]],
			
 
				+                          header_row_idx1: int, header_row_idx2: int,
			
 
				+                          headers1: List[str], column_types: List[str],
			
 
				+                          mismatched_columns: List[int], header_match: bool) -> List[Dict]:
			
 
				+        """逐行比较数据"""
			
 
				+        differences = []
			
 
				+        data_rows1 = table1[header_row_idx1 + 1:]
			
 
				+        data_rows2 = table2[header_row_idx2 + 1:]
			
 
				+        max_rows = max(len(data_rows1), len(data_rows2))
			
 
				+        
			
 
				+        for row_idx in range(max_rows):
			
 
				+            row1 = data_rows1[row_idx] if row_idx < len(data_rows1) else []
			
 
				+            row2 = data_rows2[row_idx] if row_idx < len(data_rows2) else []
			
 
				+            actual_row_num = header_row_idx1 + row_idx + 2
			
 
				+            
			
 
				+            if not row1:
			
 
				+                differences.append({
			
 
				+                    'type': 'table_row_missing',
			
 
				+                    'position': f'第{actual_row_num}行',
			
 
				+                    'file1_value': '',
			
 
				+                    'file2_value': ', '.join(row2),
			
 
				+                    'description': f'文件1缺少第{actual_row_num}行',
			
 
				+                    'severity': 'high',
			
 
				+                    'row_index': actual_row_num
			
 
				+                })
			
 
				+                continue
			
 
				+            
			
 
				+            if not row2:
			
 
				+                differences.append({
			
 
				+                    'type': 'table_row_missing',
			
 
				+                    'position': f'第{actual_row_num}行',
			
 
				+                    'file1_value': ', '.join(row1),
			
 
				+                    'file2_value': '',
			
 
				+                    'description': f'文件2缺少第{actual_row_num}行',
			
 
				+                    'severity': 'high',
			
 
				+                    'row_index': actual_row_num
			
 
				+                })
			
 
				+                continue
			
 
				+            
			
 
				+            # 逐列比较
			
 
				+            max_cols = max(len(row1), len(row2))
			
 
				+            for col_idx in range(max_cols):
			
 
				+                cell1 = row1[col_idx] if col_idx < len(row1) else ''
			
 
				+                cell2 = row2[col_idx] if col_idx < len(row2) else ''
			
 
				+                
			
 
				+                if "[图片内容-忽略]" in cell1 or "[图片内容-忽略]" in cell2:
			
 
				+                    continue
			
 
				+                
			
 
				+                column_type = column_types[col_idx] if col_idx < len(column_types) else 'text'
			
 
				+                column_name = headers1[col_idx] if col_idx < len(headers1) else f'列{col_idx + 1}'
			
 
				+                
			
 
				+                compare_result = self.compare_cell_value(cell1, cell2, column_type, column_name)
			
 
				+                
			
 
				+                if not compare_result['match']:
			
 
				+                    diff_info = compare_result['difference']
			
 
				+                    type_mismatch_note = ""
			
 
				+                    if col_idx in mismatched_columns:
			
 
				+                        type_mismatch_note = " [列类型冲突]"
			
 
				+                    
			
 
				+                    # ✅ 确定最终严重度：优先使用 diff_info 的 severity
			
 
				+                    base_severity = diff_info.get('severity', 'medium')
			
 
				+                    
			
 
				+                    # 如果列类型冲突，且基础严重度不是 high，则提升到 high
			
 
				+                    final_severity = 'high' if col_idx in mismatched_columns else base_severity
			
 
				+                    
			
 
				+                    differences.append({
			
 
				+                        'type': diff_info['type'],
			
 
				+                        'position': f'第{actual_row_num}行第{col_idx + 1}列',
			
 
				+                        'file1_value': diff_info['value1'],
			
 
				+                        'file2_value': diff_info['value2'],
			
 
				+                        'description': diff_info['description'] + type_mismatch_note,
			
 
				+                        'severity': final_severity,  # ✅ 使用计算后的严重度
			
 
				+                        'row_index': actual_row_num,
			
 
				+                        'col_index': col_idx,
			
 
				+                        'column_name': column_name,
			
 
				+                        'column_type': column_type,
			
 
				+                        'column_type_mismatch': col_idx in mismatched_columns,
			
 
				+                    })
			
 
				+        
			
 
				+        return differences
			
--- a/ocr_comparator/text_processor.py
+++ b/ocr_comparator/text_processor.py
@@ -0,0 +1,83 @@
 
				+import re
			
 
				+from typing import List
			
 
				+
			
 
				+
			
 
				+class TextProcessor:
			
 
				+    """文本标准化和预处理"""
			
 
				+    
			
 
				+    @staticmethod
			
 
				+    def normalize_text(text: str) -> str:
			
 
				+        """标准化文本：去除多余空格、回车等无效字符"""
			
 
				+        if not text:
			
 
				+            return ""
			
 
				+        text = re.sub(r'\s+', ' ', text.strip())
			
 
				+        text = re.sub(r'\s*([，。：；！？、])\s*', r'\1', text)
			
 
				+        return text
			
 
				+    
			
 
				+    @staticmethod
			
 
				+    def strip_markdown_formatting(text: str) -> str:
			
 
				+        """移除Markdown格式标记，只保留纯文本内容"""
			
 
				+        if not text:
			
 
				+            return ""
			
 
				+        
			
 
				+        text = re.sub(r'^#+\s*', '', text)
			
 
				+        text = re.sub(r'\*\*(.+?)\*\*', r'\1', text)
			
 
				+        text = re.sub(r'__(.+?)__', r'\1', text)
			
 
				+        text = re.sub(r'\*(.+?)\*', r'\1', text)
			
 
				+        text = re.sub(r'_(.+?)_', r'\1', text)
			
 
				+        text = re.sub(r'\[(.+?)\]\(.+?\)', r'\1', text)
			
 
				+        text = re.sub(r'!\[.*?\]\(.+?\)', '', text)
			
 
				+        text = re.sub(r'`(.+?)`', r'\1', text)
			
 
				+        text = re.sub(r'<[^>]+>', '', text)
			
 
				+        text = re.sub(r'^\s*[-*+]\s+', '', text)
			
 
				+        text = re.sub(r'^\s*\d+\.\s+', '', text)
			
 
				+        text = re.sub(r'^\s*>\s+', '', text)
			
 
				+        text = re.sub(r'\s+', ' ', text.strip())
			
 
				+        
			
 
				+        return text
			
 
				+    
			
 
				+    @staticmethod
			
 
				+    def normalize_punctuation(text: str) -> str:
			
 
				+        """统一标点符号 - 将中文标点转换为英文标点"""
			
 
				+        if not text:
			
 
				+            return ""
			
 
				+        
			
 
				+        punctuation_map = {
			
 
				+            '：': ':', '；': ';', '，': ',', '。': '.', '！': '!', '？': '?',
			
 
				+            '（': '(', '）': ')', '【': '[', '】': ']', '《': '<', '》': '>',
			
 
				+            '"': '"', '"': '"', ''': "'", ''': "'", '、': ',', '—': '-',
			
 
				+            '…': '...', '～': '~',
			
 
				+        }
			
 
				+        
			
 
				+        for cn_punct, en_punct in punctuation_map.items():
			
 
				+            text = text.replace(cn_punct, en_punct)
			
 
				+        
			
 
				+        return text
			
 
				+    
			
 
				+    @staticmethod
			
 
				+    def normalize_text_for_comparison(text: str) -> str:
			
 
				+        """用于比较的文本标准化"""
			
 
				+        text = TextProcessor.strip_markdown_formatting(text)
			
 
				+        text = TextProcessor.normalize_punctuation(text)
			
 
				+        text = TextProcessor.normalize_text(text)
			
 
				+        return text
			
 
				+    
			
 
				+    @staticmethod
			
 
				+    def is_image_reference(text: str) -> bool:
			
 
				+        """判断是否为图片引用或描述"""
			
 
				+        image_keywords = [
			
 
				+            '图', '图片', '图像', 'image', 'figure', 'fig',
			
 
				+            '照片', '截图', '示意图', '流程图', '结构图'
			
 
				+        ]
			
 
				+        
			
 
				+        for keyword in image_keywords:
			
 
				+            if keyword in text.lower():
			
 
				+                return True
			
 
				+        
			
 
				+        if re.search(r'!\[.*?\]\(.*?\)', text):
			
 
				+            return True
			
 
				+            
			
 
				+        if re.search(r'<img[^>]*>', text, re.IGNORECASE):
			
 
				+            return True
			
 
				+            
			
 
				+        return False