5 месяцев назад · 321e1fe2c7
--- a/ocr_tools/universal_doc_parser/tests/cell_fusion_config_example.yaml
+++ b/ocr_tools/universal_doc_parser/tests/cell_fusion_config_example.yaml
@@ -24,8 +24,7 @@ wired_table_recognizer:
 
															     rtdetr_conf_threshold: 0.5  # RT-DETR置信度阈值
														
 
															     # 功能开关
														
 
															-    enable_ocr_compensation: true      # 启用OCR孤立文本补偿
														
 
															-    skip_rtdetr_for_txt_pdf: true      # 🎯 文字PDF跳过RT-DETR（自适应策略）
														
 
															+    enable_ocr_compensation: true      # 启用OCR边缘补偿
														
 
															   # 调试选项
														
 
															   debug_options:
														
@@ -35,8 +34,8 @@ wired_table_recognizer:
 
															     save_fusion_comparison: true  # 保存融合对比图
														
 
															 # 使用说明：
														
 
															-# 1. 文字PDF (pdf_type='txt'): 自动跳过RT-DETR，使用纯UNet模式（无噪声干扰）
														
 
															-# 2. 扫描PDF (pdf_type='ocr'): 启用融合模式，结合UNet、RT-DETR和OCR三路结果
														
 
															+# 1. 所有PDF类型都使用UNet+RT-DETR融合模式
														
 
															+# 2. OCR边缘补偿在融合后执行，补偿"有OCR文本但无单元格覆盖"的位置
														
 
															 # 3. UNet结果为空: 强制启用RT-DETR补救
														
 
															 # 4. 融合失败: 自动降级到UNet-only模式
														
--- a/ocr_tools/universal_doc_parser/tests/test_cell_fusion.py
+++ b/ocr_tools/universal_doc_parser/tests/test_cell_fusion.py
@@ -71,8 +71,7 @@ def test_fusion_engine(detector):
 
															         'iou_merge_threshold': 0.7,
														
 
															         'iou_nms_threshold': 0.5,
														
 
															         'rtdetr_conf_threshold': 0.5,
														
 
															-        'enable_ocr_compensation': True,
														
 
															-        'skip_rtdetr_for_txt_pdf': True
														
 
															+        'enable_ocr_compensation': True
														
 
															     }
														
 
															     # 初始化
														
@@ -93,33 +92,30 @@ def test_fusion_engine(detector):
 
															         {'bbox': [20, 70, 80, 90], 'text': 'Cell 2'}
														
 
															     ]
														
 
															-    # Test 2.1: 文字PDF模式（应跳过RT-DETR）
														
 
															-    print("\n📄 Test 2.1: Text PDF mode (should skip RT-DETR)")
														
 
															+    # Test 2.1: 文字PDF模式（现在也使用RT-DETR融合）
														
 
															+    print("\n📄 Test 2.1: Text PDF mode (now uses RT-DETR fusion)")
														
 
															     fused_cells, stats = engine.fuse(
														
 
															         table_image=table_image,
														
 
															         unet_cells=unet_cells,
														
 
															         ocr_boxes=ocr_boxes,
														
 
															-        pdf_type='txt',
														
 
															-        upscale=1.0
														
 
															+        pdf_type='txt'
														
 
															     )
														
 
															     print(f"   Use RT-DETR: {stats['use_rtdetr']}")
														
 
															     print(f"   Fused cells: {len(fused_cells)}")
														
 
															-    assert not stats['use_rtdetr'], "❌ Should skip RT-DETR for text PDF"
														
 
															-    assert len(fused_cells) == len(unet_cells), "❌ Should keep UNet cells only"
														
 
															-    print("   ✅ Correctly skipped RT-DETR for text PDF")
														
 
															+    assert stats['use_rtdetr'], "✔️ Now uses RT-DETR for all PDF types"
														
 
															+    print("   ✅ Correctly enabled RT-DETR for text PDF")
														
 
															-    # Test 2.2: 扫描PDF模式（应启用RT-DETR，但因为是假图片可能失败）
														
 
															-    print("\n🔍 Test 2.2: Scan PDF mode (should enable RT-DETR)")
														
 
															+    # Test 2.2: 扫描PDF模式
														
 
															+    print("\n🔍 Test 2.2: Scan PDF mode")
														
 
															     fused_cells, stats = engine.fuse(
														
 
															         table_image=table_image,
														
 
															         unet_cells=unet_cells,
														
 
															         ocr_boxes=ocr_boxes,
														
 
															-        pdf_type='ocr',
														
 
															-        upscale=1.0
														
 
															+        pdf_type='ocr'
														
 
															     )
														
 
															     print(f"   Use RT-DETR: {stats['use_rtdetr']}")
														
 
															     print(f"   Stats: {stats}")
														
 
															-    print("   ✅ Fusion completed (RT-DETR may return 0 cells on blank image)")
														
 
															+    print("   ✅ Fusion completed")
														
 
															     return engine
														
@@ -130,24 +126,25 @@ def test_adaptive_strategy():
 
															     print("Test 3: 自适应策略测试")
														
 
															     print("=" * 60)
														
 
															-    engine = CellFusionEngine(rtdetr_detector=None, config={'skip_rtdetr_for_txt_pdf': True})
														
 
															+    engine = CellFusionEngine(rtdetr_detector=None, config={})
														
 
															-    # Test 3.1: 文字PDF + 正常单元格数 → 跳过
														
 
															+    # Test 3.1: 文字PDF + 检测器未初始化 → 跳过
														
 
															     should_use = engine.should_use_rtdetr('txt', unet_cell_count=10, table_size=(500, 500))
														
 
															-    print(f"📄 Text PDF, 10 cells: use_rtdetr={should_use}")
														
 
															-    assert not should_use, "❌ Should skip RT-DETR"
														
 
															+    print(f"📄 Text PDF, 10 cells, no detector: use_rtdetr={should_use}")
														
 
															+    assert not should_use, "❌ Should skip (detector not available)"
														
 
															     print("   ✅ Correct")
														
 
															-    # Test 3.2: 扫描PDF + 正常单元格数 → 跳过（因为检测器未初始化）
														
 
															+    # Test 3.2: 扫描PDF + 检测器未初始化 → 跳过
														
 
															     should_use = engine.should_use_rtdetr('ocr', unet_cell_count=10, table_size=(500, 500))
														
 
															     print(f"🔍 Scan PDF, 10 cells, no detector: use_rtdetr={should_use}")
														
 
															     assert not should_use, "❌ Should skip (detector not available)"
														
 
															     print("   ✅ Correct")
														
 
															-    # Test 3.3: UNet为空 → 强制启用（但检测器未初始化，仍跳过）
														
 
															+    # Test 3.3: UNet为空 + 检测器未初始化 → 仍跳过
														
 
															     should_use = engine.should_use_rtdetr('ocr', unet_cell_count=0, table_size=(500, 500))
														
 
															     print(f"🚨 Scan PDF, 0 cells, no detector: use_rtdetr={should_use}")
														
 
															-    print("   ⚠️ Would force enable if detector available")
														
 
															+    assert not should_use, "❌ Should skip (detector not available)"
														
 
															+    print("   ✅ Correct (would force enable if detector available)")
														
 
															     print("\n✅ All adaptive strategy tests passed")