Kaynağa Gözat

feat: 增强PDF旋转验证测试,添加return_upright_coords参数支持及结果对比功能

zhch158_admin 2 gün önce
ebeveyn
işleme
44635af320

+ 234 - 72
ocr_tools/universal_doc_parser/tests/test_pdf_rotation.py

@@ -1,10 +1,11 @@
 """
-PDF Rotation 验证测试程序
+PDF Rotation 验证测试程序(增强版)
 
 测试不同rotation角度(0/90/180/270)的PDF:
 1. 文本坐标是否正确转换
 2. 文本坐标是否与渲染图像对齐
 3. 两种渲染引擎(fitz/pypdfium2)的一致性
+4. return_upright_coords参数的行为验证(正视坐标 vs 旋转后坐标)
 """
 import sys
 from pathlib import Path
@@ -73,7 +74,12 @@ def create_test_pdf_with_rotation(output_path: str, rotation: int):
     return output_path
 
 
-def test_pdf_rotation(pdf_path: str, renderer: str = "fitz", dpi: int = 200):
+def test_pdf_rotation(
+    pdf_path: str, 
+    renderer: str = "fitz", 
+    dpi: int = 200,
+    return_upright_coords: bool = True
+):
     """
     测试PDF rotation处理
     
@@ -81,13 +87,15 @@ def test_pdf_rotation(pdf_path: str, renderer: str = "fitz", dpi: int = 200):
         pdf_path: PDF文件路径
         renderer: 渲染引擎 ("fitz" or "pypdfium2")
         dpi: 渲染DPI
+        return_upright_coords: 是否返回正视坐标(True=正视,False=旋转后坐标)
         
     Returns:
-        (all_in_bounds, text_blocks, rotation, image_size)
+        (all_in_bounds, text_blocks, rotation, image_size, comparison_data)
     """
+    mode_name = "upright" if return_upright_coords else "rotated"
     logger.info(f"\n{'='*60}")
     logger.info(f"Testing: {Path(pdf_path).name}")
-    logger.info(f"Renderer: {renderer}, DPI: {dpi}")
+    logger.info(f"Renderer: {renderer}, DPI: {dpi}, Mode: {mode_name}")
     logger.info(f"{'='*60}\n")
     
     # 1. 加载PDF并渲染图像
@@ -100,7 +108,7 @@ def test_pdf_rotation(pdf_path: str, renderer: str = "fitz", dpi: int = 200):
     
     if not images_list:
         logger.error("Failed to load PDF")
-        return False, [], 0, (0, 0)
+        return False, [], 0, (0, 0), {}
     
     # 2. 获取第一页
     page_idx = 0
@@ -114,91 +122,120 @@ def test_pdf_rotation(pdf_path: str, renderer: str = "fitz", dpi: int = 200):
     
     logger.info(f"📐 Rendered image size: {w}x{h} pixels, scale: {scale:.3f}")
     
-    # 3. 提取文本块
+    # 3. 提取文本块(使用return_upright_coords参数)
     text_blocks, rotation = PDFUtils.extract_all_text_blocks(
-        pdf_doc, page_idx, scale
+        pdf_doc, page_idx, scale, return_upright_coords=return_upright_coords
     )
     
     logger.info(f"📋 PDF rotation: {rotation}°")
-    logger.info(f"📝 Extracted {len(text_blocks)} text blocks\n")
+    logger.info(f"📝 Extracted {len(text_blocks)} text blocks")
+    logger.info(f"🔧 return_upright_coords: {return_upright_coords}\n")
     
     # 4. 验证每个文本块
     out_of_bounds_count = 0
+    comparison_data = []
     
     for idx, block in enumerate(text_blocks):
         text = block['text']
         bbox = block['bbox']
+        origin_bbox = block.get('origin_bbox', bbox)
         
         # 检查bbox是否在图像范围内
         x1, y1, x2, y2 = bbox
         
-        in_bounds = (0 <= x1 < w and 0 <= y1 < h and 0 <= x2 <= w and 0 <= y2 <= h)
-        
-        if not in_bounds:
-            out_of_bounds_count += 1
+        # 对于return_upright_coords=True(正视坐标),期望坐标在渲染图像范围内
+        # 对于return_upright_coords=False(旋转后坐标),坐标在旋转后坐标系,可能超出渲染图像
+        if return_upright_coords:
+            in_bounds = (0 <= x1 < w and 0 <= y1 < h and 0 <= x2 <= w and 0 <= y2 <= h)
+            if not in_bounds:
+                out_of_bounds_count += 1
+            status = "✅" if in_bounds else "❌"
+        else:
+            # 旋转后坐标模式,不验证是否在范围内(因为坐标系不同)
+            in_bounds = None
+            status = "ℹ️"
         
-        status = "✅" if in_bounds else "❌"
         logger.info(f"  {status} Block {idx}: '{text[:30]}' bbox=[{int(x1)},{int(y1)},{int(x2)},{int(y2)}]")
-    
-    # 5. 可视化:在图像上绘制文本框
+        logger.info(f"       origin_bbox=[{int(origin_bbox[0])},{int(origin_bbox[1])},{int(origin_bbox[2])},{int(origin_bbox[3])}]")
+        
+        comparison_data.append({
+            'text': text,
+            'bbox': [float(v) for v in bbox],
+            'origin_bbox': [float(v) for v in origin_bbox],
+            'in_bounds': in_bounds
+        })
+    
+    # 5. 可视化:在渲染图像上绘制文本框
+    # 注意:渲染图像是旋转后的视觉效果
+    # - return_upright_coords=False: 坐标匹配渲染图像(旋转后),bbox会对齐文本
+    # - return_upright_coords=True: 坐标是正视的,bbox不会对齐文本,但能观察坐标一致性
     vis_image = image.copy()
     
     for idx, block in enumerate(text_blocks):
         bbox = block['bbox']
         x1, y1, x2, y2 = [int(v) for v in bbox]
         
-        # 裁剪到图像范围内
-        x1 = max(0, min(x1, w-1))
-        y1 = max(0, min(y1, h-1))
-        x2 = max(0, min(x2, w))
-        y2 = max(0, min(y2, h))
+        # 裁剪到图像范围内(避免越界)
+        x1_clip = max(0, min(x1, w-1))
+        y1_clip = max(0, min(y1, h-1))
+        x2_clip = max(0, min(x2, w))
+        y2_clip = max(0, min(y2, h))
         
         in_bounds = (
             0 <= bbox[0] < w and 0 <= bbox[1] < h and
             0 <= bbox[2] <= w and 0 <= bbox[3] <= h
         )
         
+        # 根据模式选择颜色
+        if return_upright_coords:
+            # 正视坐标:绿色系(观察不同角度下坐标是否一致)
+            color = (0, 255, 0) if in_bounds else (0, 0, 255)  # 绿色=在范围内,红色=越界
+        else:
+            # 旋转后坐标:橙色系(应该精确对齐文本)
+            color = (255, 165, 0) if in_bounds else (128, 0, 128)  # 橙色=在范围内,紫色=越界
+        
         # 绘制矩形
-        color = (0, 255, 0) if in_bounds else (0, 0, 255)  # 绿色=正常,红色=越界
-        cv2.rectangle(vis_image, (x1, y1), (x2, y2), color, 2)
+        cv2.rectangle(vis_image, (x1_clip, y1_clip), (x2_clip, y2_clip), color, 2)
         
         # 添加文本标签
-        cv2.putText(vis_image, f"{idx}", (x1, max(15, y1-5)), 
+        label_y = max(15, y1_clip-5) if y1_clip > 0 else y2_clip + 15
+        cv2.putText(vis_image, f"{idx}", (x1_clip, label_y), 
                    cv2.FONT_HERSHEY_SIMPLEX, 0.6, color, 2)
     
+    # 添加水印说明
+    if return_upright_coords:
+        cv2.putText(vis_image, "Upright Coords (should be consistent across rotations)", (10, 30), 
+                   cv2.FONT_HERSHEY_SIMPLEX, 0.8, (0, 255, 0), 2)
+    else:
+        cv2.putText(vis_image, "Rotated Coords (should align with text)", (10, 30), 
+                   cv2.FONT_HERSHEY_SIMPLEX, 0.8, (255, 165, 0), 2)
+    
     # 6. 保存可视化结果
     output_dir = Path(__file__).parent / "output" / "rotation_test"
     output_dir.mkdir(parents=True, exist_ok=True)
     
     pdf_name = Path(pdf_path).stem
-    output_path = output_dir / f"{pdf_name}_{renderer}_vis.jpg"
+    output_path = output_dir / f"{pdf_name}_{renderer}_{mode_name}_vis.jpg"
     cv2.imwrite(str(output_path), cv2.cvtColor(vis_image, cv2.COLOR_RGB2BGR))
     logger.info(f"\n💾 Saved visualization: {output_path}")
     
     # 7. 保存JSON结果
-    json_path = output_dir / f"{pdf_name}_{renderer}_result.json"
+    output_dir = Path(__file__).parent / "output" / "rotation_test"
+    output_dir.mkdir(parents=True, exist_ok=True)
+    
+    pdf_name = Path(pdf_path).stem
+    json_path = output_dir / f"{pdf_name}_{renderer}_{mode_name}_result.json"
     result_data = {
         'pdf_path': str(pdf_path),
         'renderer': renderer,
         'dpi': dpi,
+        'return_upright_coords': return_upright_coords,
         'rotation': rotation,
         'image_size': [w, h],
         'scale': scale,
         'text_blocks_count': len(text_blocks),
-        'out_of_bounds_count': out_of_bounds_count,
-        'text_blocks': [
-            {
-                'text': block['text'],
-                'bbox': [float(v) for v in block['bbox']],
-                'in_bounds': (
-                    0 <= block['bbox'][0] < w and
-                    0 <= block['bbox'][1] < h and
-                    0 <= block['bbox'][2] <= w and
-                    0 <= block['bbox'][3] <= h
-                )
-            }
-            for block in text_blocks
-        ]
+        'out_of_bounds_count': out_of_bounds_count if return_upright_coords else None,
+        'text_blocks': comparison_data
     }
     
     with open(json_path, 'w', encoding='utf-8') as f:
@@ -206,38 +243,116 @@ def test_pdf_rotation(pdf_path: str, renderer: str = "fitz", dpi: int = 200):
     logger.info(f"💾 Saved JSON: {json_path}\n")
     
     # 8. 验证结果
-    all_in_bounds = out_of_bounds_count == 0
-    
-    if all_in_bounds:
-        logger.info("✅ All text bboxes are within image bounds - PASS\n")
+    if return_upright_coords:
+        all_in_bounds = out_of_bounds_count == 0
+        if all_in_bounds:
+            logger.info("✅ All text bboxes are within image bounds - PASS\n")
+        else:
+            logger.warning(f"❌ {out_of_bounds_count} text bboxes are outside image bounds - FAIL\n")
     else:
-        logger.warning(f"❌ {out_of_bounds_count} text bboxes are outside image bounds - FAIL\n")
+        all_in_bounds = True  # 旋转后坐标模式不验证边界
+        logger.info("ℹ️  Rotated coords mode - coordinates in rotated coordinate space\n")
     
-    # 关闭PDF
-    pdf_doc.close()
+    # 9. 关闭PDF文档,避免内存泄漏
+    try:
+        if hasattr(pdf_doc, 'close'):
+            pdf_doc.close()
+    except:
+        pass  # 忽略关闭错误
     
-    return all_in_bounds, text_blocks, rotation, (w, h)
+    return all_in_bounds, text_blocks, rotation, (w, h), comparison_data
 
 
-def pdf_rotation_to_image_rotation(pdf_rotation: int) -> int:
-    """将PDF旋转角度(顺时针)转换为图片旋转角度(逆时针)
+def compare_rotation_modes(pdf_path: str, renderer: str = "fitz", dpi: int = 200):
+    """
+    对比return_upright_coords=True和False两种模式的结果
+    
+    验证:
+    1. return_upright_coords=True时,坐标在渲染图像范围内(正视坐标)
+    2. return_upright_coords=False时,坐标在旋转后坐标系
+    3. 两种模式返回的文本内容应该一致
+    """
+    logger.info(f"\n{'='*80}")
+    logger.info(f"COMPARING ROTATION MODES: {Path(pdf_path).name}")
+    logger.info(f"{'='*80}\n")
+    
+    # 测试 return_upright_coords=True
+    logger.info("📊 Testing return_upright_coords=True (正视坐标)...")
+    result_with = test_pdf_rotation(pdf_path, renderer, dpi, return_upright_coords=True)
+    
+    # 测试 return_upright_coords=False
+    logger.info("📊 Testing return_upright_coords=False (旋转后坐标)...")
+    result_without = test_pdf_rotation(pdf_path, renderer, dpi, return_upright_coords=False)
+    
+    # 对比分析
+    logger.info(f"\n{'='*80}")
+    logger.info("COMPARISON ANALYSIS")
+    logger.info(f"{'='*80}\n")
+    
+    all_in_bounds_with, blocks_with, rotation_with, size_with, data_with = result_with
+    all_in_bounds_without, blocks_without, rotation_without, size_without, data_without = result_without
+    
+    # 验证1: 文本块数量应该相同
+    assert len(blocks_with) == len(blocks_without), "Text block count mismatch!"
+    logger.info(f"✅ Text block count match: {len(blocks_with)}")
+    
+    # 验证2: 文本内容应该相同
+    texts_match = all(
+        blocks_with[i]['text'] == blocks_without[i]['text']
+        for i in range(len(blocks_with))
+    )
+    assert texts_match, "Text content mismatch!"
+    logger.info(f"✅ Text content match")
     
-    PDF规范使用顺时针旋转定义,图片处理通常使用逆时针旋转定义。
+    # 验证3: rotation角度应该相同
+    assert rotation_with == rotation_without, "Rotation angle mismatch!"
+    logger.info(f"✅ Rotation angle match: {rotation_with}°")
     
-    Args:
-        pdf_rotation: PDF旋转角度 (0/90/180/270,顺时针)
+    # 验证4: return_upright_coords=True时,坐标应该在图像范围内
+    if rotation_with != 0:
+        if all_in_bounds_with:
+            logger.info(f"✅ return_upright_coords=True: All bboxes in bounds")
+        else:
+            logger.warning(f"⚠️  return_upright_coords=True: Some bboxes out of bounds")
+    else:
+        logger.info(f"ℹ️  rotation=0°, both modes should have similar coordinates")
+    
+    # 验证5: 对比坐标差异
+    logger.info(f"\n📐 Coordinate Comparison (first 3 blocks):")
+    for i in range(min(3, len(blocks_with))):
+        text = blocks_with[i]['text'][:20]
+        bbox_with = blocks_with[i]['bbox']
+        bbox_without = blocks_without[i]['bbox']
+        origin = blocks_with[i].get('origin_bbox', bbox_without)
         
-    Returns:
-        图片旋转角度 (0/90/180/270,逆时针)
-    """
+        logger.info(f"\n  Block {i}: '{text}'")
+        logger.info(f"    origin_bbox:     [{int(origin[0])}, {int(origin[1])}, {int(origin[2])}, {int(origin[3])}]")
+        logger.info(f"    upright_coords:  [{int(bbox_with[0])}, {int(bbox_with[1])}, {int(bbox_with[2])}, {int(bbox_with[3])}]")
+        logger.info(f"    rotated_coords:  [{int(bbox_without[0])}, {int(bbox_without[1])}, {int(bbox_without[2])}, {int(bbox_without[3])}]")
+        
+        if rotation_with == 0:
+            # rotation=0时,两种模式应该几乎相同(仅有origin_bbox vs 转换后的差异)
+            diff = [abs(bbox_with[j] - bbox_without[j]) for j in range(4)]
+            max_diff = max(diff)
+            logger.info(f"    max_diff: {max_diff:.2f} (should be ~0 for rotation=0)")
+    
+    logger.info(f"\n{'='*80}")
+    logger.info("✅ COMPARISON PASSED - Both modes work correctly")
+    logger.info(f"{'='*80}\n")
+    
+    return True
+
+
+def pdf_rotation_to_image_rotation(pdf_rotation: int) -> int:
+    """将PDF旋转角度(顺时针)转换为图片旋转角度(逆时针)"""
     mapping = {0: 0, 90: 270, 180: 180, 270: 90}
     return mapping.get(pdf_rotation, 0)
 
 
 def main():
     """主测试函数"""
-    logger.info("🚀 Starting PDF Rotation Validation Test\n")
-    logger.info("=" * 60)
+    logger.info("🚀 Starting PDF Rotation Validation Test (Enhanced)\n")
+    logger.info("=" * 80)
     
     # 创建测试PDF目录
     test_dir = Path(__file__).parent / "output" / "rotation_test" / "test_pdfs"
@@ -256,6 +371,11 @@ def main():
     
     results = {}
     
+    # 第一部分:单独测试每个rotation角度的两种模式
+    logger.info("\n" + "="*80)
+    logger.info("PART 1: Testing individual rotation angles with both modes")
+    logger.info("="*80 + "\n")
+    
     for rotation in rotations:
         # 创建测试PDF
         pdf_path = test_dir / f"test_rotation_{rotation}.pdf"
@@ -267,36 +387,78 @@ def main():
         
         # 测试所有渲染引擎
         for renderer in renderers:
-            test_key = f"rotation_{rotation}_{renderer}"
+            # 测试return_upright_coords=True(正视坐标)
+            test_key_with = f"rotation_{rotation}_{renderer}_upright"
             try:
-                all_in_bounds, text_blocks, detected_rotation, image_size = test_pdf_rotation(
-                    str(pdf_path), renderer=renderer
+                all_in_bounds, text_blocks, detected_rotation, image_size, _ = test_pdf_rotation(
+                    str(pdf_path), renderer=renderer, return_upright_coords=True
                 )
                 
-                # 验证rotation检测是否正确
-                # 注意:现在返回的是图片旋转角度(逆时针),需要转换PDF rotation来比较
                 expected_image_rotation = pdf_rotation_to_image_rotation(rotation)
                 rotation_correct = (detected_rotation == expected_image_rotation)
                 
                 if all_in_bounds and rotation_correct:
-                    results[test_key] = "PASS"
+                    results[test_key_with] = "PASS"
                 elif not rotation_correct:
-                    results[test_key] = f"FAIL (rotation mismatch: expected {expected_image_rotation}, got {detected_rotation})"
+                    results[test_key_with] = f"FAIL (rotation mismatch)"
                 else:
-                    results[test_key] = "FAIL (bbox out of bounds)"
+                    results[test_key_with] = "FAIL (bbox out of bounds)"
                     
             except Exception as e:
-                logger.error(f"❌ Test failed for {test_key}: {e}")
+                logger.error(f"❌ Test failed for {test_key_with}: {e}")
+                import traceback
+                logger.error(traceback.format_exc())
+                results[test_key_with] = "ERROR"
+            
+            # 测试return_upright_coords=False(旋转后坐标)
+            test_key_without = f"rotation_{rotation}_{renderer}_rotated"
+            try:
+                all_in_bounds, text_blocks, detected_rotation, image_size, _ = test_pdf_rotation(
+                    str(pdf_path), renderer=renderer, return_upright_coords=False
+                )
+                
+                expected_image_rotation = pdf_rotation_to_image_rotation(rotation)
+                rotation_correct = (detected_rotation == expected_image_rotation)
+                
+                if rotation_correct:
+                    results[test_key_without] = "PASS"
+                else:
+                    results[test_key_without] = "FAIL (rotation mismatch)"
+                    
+            except Exception as e:
+                logger.error(f"❌ Test failed for {test_key_without}: {e}")
+                import traceback
+                logger.error(traceback.format_exc())
+                results[test_key_without] = "ERROR"
+    
+    # 第二部分:对比测试
+    logger.info("\n" + "="*80)
+    logger.info("PART 2: Comparing rotation modes for each PDF")
+    logger.info("="*80 + "\n")
+    
+    for rotation in rotations:
+        pdf_path = test_dir / f"test_rotation_{rotation}.pdf"
+        
+        if not pdf_path.exists():
+            continue
+        
+        for renderer in renderers:
+            test_key_compare = f"rotation_{rotation}_{renderer}_compare"
+            try:
+                success = compare_rotation_modes(str(pdf_path), renderer=renderer)
+                results[test_key_compare] = "PASS" if success else "FAIL"
+            except Exception as e:
+                logger.error(f"❌ Comparison failed for {test_key_compare}: {e}")
                 import traceback
                 logger.error(traceback.format_exc())
-                results[test_key] = "ERROR"
+                results[test_key_compare] = "ERROR"
     
     # 打印总结
-    logger.info(f"\n{'='*60}")
+    logger.info(f"\n{'='*80}")
     logger.info("TEST SUMMARY")
-    logger.info(f"{'='*60}\n")
+    logger.info(f"{'='*80}\n")
     
-    for test_key, result in results.items():
+    for test_key, result in sorted(results.items()):
         if result == "PASS":
             status_emoji = "✅"
         elif "FAIL" in result:
@@ -309,9 +471,9 @@ def main():
     pass_count = sum(1 for r in results.values() if r == "PASS")
     total_count = len(results)
     
-    logger.info(f"\n{'='*60}")
+    logger.info(f"\n{'='*80}")
     logger.info(f"📊 Final Score: {pass_count}/{total_count} tests passed")
-    logger.info(f"{'='*60}\n")
+    logger.info(f"{'='*80}\n")
     
     if pass_count == total_count:
         logger.info("🎉 All tests passed!")