|
@@ -0,0 +1,261 @@
|
|
|
|
|
+#!/usr/bin/env python3
|
|
|
|
|
+"""
|
|
|
|
|
+对比分析 fitz (PyMuPDF) 和 pypdfium2 渲染 PDF 的差异
|
|
|
|
|
+用于诊断 UNet 表格识别结果不一致的问题
|
|
|
|
|
+"""
|
|
|
|
|
+
|
|
|
|
|
+import os
|
|
|
|
|
+import sys
|
|
|
|
|
+from pathlib import Path
|
|
|
|
|
+from PIL import Image
|
|
|
|
|
+import numpy as np
|
|
|
|
|
+import cv2
|
|
|
|
|
+
|
|
|
|
|
+# 图片路径
|
|
|
|
|
+FITZ_IMAGE = Path("/Users/zhch158/workspace/repository.git/ocr_platform/ocr_tools/universal_doc_parser/tests/2023年度报告母公司_page_003.png")
|
|
|
|
|
+PYPDFIUM2_IMAGE = Path("/Users/zhch158/workspace/repository.git/ocr_platform/ocr_tools/universal_doc_parser/output/2023年度报告母公司/bank_statement_wired_unet/2023年度报告母公司/2023年度报告母公司_page_003.png")
|
|
|
|
|
+
|
|
|
|
|
+def analyze_image(image_path: Path, label: str):
|
|
|
|
|
+ """分析图像的详细属性"""
|
|
|
|
|
+ print(f"\n{'='*70}")
|
|
|
|
|
+ print(f"分析: {label}")
|
|
|
|
|
+ print(f"{'='*70}")
|
|
|
|
|
+
|
|
|
|
|
+ if not image_path.exists():
|
|
|
|
|
+ print(f"❌ 文件不存在: {image_path}")
|
|
|
|
|
+ return None, None, None
|
|
|
|
|
+
|
|
|
|
|
+ # 文件大小
|
|
|
|
|
+ file_size = os.path.getsize(image_path) / 1024 # KB
|
|
|
|
|
+ print(f"📦 文件大小: {file_size:.2f} KB")
|
|
|
|
|
+
|
|
|
|
|
+ # PIL 加载
|
|
|
|
|
+ img_pil = Image.open(image_path)
|
|
|
|
|
+ print(f"📐 图像尺寸: {img_pil.size[0]} × {img_pil.size[1]} (宽×高)")
|
|
|
|
|
+ print(f"🎨 颜色模式: {img_pil.mode}")
|
|
|
|
|
+ print(f"📊 格式: {img_pil.format}")
|
|
|
|
|
+
|
|
|
|
|
+ # 获取 DPI 信息
|
|
|
|
|
+ dpi = img_pil.info.get('dpi', 'N/A')
|
|
|
|
|
+ print(f"🔍 DPI 信息: {dpi}")
|
|
|
|
|
+
|
|
|
|
|
+ # OpenCV 加载
|
|
|
|
|
+ img_cv = cv2.imread(str(image_path))
|
|
|
|
|
+ if img_cv is not None:
|
|
|
|
|
+ print(f"📏 OpenCV 尺寸: {img_cv.shape[0]} × {img_cv.shape[1]} × {img_cv.shape[2]} (高×宽×通道)")
|
|
|
|
|
+ print(f"📈 数据类型: {img_cv.dtype}")
|
|
|
|
|
+
|
|
|
|
|
+ # 统计信息
|
|
|
|
|
+ print(f"🔢 像素值范围: [{img_cv.min()}, {img_cv.max()}]")
|
|
|
|
|
+ print(f"📊 平均值: {img_cv.mean():.2f}")
|
|
|
|
|
+ print(f"📊 标准差: {img_cv.std():.2f}")
|
|
|
|
|
+
|
|
|
|
|
+ # 检查是否有纯黑/纯白区域
|
|
|
|
|
+ black_pixels = np.all(img_cv == 0, axis=-1).sum()
|
|
|
|
|
+ white_pixels = np.all(img_cv == 255, axis=-1).sum()
|
|
|
|
|
+ total_pixels = img_cv.shape[0] * img_cv.shape[1]
|
|
|
|
|
+ print(f"⚫ 纯黑像素: {black_pixels} ({black_pixels/total_pixels*100:.2f}%)")
|
|
|
|
|
+ print(f"⚪ 纯白像素: {white_pixels} ({white_pixels/total_pixels*100:.2f}%)")
|
|
|
|
|
+
|
|
|
|
|
+ # NumPy 数组
|
|
|
|
|
+ img_np = np.array(img_pil)
|
|
|
|
|
+ print(f"🧮 NumPy shape: {img_np.shape}")
|
|
|
|
|
+ print(f"🧮 NumPy dtype: {img_np.dtype}")
|
|
|
|
|
+
|
|
|
|
|
+ return img_pil, img_cv, img_np
|
|
|
|
|
+
|
|
|
|
|
+def compare_images(img1_pil: Image.Image, img2_pil: Image.Image, label1: str, label2: str):
|
|
|
|
|
+ """对比两张图像的差异"""
|
|
|
|
|
+ print(f"\n{'='*70}")
|
|
|
|
|
+ print(f"对比: {label1} vs {label2}")
|
|
|
|
|
+ print(f"{'='*70}")
|
|
|
|
|
+
|
|
|
|
|
+ # 尺寸对比
|
|
|
|
|
+ if img1_pil.size == img2_pil.size:
|
|
|
|
|
+ print(f"✅ 尺寸一致: {img1_pil.size[0]} × {img1_pil.size[1]}")
|
|
|
|
|
+ else:
|
|
|
|
|
+ print(f"❌ 尺寸不一致:")
|
|
|
|
|
+ print(f" {label1}: {img1_pil.size[0]} × {img1_pil.size[1]}")
|
|
|
|
|
+ print(f" {label2}: {img2_pil.size[0]} × {img2_pil.size[1]}")
|
|
|
|
|
+ print(f"\n⚠️ 尺寸不同,无法进行像素级对比")
|
|
|
|
|
+ return
|
|
|
|
|
+
|
|
|
|
|
+ # 转换为 numpy 数组
|
|
|
|
|
+ arr1 = np.array(img1_pil)
|
|
|
|
|
+ arr2 = np.array(img2_pil)
|
|
|
|
|
+
|
|
|
|
|
+ # 像素差异
|
|
|
|
|
+ diff = np.abs(arr1.astype(np.float32) - arr2.astype(np.float32))
|
|
|
|
|
+
|
|
|
|
|
+ print(f"\n📊 像素差异统计:")
|
|
|
|
|
+ print(f" 最大差异: {diff.max():.2f} (0-255 范围)")
|
|
|
|
|
+ print(f" 平均差异: {diff.mean():.2f}")
|
|
|
|
|
+ print(f" 中位数差异: {np.median(diff):.2f}")
|
|
|
|
|
+ print(f" 差异标准差: {diff.std():.2f}")
|
|
|
|
|
+
|
|
|
|
|
+ # 相同像素百分比
|
|
|
|
|
+ identical_pixels = np.all(arr1 == arr2, axis=-1).sum()
|
|
|
|
|
+ total_pixels = arr1.shape[0] * arr1.shape[1]
|
|
|
|
|
+ identical_ratio = identical_pixels / total_pixels * 100
|
|
|
|
|
+ print(f"\n✓ 完全相同的像素: {identical_pixels:,} / {total_pixels:,} ({identical_ratio:.2f}%)")
|
|
|
|
|
+
|
|
|
|
|
+ # 差异分布
|
|
|
|
|
+ diff_1px = np.sum(np.any(diff <= 1, axis=-1))
|
|
|
|
|
+ diff_5px = np.sum(np.any(diff <= 5, axis=-1))
|
|
|
|
|
+ diff_10px = np.sum(np.any(diff <= 10, axis=-1))
|
|
|
|
|
+ print(f"\n📈 差异分布 (有差异的像素):")
|
|
|
|
|
+ print(f" ≤ 1 灰度级: {diff_1px:,} ({diff_1px/total_pixels*100:.2f}%)")
|
|
|
|
|
+ print(f" ≤ 5 灰度级: {diff_5px:,} ({diff_5px/total_pixels*100:.2f}%)")
|
|
|
|
|
+ print(f" ≤ 10 灰度级: {diff_10px:,} ({diff_10px/total_pixels*100:.2f}%)")
|
|
|
|
|
+ print(f" > 10 灰度级: {(total_pixels - diff_10px):,} ({(total_pixels - diff_10px)/total_pixels*100:.2f}%)")
|
|
|
|
|
+
|
|
|
|
|
+ # 颜色通道差异
|
|
|
|
|
+ if len(arr1.shape) == 3:
|
|
|
|
|
+ print(f"\n🎨 各颜色通道差异:")
|
|
|
|
|
+ for i, channel in enumerate(['红色 (R)', '绿色 (G)', '蓝色 (B)']):
|
|
|
|
|
+ channel_diff = np.abs(arr1[:,:,i].astype(np.float32) - arr2[:,:,i].astype(np.float32))
|
|
|
|
|
+ print(f" {channel}: 平均 {channel_diff.mean():.2f}, 最大 {channel_diff.max():.0f}")
|
|
|
|
|
+
|
|
|
|
|
+ # 生成差异热图
|
|
|
|
|
+ diff_map = diff.mean(axis=-1) if len(diff.shape) == 3 else diff
|
|
|
|
|
+ max_diff_loc = np.unravel_index(diff_map.argmax(), diff_map.shape)
|
|
|
|
|
+ print(f"\n🔥 最大差异位置:")
|
|
|
|
|
+ print(f" 坐标: (y={max_diff_loc[0]}, x={max_diff_loc[1]})")
|
|
|
|
|
+ print(f" 差异值: {diff_map[max_diff_loc]:.2f}")
|
|
|
|
|
+ print(f" {label1} 像素值: {arr1[max_diff_loc]}")
|
|
|
|
|
+ print(f" {label2} 像素值: {arr2[max_diff_loc]}")
|
|
|
|
|
+
|
|
|
|
|
+ # SSIM 结构相似性
|
|
|
|
|
+ try:
|
|
|
|
|
+ from skimage.metrics import structural_similarity as ssim
|
|
|
|
|
+ # 转换为灰度
|
|
|
|
|
+ gray1 = cv2.cvtColor(arr1, cv2.COLOR_RGB2GRAY) if len(arr1.shape) == 3 else arr1
|
|
|
|
|
+ gray2 = cv2.cvtColor(arr2, cv2.COLOR_RGB2GRAY) if len(arr2.shape) == 3 else arr2
|
|
|
|
|
+ ssim_value = ssim(gray1, gray2)
|
|
|
|
|
+ print(f"\n📏 SSIM 结构相似性: {ssim_value:.6f}")
|
|
|
|
|
+ print(f" (1.0 = 完全相同, >0.95 = 几乎相同, <0.9 = 有明显差异)")
|
|
|
|
|
+ except ImportError:
|
|
|
|
|
+ print(f"\n⚠️ 未安装 scikit-image,跳过 SSIM 计算")
|
|
|
|
|
+ print(f" 安装: pip install scikit-image")
|
|
|
|
|
+
|
|
|
|
|
+ # 保存差异图
|
|
|
|
|
+ output_dir = Path(__file__).parent / "analysis_output"
|
|
|
|
|
+ output_dir.mkdir(exist_ok=True)
|
|
|
|
|
+
|
|
|
|
|
+ # 差异热图 (归一化到 0-255)
|
|
|
|
|
+ diff_visual = (diff_map / diff_map.max() * 255).astype(np.uint8) if diff_map.max() > 0 else diff_map.astype(np.uint8)
|
|
|
|
|
+ diff_colored = cv2.applyColorMap(diff_visual, cv2.COLORMAP_JET)
|
|
|
|
|
+ cv2.imwrite(str(output_dir / "diff_heatmap.png"), diff_colored)
|
|
|
|
|
+
|
|
|
|
|
+ # 保存原始差异图(未归一化)
|
|
|
|
|
+ diff_raw = diff_map.astype(np.uint8)
|
|
|
|
|
+ cv2.imwrite(str(output_dir / "diff_raw.png"), diff_raw)
|
|
|
|
|
+
|
|
|
|
|
+ # 保存二值化差异(差异 > 5 的区域)
|
|
|
|
|
+ diff_binary = (diff_map > 5).astype(np.uint8) * 255
|
|
|
|
|
+ cv2.imwrite(str(output_dir / "diff_binary_5px.png"), diff_binary)
|
|
|
|
|
+
|
|
|
|
|
+ print(f"\n💾 差异图已保存到: {output_dir}")
|
|
|
|
|
+ print(f" - diff_heatmap.png (彩色热图)")
|
|
|
|
|
+ print(f" - diff_raw.png (原始差异)")
|
|
|
|
|
+ print(f" - diff_binary_5px.png (差异>5的区域)")
|
|
|
|
|
+
|
|
|
|
|
+def analyze_rendering_differences():
|
|
|
|
|
+ """分析渲染差异的根本原因"""
|
|
|
|
|
+ print(f"\n{'='*70}")
|
|
|
|
|
+ print("🔬 渲染差异根本原因分析")
|
|
|
|
|
+ print(f"{'='*70}")
|
|
|
|
|
+
|
|
|
|
|
+ print("""
|
|
|
|
|
+## 主要差异来源:
|
|
|
|
|
+
|
|
|
|
|
+### 1. 抗锯齿算法 (Anti-aliasing)
|
|
|
|
|
+ • PyMuPDF (fitz): 使用 MuPDF 渲染引擎,默认启用抗锯齿
|
|
|
|
|
+ • pypdfium2: 使用 PDFium 渲染引擎(Chrome PDF 引擎)
|
|
|
|
|
+
|
|
|
|
|
+ 影响: 边缘平滑度不同,细线条的像素值会有 1-3 灰度级差异
|
|
|
|
|
+
|
|
|
|
|
+### 2. 颜色空间处理
|
|
|
|
|
+ • PyMuPDF: MuPDF 内部颜色管理
|
|
|
|
|
+ • pypdfium2: Chromium 颜色管理系统
|
|
|
|
|
+
|
|
|
|
|
+ 影响: RGB 值可能有 1-2 个灰度级的系统性偏差
|
|
|
|
|
+
|
|
|
|
|
+### 3. 字体渲染引擎
|
|
|
|
|
+ • PyMuPDF: FreeType 字体渲染
|
|
|
|
|
+ • pypdfium2: PDFium/Skia 字体渲染
|
|
|
|
|
+
|
|
|
|
|
+ 影响: 文字边缘、字形细节略有不同,影响 OCR 识别
|
|
|
|
|
+
|
|
|
|
|
+### 4. DPI 缩放算法
|
|
|
|
|
+ • PyMuPDF: fitz.Matrix() 矩阵变换
|
|
|
|
|
+ • pypdfium2: bitmap.render(scale=) 缩放
|
|
|
|
|
+
|
|
|
|
|
+ 影响: 插值算法不同,导致边缘像素值差异
|
|
|
|
|
+
|
|
|
|
|
+### 5. 尺寸限制策略
|
|
|
|
|
+ • PyMuPDF: >4500px → 降为 72 DPI
|
|
|
|
|
+ • pypdfium2: >3500px → 动态调整 scale
|
|
|
|
|
+
|
|
|
|
|
+ 影响: 大尺寸 PDF 可能产生不同分辨率的图像
|
|
|
|
|
+
|
|
|
|
|
+## 对 UNet 表格识别的影响:
|
|
|
|
|
+
|
|
|
|
|
+### 直接影响:
|
|
|
|
|
+✗ 线条边缘抗锯齿差异 → UNet 检测线条位置有 1-2 像素偏移
|
|
|
|
|
+✗ 文字清晰度差异 → 影响单元格文本区域识别
|
|
|
|
|
+✗ 整体对比度差异 → 影响表格线检测阈值
|
|
|
|
|
+
|
|
|
|
|
+### 建议解决方案:
|
|
|
|
|
+1. 统一渲染引擎: 全部使用 pypdfium2 (更稳定、更快)
|
|
|
|
|
+2. 保存调试图像: 保存 UNet 输入图像以便排查
|
|
|
|
|
+3. 调整检测阈值: 考虑渲染差异,适当放宽容差
|
|
|
|
|
+4. 使用相同测试数据: 确保 test 和 production 使用同一渲染方法
|
|
|
|
|
+""")
|
|
|
|
|
+
|
|
|
|
|
+def main():
|
|
|
|
|
+ print("="*70)
|
|
|
|
|
+ print("PDF 渲染引擎对比分析工具")
|
|
|
|
|
+ print("fitz (PyMuPDF) vs pypdfium2")
|
|
|
|
|
+ print("="*70)
|
|
|
|
|
+
|
|
|
|
|
+ # 检查文件是否存在
|
|
|
|
|
+ if not FITZ_IMAGE.exists():
|
|
|
|
|
+ print(f"\n❌ fitz 图像不存在: {FITZ_IMAGE}")
|
|
|
|
|
+ print(f" 请确保已使用 fitz 渲染 PDF 并保存图像")
|
|
|
|
|
+ return 1
|
|
|
|
|
+
|
|
|
|
|
+ if not PYPDFIUM2_IMAGE.exists():
|
|
|
|
|
+ print(f"\n❌ pypdfium2 图像不存在: {PYPDFIUM2_IMAGE}")
|
|
|
|
|
+ print(f" 请运行 pipeline 生成输出图像")
|
|
|
|
|
+ return 1
|
|
|
|
|
+
|
|
|
|
|
+ # 分析两张图片
|
|
|
|
|
+ print("\n" + "🔍 第一步: 分析各自的图像属性")
|
|
|
|
|
+ fitz_pil, fitz_cv, fitz_np = analyze_image(FITZ_IMAGE, "PyMuPDF (fitz)")
|
|
|
|
|
+
|
|
|
|
|
+ if fitz_pil is None:
|
|
|
|
|
+ return 1
|
|
|
|
|
+
|
|
|
|
|
+ pypdfium2_pil, pypdfium2_cv, pypdfium2_np = analyze_image(PYPDFIUM2_IMAGE, "pypdfium2")
|
|
|
|
|
+
|
|
|
|
|
+ if pypdfium2_pil is None:
|
|
|
|
|
+ return 1
|
|
|
|
|
+
|
|
|
|
|
+ # 对比差异
|
|
|
|
|
+ print("\n" + "📊 第二步: 对比两张图像的差异")
|
|
|
|
|
+ compare_images(fitz_pil, pypdfium2_pil, "PyMuPDF", "pypdfium2")
|
|
|
|
|
+
|
|
|
|
|
+ # 分析根本原因
|
|
|
|
|
+ print("\n" + "💡 第三步: 分析差异的根本原因")
|
|
|
|
|
+ analyze_rendering_differences()
|
|
|
|
|
+
|
|
|
|
|
+ print(f"\n{'='*70}")
|
|
|
|
|
+ print("✅ 分析完成")
|
|
|
|
|
+ print(f"{'='*70}")
|
|
|
|
|
+ print(f"\n查看输出目录: {Path(__file__).parent / 'analysis_output'}")
|
|
|
|
|
+
|
|
|
|
|
+ return 0
|
|
|
|
|
+
|
|
|
|
|
+if __name__ == "__main__":
|
|
|
|
|
+ sys.exit(main())
|