Bläddra i källkod

fix: 优化UNet推理调试信息记录,增强尺寸一致性验证,返回检测到的倾斜角度

zhch158_admin 10 timmar sedan
förälder
incheckning
af1c467c48
1 ändrade filer med 34 tillägg och 90 borttagningar
  1. 34 90
      ocr_tools/universal_doc_parser/models/adapters/mineru_wired_table.py

+ 34 - 90
ocr_tools/universal_doc_parser/models/adapters/mineru_wired_table.py

@@ -193,6 +193,12 @@ class MinerUWiredTableRecognizer:
             upscale = self.upscale_ratio if self.upscale_ratio and self.upscale_ratio > 0 else 1.0
             h, w = table_image.shape[:2]
             
+            # 调试选项合并(需要在 run_unet 之前初始化,因为内部函数会引用)
+            dbg = self.debug_utils.merge_debug_options(self.config, debug_options or {})
+            debug_dir = None
+            if dbg and dbg.enabled and dbg.output_dir:
+                debug_dir = dbg.output_dir
+            
             # 定义内部函数以方便复用 UNet 推理
             def run_unet(img_in):
                 if upscale != 1.0:
@@ -231,16 +237,10 @@ class MinerUWiredTableRecognizer:
                         preprocessed_h_actual = preprocessed_h
                         preprocessed_w_actual = preprocessed_w
                     scale_diff = abs(w_scale_actual - h_scale_actual)
-                    logger.info(
-                        f"🔍 UNet预处理缩放因子验证: "
-                        f"w_scale={w_scale_actual:.6f}, h_scale={h_scale_actual:.6f}, "
-                        f"差异={scale_diff:.6f}, "
-                        f"预处理后实际尺寸=[{preprocessed_h_actual}, {preprocessed_w_actual}]"
-                    )
-                    if scale_diff > 1e-6:
+                    # 仅在差异过大时警告,用于生产环境排查
+                    if scale_diff / max(w_scale_actual, h_scale_actual) > 0.01:  # >1%差异
                         logger.warning(
-                            f"⚠️ w_scale 和 h_scale 不相等!这可能导致坐标偏移。"
-                            f"w_scale={w_scale_actual:.6f}, h_scale={h_scale_actual:.6f}"
+                            f"UNet预处理缩放因子差异>1%: w_scale={w_scale_actual:.6f}, h_scale={h_scale_actual:.6f}"
                         )
                 except Exception as e:
                     logger.warning(f"无法获取实际缩放因子: {e}")
@@ -255,18 +255,12 @@ class MinerUWiredTableRecognizer:
                 hpred_ = np.where(pred_ == 1, 255, 0).astype(np.uint8)
                 vpred_ = np.where(pred_ == 2, 255, 0).astype(np.uint8)
                 
-                # 调试:记录尺寸信息
                 pred_h, pred_w = pred_.shape[:2]
-                logger.info(
-                    f"🔍 UNet 推理详细日志:\n"
-                    f"  - 上采样图像尺寸: [{h_up_}, {w_up_}]\n"
-                    f"  - 计算预处理后尺寸: [{preprocessed_h}, {preprocessed_w}]\n"
-                    f"  - 实际预处理后尺寸: [{preprocessed_h_actual}, {preprocessed_w_actual}]\n"
-                    f"  - 预测结果尺寸: [{pred_h}, {pred_w}]\n"
-                    f"  - 计算缩放因子: {scale_factor:.6f}\n"
-                    f"  - 实际缩放因子: w_scale={w_scale_actual:.6f}, h_scale={h_scale_actual:.6f}\n"
-                    f"  - upscale: {upscale:.3f}"
-                )
+                # 在 debug 模式下记录关键尺寸信息
+                if debug_dir:
+                    logger.debug(
+                        f"UNet推理: 上采样[{h_up_}, {w_up_}], 预测[{pred_h}, {pred_w}], upscale={upscale:.3f}"
+                    )
                 
                 # 关键修复:正确地将预测结果 resize 回上采样尺寸
                 # UNet 的 postprocess 使用 ori_shape = img.shape 来 resize 预测结果
@@ -274,12 +268,10 @@ class MinerUWiredTableRecognizer:
                 # 所以我们应该使用 img_up_.shape 来 resize 预测结果
                 # 但是,由于预处理时改变了图像尺寸(保持长宽比),我们需要确保 resize 是正确的
                 
-                # 验证:检查预测结果尺寸是否与预处理后的尺寸一致(仅用于警告)
-                if pred_h != preprocessed_h_actual or pred_w != preprocessed_w_actual:
+                # 验证:预测结果尺寸应与预处理后尺寸一致
+                if debug_dir and (pred_h != preprocessed_h_actual or pred_w != preprocessed_w_actual):
                     logger.warning(
-                        f"⚠️ 预测结果尺寸 [{pred_h}, {pred_w}] 与预处理后实际尺寸 "
-                        f"[{preprocessed_h_actual}, {preprocessed_w_actual}] 不一致!"
-                        f"这可能导致坐标偏移。"
+                        f"预测尺寸[{pred_h}, {pred_w}] 与预处理尺寸[{preprocessed_h_actual}, {preprocessed_w_actual}]不一致"
                     )
                 
                 # 修复:统一将预测结果resize回上采样尺寸,避免舍入误差
@@ -289,61 +281,23 @@ class MinerUWiredTableRecognizer:
                 hpred_up_ = cv2.resize(hpred_, (w_up_, h_up_), interpolation=cv2.INTER_NEAREST)
                 vpred_up_ = cv2.resize(vpred_, (w_up_, h_up_), interpolation=cv2.INTER_NEAREST)
                 
-                # 记录验证信息:检查理论target尺寸与实际上采样尺寸的差异
-                # 这些差异应该非常小(<2像素),如果差异较大说明UNet预处理有问题
-                if abs(w_scale_actual - h_scale_actual) > 1e-6:
-                    target_w_theoretical = int(pred_w / w_scale_actual + 0.5)
-                    target_h_theoretical = int(pred_h / h_scale_actual + 0.5)
-                    diff_w = abs(target_w_theoretical - w_up_)
-                    diff_h = abs(target_h_theoretical - h_up_)
-                    if diff_w > 2 or diff_h > 2:
-                        logger.warning(
-                            f"⚠️ 理论resize尺寸 [{target_h_theoretical}, {target_w_theoretical}] "
-                            f"与上采样尺寸 [{h_up_}, {w_up_}] 差异较大 (diff=[{diff_h}, {diff_w}])!"
-                            f"w_scale={w_scale_actual:.6f}, h_scale={h_scale_actual:.6f}"
-                        )
-                    else:
-                        logger.debug(
-                            f"✓ 理论resize尺寸 [{target_h_theoretical}, {target_w_theoretical}] "
-                            f"与上采样尺寸 [{h_up_}, {w_up_}] 一致 (diff=[{diff_h}, {diff_w}])"
-                        )
+                # 在 debug 模式下验证 resize 一致性
+                if debug_dir:
+                    hpred_up_h, hpred_up_w = hpred_up_.shape[:2]
+                    vpred_up_h, vpred_up_w = vpred_up_.shape[:2]
+                    if hpred_up_h != h_up_ or hpred_up_w != w_up_:
+                        logger.warning(f"Mask尺寸[{hpred_up_h}, {hpred_up_w}] 与上采样尺寸[{h_up_}, {w_up_}]不一致")
                 
-                # 记录resize后的mask尺寸
-                hpred_up_h, hpred_up_w = hpred_up_.shape[:2]
-                vpred_up_h, vpred_up_w = vpred_up_.shape[:2]
-                logger.info(
-                    f"🔍 Resize后mask尺寸: "
-                    f"hpred_up=[{hpred_up_h}, {hpred_up_w}], "
-                    f"vpred_up=[{vpred_up_h}, {vpred_up_w}], "
-                    f"img_up=[{h_up_}, {w_up_}]"
-                )
-                
-                # 详细的坐标转换链路日志
-                logger.info(
-                    f"🔍 UNet推理完成 - 坐标转换链路验证:\n"
-                    f"  [1] 原图尺寸: [{h}, {w}]\n"
-                    f"  [2] 上采样尺寸: [{h_up_}, {w_up_}] (upscale={upscale:.3f})\n"
-                    f"  [3] UNet输入尺寸: [{pred_h}, {pred_w}] (h_scale={h_scale_actual:.6f}, w_scale={w_scale_actual:.6f})\n"
-                    f"  [4] Mask尺寸: [{hpred_up_h}, {hpred_up_w}] (已resize回上采样尺寸)\n"
-                    f"  验证: 理论upscale = {h_up_ / h:.3f} (h), {w_up_ / w:.3f} (w)"
-                )
-                
-                # 验证mask尺寸是否与上采样图像一致
-                if hpred_up_h != h_up_ or hpred_up_w != w_up_:
-                    logger.error(
-                        f"❌ hpred_up 尺寸 [{hpred_up_h}, {hpred_up_w}] 与上采样图像尺寸 "
-                        f"[{h_up_}, {w_up_}] 不一致!"
-                    )
-                if vpred_up_h != h_up_ or vpred_up_w != w_up_:
-                    logger.error(
-                        f"❌ vpred_up 尺寸 [{vpred_up_h}, {vpred_up_w}] 与上采样图像尺寸 "
-                        f"[{h_up_}, {w_up_}] 不一致!"
-                    )
+                # 最终验证:确保 mask 尺寸正确
+                if hpred_up_.shape[:2] != (h_up_, w_up_):
+                    logger.error(f"hpred_up 尺寸 {hpred_up_.shape[:2]} 与上采样图像尺寸 ({h_up_}, {w_up_}) 不一致")
+                if vpred_up_.shape[:2] != (h_up_, w_up_):
+                    logger.error(f"vpred_up 尺寸 {vpred_up_.shape[:2]} 与上采样图像尺寸 ({h_up_}, {w_up_}) 不一致")
                 
-                return hpred_up_, vpred_up_, img_up_, w_scale_actual, h_scale_actual
+                return hpred_up_, vpred_up_, img_up_
 
             # Step 1: 首次运行 UNet 获取初步 mask
-            hpred_up, vpred_up, img_up, w_scale_actual, h_scale_actual = run_unet(table_image)
+            hpred_up, vpred_up, img_up = run_unet(table_image)
             
             # Step 1.1: 基于 Mask 的高精度倾斜检测与矫正
             if self.skew_detector.enable_deskew:
@@ -360,13 +314,10 @@ class MinerUWiredTableRecognizer:
                     h, w = table_image.shape[:2]
                     
                     # 重新运行 UNet (确保 mask 与矫正后的图完全对齐)
-                    hpred_up, vpred_up, img_up, w_scale_actual, h_scale_actual = run_unet(table_image)
+                    hpred_up, vpred_up, img_up = run_unet(table_image)
                 else:
                     logger.debug(f"表格倾斜 {skew_angle:.3f}° 小于阈值,无需矫正")
                         
-            # 调试选项合并
-            dbg = self.debug_utils.merge_debug_options(self.config, debug_options or {})
-
             # Step 1.5: 可视化表格线(调试用)- 需要缩放回原图
             if self.debug_utils.debug_is_on("save_table_lines", dbg):
                 hpred_orig = cv2.resize(hpred_up, (w, h), interpolation=cv2.INTER_NEAREST)
@@ -381,22 +332,15 @@ class MinerUWiredTableRecognizer:
                     )
 
             # Step 2: 使用连通域法提取单元格 (替换了原来的投影法)
-            debug_dir = None
-            debug_prefix = ""
-            if dbg and dbg.enabled and dbg.output_dir:
-                debug_dir = dbg.output_dir
-                debug_prefix = f"{dbg.prefix}_grid" if dbg.prefix else "grid"
+            debug_prefix = f"{dbg.prefix}_grid" if dbg.prefix else "grid"
             
-            # 传入原图的实际尺寸和UNet预处理时的缩放因子,用于计算真实的缩放比例
-            # 这样可以正确处理 UNet 预处理改变图像尺寸的情况
+            # 传入原图的实际尺寸用于计算坐标缩放比例
             bboxes = self.grid_recovery.compute_cells_from_lines(
                 hpred_up, 
                 vpred_up, 
                 upscale,
                 orig_h=h,
                 orig_w=w,
-                unet_w_scale=w_scale_actual,
-                unet_h_scale=h_scale_actual,
                 debug_dir=debug_dir,
                 debug_prefix=debug_prefix
             )
@@ -497,6 +441,7 @@ class MinerUWiredTableRecognizer:
             return {
                 "html": html_filled,
                 "cells": cells,
+                "skew_angle": skew_angle,  # 返回检测到的角度
             }
         except Exception as e:
             logger.error(f"可视化网格结构时发生错误: {e}")
@@ -530,4 +475,3 @@ class MinerUWiredTableRecognizer:
                 return self.recognize_legacy(table_image, ocr_boxes)
         else:
             return self.recognize_legacy(table_image, ocr_boxes)
-