3 kuukautta sitten · 9f84fef765
--- a/ocr_tools/universal_doc_parser/models/adapters/wired_table/text_filling.py
+++ b/ocr_tools/universal_doc_parser/models/adapters/wired_table/text_filling.py
@@ -28,7 +28,7 @@ class TextFiller:
 
				         """
			
 
				         self.ocr_engine = ocr_engine
			
 
				         self.cell_crop_margin: int = config.get("cell_crop_margin", 2)
			
 
				-        self.ocr_conf_threshold: float = config.get("ocr_conf_threshold", 0.9)  # 单元格 OCR 置信度阈值
			
 
				+        self.ocr_conf_threshold: float = config.get("ocr_conf_threshold", 0.9)  # 单元格 OCR 置信度阈值（基准值）
			
 
				         
			
 
				         # 跨单元格检测配置参数
			
 
				         self.overlap_threshold_horizontal: float = config.get("overlap_threshold_horizontal", 0.2)
			
@@ -38,6 +38,45 @@ class TextFiller:
 
				         self.other_cell_max_ratio: float = config.get("other_cell_max_ratio", 0.3)
			
 
				     
			
 
				     @staticmethod
			
 
				+    def calculate_dynamic_confidence_threshold(text: str, base_threshold: float = 0.9) -> float:
			
 
				+        """
			
 
				+        根据文本长度动态计算置信度阈值
			
 
				+        
			
 
				+        策略：
			
 
				+        - 单字符：使用较高阈值（避免误识别，如"1"误识别为"l"）
			
 
				+        - 短文本（2-3字符）：使用中等阈值
			
 
				+        - 中等长度（4-10字符）：使用基准阈值
			
 
				+        - 长文本（10+字符）：使用较低阈值（长文本整体可靠性更高）
			
 
				+        
			
 
				+        Args:
			
 
				+            text: 识别的文本
			
 
				+            base_threshold: 基准置信度阈值（默认0.9）
			
 
				+            
			
 
				+        Returns:
			
 
				+            动态调整后的置信度阈值
			
 
				+        """
			
 
				+        if not text:
			
 
				+            return base_threshold
			
 
				+        
			
 
				+        text_len = len(text.strip())
			
 
				+        
			
 
				+        if text_len == 1:
			
 
				+            # 单字符：提高阈值 +0.05
			
 
				+            return min(0.95, base_threshold + 0.1)
			
 
				+        elif text_len <= 3:
			
 
				+            # 2-3字符：轻微提高阈值 +0.02
			
 
				+            return min(0.92, base_threshold + 0.02)
			
 
				+        elif text_len <= 10:
			
 
				+            # 4-10字符：使用基准阈值
			
 
				+            return max(0.85, base_threshold - 0.05)
			
 
				+        elif text_len <= 20:
			
 
				+            # 11-20字符：降低阈值 -0.03
			
 
				+            return max(0.80, base_threshold - 0.1)
			
 
				+        else:
			
 
				+            # 20+字符：显著降低阈值 -0.05
			
 
				+            return max(0.75, base_threshold - 0.15)
			
 
				+    
			
 
				+    @staticmethod
			
 
				     def calculate_overlap_ratio(ocr_bbox: List[float], cell_bbox: List[float]) -> float:
			
 
				         """
			
 
				         计算 OCR box 与单元格的重叠比例（重叠面积 / OCR box 面积）
			
@@ -608,7 +647,7 @@ class TextFiller:
 
				 
			
 
				             # 对齐长度，避免越界
			
 
				             n = min(len(results) if isinstance(results, list) else 0, len(crop_list), len(crop_indices))
			
 
				-            conf_th = self.ocr_conf_threshold
			
 
				+            base_conf_th = self.ocr_conf_threshold
			
 
				 
			
 
				             # 辅助函数：清理文件名中的非法字符
			
 
				             def sanitize_filename(text: str, max_length: int = 50) -> str:
			
@@ -642,10 +681,17 @@ class TextFiller:
 
				                     except Exception as e:
			
 
				                         logger.warning(f"保存单元格OCR图片失败 (cell {cell_idx}): {e}")
			
 
				                 
			
 
				-                if text_k and score_k >= conf_th:
			
 
				-                    texts[cell_idx] = text_k
			
 
				-                elif text_k:
			
 
				-                    logger.debug(f"单元格 {cell_idx} 二次OCR结果置信度({score_k:.2f})低于阈值({conf_th}): (文本: '{text_k[:30]}...')")
			
 
				+                if text_k:
			
 
				+                    # 根据文本长度动态调整置信度阈值
			
 
				+                    dynamic_conf_th = self.calculate_dynamic_confidence_threshold(text_k, base_conf_th)
			
 
				+                    
			
 
				+                    if score_k >= dynamic_conf_th:
			
 
				+                        texts[cell_idx] = text_k
			
 
				+                    else:
			
 
				+                        logger.debug(
			
 
				+                            f"单元格 {cell_idx} 二次OCR结果置信度({score_k:.2f})低于动态阈值({dynamic_conf_th:.2f}) "
			
 
				+                            f"[文本长度={len(text_k)}, 基准阈值={base_conf_th:.2f}]: '{text_k[:30]}...'"
			
 
				+                        )
			
 
				 
			
 
				         except Exception as e:
			
 
				             logger.warning(f"二次OCR失败: {e}")