1 Minggu lalu · 672d58aaf3
--- a/comparator/table_comparator.py
+++ b/comparator/table_comparator.py
@@ -286,8 +286,8 @@ class TableComparator:
 
				         
			
 
				         检测策略:
			
 
				         1. 查找包含表头关键字最多的行
			
 
				-        2. 确认下一行是数据行
			
 
				-        3. 避免将合并单元格的元数据行误判为表头
			
 
				+        2. 确认下一行是数据行（或分类行）
			
 
				+        3. 特殊处理：资产负债表等多层表头
			
 
				         """
			
 
				         if not table:
			
 
				             return 0
			
@@ -299,13 +299,15 @@ class TableComparator:
 
				             '摘要', 'description', '说明', 'remark',
			
 
				             '金额', 'amount', '借方', 'debit', '贷方', 'credit',
			
 
				             '余额', 'balance',
			
 
				-            '对手', 'counterparty', '账户', 'account', '户名', 'name'
			
 
				+            '对手', 'counterparty', '账户', 'account', '户名', 'name',
			
 
				+            # ✅ 新增：资产负债表关键词
			
 
				+            # '资产', 'asset', '负债', 'liability', '期末', 'period', '期初'
			
 
				+            '期末', 'period', '期初'
			
 
				         ]
			
 
				         
			
 
				         best_header_row = 0
			
 
				         best_score = 0
			
 
				 
			
 
				-        # 如果表格行数小于10，取全部行进行检测，如果大于10，取前10行
			
 
				         for row_idx, row in enumerate(table[:10]):
			
 
				             if not row:
			
 
				                 continue
			
@@ -324,36 +326,65 @@ class TableComparator:
 
				                         if keyword in cell_lower:
			
 
				                             keyword_count += 1
			
 
				                             break
			
 
				-            
			
 
				-            # 避免空行或几乎空的行
			
 
				+        
			
 
				             if non_empty_cells < 3:
			
 
				                 continue
			
 
				             
			
 
				-            # 计算得分：关键字比例 + 列数奖励
			
 
				             keyword_ratio = keyword_count / non_empty_cells if non_empty_cells > 0 else 0
			
 
				-            column_bonus = min(non_empty_cells / 5, 1.0)  # 列数越多，奖励越高
			
 
				+            column_bonus = min(non_empty_cells / 5, 1.0)
			
 
				             score = keyword_ratio * 0.7 + column_bonus * 0.3
			
 
				             
			
 
				-            # 如果下一行是数据行，加分
			
 
				+            # ✅ 改进：跳过分类行和数据行检测
			
 
				             if row_idx + 1 < len(table):
			
 
				                 next_row = table[row_idx + 1]
			
 
				+                # 如果下一行是数据行，加分
			
 
				                 if self._is_data_row(next_row):
			
 
				                     score += 0.2
			
 
				-            
			
 
				+                # ✅ 新增：如果下一行是分类行（如"流动资产:"），小幅加分
			
 
				+                elif self._is_category_row(next_row):
			
 
				+                    score += 0.1
			
 
				+
			
 
				             if score > best_score:
			
 
				                 best_score = score
			
 
				                 best_header_row = row_idx
			
 
				         
			
 
				-        # 如果最佳得分太低，返回0（第一行）
			
 
				         if best_score < 0.3:
			
 
				             print(f"   ⚠️  未检测到明确表头，默认使用第1行 (得分: {best_score:.2f})")
			
 
				             return 0
			
 
				         
			
 
				         print(f"   📍 检测到表头在第 {best_header_row + 1} 行 (得分: {best_score:.2f})")
			
 
				         return best_header_row
			
 
				+
			
 
				+    def _is_category_row(self, row: List[str]) -> bool:
			
 
				+        """
			
 
				+        ✅ 新增：判断是否为分类行（如"流动资产:"）
			
 
				+        """
			
 
				+        if not row:
			
 
				+            return False
			
 
				+        
			
 
				+        category_patterns = [
			
 
				+            # r'流动[资产负债]',
			
 
				+            # r'非流动[资产负债]',
			
 
				+            r'.*:$',  # 以冒号结尾
			
 
				+        ]
			
 
				+        
			
 
				+        for cell in row:
			
 
				+            cell_text = str(cell).strip()
			
 
				+            if not cell_text:
			
 
				+                continue
			
 
				+            
			
 
				+            for pattern in category_patterns:
			
 
				+                if re.search(pattern, cell_text):
			
 
				+                    return True
			
 
				+        
			
 
				+        return False
			
 
				     
			
 
				     def _is_data_row(self, row: List[str]) -> bool:
			
 
				-        """判断是否为数据行"""
			
 
				+        """
			
 
				+        判断是否为数据行（改进版）
			
 
				+        
			
 
				+        ✅ "-" 符号表示金额为0或空，应该被认为是有效的数据单元格
			
 
				+        """
			
 
				         if not row:
			
 
				             return False
			
 
				         
			
@@ -367,6 +398,11 @@ class TableComparator:
 
				             
			
 
				             non_empty_count += 1
			
 
				             
			
 
				+            # ✅ "-" 符号也是有效的数据（表示0或空）
			
 
				+            if cell_text == '-' or cell_text == '—' or cell_text == '--':
			
 
				+                data_pattern_count += 1
			
 
				+                continue
			
 
				+            
			
 
				             # 包含数字
			
 
				             if re.search(r'\d', cell_text):
			
 
				                 data_pattern_count += 1