1 月之前 · 937aab7790
--- a/ocr_comparator/compare_ocr_results.py
+++ b/ocr_comparator/compare_ocr_results.py
@@ -90,9 +90,9 @@ if __name__ == "__main__":
 
				         # 测试流水表格对比
			
 
				         import time
			
 
				         result = compare_ocr_results(
			
 
				-            file1_path='/Users/zhch158/workspace/data/流水分析/B用户_扫描流水/bank_statement_yusys_v4/B用户_扫描流水_page_001.md',
			
 
				-            file2_path='/Users/zhch158/workspace/data/流水分析/B用户_扫描流水/bank_statement_yusys_v3/B用户_扫描流水_page_001.md',
			
 
				-            output_file=f'/Users/zhch158/workspace/data/流水分析/B用户_扫描流水/logs/flow_list_comparison_{time.strftime("%Y%m%d_%H%M%S")}',
			
 
				+            file1_path='/Users/zhch158/workspace/data/流水分析/钟_广东陆丰农村商业银行/bank_statement_yusys_local/钟_广东陆丰农村商业银行_page_001.md',
			
 
				+            file2_path='/Users/zhch158/workspace/data/流水分析/钟_广东陆丰农村商业银行/bank_statement_yusys_glmocr_local/钟_广东陆丰农村商业银行_page_001.md',
			
 
				+            output_file=f'/Users/zhch158/workspace/data/流水分析/钟_广东陆丰农村商业银行/logs/flow_list_comparison_{time.strftime("%Y%m%d_%H%M%S")}',
			
 
				             output_format='both',
			
 
				             ignore_images=True,
			
 
				             table_mode='flow_list',  # 使用流水表格模式
			
--- a/ocr_comparator/content_extractor.py
+++ b/ocr_comparator/content_extractor.py
@@ -24,6 +24,10 @@ class ContentExtractor:
 
				         text = re.sub(r'\s*([，。：；！？、])\s*', r'\1', text)
			
 
				         return text
			
 
				     
			
 
				+    def _strip_html_comments(self, content: str) -> str:
			
 
				+        """移除 HTML/Markdown 注释块（含多行），不参与段落提取与对比。"""
			
 
				+        return re.sub(r'<!--.*?-->', '', content, flags=re.DOTALL)
			
 
				+
			
 
				     def _is_image_reference(self, text: str) -> bool:
			
 
				         """判断是否为图片引用或描述"""
			
 
				         image_keywords = [
			
@@ -194,23 +198,19 @@ class ContentExtractor:
 
				         return merged_lines
			
 
				     
			
 
				     def extract_paragraphs(self, content: str) -> List[str]:
			
 
				-        """提取段落内容"""
			
 
				-        # 移除HTML标签
			
 
				-        content_no_html = re.sub(r'<[^>]+>', '', content)
			
 
				-        
			
 
				-        # 移除bbox注释
			
 
				-        content_no_bbox = re.sub(r'<!--.*?-->', '', content_no_html)
			
 
				-        
			
 
				-        # 按换行符分割
			
 
				+        """提取段落内容（HTML 注释、标准化说明元数据不参与对比）"""
			
 
				+        # 必须先去掉注释：多行 <!-- ... --> 无法用 <[^>]+> 或单行 .*? 一次清干净
			
 
				+        content_no_comments = self._strip_html_comments(content)
			
 
				+        content_no_html = re.sub(r'<[^>]+>', '', content_no_comments)
			
 
				+
			
 
				         paragraphs = []
			
 
				-        lines = content_no_bbox.split('\n')
			
 
				+        lines = content_no_html.split('\n')
			
 
				         merged_lines = self.merge_split_paragraphs(lines)
			
 
				-        
			
 
				+
			
 
				         for line in merged_lines:
			
 
				             normalized = self._normalize_text(line)
			
 
				-            if normalized:
			
 
				-                paragraphs.append(normalized)
			
 
				-            else:
			
 
				-                print(f"跳过的内容无效或图片段落: {line[0:30] if line else ''}...")        
			
 
				+            if not normalized:
			
 
				+                continue
			
 
				+            paragraphs.append(normalized)
			
 
				 
			
 
				         return paragraphs