Selaa lähdekoodia

fix(更新OCR结果对比路径与内容提取逻辑): 修改compare_ocr_results.py中的文件路径以适应新的数据源,同时在content_extractor.py中新增HTML注释移除功能,优化段落提取逻辑,提升OCR处理的准确性与灵活性。

zhch158_admin 2 päivää sitten
vanhempi
commit
937aab7790
2 muutettua tiedostoa jossa 17 lisäystä ja 17 poistoa
  1. 3 3
      ocr_comparator/compare_ocr_results.py
  2. 14 14
      ocr_comparator/content_extractor.py

+ 3 - 3
ocr_comparator/compare_ocr_results.py

@@ -90,9 +90,9 @@ if __name__ == "__main__":
         # 测试流水表格对比
         import time
         result = compare_ocr_results(
-            file1_path='/Users/zhch158/workspace/data/流水分析/B用户_扫描流水/bank_statement_yusys_v4/B用户_扫描流水_page_001.md',
-            file2_path='/Users/zhch158/workspace/data/流水分析/B用户_扫描流水/bank_statement_yusys_v3/B用户_扫描流水_page_001.md',
-            output_file=f'/Users/zhch158/workspace/data/流水分析/B用户_扫描流水/logs/flow_list_comparison_{time.strftime("%Y%m%d_%H%M%S")}',
+            file1_path='/Users/zhch158/workspace/data/流水分析/钟_广东陆丰农村商业银行/bank_statement_yusys_local/钟_广东陆丰农村商业银行_page_001.md',
+            file2_path='/Users/zhch158/workspace/data/流水分析/钟_广东陆丰农村商业银行/bank_statement_yusys_glmocr_local/钟_广东陆丰农村商业银行_page_001.md',
+            output_file=f'/Users/zhch158/workspace/data/流水分析/钟_广东陆丰农村商业银行/logs/flow_list_comparison_{time.strftime("%Y%m%d_%H%M%S")}',
             output_format='both',
             ignore_images=True,
             table_mode='flow_list',  # 使用流水表格模式

+ 14 - 14
ocr_comparator/content_extractor.py

@@ -24,6 +24,10 @@ class ContentExtractor:
         text = re.sub(r'\s*([,。:;!?、])\s*', r'\1', text)
         return text
     
+    def _strip_html_comments(self, content: str) -> str:
+        """移除 HTML/Markdown 注释块(含多行),不参与段落提取与对比。"""
+        return re.sub(r'<!--.*?-->', '', content, flags=re.DOTALL)
+
     def _is_image_reference(self, text: str) -> bool:
         """判断是否为图片引用或描述"""
         image_keywords = [
@@ -194,23 +198,19 @@ class ContentExtractor:
         return merged_lines
     
     def extract_paragraphs(self, content: str) -> List[str]:
-        """提取段落内容"""
-        # 移除HTML标签
-        content_no_html = re.sub(r'<[^>]+>', '', content)
-        
-        # 移除bbox注释
-        content_no_bbox = re.sub(r'<!--.*?-->', '', content_no_html)
-        
-        # 按换行符分割
+        """提取段落内容(HTML 注释、标准化说明元数据不参与对比)"""
+        # 必须先去掉注释:多行 <!-- ... --> 无法用 <[^>]+> 或单行 .*? 一次清干净
+        content_no_comments = self._strip_html_comments(content)
+        content_no_html = re.sub(r'<[^>]+>', '', content_no_comments)
+
         paragraphs = []
-        lines = content_no_bbox.split('\n')
+        lines = content_no_html.split('\n')
         merged_lines = self.merge_split_paragraphs(lines)
-        
+
         for line in merged_lines:
             normalized = self._normalize_text(line)
-            if normalized:
-                paragraphs.append(normalized)
-            else:
-                print(f"跳过的内容无效或图片段落: {line[0:30] if line else ''}...")        
+            if not normalized:
+                continue
+            paragraphs.append(normalized)
 
         return paragraphs