SHA1
--- a/.cursor/rules/project-conventions.mdc
+++ b/.cursor/rules/project-conventions.mdc
@@ -0,0 +1,35 @@
 
				+---
			
 
				+description: 项目通用约定（中文回复、conda 环境）
			
 
				+alwaysApply: true
			
 
				+---
			
 
				+
			
 
				+# 项目约定
			
 
				+
			
 
				+## 语言
			
 
				+
			
 
				+- 与用户的所有对话、说明、总结、PR/提交说明草稿均使用**简体中文**。
			
 
				+- 代码标识符、路径、命令、日志原文可保持英文；技术术语首次出现时可附简短中文说明。
			
 
				+
			
 
				+## Python / Shell 环境
			
 
				+
			
 
				+本项目使用 Conda 环境 **`mineru`**。执行 Python、pip、pytest 或依赖项目依赖的命令前，必须先激活该环境。
			
 
				+
			
 
				+### 交互式终端
			
 
				+
			
 
				+```bash
			
 
				+conda activate mineru
			
 
				+```
			
 
				+
			
 
				+### 非交互式命令（推荐，避免 activate 未生效）
			
 
				+
			
 
				+```bash
			
 
				+conda run -n mineru python ...
			
 
				+conda run -n mineru pip ...
			
 
				+conda run -n mineru pytest ...
			
 
				+```
			
 
				+
			
 
				+### 要求
			
 
				+
			
 
				+- 不要假设系统默认 `python3` 已安装项目依赖（如 `bs4`、`torch` 等）。
			
 
				+- 若命令失败且提示缺少模块，先确认是否在 `mineru` 环境中再排查代码问题。
			
 
				+- 新建脚本或文档中的示例命令，应体现 `conda activate mineru` 或 `conda run -n mineru`。
			
--- a/.cursorrules
+++ b/.cursorrules
@@ -0,0 +1,4 @@
 
				+# Project Rules / 项目规则
			
 
				+
			
 
				+1. 所有回答必须使用中文。
			
 
				+2. 开发与运行命令时，环境使用 `conda activate mineru`。
			
--- a/ocr_comparator/compare_ocr_results.py
+++ b/ocr_comparator/compare_ocr_results.py
@@ -90,9 +90,9 @@ if __name__ == "__main__":
 
				         # 测试流水表格对比
			
 
				         import time
			
 
				         result = compare_ocr_results(
			
 
				-            file1_path='/Users/zhch158/workspace/data/流水分析/B用户_扫描流水/bank_statement_yusys_v4/B用户_扫描流水_page_001.md',
			
 
				-            file2_path='/Users/zhch158/workspace/data/流水分析/B用户_扫描流水/bank_statement_yusys_v3/B用户_扫描流水_page_001.md',
			
 
				-            output_file=f'/Users/zhch158/workspace/data/流水分析/B用户_扫描流水/logs/flow_list_comparison_{time.strftime("%Y%m%d_%H%M%S")}',
			
 
				+            file1_path='/Users/zhch158/workspace/data/流水分析/钟_广东陆丰农村商业银行/bank_statement_yusys_local/钟_广东陆丰农村商业银行_page_001.md',
			
 
				+            file2_path='/Users/zhch158/workspace/data/流水分析/钟_广东陆丰农村商业银行/bank_statement_yusys_glmocr_local/钟_广东陆丰农村商业银行_page_001.md',
			
 
				+            output_file=f'/Users/zhch158/workspace/data/流水分析/钟_广东陆丰农村商业银行/logs/flow_list_comparison_{time.strftime("%Y%m%d_%H%M%S")}',
			
 
				             output_format='both',
			
 
				             ignore_images=True,
			
 
				             table_mode='flow_list',  # 使用流水表格模式
			
--- a/ocr_comparator/content_extractor.py
+++ b/ocr_comparator/content_extractor.py
@@ -24,6 +24,10 @@ class ContentExtractor:
 
				         text = re.sub(r'\s*([，。：；！？、])\s*', r'\1', text)
			
 
				         return text
			
 
				     
			
 
				+    def _strip_html_comments(self, content: str) -> str:
			
 
				+        """移除 HTML/Markdown 注释块（含多行），不参与段落提取与对比。"""
			
 
				+        return re.sub(r'<!--.*?-->', '', content, flags=re.DOTALL)
			
 
				+
			
 
				     def _is_image_reference(self, text: str) -> bool:
			
 
				         """判断是否为图片引用或描述"""
			
 
				         image_keywords = [
			
@@ -194,23 +198,19 @@ class ContentExtractor:
 
				         return merged_lines
			
 
				     
			
 
				     def extract_paragraphs(self, content: str) -> List[str]:
			
 
				-        """提取段落内容"""
			
 
				-        # 移除HTML标签
			
 
				-        content_no_html = re.sub(r'<[^>]+>', '', content)
			
 
				-        
			
 
				-        # 移除bbox注释
			
 
				-        content_no_bbox = re.sub(r'<!--.*?-->', '', content_no_html)
			
 
				-        
			
 
				-        # 按换行符分割
			
 
				+        """提取段落内容（HTML 注释、标准化说明元数据不参与对比）"""
			
 
				+        # 必须先去掉注释：多行 <!-- ... --> 无法用 <[^>]+> 或单行 .*? 一次清干净
			
 
				+        content_no_comments = self._strip_html_comments(content)
			
 
				+        content_no_html = re.sub(r'<[^>]+>', '', content_no_comments)
			
 
				+
			
 
				         paragraphs = []
			
 
				-        lines = content_no_bbox.split('\n')
			
 
				+        lines = content_no_html.split('\n')
			
 
				         merged_lines = self.merge_split_paragraphs(lines)
			
 
				-        
			
 
				+
			
 
				         for line in merged_lines:
			
 
				             normalized = self._normalize_text(line)
			
 
				-            if normalized:
			
 
				-                paragraphs.append(normalized)
			
 
				-            else:
			
 
				-                print(f"跳过的内容无效或图片段落: {line[0:30] if line else ''}...")        
			
 
				+            if not normalized:
			
 
				+                continue
			
 
				+            paragraphs.append(normalized)
			
 
				 
			
 
				         return paragraphs
Autor	SHA1 Mensaxe	Data
zhch158_admin	01c5c02e94 feat(新增项目规则文件): 创建项目规则文件，明确所有回答需使用中文及开发运行时需激活`mineru`环境的要求，提升项目规范性与可维护性。	hai 1 mes
zhch158_admin	937aab7790 fix(更新OCR结果对比路径与内容提取逻辑): 修改compare_ocr_results.py中的文件路径以适应新的数据源，同时在content_extractor.py中新增HTML注释移除功能，优化段落提取逻辑，提升OCR处理的准确性与灵活性。	hai 1 mes
zhch158_admin	55ca99c249 feat(新增项目通用约定文档): 创建项目通用约定文档，明确使用简体中文进行交流及命令执行时需激活`mineru`环境的要求，提升项目规范性与可维护性。	hai 1 mes