11 Commits 0e20f6612e ... abf5932769

Auteur SHA1 Message Date
  zhch158_admin abf5932769 fix(markdown_generator): 移除印章置信度信息以简化输出格式 il y a 1 semaine
  zhch158_admin 4ede25dc86 feat(process_single_input): 添加场景名称设置功能并更新场景参数 il y a 1 semaine
  zhch158_admin 87c5b916fb feat(pipeline_manager): 添加场景名称设置功能,并同步到布局路由器 il y a 1 semaine
  zhch158_admin 3ab44b691b feat(layout_detection): 更新布局检测配置,支持智能路由器场景选择和多模型配置 il y a 1 semaine
  zhch158_admin 10872b84e9 feat(pdf_processing): 添加场景参数支持,优化 PDF 处理任务配置 il y a 1 semaine
  zhch158_admin 59f2fdd74c feat(glmocr_vl_adapter): 添加连通性测试以验证与 GLM-OCR API 的连接 il y a 1 semaine
  zhch158_admin 08cfd7cd25 fix(element_processors): 优化表格识别中的 VL 识别器异常处理逻辑 il y a 1 semaine
  zhch158_admin d2258858b5 feat(paddle_table_classifier): 优化表格线检测,使用自适应阈值和线段过滤 il y a 1 semaine
  zhch158_admin 73f20ff9e2 feat(mineru_wired_table): 添加 OCR 文本容差计算,优化网格结构恢复 il y a 1 semaine
  zhch158_admin 93977737f5 feat(grid_recovery): 增加 OCR 文本容差参数,优化网格结构恢复算法 il y a 1 semaine
  zhch158_admin ce673e8fc6 feat(cell_fusion): 增强单元格融合逻辑,支持UNet过度合并拆分,添加新配置参数 il y a 1 semaine

+ 77 - 26
ocr_tools/ocr_batch/batch_process_pdf.py

@@ -14,7 +14,7 @@ import json
 import yaml
 from pathlib import Path
 from datetime import datetime
-from typing import List, Dict, Optional, Any
+from typing import List, Dict, Optional, Any, Tuple
 from dataclasses import dataclass, field
 import logging
 from tqdm import tqdm
@@ -34,11 +34,19 @@ class ProcessorConfig:
     extra_args: List[str] = field(default_factory=list)
     output_subdir: str = "results"
     log_subdir: str = "logs"  # 🎯 新增:日志子目录
+    scene_arg: Optional[str] = None  # 场景参数名(如 --scene)
     venv: Optional[str] = None
     description: str = ""
 
 
 @dataclass
+class PDFTask:
+    """PDF 处理任务"""
+    path: Path
+    scene: Optional[str] = None
+
+
+@dataclass
 class ProcessResult:
     """处理结果"""
     pdf_file: str
@@ -165,6 +173,7 @@ class ConfigManager:
             extra_args=proc_config.get('extra_args', []),
             output_subdir=proc_config.get('output_subdir', processor_name + '_results'),
             log_subdir=proc_config.get('log_subdir', f'logs/{processor_name}'),  # 🎯 新增
+            scene_arg=proc_config.get('scene_arg'),
             venv=proc_config.get('venv'),
             description=proc_config.get('description', '')
         )
@@ -188,9 +197,9 @@ class PDFFileFinder:
     def __init__(self, base_dir: str):
         self.base_dir = Path(base_dir)
     
-    def from_file_list(self, list_file: str) -> List[Path]:
+    def from_file_list(self, list_file: str) -> List[PDFTask]:
         """从文件列表读取"""
-        pdf_files = []
+        pdf_files: List[PDFTask] = []
         
         with open(list_file, 'r', encoding='utf-8') as f:
             for line in f:
@@ -199,27 +208,39 @@ class PDFFileFinder:
                 if not line or line.startswith('#'):
                     continue
                 
+                file_part, scene = self._parse_list_line(line)
                 # 构建完整路径
-                pdf_path = self._resolve_path(line)
+                pdf_path = self._resolve_path(file_part)
                 if pdf_path:
-                    pdf_files.append(pdf_path)
+                    pdf_files.append(PDFTask(path=pdf_path, scene=scene))
         
         return pdf_files
     
-    def from_list(self, pdf_list: List[str]) -> List[Path]:
+    def from_list(self, pdf_list: List[str]) -> List[PDFTask]:
         """从列表读取"""
-        pdf_files = []
+        pdf_files: List[PDFTask] = []
         
         for pdf in pdf_list:
-            pdf_path = self._resolve_path(pdf.strip())
+            file_part, scene = self._parse_list_line(pdf.strip())
+            pdf_path = self._resolve_path(file_part)
             if pdf_path:
-                pdf_files.append(pdf_path)
+                pdf_files.append(PDFTask(path=pdf_path, scene=scene))
         
         return pdf_files
     
-    def find_all(self) -> List[Path]:
+    def find_all(self) -> List[PDFTask]:
         """查找基础目录下所有 PDF"""
-        return sorted(self.base_dir.rglob('*.pdf'))
+        return [PDFTask(path=path) for path in sorted(self.base_dir.rglob('*.pdf'))]
+
+    def _parse_list_line(self, line: str) -> Tuple[str, Optional[str]]:
+        """解析列表行(支持  文件<TAB>场景  或  文件,场景)"""
+        for sep in ["\t", ","]:
+            if sep in line:
+                file_part, scene_part = line.split(sep, 1)
+                file_part = file_part.strip()
+                scene_part = scene_part.strip()
+                return file_part, scene_part or None
+        return line.strip(), None
     
     def _resolve_path(self, path_str: str) -> Optional[Path]:
         """解析路径"""
@@ -263,13 +284,15 @@ class PDFBatchProcessor:
         processor_config: ProcessorConfig,
         output_subdir: Optional[str] = None,
         log_base_dir: Optional[str] = None,  # 🎯 新增:日志基础目录
-        dry_run: bool = False
+        dry_run: bool = False,
+        default_scene: Optional[str] = None
     ):
         self.processor_config = processor_config
         # 如果指定了output_subdir,使用指定的;否则使用处理器配置中的
         self.output_subdir = output_subdir or processor_config.output_subdir
         self.log_base_dir = Path(log_base_dir) if log_base_dir else Path('logs')  # 🎯 新增
         self.dry_run = dry_run
+        self.default_scene = default_scene
         
         # 设置日志
         self.logger = self._setup_logger()
@@ -320,7 +343,7 @@ class PDFBatchProcessor:
         
         return log_file
     
-    def process_files(self, pdf_files: List[Path]) -> Dict[str, Any]:
+    def process_files(self, pdf_files: List[PDFTask]) -> Dict[str, Any]:
         """批量处理文件"""
         self.logger.info(f"开始处理 {len(pdf_files)} 个文件")
         self.logger.info(f"处理器: {self.processor_config.description}")
@@ -335,8 +358,8 @@ class PDFBatchProcessor:
         
         # 使用进度条
         with tqdm(total=len(pdf_files), desc="处理进度", unit="file") as pbar:
-            for pdf_file in pdf_files:
-                result = self._process_single_file(pdf_file)
+            for task in pdf_files:
+                result = self._process_single_file(task)
                 self.results.append(result)
                 pbar.update(1)
                 
@@ -355,9 +378,12 @@ class PDFBatchProcessor:
         
         return stats
     
-    def _process_single_file(self, pdf_file: Path) -> ProcessResult:
+    def _process_single_file(self, task: PDFTask) -> ProcessResult:
         """🎯 处理单个文件(支持日志重定向)"""
-        self.logger.info(f"处理: {pdf_file}")
+        pdf_file = task.path
+        scene = task.scene or self.default_scene
+        scene_info = f" (scene: {scene})" if scene else ""
+        self.logger.info(f"处理: {pdf_file}{scene_info}")
         
         # 检查文件是否存在
         if not pdf_file.exists():
@@ -376,7 +402,7 @@ class PDFBatchProcessor:
         log_file = self._get_log_file_path(pdf_file)
         
         # 构建命令
-        cmd = self._build_command(pdf_file, output_dir)
+        cmd = self._build_command(pdf_file, output_dir, scene)
         
         self.logger.debug(f"执行命令: {cmd if isinstance(cmd, str) else ' '.join(cmd)}")
         self.logger.info(f"日志输出: {log_file}")
@@ -398,7 +424,7 @@ class PDFBatchProcessor:
                 # 写入日志头
                 log_f.write(f"{'='*80}\n")
                 log_f.write(f"处理器: {self.processor_config.description}\n")
-                log_f.write(f"PDF 文件: {pdf_file}\n")
+                log_f.write(f"PDF 文件: {pdf_file}{scene_info}\n")
                 log_f.write(f"输出目录: {output_dir}\n")
                 log_f.write(f"开始时间: {datetime.now()}\n")
                 log_f.write(f"{'='*80}\n\n")
@@ -486,7 +512,7 @@ class PDFBatchProcessor:
                 log_file=str(log_file)
             )
     
-    def _build_command(self, pdf_file: Path, output_dir: Path):
+    def _build_command(self, pdf_file: Path, output_dir: Path, scene: Optional[str]):
         """构建执行命令
         
         Returns:
@@ -503,6 +529,13 @@ class PDFBatchProcessor:
         
         # 添加额外参数
         base_cmd.extend(self.processor_config.extra_args)
+
+        # 添加场景参数(如果配置了scene_arg)
+        if scene:
+            if self.processor_config.scene_arg:
+                base_cmd.extend([self.processor_config.scene_arg, scene])
+            else:
+                self.logger.warning("⚠️ 场景已提供但未配置scene_arg,已忽略场景参数")
         
         # 如果配置了虚拟环境,构建 shell 命令
         if self.processor_config.venv:
@@ -690,6 +723,17 @@ def create_parser() -> argparse.ArgumentParser:
         nargs='+',
         help='PDF 文件列表 (空格分隔)'
     )
+
+    # 场景参数
+    parser.add_argument(
+        '--scene',
+        help='默认场景名称(文件列表未提供场景时使用)'
+    )
+    parser.add_argument(
+        '--scene-arg',
+        default='--scene',
+        help='场景参数名称 (默认: --scene)'
+    )
     
     # 额外参数
     parser.add_argument(
@@ -772,8 +816,12 @@ def main():
             script=args.script,
             extra_args=args.extra_args.split() if args.extra_args else [],
             output_subdir=args.output_subdir or 'manual_results',
+            scene_arg=args.scene_arg,
             venv=args.venv
         )
+        # 如果配置中没有scene_arg且用户指定了scene,默认设置为--scene
+        if args.scene and not processor_config.scene_arg:
+            print("⚠️  已指定场景但未配置scene_arg,忽略场景参数")
     else:
         parser.error("必须指定 -p 或 -s 参数")
     
@@ -806,18 +854,20 @@ def main():
         return 1
     
     # 显示找到的文件
-    valid_file_paths = [f.as_posix() for f in pdf_files if f.exists()]
+    valid_file_paths = [f"{t.path.as_posix()}\t{t.scene}" if t.scene else t.path.as_posix()
+                        for t in pdf_files if t.path.exists()]
     if valid_file_paths:
         print("\n".join(valid_file_paths))    
 
     # 验证文件
-    valid_files = [f for f in pdf_files if f.exists()]
-    invalid_files = [f for f in pdf_files if not f.exists()]
+    valid_files = [t for t in pdf_files if t.path.exists()]
+    invalid_files = [t for t in pdf_files if not t.path.exists()]
     
     if invalid_files:
         print(f"\n⚠️  警告: {len(invalid_files)} 个文件不存在:")
-        for f in invalid_files[:5]:
-            print(f"  - {f}")
+        for t in invalid_files[:5]:
+            scene_suffix = f" (scene: {t.scene})" if t.scene else ""
+            print(f"  - {t.path}{scene_suffix}")
         if len(invalid_files) > 5:
             print(f"  ... 还有 {len(invalid_files) - 5} 个")
     
@@ -834,7 +884,8 @@ def main():
         processor_config=processor_config,
         output_subdir=args.output_subdir,
         log_base_dir=log_base_dir,  # 🎯 传递日志目录
-        dry_run=args.dry_run
+        dry_run=args.dry_run,
+        default_scene=args.scene
     )
     
     stats = processor.process_files(valid_files)

+ 17 - 16
ocr_tools/ocr_batch/pdf_list.txt

@@ -1,17 +1,18 @@
-德_内蒙古银行照.pdf
-对公_招商银行图.pdf
-A用户_单元格扫描流水.pdf
-B用户_扫描流水.pdf
-康强_北京农村商业银行.pdf
-施博深.pdf
-山西云集科技有限公司.pdf
-2023年度报告母公司.pdf
-提取自赤峰黄金2023年报.pdf
-许_民生银行图.pdf
-方_广发银行图.pdf
-付_工商银行943825图.pdf
-乔_建设银行图.pdf
-湛_平安银行图.pdf
-张_微信图.pdf
-朱_中信银行图.pdf
+# 文件名<TAB>","场景(bank_statement / financial_report)
+德_内蒙古银行照.pdf,bank_statement
+对公_招商银行图.pdf,bank_statement
+A用户_单元格扫描流水.pdf,bank_statement
+B用户_扫描流水.pdf,bank_statement
+康强_北京农村商业银行.pdf,bank_statement
+施博深.pdf,bank_statement
+山西云集科技有限公司.pdf,bank_statement
+2023年度报告母公司.pdf,financial_report
+提取自赤峰黄金2023年报.pdf,financial_report
+许_民生银行图.pdf,bank_statement
+方_广发银行图.pdf,bank_statement
+付_工商银行943825图.pdf,bank_statement
+乔_建设银行图.pdf,bank_statement
+湛_平安银行图.pdf,bank_statement
+张_微信图.pdf,bank_statement
+朱_中信银行图.pdf,bank_statement
 

+ 3 - 0
ocr_tools/ocr_batch/processor_configs.yaml

@@ -12,6 +12,7 @@ processors:
     script: "/Users/zhch158/workspace/repository.git/ocr_platform/ocr_tools/universal_doc_parser/main_v2.py"
     input_arg: "--input"
     output_arg: "--output_dir"
+    scene_arg: "--scene"
     extra_args:
       - "--config=/Users/zhch158/workspace/repository.git/ocr_platform/ocr_tools/universal_doc_parser/config/bank_statement_yusys_v4.yaml"
       - "--pages=1-35"
@@ -27,6 +28,7 @@ processors:
     script: "/Users/zhch158/workspace/repository.git/ocr_platform/ocr_tools/universal_doc_parser/main_v2.py"
     input_arg: "--input"
     output_arg: "--output_dir"
+    scene_arg: "--scene"
     extra_args:
       - "--config=/Users/zhch158/workspace/repository.git/ocr_platform/ocr_tools/universal_doc_parser/config/bank_statement_yusys_v3.yaml"
       # - "--config=/Users/zhch158/workspace/repository.git/ocr_platform/ocr_tools/universal_doc_parser/config/bank_statement_yusys_v2.yaml"
@@ -45,6 +47,7 @@ processors:
     script: "/Users/zhch158/workspace/repository.git/ocr_platform/ocr_tools/universal_doc_parser/main_v2.py"
     input_arg: "--input"
     output_arg: "--output_dir"
+    scene_arg: "--scene"
     extra_args:
       - "--config=/Users/zhch158/workspace/repository.git/ocr_platform/ocr_tools/universal_doc_parser/config/bank_statement_mineru_vl.yaml"
       # - "--config=/Users/zhch158/workspace/repository.git/ocr_platform/ocr_tools/universal_doc_parser/config/bank_statement_yusys_v2.yaml"

+ 30 - 8
ocr_tools/universal_doc_parser/config/bank_statement_yusys_v4.yaml

@@ -18,16 +18,38 @@ preprocessor:
     enabled: false
 
 # ============================================================
-# Layout 检测配置 - 使用 PP-DocLayoutV3
+# Layout 检测配置 - 智能路由器(按场景直接选择模型)
 # ============================================================
 layout_detection:
-  module: "paddle"
-  model_name: "PP-DocLayoutV3"
-  model_dir: "PaddlePaddle/PP-DocLayoutV3_safetensors"
-  device: "cpu"
-  conf: 0.3
-  num_threads: 4
-  batch_size: 1
+  module: "smart_router"
+  strategy: "scene"  # 按场景直接选择模型,不走ocr_eval
+
+  # 场景策略:指定场景直接选用的布局模型
+  scene_strategy:
+    bank_statement:
+      model: "docling"
+    financial_report:
+      model: "paddle_ppdoclayoutv3"
+  default_model: "docling"
+
+  # 配置多个模型
+  models:
+    docling:
+      module: "docling"
+      model_name: "docling-layout-old"
+      model_dir: "ds4sd/docling-layout-old"
+      device: "cpu"
+      conf: 0.3
+      num_threads: 4
+
+    paddle_ppdoclayoutv3:
+      module: "paddle"
+      model_name: "PP-DocLayoutV3"
+      model_dir: "PaddlePaddle/PP-DocLayoutV3_safetensors"
+      device: "cpu"
+      conf: 0.3
+      num_threads: 4
+      batch_size: 1
   
   # 后处理配置
   post_process:

+ 6 - 6
ocr_tools/universal_doc_parser/core/element_processors.py

@@ -495,13 +495,13 @@ class ElementProcessors:
         
         # VLM 识别获取表格结构HTML(懒加载)
         table_html = ""
-        try:
-            vl_recognizer = self._ensure_vl_recognizer()
-            if vl_recognizer is None:
-                logger.error("❌ VL recognizer not available for table recognition")
-                # return self._create_empty_table_result(layout_item, bbox, table_angle, ocr_source)
-                raise RuntimeError("VL recognizer not available")
+        vl_recognizer = self._ensure_vl_recognizer()
+        if vl_recognizer is None:
+            logger.error("❌ VL recognizer not available for table recognition")
+            # return self._create_empty_table_result(layout_item, bbox, table_angle, ocr_source)
+            raise RuntimeError("VL recognizer not available")
             
+        try:
             vl_result = vl_recognizer.recognize_table(
                 cropped_table,
                 return_cells_coordinate=True

+ 40 - 1
ocr_tools/universal_doc_parser/core/layout_model_router.py

@@ -30,12 +30,15 @@ class SmartLayoutRouter(BaseLayoutDetector):
     
     def __init__(self, config: Dict[str, Any]):
         super().__init__(config)
-        self.strategy = config.get('strategy', 'ocr_eval')  # ocr_eval, auto
+        self.strategy = config.get('strategy', 'ocr_eval')  # ocr_eval, auto, scene
         self.models = {}
         self.model_configs = config.get('models', {})
         self.fallback_config = config.get('fallback_model', None)
         self.evaluator = OCRBasedLayoutEvaluator()
         self.ocr_recognizer = None  # 用于在ocr_eval策略中获取OCR结果
+        self.scene_name = config.get('scene_name', None)
+        self.scene_strategy = config.get('scene_strategy', {})
+        self.default_model = config.get('default_model', None)
         # 调试模式支持
         self.debug_mode = config.get('debug_mode', False)
         self.output_dir = config.get('output_dir', None)
@@ -90,6 +93,10 @@ class SmartLayoutRouter(BaseLayoutDetector):
     def set_ocr_recognizer(self, ocr_recognizer):
         """设置OCR识别器(用于ocr_eval策略)"""
         self.ocr_recognizer = ocr_recognizer
+
+    def set_scene_name(self, scene_name: Optional[str]):
+        """设置场景名称(用于scene策略)"""
+        self.scene_name = scene_name
     
     def _detect_raw(
         self, 
@@ -137,8 +144,40 @@ class SmartLayoutRouter(BaseLayoutDetector):
             return self._ocr_eval_detect(image, ocr_spans)
         elif self.strategy == 'auto':
             return self._auto_select_detect(image)
+        elif self.strategy == 'scene':
+            return self._scene_select_detect(image)
         else:
             raise ValueError(f"Unknown strategy: {self.strategy}")
+
+    def _scene_select_detect(
+        self,
+        image: Union[np.ndarray, Image.Image]
+    ) -> List[Dict[str, Any]]:
+        """
+        场景策略:根据scene_strategy直接选择模型
+
+        注意:不执行ocr_eval,直接使用选定模型
+        """
+        selected_model = None
+        if self.scene_name:
+            scene_rule = self.scene_strategy.get(self.scene_name)
+            if isinstance(scene_rule, str):
+                selected_model = scene_rule
+            elif isinstance(scene_rule, dict):
+                selected_model = scene_rule.get('model')
+
+        if not selected_model:
+            selected_model = self.default_model
+
+        if not selected_model and self.models:
+            selected_model = next(iter(self.models.keys()))
+
+        if selected_model not in self.models:
+            logger.warning(f"⚠️ Scene strategy model not available: {selected_model}, using first model")
+            selected_model = next(iter(self.models.keys()))
+
+        logger.info(f"🎯 Scene strategy selected model: {selected_model} (scene: {self.scene_name})")
+        return self.models[selected_model].detect(image)
     
     def _ocr_eval_detect(
         self, 

+ 12 - 0
ocr_tools/universal_doc_parser/core/pipeline_manager_v2.py

@@ -117,6 +117,15 @@ class EnhancedDocPipeline:
         self._init_element_processors()
         
         logger.info(f"✅ Pipeline initialized for scene: {self.scene_name}")
+
+    def set_scene_name(self, scene_name: Optional[str]):
+        """设置场景名称,并同步到布局路由器"""
+        if not scene_name:
+            return
+        self.scene_name = scene_name
+        if hasattr(self.layout_detector, 'set_scene_name'):
+            self.layout_detector.set_scene_name(scene_name)
+        logger.info(f"🔄 Scene updated in pipeline: {scene_name}")
     
     def _ensure_vl_recognizer(self):
         """懒加载 VL 识别器(仅在需要时初始化,且只初始化一次)"""
@@ -155,6 +164,9 @@ class EnhancedDocPipeline:
             self.layout_detector = ModelFactory.create_layout_detector(
                 self.config['layout_detection']
             )
+
+            if hasattr(self.layout_detector, 'set_scene_name'):
+                self.layout_detector.set_scene_name(self.scene_name)
             
             # 如果是智能路由器且使用ocr_eval策略,需要设置OCR识别器
             if hasattr(self.layout_detector, 'set_ocr_recognizer'):

+ 13 - 8
ocr_tools/universal_doc_parser/main_v2.py

@@ -179,6 +179,8 @@ def process_single_input(
         try:
             if scene:
                 pipeline.scene_name = scene
+                if hasattr(pipeline, 'set_scene_name'):
+                    pipeline.set_scene_name(scene)
                 logger.info(f"🔄 Scene overridden to: {scene}")
             
             logger.info(f"🚀 开始处理: {input_path}")
@@ -349,6 +351,7 @@ def main():
     )
     parser.add_argument(
         "--scene", "-s",
+        required=True,
         choices=["bank_statement", "financial_report"],
         help="场景类型(覆盖配置文件设置)"
     )
@@ -436,10 +439,10 @@ if __name__ == "__main__":
             # "input": "/Users/zhch158/workspace/repository.git/ocr_platform/ocr_tools/universal_doc_parser/tests/2023年度报告母公司_page_003.png",
             # "input": "/Users/zhch158/workspace/repository.git/ocr_platform/ocr_tools/universal_doc_parser/tests/2023年度报告母公司_page_003.png",
             # "input": "/Users/zhch158/workspace/repository.git/ocr_platform/ocr_tools/universal_doc_parser/tests/2023年度报告母公司_page_003_270_skew(-0.4).png",
-            "input": "/Users/zhch158/workspace/data/流水分析/2023年度报告母公司.pdf",
+            # "input": "/Users/zhch158/workspace/data/流水分析/2023年度报告母公司.pdf",
             # "output_dir": "./output/2023年度报告母公司/bank_statement_yusys_v3",
             # "output_dir": "/Users/zhch158/workspace/data/流水分析/2023年度报告母公司/bank_statement_yusys_v3",
-            "output_dir": "/Users/zhch158/workspace/data/流水分析/2023年度报告母公司/bank_statement_glm_vl",
+            # "output_dir": "/Users/zhch158/workspace/data/流水分析/2023年度报告母公司/bank_statement_glm_vl",
 
             # "input": "/Users/zhch158/workspace/data/流水分析/2023年度报告母公司.pdf",
             # "output_dir": "/Users/zhch158/workspace/data/流水分析/2023年度报告母公司/bank_statement_yusys_v2",
@@ -458,8 +461,9 @@ if __name__ == "__main__":
 
             # "input": "/Users/zhch158/workspace/repository.git/ocr_platform/ocr_tools/universal_doc_parser/tests/提取自赤峰黄金2023年报.pdf",
             # "output_dir": "./output/提取自赤峰黄金2023年报/bank_statement_yusys_v3",
-            # "input": "/Users/zhch158/workspace/data/流水分析/提取自赤峰黄金2023年报.pdf",
-            # "output_dir": "/Users/zhch158/workspace/data/流水分析/提取自赤峰黄金2023年报/bank_statement_yusys_v3",
+            "input": "/Users/zhch158/workspace/data/流水分析/提取自赤峰黄金2023年报.pdf",
+            "output_dir": "./output/提取自赤峰黄金2023年报/bank_statement_yusys_v4",
+            # "output_dir": "/Users/zhch158/workspace/data/流水分析/提取自赤峰黄金2023年报/bank_statement_yusys_v4",
 
             # "input": "/Users/zhch158/workspace/data/流水分析/施博深.pdf",
             # "output_dir": "/Users/zhch158/workspace/data/流水分析/施博深/bank_statement_yusys_v3",
@@ -475,7 +479,7 @@ if __name__ == "__main__":
             # "output_dir": "/Users/zhch158/workspace/data/流水分析/山西云集科技有限公司/bank_statement_yusys_v3",
 
             # 配置文件
-            "config": "./config/bank_statement_glm_vl.yaml",
+            "config": "./config/bank_statement_yusys_v4.yaml",
             # "config": "./config/bank_statement_yusys_v3.yaml",
             # "config": "./config/bank_statement_smart_router.yaml",
             # "config": "./config/bank_statement_mineru_vl.yaml",
@@ -483,10 +487,11 @@ if __name__ == "__main__":
             # "config": "./config/bank_statement_paddle_vl.yaml",
             
             # 场景
-            "scene": "bank_statement",
+            # "scene": "bank_statement",
+            "scene": "financial_report",
             
             # 页面范围(可选)
-            "pages": "3-7",  # 只处理前1页
+            "pages": "11",  # 只处理前1页
             # "pages": "1-3,5,7-10",  # 处理指定页面
             # "pages": "83-109",  # 处理指定页面
 
@@ -499,7 +504,7 @@ if __name__ == "__main__":
             "log_level": "DEBUG",
 
             # 日志文件
-            "log_file": "./output/logs/bank_statement_glm_vl/process.log",
+            "log_file": "./output/logs/bank_statement_yusys_v4/process.log",
         }
         
         # 构造参数

+ 10 - 0
ocr_tools/universal_doc_parser/models/adapters/glmocr_vl_adapter.py

@@ -121,6 +121,16 @@ class GLMOCRVLRecognizer(BaseVLRecognizer):
                     'Authorization': f'Bearer {self.api_key}'
                 })
             
+            # 需要向MinerU-VL,进行连通测试
+            try:
+                test_response = self.session.get(self.api_url, timeout=(self.connect_timeout, self.http_timeout), verify=self.verify_ssl)
+                if test_response.status_code == 200:
+                    logger.debug(f"Successfully connected to GLM-OCR API at {self.api_url}")
+                else:
+                    logger.warning(f"Received unexpected status code {test_response.status_code} from GLM-OCR API: {test_response.text}")
+            except requests.exceptions.RequestException as e:
+                logger.error(f"Failed to connect to GLM-OCR API at {self.api_url}: {e}")
+                raise
             logger.success(f"✅ GLM-OCR VL recognizer initialized: {self.api_url}")
             
         except Exception as e:

+ 18 - 1
ocr_tools/universal_doc_parser/models/adapters/mineru_wired_table.py

@@ -378,6 +378,19 @@ class MinerUWiredTableRecognizer:
 
             # Step 2: 使用连通域法提取单元格 (替换了原来的投影法)
             debug_prefix = f"{dbg.prefix}_grid" if dbg.prefix else "grid"
+
+            # 计算 OCR 文本容差:取最小行高的 50%,无有效 OCR 时回退为 0
+            ocr_heights = []
+            for ocr in ocr_boxes or []:
+                bbox = ocr.get("bbox", [])
+                if len(bbox) >= 4:
+                    height = bbox[3] - bbox[1]
+                    if height > 0:
+                        ocr_heights.append(height)
+            if ocr_heights:
+                ocr_text_pixel_tolerance = min(ocr_heights) * 0.5
+            else:
+                ocr_text_pixel_tolerance = 10.0
             
             # 传入原图的实际尺寸和裁剪padding
             bboxes = self.grid_recovery.compute_cells_from_lines(
@@ -402,6 +415,7 @@ class MinerUWiredTableRecognizer:
                         table_image=table_image,
                         unet_cells=bboxes,
                         ocr_boxes=ocr_boxes or [],
+                        ocr_text_pixel_tolerance=ocr_text_pixel_tolerance,
                         pdf_type=pdf_type,
                         debug_dir=debug_dir,
                         debug_prefix=debug_prefix
@@ -425,7 +439,10 @@ class MinerUWiredTableRecognizer:
 
             # Step 3: 重建网格结构 (计算 row, col, rowspan, colspan)
             # OCR补偿已在Step 2中完成,这里仅做网格重建
-            merged_cells = self.grid_recovery.recover_grid_structure(bboxes)
+            merged_cells = self.grid_recovery.recover_grid_structure(
+                bboxes,
+                ocr_text_pixel_tolerance=ocr_text_pixel_tolerance
+            )
             
             # Step 3.5: 可视化逻辑结构 (新增)
             if self.debug_utils.debug_is_on("save_grid_structure", dbg):

+ 36 - 6
ocr_tools/universal_doc_parser/models/adapters/paddle_table_classifier.py

@@ -198,20 +198,50 @@ class PaddleTableClassifier(BaseAdapter):
         else:
             gray = img_array
         
-        # 二值化
-        _, binary = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)
+        # 二值化:自适应阈值更适合浅色表格线
+        binary = cv2.adaptiveThreshold(
+            gray,
+            255,
+            cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
+            cv2.THRESH_BINARY_INV,
+            25,
+            10
+        )
         
         h, w = binary.shape
         
         # 检测横线
-        horizontal_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (max(20, w//30), 1))
+        horizontal_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (max(20, w // 30), 1))
         horizontal_mask = cv2.morphologyEx(binary, cv2.MORPH_OPEN, horizontal_kernel)
-        horizontal_lines = cv2.findContours(horizontal_mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)[0]
+        horizontal_contours = cv2.findContours(horizontal_mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)[0]
         
         # 检测竖线
-        vertical_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (1, max(20, h//30)))
+        vertical_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (1, max(20, h // 30)))
         vertical_mask = cv2.morphologyEx(binary, cv2.MORPH_OPEN, vertical_kernel)
-        vertical_lines = cv2.findContours(vertical_mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)[0]
+        vertical_contours = cv2.findContours(vertical_mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)[0]
+
+        # 线段长度/长宽比过滤,降低文字竖画误检
+        def filter_lines(contours, orientation):
+            filtered = []
+            for cnt in contours:
+                x, y, cw, ch = cv2.boundingRect(cnt)
+                if cw <= 0 or ch <= 0:
+                    continue
+                if orientation == "h":
+                    if cw < w * 0.15:
+                        continue
+                    if cw / max(ch, 1) < 5.0:
+                        continue
+                else:
+                    if ch < h * 0.15:
+                        continue
+                    if ch / max(cw, 1) < 5.0:
+                        continue
+                filtered.append(cnt)
+            return filtered
+
+        horizontal_lines = filter_lines(horizontal_contours, "h")
+        vertical_lines = filter_lines(vertical_contours, "v")
         
         # 调试可视画
         # 使用传入的 debug_options (包含了可能的 override)

+ 141 - 36
ocr_tools/universal_doc_parser/models/adapters/wired_table/cell_fusion.py

@@ -57,11 +57,22 @@ class CellFusionEngine:
         self.rtdetr_conf_threshold = self.config.get('rtdetr_conf_threshold', 0.5)
         self.enable_ocr_compensation = self.config.get('enable_ocr_compensation', True)
         self.enable_boundary_noise_filter = self.config.get('enable_boundary_noise_filter', True)
+        self.unet_split_min_count = self.config.get('unet_split_min_count', 2)
+        self.rtdetr_split_cover_threshold = self.config.get('rtdetr_split_cover_threshold', 0.5)
+        self.unet_split_cover_threshold = self.config.get('unet_split_cover_threshold', 0.5)
+        self.unet_split_rtdetr_score_threshold = self.config.get(
+            'unet_split_rtdetr_score_threshold',
+            self.rtdetr_conf_threshold
+        )
         
         logger.info(f"🔧 CellFusionEngine initialized: "
-                   f"unet_w={self.unet_weight}, rtdetr_w={self.rtdetr_weight}, "
-                   f"iou_merge={self.iou_merge_threshold}, ocr_comp={self.enable_ocr_compensation}, "
-                   f"boundary_filter={self.enable_boundary_noise_filter}")
+               f"unet_w={self.unet_weight}, rtdetr_w={self.rtdetr_weight}, "
+               f"iou_merge={self.iou_merge_threshold}, ocr_comp={self.enable_ocr_compensation}, "
+               f"boundary_filter={self.enable_boundary_noise_filter}, "
+               f"unet_split_min={self.unet_split_min_count}, "
+               f"unet_split_cover={self.unet_split_cover_threshold}, "
+               f"unet_split_score={self.unet_split_rtdetr_score_threshold}, "
+               f"rtdetr_split_cover={self.rtdetr_split_cover_threshold}")
     
     def should_use_rtdetr(
         self,
@@ -99,6 +110,7 @@ class CellFusionEngine:
         table_image: np.ndarray,
         unet_cells: List[List[float]],
         ocr_boxes: List[Dict[str, Any]],
+        ocr_text_pixel_tolerance: float = 10.0,
         pdf_type: str = 'ocr',
         debug_dir: Optional[str] = None,
         debug_prefix: str = "fusion"
@@ -110,6 +122,7 @@ class CellFusionEngine:
             table_image: 表格图像(原图坐标系)
             unet_cells: UNet检测的单元格列表 [[x1,y1,x2,y2], ...](原图坐标系)
             ocr_boxes: OCR结果列表
+            ocr_text_pixel_tolerance: OCR文本容差(原图坐标系,默认10.0)
             pdf_type: PDF类型 ('txt' 或 'ocr')
             debug_dir: 调试输出目录(可选)
             debug_prefix: 调试文件前缀
@@ -126,7 +139,7 @@ class CellFusionEngine:
             max(unet_cells, key=lambda box: box[2])[2], \
             max(unet_cells, key=lambda box: box[3])[3]
         ] if unet_cells else [0,0,0,0]
-        
+
         # 决策:是否使用 RT-DETR
         use_rtdetr = self.should_use_rtdetr(pdf_type, len(unet_cells), (w, h))
         
@@ -165,8 +178,8 @@ class CellFusionEngine:
                 table_image,
                 conf_threshold=self.rtdetr_conf_threshold
             )
-            # rtdetr_result从上到下,从左到右排序
-            rtdetr_results.sort(key=lambda x: (x['bbox'][1], x['bbox'][0]))
+            # rtdetr_result从上到下,从左到右排序, 排序按取整后,容差为10
+            rtdetr_results = sorted(rtdetr_results, key=lambda x: (round(x['bbox'][1] / 10), round(x['bbox'][0])))
             rtdetr_cells = [res['bbox'] for res in rtdetr_results]
             rtdetr_scores = [res['score'] for res in rtdetr_results]
             fusion_stats['rtdetr_count'] = len(rtdetr_cells)
@@ -179,7 +192,7 @@ class CellFusionEngine:
         
         # Phase 2: 智能融合
         # 使用稳健边界估计(避免单个超大单元格撑开边界)
-        table_bbox = self._estimate_robust_table_bbox(rtdetr_cells)
+        table_bbox = self._estimate_robust_table_bbox(rtdetr_cells, ocr_text_pixel_tolerance)
         
         # 将所有单元格的边界限制在表格边界内
         # rtdetr_cells = self._clip_cells_to_bbox(rtdetr_cells, table_bbox)
@@ -190,6 +203,7 @@ class CellFusionEngine:
         fusion_stats['merged_count'] = merge_stats['merged']
         fusion_stats['merged_cells_count'] = merge_stats['merged_cells']
         fusion_stats['added_count'] = merge_stats['added']
+        fusion_stats['split_count'] = merge_stats.get('split', 0)
         
         # Phase 3: NMS 去重
         fused_cells, suppressed = self._nms_filter(fused_cells, self.iou_nms_threshold)
@@ -199,7 +213,8 @@ class CellFusionEngine:
         # Phase 4: 边界噪声过滤(过滤掉边界的 unet_only 噪声单元格)
         if self.enable_boundary_noise_filter:
             fused_cells, cell_labels, noise_filtered = self._filter_boundary_noise(
-                fused_cells, cell_labels, ocr_boxes, table_bbox
+                fused_cells, cell_labels, ocr_boxes, table_bbox,
+                boundary_tolerance=ocr_text_pixel_tolerance
             )
             fusion_stats['noise_filtered_count'] = noise_filtered
         else:
@@ -220,7 +235,7 @@ class CellFusionEngine:
         logger.info(
             f"📊 Fusion (UNet+RT-DETR): UNet={len(unet_cells)}, RT-DETR={len(rtdetr_cells)}, "
             f"1:1Merged={merge_stats['merged']}, MergedCells={merge_stats['merged_cells']}, "
-            f"Added={merge_stats['added']}, NoiseFiltered={noise_filtered}, "
+            f"Split={merge_stats.get('split', 0)}, Added={merge_stats['added']}, NoiseFiltered={noise_filtered}, "
             f"OCRCompensated={fusion_stats.get('ocr_compensated_count', 0)}, Final={len(fused_cells)}"
         )
         
@@ -243,13 +258,14 @@ class CellFusionEngine:
         """
         融合 UNet 和 RT-DETR 检测结果(增强版:支持合并单元格检测)
         
-        融合规则:
-        1. 检测RT-DETR的合并单元格(一对多匹配,基于包含关系)
-           - 判断RT-DETR单元格包含多少个UNet单元格
-           - 使用中心点+包含率判断(而非IoU)
-        2. UNet + RT-DETR 高IoU (>threshold) → 加权平均合并(一对一)
-        3. RT-DETR 独有 + 高置信度 (>0.7) → 补充
-        4. UNet 独有 → 保留
+          融合规则:
+          1. 检测RT-DETR的合并单元格(一对多匹配,基于包含关系)
+              - 判断RT-DETR单元格包含多少个UNet单元格
+              - 使用中心点+包含率判断(而非IoU)
+          2. 检测UNet过度合并(一个UNet包含多个RT-DETR)并拆分
+          3. UNet + RT-DETR 高IoU (>threshold) → 加权平均合并(一对一)
+          4. RT-DETR 独有 + 高置信度 (>0.7) → 补充
+          5. UNet 独有 → 保留
         
         包含关系判断逻辑:
         - UNet单元格的中心点在RT-DETR内
@@ -267,14 +283,14 @@ class CellFusionEngine:
             (fused_cells, stats, cell_labels)
             - fused_cells: 融合后的单元格
             - stats: {'merged': int, 'added': int, 'merged_cells': int}
-            - cell_labels: 每个单元格的来源标签列表 ['merged_span', 'merged_1to1', 'unet_only', 'rtdetr_only', 'new']
+            - cell_labels: 每个单元格的来源标签列表 ['merged_span', 'merged_1to1', 'unet_only', 'rtdetr_only', 'split_rtdetr', 'new']
         """
         
         fused_cells = []
         cell_labels = []  # 记录每个单元格的来源标签
         unet_matched = [False] * len(unet_cells)
         rtdetr_matched = [False] * len(rtdetr_cells)
-        stats = {'merged': 0, 'added': 0, 'merged_cells': 0}
+        stats = {'merged': 0, 'added': 0, 'merged_cells': 0, 'split': 0}
         
         # Step 1: 检测RT-DETR的合并单元格(一对多匹配)
         # 遍历RT-DETR单元格,查找被包含的多个UNet单元格
@@ -332,7 +348,7 @@ class CellFusionEngine:
                     coverage = min(total_unet_area / rtdetr_area, 1.0) if rtdetr_area > 0 else 0
                     
                     # 如果覆盖率>50%,说明这是一个真实的合并单元格
-                    if coverage > 0.5:
+                    if coverage > self.rtdetr_split_cover_threshold:
                         # 认定为合并单元格,取bounding与RT-DETR的最大范围, 且不能超过table_bbox范围
                         fused_cell = [
                             min(bounding_x1, rtdetr_cell[0]),
@@ -342,9 +358,9 @@ class CellFusionEngine:
                         ]
                         # x限制在table_bbox范围内
                         fused_cell[0] = max(fused_cell[0], table_bbox[0])
-                        # fused_cell[1] = max(fused_cell[1], table_bbox[1])
+                        fused_cell[1] = max(fused_cell[1], table_bbox[1])
                         fused_cell[2] = min(fused_cell[2], table_bbox[2])
-                        # fused_cell[3] = min(fused_cell[3], table_bbox[3])
+                        fused_cell[3] = min(fused_cell[3], table_bbox[3])
                         fused_cells.append(fused_cell)
                         cell_labels.append('merged_span')  # 标记为合并单元格
                         rtdetr_matched[rt_idx] = True
@@ -357,6 +373,80 @@ class CellFusionEngine:
                             f"(coverage={coverage:.2f}, score={rtdetr_scores[rt_idx]:.2f})"
                         )
         
+        # Step 1.5: 检测UNet过度合并(一个UNet包含多个RT-DETR)并拆分
+        for u_idx, unet_cell in enumerate(unet_cells):
+            if unet_matched[u_idx]:
+                continue
+
+            unet_area = self._calc_bbox_area(unet_cell)
+            if unet_area <= 0:
+                continue
+
+            contained_rtdetr = []
+            contained_intersects = []
+
+            for rt_idx, rtdetr_cell in enumerate(rtdetr_cells):
+                if rtdetr_matched[rt_idx]:
+                    continue
+                if rtdetr_scores[rt_idx] < self.unet_split_rtdetr_score_threshold:
+                    continue
+
+                rt_cx = (rtdetr_cell[0] + rtdetr_cell[2]) / 2
+                rt_cy = (rtdetr_cell[1] + rtdetr_cell[3]) / 2
+                if not (unet_cell[0] <= rt_cx <= unet_cell[2] and
+                        unet_cell[1] <= rt_cy <= unet_cell[3]):
+                    continue
+
+                intersect_x1 = max(unet_cell[0], rtdetr_cell[0])
+                intersect_y1 = max(unet_cell[1], rtdetr_cell[1])
+                intersect_x2 = min(unet_cell[2], rtdetr_cell[2])
+                intersect_y2 = min(unet_cell[3], rtdetr_cell[3])
+                if intersect_x2 <= intersect_x1 or intersect_y2 <= intersect_y1:
+                    continue
+
+                intersect_area = (intersect_x2 - intersect_x1) * (intersect_y2 - intersect_y1)
+                rtdetr_area = self._calc_bbox_area(rtdetr_cell)
+                contain_ratio = intersect_area / rtdetr_area if rtdetr_area > 0 else 0
+                if contain_ratio > 0.5:
+                    contained_rtdetr.append(rt_idx)
+                    contained_intersects.append(intersect_area)
+
+            if len(contained_rtdetr) >= self.unet_split_min_count:
+                # 计算总包含率:使用所有被包含RT-DETR单元格的外接矩形面积 vs UNet面积
+                # 与RT-DETR合并逻辑保持一致,避免相邻框重复/间隙导致覆盖率失真
+                rt_indices = contained_rtdetr
+                bounding_x1 = min(rtdetr_cells[i][0] for i in rt_indices)
+                bounding_y1 = min(rtdetr_cells[i][1] for i in rt_indices)
+                bounding_x2 = max(rtdetr_cells[i][2] for i in rt_indices)
+                bounding_y2 = max(rtdetr_cells[i][3] for i in rt_indices)
+                total_rtdetr_area = (bounding_x2 - bounding_x1) * (bounding_y2 - bounding_y1)
+                coverage = min(total_rtdetr_area / unet_area, 1.0)
+                if coverage >= self.unet_split_cover_threshold:
+                    # 认定为合并单元格,取bounding与RT-DETR的最大范围, 且不能超过table_bbox范围
+                    split_cell = [
+                        min(bounding_x1, unet_cell[0]),
+                        min(bounding_y1, unet_cell[1]),
+                        max(bounding_x2, unet_cell[2]),
+                        max(bounding_y2, unet_cell[3])
+                    ]
+                    split_cell = [
+                        max(split_cell[0], table_bbox[0]),
+                        max(split_cell[1], table_bbox[1]),
+                        min(split_cell[2], table_bbox[2]),
+                        min(split_cell[3], table_bbox[3])
+                    ]
+                    fused_cells.append(split_cell)
+                    cell_labels.append('split_rtdetr')
+                    for rt_idx in contained_rtdetr:
+                        rtdetr_matched[rt_idx] = True
+
+                    unet_matched[u_idx] = True
+                    stats['split'] += len(contained_rtdetr)
+                    logger.debug(
+                        f"🧩 UNet过度合并拆分: UNet[{u_idx}] -> {len(contained_rtdetr)} RT-DETR "
+                        f"(coverage={coverage:.2f})"
+                    )
+
         # Step 2: 一对一匹配(处理剩余的单元格)
         for u_idx, unet_cell in enumerate(unet_cells):
             if unet_matched[u_idx]:
@@ -401,9 +491,9 @@ class CellFusionEngine:
             if not rtdetr_matched[idx] and score > 0.7:
                 # rtdetr_cell不能超出table_bbox范围, x方向分别限制
                 rtdetr_cell[0] = max(rtdetr_cell[0], table_bbox[0])
-                # rtdetr_cell[1] = max(rtdetr_cell[1], table_bbox[1])
+                rtdetr_cell[1] = max(rtdetr_cell[1], table_bbox[1])
                 rtdetr_cell[2] = min(rtdetr_cell[2], table_bbox[2])
-                # rtdetr_cell[3] = min(rtdetr_cell[3], table_bbox[3])
+                rtdetr_cell[3] = min(rtdetr_cell[3], table_bbox[3])
                 fused_cells.append(rtdetr_cell)
                 cell_labels.append('rtdetr_only')  # 标记为RT-DETR独有
                 stats['added'] += 1
@@ -418,18 +508,16 @@ class CellFusionEngine:
         """
         稳健的表格边界估计
         
-        使用聚类方法找到"主流"的左右边界,避免单个超大单元格撑开边界。
+        使用聚类方法找到"主流"的边界,避免单个超大单元格撑开边界。
         
         算法:
-        1. 收集所有单元格的左边界x1和右边界x2
-        2. 对x1聚类,选择支持度最高的聚类中心作为表格左边界
-        3. 对x2聚类,选择支持度最高的聚类中心作为表格右边界
-        4. y方向使用简单的min/max(行高变化大,不适合聚类)
+        1. 收集所有单元格的边界
+        2. 聚类,选择支持度最高的聚类中心作为表格边界
+        3. 通过容差向内调整边界,过滤掉过于宽松的边界(可能包含噪声单元格)
         
         Args:
             rtdetr_cells: RT-DETR单元格列表
             cluster_tolerance: 聚类容差(像素)
-            
         Returns:
             table_bbox: [x1, y1, x2, y2]
         """
@@ -448,11 +536,13 @@ class CellFusionEngine:
         # 对x2聚类,找主流右边界
         robust_x2 = self._find_dominant_boundary(x2_coords, cluster_tolerance, mode='max')
         # y方向直接取极值
-        robust_y1 = min(y1_coords)
-        robust_y2 = max(y2_coords)
+        robust_y1 = self._find_dominant_boundary(y1_coords, cluster_tolerance, mode='min')
+        robust_y2 = self._find_dominant_boundary(y2_coords, cluster_tolerance, mode='max')
         
         logger.debug(f"📐 稳健边界估计: x=[{robust_x1:.1f}, {robust_x2:.1f}], "
-                    f"原始x范围=[{min(x1_coords):.1f}, {max(x2_coords):.1f}]")
+                    f"原始x范围=[{min(x1_coords):.1f}, {max(x2_coords):.1f}]"
+                    f" | y=[{robust_y1:.1f}, {robust_y2:.1f}], "
+                    f"原始y范围=[{min(y1_coords):.1f}, {max(y2_coords):.1f}]")
         
         return [robust_x1, robust_y1, robust_x2, robust_y2]
     
@@ -624,7 +714,8 @@ class CellFusionEngine:
         cells: List[List[float]],
         cell_labels: List[str],
         ocr_boxes: List[Dict[str, Any]],
-        rtdetr_bbox: List[float]
+        rtdetr_bbox: List[float],
+        boundary_tolerance: float = 0.0
     ) -> Tuple[List[List[float]], List[str], int]:
         """
         过滤边界噪声单元格
@@ -639,6 +730,7 @@ class CellFusionEngine:
             cell_labels: 单元格标签列表
             ocr_boxes: OCR结果列表
             rtdetr_bbox: RT-DETR单元格的边界框 [x1, y1, x2, y2]
+            boundary_tolerance: 边界判定容忍范围(像素,原图坐标系)
         Returns:
             (filtered_cells, filtered_labels, filtered_count)
         """
@@ -646,6 +738,8 @@ class CellFusionEngine:
         filtered_labels = []
         filtered_count = 0
         
+        tol = max(0.0, boundary_tolerance)
+
         for cell, label in zip(cells, cell_labels):
             # # 只过滤 unet_only 标记的单元格
             # if label != 'unet_only':
@@ -655,9 +749,9 @@ class CellFusionEngine:
             
             x1, y1, x2, y2 = cell
             
-            # 检查是否在边界
-            is_left_boundary = x1 <= rtdetr_bbox[0]
-            is_right_boundary = x2 >= rtdetr_bbox[2]
+            # 检查是否在边界(加入容忍范围,避免贴边被误判)
+            is_left_boundary = x1 <= (rtdetr_bbox[0] - tol)
+            is_right_boundary = x2 >= (rtdetr_bbox[2] + tol)
             is_on_boundary = is_left_boundary or is_right_boundary
             
             if not is_on_boundary:
@@ -906,6 +1000,7 @@ class CellFusionEngine:
             merged_cells_1to1 = []  # 1:1融合单元格(黄色)
             merged_cells_span = []  # 合并单元格(品红色,RT-DETR检测的跨格单元格)
             new_cells = []  # 新增单元格(紫色)
+            split_cells = []  # UNet拆分得到的RT-DETR单元格(青色)
             ocr_compensated = []  # OCR补偿单元格(橙色)
             
             for fused_cell, label in zip(fused_cells, cell_labels):
@@ -919,6 +1014,8 @@ class CellFusionEngine:
                     merged_cells_span.append(fused_cell)
                 elif label == 'new':
                     new_cells.append(fused_cell)
+                elif label == 'split_rtdetr':
+                    split_cells.append(fused_cell)
                 elif label == 'ocr_compensated':
                     ocr_compensated.append(fused_cell)
             
@@ -942,6 +1039,10 @@ class CellFusionEngine:
             for cell in new_cells:
                 x1, y1, x2, y2 = [int(v) for v in cell]
                 cv2.rectangle(img3, (x1, y1), (x2, y2), (128, 0, 128), 2)  # 紫色 - 新增
+
+            for cell in split_cells:
+                x1, y1, x2, y2 = [int(v) for v in cell]
+                cv2.rectangle(img3, (x1, y1), (x2, y2), (255, 255, 0), 3)  # 青色 - UNet拆分
             
             for cell in ocr_compensated:
                 x1, y1, x2, y2 = [int(v) for v in cell]
@@ -967,6 +1068,10 @@ class CellFusionEngine:
                 legend_y += 30
                 cv2.putText(img3, f"Purple: New ({len(new_cells)})", (10, legend_y),
                            cv2.FONT_HERSHEY_SIMPLEX, 0.6, (128, 0, 128), 2)
+            if split_cells:
+                legend_y += 30
+                cv2.putText(img3, f"Cyan: Split ({len(split_cells)})", (10, legend_y),
+                           cv2.FONT_HERSHEY_SIMPLEX, 0.6, (255, 255, 0), 2)
             if ocr_compensated:
                 legend_y += 30
                 cv2.putText(img3, f"Orange: OCR Compensated ({len(ocr_compensated)})", (10, legend_y),

+ 8 - 3
ocr_tools/universal_doc_parser/models/adapters/wired_table/grid_recovery.py

@@ -498,7 +498,10 @@ class GridRecovery:
     
     
     @staticmethod
-    def recover_grid_structure(bboxes: List[List[float]]) -> List[Dict]:
+    def recover_grid_structure(
+        bboxes: List[List[float]],
+        ocr_text_pixel_tolerance: float = 0.0
+    ) -> List[Dict]:
         """
         从散乱的单元格 bbox 恢复表格的行列结构 (row, col, rowspan, colspan)
         重构版:基于投影网格线 (Projected Grid Lines) 的算法
@@ -506,6 +509,7 @@ class GridRecovery:
         
         Args:
             bboxes: 单元格bbox列表
+            ocr_text_pixel_tolerance: OCR文本容差(原图坐标系)
             
         Returns:
             结构化单元格列表,包含 row, col, rowspan, colspan
@@ -519,14 +523,15 @@ class GridRecovery:
             y_coords.append(b[1])
             y_coords.append(b[3])
         
-        row_dividers= GridRecovery.find_grid_lines(y_coords, tolerance=5, min_support=1)
+        tolerance = max(5.0, min(float(ocr_text_pixel_tolerance), 20.0))
+        row_dividers = GridRecovery.find_grid_lines(y_coords, tolerance=tolerance, min_support=1)
         
         # 2. 识别列分割线 (X轴)
         x_coords = []
         for b in bboxes:
             x_coords.append(b[0])
             x_coords.append(b[2])
-        col_dividers= GridRecovery.find_grid_lines(x_coords, tolerance=5, min_support=1)
+        col_dividers = GridRecovery.find_grid_lines(x_coords, tolerance=tolerance, min_support=1)
         
         # 3. 构建网格结构
         structured_cells = []

+ 2 - 1
ocr_utils/markdown_generator.py

@@ -381,7 +381,8 @@ pages: {len(results.get('pages', []))}
                 text = content.get('text', '') if isinstance(content, dict) else str(content)
                 if text:
                     confidence = content.get('confidence', 0.0) if isinstance(content, dict) else 0.0
-                    md_lines.append(f"🔖 **[印章]** {text} _(置信度: {confidence:.2f})_")
+                    # md_lines.append(f"🔖 **[印章]** {text} _(置信度: {confidence:.2f})_")
+                    md_lines.append(f"🔖 **[印章]** {text}")
                     md_lines.append("")
             
             elif elem_type == 'discarded':