|
@@ -90,14 +90,45 @@ def _handle_dry_run(args: argparse.Namespace) -> bool:
|
|
|
return False
|
|
return False
|
|
|
|
|
|
|
|
|
|
|
|
|
-def _create_pipeline(streaming: bool, config_path: str, output_dir: str):
|
|
|
|
|
- """创建并初始化处理流水线"""
|
|
|
|
|
|
|
+def _create_pipeline(
|
|
|
|
|
+ streaming: bool,
|
|
|
|
|
+ config_path: str,
|
|
|
|
|
+ output_dir: str,
|
|
|
|
|
+ debug: bool = False,
|
|
|
|
|
+ debug_layout: bool = False,
|
|
|
|
|
+ debug_table: bool = False,
|
|
|
|
|
+ debug_ocr: bool = False
|
|
|
|
|
+):
|
|
|
|
|
+ """
|
|
|
|
|
+ 创建并初始化处理流水线(应用 debug 覆盖)
|
|
|
|
|
+
|
|
|
|
|
+ Args:
|
|
|
|
|
+ streaming: 是否使用流式处理模式
|
|
|
|
|
+ config_path: 配置文件路径
|
|
|
|
|
+ output_dir: 输出目录
|
|
|
|
|
+ debug: 全局 debug 开关
|
|
|
|
|
+ debug_layout: 布局检测 debug 开关
|
|
|
|
|
+ debug_table: 表格识别 debug 开关
|
|
|
|
|
+ debug_ocr: OCR 识别 debug 开关
|
|
|
|
|
+
|
|
|
|
|
+ Returns:
|
|
|
|
|
+ 初始化后的 pipeline 实例
|
|
|
|
|
+ """
|
|
|
|
|
+ # 1. 先加载配置
|
|
|
|
|
+ from core.config_manager import ConfigManager
|
|
|
|
|
+ config = ConfigManager.load_config(config_path)
|
|
|
|
|
+
|
|
|
|
|
+ # 2. 应用 debug 覆盖(在创建 pipeline 之前)
|
|
|
|
|
+ if debug or debug_layout or debug_table or debug_ocr:
|
|
|
|
|
+ _apply_debug_overrides_to_config(config, debug, debug_layout, debug_table, debug_ocr)
|
|
|
|
|
+
|
|
|
|
|
+ # 3. 创建 pipeline(adapter 会读取到已修改的 config)
|
|
|
if streaming:
|
|
if streaming:
|
|
|
logger.info("🔄 Using streaming processing mode (memory-efficient)")
|
|
logger.info("🔄 Using streaming processing mode (memory-efficient)")
|
|
|
- pipeline = StreamingDocPipeline(config_path, output_dir)
|
|
|
|
|
|
|
+ pipeline = StreamingDocPipeline(config, output_dir, config_is_dict=True)
|
|
|
else:
|
|
else:
|
|
|
logger.info("🔄 Using batch processing mode (all pages in memory)")
|
|
logger.info("🔄 Using batch processing mode (all pages in memory)")
|
|
|
- pipeline = EnhancedDocPipeline(config_path)
|
|
|
|
|
|
|
+ pipeline = EnhancedDocPipeline(config, config_is_dict=True)
|
|
|
|
|
|
|
|
return pipeline
|
|
return pipeline
|
|
|
|
|
|
|
@@ -122,6 +153,88 @@ def _get_default_output_config(debug: bool) -> dict:
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
+def _apply_debug_overrides_to_config(
|
|
|
|
|
+ config: dict,
|
|
|
|
|
+ debug: bool,
|
|
|
|
|
+ debug_layout: bool,
|
|
|
|
|
+ debug_table: bool,
|
|
|
|
|
+ debug_ocr: bool
|
|
|
|
|
+):
|
|
|
|
|
+ """
|
|
|
|
|
+ 应用命令行 debug 参数覆盖配置文件设置(在创建 pipeline 之前)
|
|
|
|
|
+
|
|
|
|
|
+ 优先级规则:
|
|
|
|
|
+ 1. --debug: 启用所有模块的 debug
|
|
|
|
|
+ 2. --debug-layout/--debug-table/--debug-ocr: 精细控制各模块
|
|
|
|
|
+ 3. 配置文件的 debug_options 只提供默认值
|
|
|
|
|
+
|
|
|
|
|
+ Args:
|
|
|
|
|
+ config: 配置字典(会被直接修改)
|
|
|
|
|
+ debug: 全局 debug 开关
|
|
|
|
|
+ debug_layout: 布局检测 debug 开关
|
|
|
|
|
+ debug_table: 表格识别 debug 开关
|
|
|
|
|
+ debug_ocr: OCR 识别 debug 开关
|
|
|
|
|
+ """
|
|
|
|
|
+ # 确定需要启用哪些模块的 debug
|
|
|
|
|
+ enable_layout_debug = debug or debug_layout
|
|
|
|
|
+ enable_table_debug = debug or debug_table
|
|
|
|
|
+ enable_ocr_debug = debug or debug_ocr
|
|
|
|
|
+
|
|
|
|
|
+ # 1. 布局检测 debug
|
|
|
|
|
+ if enable_layout_debug:
|
|
|
|
|
+ if 'layout_detection' in config:
|
|
|
|
|
+ if 'debug_options' not in config['layout_detection']:
|
|
|
|
|
+ config['layout_detection']['debug_options'] = {}
|
|
|
|
|
+ config['layout_detection']['debug_options']['enabled'] = True
|
|
|
|
|
+ logger.info("✅ 启用布局检测 debug 输出")
|
|
|
|
|
+
|
|
|
|
|
+ # 2. 表格分类 debug
|
|
|
|
|
+ if enable_table_debug:
|
|
|
|
|
+ if 'table_classification' in config:
|
|
|
|
|
+ if 'debug_options' not in config['table_classification']:
|
|
|
|
|
+ config['table_classification']['debug_options'] = {}
|
|
|
|
|
+ config['table_classification']['debug_options']['enabled'] = True
|
|
|
|
|
+ logger.info("✅ 启用表格分类 debug 输出")
|
|
|
|
|
+
|
|
|
|
|
+ # 3. 有线表格识别 debug
|
|
|
|
|
+ if enable_table_debug:
|
|
|
|
|
+ if 'table_recognition_wired' in config:
|
|
|
|
|
+ if 'debug_options' not in config['table_recognition_wired']:
|
|
|
|
|
+ config['table_recognition_wired']['debug_options'] = {}
|
|
|
|
|
+ config['table_recognition_wired']['debug_options']['enabled'] = True
|
|
|
|
|
+ logger.info("✅ 启用有线表格识别 debug 输出")
|
|
|
|
|
+
|
|
|
|
|
+ # 4. OCR 识别 debug(如果有 debug_options)
|
|
|
|
|
+ if enable_ocr_debug:
|
|
|
|
|
+ if 'ocr_recognition' in config:
|
|
|
|
|
+ if 'debug_options' not in config['ocr_recognition']:
|
|
|
|
|
+ config['ocr_recognition']['debug_options'] = {}
|
|
|
|
|
+ config['ocr_recognition']['debug_options']['enabled'] = True
|
|
|
|
|
+ logger.info("✅ 启用 OCR 识别 debug 输出")
|
|
|
|
|
+
|
|
|
|
|
+ # 5. 更新输出配置
|
|
|
|
|
+ if enable_layout_debug or enable_ocr_debug or enable_table_debug:
|
|
|
|
|
+ output_config = config.get('output', {})
|
|
|
|
|
+ output_config['debug_mode'] = True
|
|
|
|
|
+ if enable_layout_debug or enable_ocr_debug:
|
|
|
|
|
+ output_config.setdefault('save_layout_image', True)
|
|
|
|
|
+ output_config.setdefault('save_ocr_image', True)
|
|
|
|
|
+
|
|
|
|
|
+ # 输出当前 debug 状态
|
|
|
|
|
+ if debug:
|
|
|
|
|
+ logger.info("🐛 全局 Debug 模式已启用(所有模块)")
|
|
|
|
|
+ else:
|
|
|
|
|
+ debug_modules = []
|
|
|
|
|
+ if debug_layout:
|
|
|
|
|
+ debug_modules.append("布局检测")
|
|
|
|
|
+ if debug_table:
|
|
|
|
|
+ debug_modules.append("表格识别")
|
|
|
|
|
+ if debug_ocr:
|
|
|
|
|
+ debug_modules.append("OCR识别")
|
|
|
|
|
+ if debug_modules:
|
|
|
|
|
+ logger.info(f"🐛 Debug 模式已启用: {', '.join(debug_modules)}")
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
def setup_logging(log_level: str = "INFO", log_file: Optional[str] = None):
|
|
def setup_logging(log_level: str = "INFO", log_file: Optional[str] = None):
|
|
|
"""设置日志"""
|
|
"""设置日志"""
|
|
|
logger.remove()
|
|
logger.remove()
|
|
@@ -148,6 +261,9 @@ def process_single_input(
|
|
|
config_path: Path,
|
|
config_path: Path,
|
|
|
output_dir: Path,
|
|
output_dir: Path,
|
|
|
debug: bool = False,
|
|
debug: bool = False,
|
|
|
|
|
+ debug_layout: bool = False,
|
|
|
|
|
+ debug_table: bool = False,
|
|
|
|
|
+ debug_ocr: bool = False,
|
|
|
scene: Optional[str] = None,
|
|
scene: Optional[str] = None,
|
|
|
page_range: Optional[str] = None,
|
|
page_range: Optional[str] = None,
|
|
|
streaming: bool = False
|
|
streaming: bool = False
|
|
@@ -159,7 +275,10 @@ def process_single_input(
|
|
|
input_path: 输入路径
|
|
input_path: 输入路径
|
|
|
config_path: 配置文件路径
|
|
config_path: 配置文件路径
|
|
|
output_dir: 输出目录
|
|
output_dir: 输出目录
|
|
|
- debug: 是否开启debug模式
|
|
|
|
|
|
|
+ debug: 全局debug开关(启用所有模块debug)
|
|
|
|
|
+ debug_layout: 仅启用布局检测debug
|
|
|
|
|
+ debug_table: 仅启用表格识别debug
|
|
|
|
|
+ debug_ocr: 仅启用OCR识别debug
|
|
|
scene: 场景类型覆盖
|
|
scene: 场景类型覆盖
|
|
|
page_range: 页面范围(如 "1-5,7,9-12")
|
|
page_range: 页面范围(如 "1-5,7,9-12")
|
|
|
streaming: 是否使用流式处理模式(按页处理,立即保存,节省内存)
|
|
streaming: 是否使用流式处理模式(按页处理,立即保存,节省内存)
|
|
@@ -168,16 +287,17 @@ def process_single_input(
|
|
|
处理结果和输出路径
|
|
处理结果和输出路径
|
|
|
"""
|
|
"""
|
|
|
try:
|
|
try:
|
|
|
- # 创建流水线
|
|
|
|
|
- pipeline = _create_pipeline(streaming, str(config_path), str(output_dir))
|
|
|
|
|
|
|
+ # 创建流水线(debug 覆盖已在 _create_pipeline 中应用)
|
|
|
|
|
+ pipeline = _create_pipeline(
|
|
|
|
|
+ streaming,
|
|
|
|
|
+ str(config_path),
|
|
|
|
|
+ str(output_dir),
|
|
|
|
|
+ debug=debug,
|
|
|
|
|
+ debug_layout=debug_layout,
|
|
|
|
|
+ debug_table=debug_table,
|
|
|
|
|
+ debug_ocr=debug_ocr
|
|
|
|
|
+ )
|
|
|
output_config = pipeline.config.get('output', {}) or _get_default_output_config(debug)
|
|
output_config = pipeline.config.get('output', {}) or _get_default_output_config(debug)
|
|
|
-
|
|
|
|
|
- # 命令行 --debug 优先级最高:覆盖 yaml 中的所有 debug 设置
|
|
|
|
|
- if debug:
|
|
|
|
|
- pipeline.debug_mode = True
|
|
|
|
|
- output_config['debug_mode'] = True
|
|
|
|
|
- output_config.setdefault('save_layout_image', True)
|
|
|
|
|
- output_config.setdefault('save_ocr_image', True)
|
|
|
|
|
|
|
|
|
|
use_context = not streaming and hasattr(pipeline, '__enter__')
|
|
use_context = not streaming and hasattr(pipeline, '__enter__')
|
|
|
if use_context:
|
|
if use_context:
|
|
@@ -324,9 +444,14 @@ def main():
|
|
|
# 处理图片目录
|
|
# 处理图片目录
|
|
|
python main_v2.py -i ./images/ -c config/bank_statement_paddle_vl.yaml
|
|
python main_v2.py -i ./images/ -c config/bank_statement_paddle_vl.yaml
|
|
|
|
|
|
|
|
- # 开启debug模式(输出可视化图片)
|
|
|
|
|
|
|
+ # 开启全局debug模式(所有模块输出可视化图片)
|
|
|
python main_v2.py -i doc.pdf -c config.yaml --debug
|
|
python main_v2.py -i doc.pdf -c config.yaml --debug
|
|
|
|
|
|
|
|
|
|
+ # 开启特定模块的debug(精细控制)
|
|
|
|
|
+ python main_v2.py -i doc.pdf -c config.yaml --debug-layout # 仅布局debug
|
|
|
|
|
+ python main_v2.py -i doc.pdf -c config.yaml --debug-table # 仅表格debug
|
|
|
|
|
+ python main_v2.py -i doc.pdf -c config.yaml --debug-layout --debug-table # 组合
|
|
|
|
|
+
|
|
|
# 指定输出目录
|
|
# 指定输出目录
|
|
|
python main_v2.py -i doc.pdf -c config.yaml -o ./my_output/
|
|
python main_v2.py -i doc.pdf -c config.yaml -o ./my_output/
|
|
|
|
|
|
|
@@ -365,7 +490,22 @@ def main():
|
|
|
parser.add_argument(
|
|
parser.add_argument(
|
|
|
"--debug",
|
|
"--debug",
|
|
|
action="store_true",
|
|
action="store_true",
|
|
|
- help="开启debug模式(输出layout和OCR可视化图片)"
|
|
|
|
|
|
|
+ help="开启全局debug模式(启用所有模块的调试输出)"
|
|
|
|
|
+ )
|
|
|
|
|
+ parser.add_argument(
|
|
|
|
|
+ "--debug-layout",
|
|
|
|
|
+ action="store_true",
|
|
|
|
|
+ help="仅开启布局检测的debug输出"
|
|
|
|
|
+ )
|
|
|
|
|
+ parser.add_argument(
|
|
|
|
|
+ "--debug-table",
|
|
|
|
|
+ action="store_true",
|
|
|
|
|
+ help="仅开启表格识别的debug输出"
|
|
|
|
|
+ )
|
|
|
|
|
+ parser.add_argument(
|
|
|
|
|
+ "--debug-ocr",
|
|
|
|
|
+ action="store_true",
|
|
|
|
|
+ help="仅开启OCR识别的debug输出"
|
|
|
)
|
|
)
|
|
|
parser.add_argument(
|
|
parser.add_argument(
|
|
|
"--log_level",
|
|
"--log_level",
|
|
@@ -407,6 +547,9 @@ def main():
|
|
|
config_path=Path(args.config),
|
|
config_path=Path(args.config),
|
|
|
output_dir=Path(args.output_dir),
|
|
output_dir=Path(args.output_dir),
|
|
|
debug=args.debug,
|
|
debug=args.debug,
|
|
|
|
|
+ debug_layout=args.debug_layout,
|
|
|
|
|
+ debug_table=args.debug_table,
|
|
|
|
|
+ debug_ocr=args.debug_ocr,
|
|
|
scene=args.scene,
|
|
scene=args.scene,
|
|
|
page_range=args.pages,
|
|
page_range=args.pages,
|
|
|
streaming=args.streaming
|
|
streaming=args.streaming
|