SHA1
--- a/ocr_tools/daemons/glmocr_local_daemon.sh
+++ b/ocr_tools/daemons/glmocr_local_daemon.sh
@@ -5,7 +5,12 @@
 
				 # 模型下载地址: https://huggingface.co/ggml-org/GLM-OCR-GGUF
			
 
				 # 模型下载地址: https://huggingface.co/PaddlePaddle/PaddleOCR-VL-1.5-GGUF
			
 
				 
			
 
				-# curl -X POST http://localhost:8080/v1/chat/completions -d @payload.json
			
 
				+# unset https_proxy http_proxy HF_ENDPOINT
			
 
				+# llama-server -hf ggml-org/GLM-OCR-GGUF:Q8_0
			
 
				+# mv ~/Library/Caches/llama.cpp/ggml-org_GLM-OCR-GGUF_GLM-OCR-Q8_0.gguf  ~/models/glmocr
			
 
				+# mv ~/Library/Caches/llama.cpp/ggml-org_GLM-OCR-GGUF_mmproj-GLM-OCR-Q8_0.gguf  ~/models/glmocr
			
 
				+
			
 
				+# curl -X POST http://localhost:8101/v1/chat/completions -d @payload.json
			
 
				 
			
 
				 LOGDIR="$HOME/workspace/logs"
			
 
				 mkdir -p $LOGDIR
			
@@ -14,12 +19,12 @@ LOGFILE="$LOGDIR/glmocr_llamaserver.log"
 
				 
			
 
				 # 配置参数
			
 
				 CONDA_ENV="mineru2"
			
 
				-PORT="8080"
			
 
				+PORT="8101"
			
 
				 HOST="0.0.0.0"
			
 
				 
			
 
				 # 本地 GGUF 模型路径
			
 
				-MODEL_PATH="$HOME/Library/Caches/llama.cpp/ggml-org_GLM-OCR-GGUF_GLM-OCR-Q8_0.gguf"
			
 
				-MMPROJ_PATH="$HOME/Library/Caches/llama.cpp/ggml-org_GLM-OCR-GGUF_mmproj-GLM-OCR-Q8_0.gguf"
			
 
				+MODEL_PATH="$HOME/models/glmocr/ggml-org_GLM-OCR-GGUF_GLM-OCR-Q8_0.gguf"
			
 
				+MMPROJ_PATH="$HOME/models/glmocr/ggml-org_GLM-OCR-GGUF_mmproj-GLM-OCR-Q8_0.gguf"
			
 
				 
			
 
				 # llama-server 参数
			
 
				 CONTEXT_SIZE="16384"         # 上下文长度（需 >= max_tokens，推荐 8192-16384）
			
@@ -92,7 +97,7 @@ start() {
 
				         --mmproj "$MMPROJ_PATH" \
			
 
				         --host $HOST \
			
 
				         --port $PORT \
			
 
				-        --media-path /Users/zhch158/workspace \
			
 
				+        --media-path $HOME/workspace \
			
 
				         -c $CONTEXT_SIZE \
			
 
				         -ngl $GPU_LAYERS \
			
 
				         -t $THREADS \
			
--- a/ocr_tools/daemons/paddle_local_daemon.sh
+++ b/ocr_tools/daemons/paddle_local_daemon.sh
@@ -3,7 +3,13 @@
 
				 # 对应: PaddleOCR-VL 本地 llama-server 服务（macOS），使用 GGUF 格式模型
			
 
				 # 适用于 Mac M4 Pro 48G，使用 Metal GPU 加速
			
 
				 # 模型下载地址: https://huggingface.co/PaddlePaddle/PaddleOCR-VL-1.5-GGUF
			
 
				-# curl -X POST http://localhost:8081/v1/chat/completions -d @payload.json
			
 
				+
			
 
				+# unset https_proxy http_proxy HF_ENDPOINT
			
 
				+# llama-server -hf PaddlePaddle/PaddleOCR-VL-1.5-GGUF
			
 
				+# mv ~/Library/Caches/llama.cpp/PaddlePaddle_PaddleOCR-VL-1.5-GGUF_PaddleOCR-VL-1.5.gguf  ~/models/paddleocr_vl
			
 
				+# mv ~/Library/Caches/llama.cpp/PaddlePaddle_PaddleOCR-VL-1.5-GGUF_PaddleOCR-VL-1.5-mmproj.gguf  ~/models/paddleocr_vl
			
 
				+
			
 
				+# curl -X POST http://localhost:8102/v1/chat/completions -d @payload.json
			
 
				 
			
 
				 LOGDIR="$HOME/workspace/logs"
			
 
				 mkdir -p $LOGDIR
			
@@ -12,12 +18,12 @@ LOGFILE="$LOGDIR/paddleocr_llamaserver.log"
 
				 
			
 
				 # 配置参数
			
 
				 CONDA_ENV="mineru2"
			
 
				-PORT="8081"
			
 
				+PORT="8102"
			
 
				 HOST="0.0.0.0"
			
 
				 
			
 
				 # 本地 GGUF 模型路径
			
 
				-MODEL_PATH="$HOME/Library/Caches/llama.cpp/PaddlePaddle_PaddleOCR-VL-1.5-GGUF_PaddleOCR-VL-1.5.gguf"
			
 
				-MMPROJ_PATH="$HOME/Library/Caches/llama.cpp/PaddlePaddle_PaddleOCR-VL-1.5-GGUF_PaddleOCR-VL-1.5-mmproj.gguf"
			
 
				+MODEL_PATH="$HOME/models/paddleocr_vl/PaddlePaddle_PaddleOCR-VL-1.5-GGUF_PaddleOCR-VL-1.5.gguf"
			
 
				+MMPROJ_PATH="$HOME/models/paddleocr_vl/PaddlePaddle_PaddleOCR-VL-1.5-GGUF_PaddleOCR-VL-1.5-mmproj.gguf"
			
 
				 
			
 
				 # llama-server 参数
			
 
				 CONTEXT_SIZE="16384"         # 上下文长度（需 >= max_tokens，推荐 8192-16384）
			
@@ -89,7 +95,7 @@ start() {
 
				         --mmproj "$MMPROJ_PATH" \
			
 
				         --host $HOST \
			
 
				         --port $PORT \
			
 
				-        --media-path /Users/zhch158/workspace \
			
 
				+        --media-path $HOME/workspace \
			
 
				         -c $CONTEXT_SIZE \
			
 
				         -ngl $GPU_LAYERS \
			
 
				         -t $THREADS \
			
--- a/ocr_tools/remove_watermark_tool/remove_watermark.py
+++ b/ocr_tools/remove_watermark_tool/remove_watermark.py
@@ -0,0 +1,406 @@
 
				+"""
			
 
				+银行流水水印去除工具
			
 
				+
			
 
				+支持 PDF 和常见图片格式（jpg/png/tif/bmp/webp）。
			
 
				+- 输入 PDF → 输出去水印 PDF（扫描件）或直接复制（文字型）
			
 
				+- 输入图片 → 输出去水印图片（保持原格式）
			
 
				+适用于福建农信、邮储银行等带有半透明文字水印的银行流水单。
			
 
				+
			
 
				+用法:
			
 
				+    # 处理单个 PDF 或图片
			
 
				+    python remove_watermark.py input.pdf
			
 
				+    python remove_watermark.py input.jpg
			
 
				+
			
 
				+    # 指定输出路径
			
 
				+    python remove_watermark.py input.pdf -o output.pdf
			
 
				+
			
 
				+    # 指定页面范围（支持 "1-5,7,9-12" 格式）
			
 
				+    python remove_watermark.py input.pdf --page-range 1-3
			
 
				+
			
 
				+    # 调整去除阈值（默认 160，范围建议 140-180）
			
 
				+    python remove_watermark.py input.pdf --threshold 170
			
 
				+
			
 
				+    # 批量处理目录下所有 PDF 和图片
			
 
				+    python remove_watermark.py /path/to/dir/ --batch
			
 
				+
			
 
				+    # 预览单页/图片效果（不保存，直接展示对比图）
			
 
				+    python remove_watermark.py input.pdf --preview --page 0
			
 
				+    python remove_watermark.py input.jpg --preview
			
 
				+"""
			
 
				+import argparse
			
 
				+import sys
			
 
				+from pathlib import Path
			
 
				+from typing import Optional
			
 
				+
			
 
				+# 将 ocr_platform 根目录加入 sys.path，以便导入 ocr_utils
			
 
				+_repo_root = Path(__file__).parents[2]
			
 
				+if str(_repo_root) not in sys.path:
			
 
				+    sys.path.insert(0, str(_repo_root))
			
 
				+
			
 
				+from loguru import logger
			
 
				+from ocr_utils.watermark_utils import (
			
 
				+    detect_watermark,
			
 
				+    remove_watermark_from_image,
			
 
				+    scan_pdf_watermark_xobjs,
			
 
				+    remove_txt_pdf_watermark,
			
 
				+)
			
 
				+
			
 
				+# 支持的图片后缀（小写）
			
 
				+IMAGE_SUFFIXES = {".jpg", ".jpeg", ".png", ".tif", ".tiff", ".bmp", ".webp"}
			
 
				+
			
 
				+
			
 
				+def _try_remove_txt_pdf_watermark(input_path: Path, output_path: Path) -> int:
			
 
				+    """
			
 
				+    对文字型 PDF 执行原生水印去除，保留文字可搜索性。
			
 
				+
			
 
				+    内部委托给 watermark_utils.remove_txt_pdf_watermark() 完成内存流处理，
			
 
				+    有水印时将结果写入 output_path。
			
 
				+
			
 
				+    流程：
			
 
				+    1. scan_pdf_watermark_xobjs() 快速扫描前 3 页，无水印直接返回 0
			
 
				+    2. remove_txt_pdf_watermark() 执行全量去除，返回 bytes 或 None
			
 
				+    3. 有水印时写 output_path
			
 
				+
			
 
				+    Returns:
			
 
				+        1 表示去除成功，0 表示未发现水印
			
 
				+    """
			
 
				+    pdf_bytes = input_path.read_bytes()
			
 
				+
			
 
				+    if not scan_pdf_watermark_xobjs(pdf_bytes, sample_pages=3):
			
 
				+        return 0
			
 
				+
			
 
				+    cleaned = remove_txt_pdf_watermark(pdf_bytes)
			
 
				+    if cleaned is None:
			
 
				+        return 0
			
 
				+
			
 
				+    output_path.write_bytes(cleaned)
			
 
				+    return 1
			
 
				+
			
 
				+
			
 
				+
			
 
				+def process_document(
			
 
				+    input_path: Path,
			
 
				+    output_path: Path,
			
 
				+    threshold: int = 160,
			
 
				+    morph_close_kernel: int = 0,
			
 
				+    dpi: int = 200,
			
 
				+    page_range: Optional[str] = None,
			
 
				+    force_image: bool = False,
			
 
				+) -> int:
			
 
				+    """
			
 
				+    统一处理函数：支持 PDF（扫描件）和图片，去除水印后保存。
			
 
				+
			
 
				+    使用 PDFUtils.load_and_classify_document 加载并分类：
			
 
				+    - 文字型 PDF（pdf_type='txt'）：优先尝试原生 XObject 水印去除（保留可搜索性）；
			
 
				+      失败时自动回退图像化处理，或 force_image=True 时直接走图像处理
			
 
				+    - 扫描件 PDF（pdf_type='ocr'）：逐页去水印后重新打包为 PDF
			
 
				+    - 图片：检测水印后去除并保存
			
 
				+
			
 
				+    Args:
			
 
				+        input_path: 输入文件路径（PDF 或图片）
			
 
				+        output_path: 输出文件路径
			
 
				+        threshold: 灰度阈值（140-180），越大保守，越小激进
			
 
				+        morph_close_kernel: 形态学闭运算核大小，0 跳过
			
 
				+        dpi: PDF 渲染分辨率
			
 
				+        page_range: 页面范围字符串，如 "1-5,7,9-12"（从 1 开始，仅对 PDF 有效）
			
 
				+        force_image: 强制对文字型 PDF 使用图像化处理（会失去文字可搜索性，
			
 
				+                     但能处理水印嵌在内容流中的情况）
			
 
				+
			
 
				+    Returns:
			
 
				+        实际处理的页/图片数
			
 
				+    """
			
 
				+    import shutil
			
 
				+    import numpy as np
			
 
				+    from io import BytesIO
			
 
				+    from PIL import Image
			
 
				+    from ocr_utils.pdf_utils import PDFUtils
			
 
				+
			
 
				+    is_pdf = input_path.suffix.lower() == ".pdf"
			
 
				+
			
 
				+    # 统一加载 + 分类（PDF 用 MinerU pdf_classify，图片直接读取）
			
 
				+    images, pdf_type, pdf_doc, renderer = PDFUtils.load_and_classify_document(
			
 
				+        input_path, dpi=dpi, page_range=page_range
			
 
				+    )
			
 
				+
			
 
				+    # _known_has_wm: 当 txt 分支已确认有水印时设为 True，避免公共段用更严格阈值误判
			
 
				+    _known_has_wm: Optional[bool] = None
			
 
				+
			
 
				+    # 文字型 PDF：优先尝试原生 XObject 水印去除，保留可搜索性
			
 
				+    if is_pdf and pdf_type == "txt" and not force_image:
			
 
				+        output_path.parent.mkdir(parents=True, exist_ok=True)
			
 
				+        removed = _try_remove_txt_pdf_watermark(input_path, output_path)
			
 
				+        if removed > 0:
			
 
				+            logger.info(
			
 
				+                f"✅ 文字型 PDF '{input_path.name}'：删除 {removed} 个水印 XObject，"
			
 
				+                "保留文字可搜索性，已保存。"
			
 
				+            )
			
 
				+            return removed
			
 
				+
			
 
				+        # XObject 扫描无结果，用较低阈值（0.5%）做图像水印检测二次确认
			
 
				+        # 文字 PDF 背景干净，降低阈值以检测稀疏文字水印
			
 
				+        first_np = np.array(images[0]["img_pil"])
			
 
				+        if detect_watermark(first_np, ratio_threshold=0.005):
			
 
				+            logger.warning(
			
 
				+                f"⚠️  文字型 PDF '{input_path.name}'：未找到 XObject 水印，"
			
 
				+                "但图像检测发现水印（内联内容流水印），"
			
 
				+                "回退为图像化处理（输出将失去文字可搜索性）。"
			
 
				+            )
			
 
				+            _known_has_wm = True  # 明确检测到水印，跳过公共段二次检测
			
 
				+        else:
			
 
				+            logger.info(
			
 
				+                f"✅ 文字型 PDF '{input_path.name}'：未检测到水印，直接复制。"
			
 
				+            )
			
 
				+            shutil.copy2(str(input_path), str(output_path))
			
 
				+            return 0
			
 
				+    elif is_pdf and pdf_type == "txt" and force_image:
			
 
				+        logger.warning(
			
 
				+            f"⚠️  文字型 PDF '{input_path.name}'：--force-image 模式，"
			
 
				+            "强制图像化处理（输出将失去文字可搜索性）。"
			
 
				+        )
			
 
				+        _known_has_wm = True  # force_image 模式不再检测，直接去除
			
 
				+
			
 
				+    logger.info(
			
 
				+        f"{'📄' if is_pdf else '🖼️ '} 处理: {input_path.name}  "
			
 
				+        f"共 {len(images)} {'页' if is_pdf else '张'}  threshold={threshold}"
			
 
				+    )
			
 
				+
			
 
				+    # 水印检测（仅用第一页/图判断，同一文档水印通常一致）
			
 
				+    # _known_has_wm 已在 txt 分支设置时，跳过重复检测
			
 
				+    if _known_has_wm is not None:
			
 
				+        has_wm = _known_has_wm
			
 
				+        logger.info("🔍 检测到水印，启动去水印处理" if has_wm else "✅ 未检测到水印，跳过")
			
 
				+    else:
			
 
				+        first_np = np.array(images[0]["img_pil"])
			
 
				+        # 扫描件/图片路径：使用宽松一档的中间调阈值（2.5%）以避免边界误判，
			
 
				+        # 斜向直线验证仍作为双重保险防止误报
			
 
				+        has_wm = detect_watermark(first_np, ratio_threshold=0.025)
			
 
				+        if has_wm:
			
 
				+            logger.info("🔍 检测到水印，启动去水印处理")
			
 
				+        else:
			
 
				+            logger.info("✅ 未检测到水印，跳过去水印处理")
			
 
				+            if not is_pdf:
			
 
				+                # 图片无水印：直接复制
			
 
				+                output_path.parent.mkdir(parents=True, exist_ok=True)
			
 
				+                shutil.copy2(str(input_path), str(output_path))
			
 
				+                return 1
			
 
				+
			
 
				+    output_path.parent.mkdir(parents=True, exist_ok=True)
			
 
				+
			
 
				+    if is_pdf:
			
 
				+        # 逐页处理后重新打包为 PDF
			
 
				+        try:
			
 
				+            import fitz
			
 
				+        except ImportError:
			
 
				+            raise ImportError("请安装 PyMuPDF: pip install PyMuPDF")
			
 
				+
			
 
				+        new_doc = fitz.open()
			
 
				+        for i, img_dict in enumerate(images):
			
 
				+            pil_img = img_dict["img_pil"]
			
 
				+            img_np = np.array(pil_img)
			
 
				+
			
 
				+            if has_wm:
			
 
				+                cleaned_gray = remove_watermark_from_image(
			
 
				+                    img_np, threshold=threshold,
			
 
				+                    morph_close_kernel=morph_close_kernel, return_pil=False,
			
 
				+                )
			
 
				+                out_pil = Image.fromarray(cleaned_gray).convert("RGB")
			
 
				+            else:
			
 
				+                out_pil = pil_img
			
 
				+
			
 
				+            buf = BytesIO()
			
 
				+            out_pil.save(buf, format="PNG", optimize=False)
			
 
				+            buf.seek(0)
			
 
				+
			
 
				+            # 按渲染图尺寸创建新页面（保持原始 DPI 尺寸）
			
 
				+            w_px, h_px = out_pil.size
			
 
				+            new_page = new_doc.new_page(width=w_px * 72 / dpi, height=h_px * 72 / dpi)
			
 
				+            new_page.insert_image(new_page.rect, stream=buf.read())
			
 
				+
			
 
				+            if (i + 1) % 10 == 0 or i == len(images) - 1:
			
 
				+                logger.info(f"  进度: {i + 1}/{len(images)}")
			
 
				+
			
 
				+        new_doc.save(str(output_path), garbage=4, deflate=True)
			
 
				+    else:
			
 
				+        # 图片：有水印则去除后保存
			
 
				+        img_np = np.array(images[0]["img_pil"])
			
 
				+        cleaned_gray = remove_watermark_from_image(
			
 
				+            img_np, threshold=threshold,
			
 
				+            morph_close_kernel=morph_close_kernel, return_pil=False,
			
 
				+        )
			
 
				+        Image.fromarray(cleaned_gray, mode="L").save(str(output_path))
			
 
				+
			
 
				+    logger.info(f"✅ 保存到: {output_path}")
			
 
				+    return len(images)
			
 
				+
			
 
				+
			
 
				+def preview_page(
			
 
				+    input_path: Path,
			
 
				+    page_idx: int = 0,
			
 
				+    threshold: int = 160,
			
 
				+    dpi: int = 200,
			
 
				+):
			
 
				+    """展示单页原图与去水印对比（需要 matplotlib）。支持 PDF 和图片文件。"""
			
 
				+    try:
			
 
				+        import numpy as np
			
 
				+        import matplotlib.pyplot as plt
			
 
				+        import matplotlib
			
 
				+        matplotlib.rcParams['font.sans-serif'] = ['Arial Unicode MS', 'SimHei', 'sans-serif']
			
 
				+        matplotlib.rcParams['axes.unicode_minus'] = False
			
 
				+    except ImportError as e:
			
 
				+        raise ImportError(f"预览需要 matplotlib: {e}")
			
 
				+
			
 
				+    suffix = input_path.suffix.lower()
			
 
				+
			
 
				+    if suffix == ".pdf":
			
 
				+        try:
			
 
				+            import fitz
			
 
				+        except ImportError:
			
 
				+            raise ImportError("PDF 预览需要 PyMuPDF: pip install PyMuPDF")
			
 
				+        doc = fitz.open(str(input_path))
			
 
				+        if page_idx >= len(doc):
			
 
				+            raise ValueError(f"页码 {page_idx} 超出范围（共 {len(doc)} 页）")
			
 
				+        mat = fitz.Matrix(dpi / 72, dpi / 72)
			
 
				+        page = doc[page_idx]
			
 
				+        pix = page.get_pixmap(matrix=mat, alpha=False)
			
 
				+        img_np = np.frombuffer(pix.samples, dtype=np.uint8).reshape(pix.h, pix.w, 3)
			
 
				+        title_orig = f"原图  第 {page_idx + 1} 页"
			
 
				+    elif suffix in IMAGE_SUFFIXES:
			
 
				+        from PIL import Image
			
 
				+        img_np = np.array(Image.open(str(input_path)).convert("RGB"))
			
 
				+        title_orig = f"原图  {input_path.name}"
			
 
				+    else:
			
 
				+        raise ValueError(f"不支持的文件格式: {suffix}")
			
 
				+
			
 
				+    cleaned = remove_watermark_from_image(img_np, threshold=threshold, return_pil=False)
			
 
				+
			
 
				+    fig, axes = plt.subplots(1, 2, figsize=(20, 14))
			
 
				+    axes[0].imshow(img_np)
			
 
				+    axes[0].set_title(title_orig, fontsize=14)
			
 
				+    axes[0].axis('off')
			
 
				+
			
 
				+    axes[1].imshow(cleaned, cmap='gray')
			
 
				+    axes[1].set_title(f"去水印后  threshold={threshold}", fontsize=14)
			
 
				+    axes[1].axis('off')
			
 
				+
			
 
				+    plt.tight_layout()
			
 
				+    plt.show()
			
 
				+
			
 
				+
			
 
				+def main():
			
 
				+    parser = argparse.ArgumentParser(
			
 
				+        description="银行流水水印去除工具",
			
 
				+        formatter_class=argparse.RawDescriptionHelpFormatter,
			
 
				+        epilog=__doc__,
			
 
				+    )
			
 
				+    parser.add_argument("input", type=Path, help="输入 PDF / 图片文件或目录（批量模式）")
			
 
				+    parser.add_argument("-o", "--output", type=Path, default=None,
			
 
				+                        help="输出路径（单文件模式；默认在原文件名后加 _cleaned）")
			
 
				+    parser.add_argument("--threshold", type=int, default=160,
			
 
				+                        help="灰度阈值 (140-180)，默认 160")
			
 
				+    parser.add_argument("--morph-kernel", type=int, default=2,
			
 
				+                        help="形态学闭运算核大小，0 跳过，默认 2")
			
 
				+    parser.add_argument("--dpi", type=int, default=200,
			
 
				+                        help="渲染 DPI，默认 200")
			
 
				+    parser.add_argument("--batch", action="store_true",
			
 
				+                        help="批量模式：处理目录下所有 PDF 和图片")
			
 
				+    parser.add_argument("--preview", action="store_true",
			
 
				+                        help="预览模式：展示单页对比图（不保存）")
			
 
				+    parser.add_argument("--page", type=int, default=0,
			
 
				+                        help="预览页码（0-based），默认第 0 页")
			
 
				+    parser.add_argument("--page-range", type=str, default=None,
			
 
				+                        help="处理页面范围，如 '1-3,5,7-9'（从 1 开始，仅对 PDF 有效）")
			
 
				+    parser.add_argument("--force-image", action="store_true",
			
 
				+                        help="强制对文字型 PDF 使用图像化处理（会失去可搜索性，适用于 XObject 方法无法去除的内联水印）")
			
 
				+
			
 
				+    args = parser.parse_args()
			
 
				+
			
 
				+    if args.preview:
			
 
				+        preview_page(
			
 
				+            args.input,
			
 
				+            page_idx=args.page,
			
 
				+            threshold=args.threshold,
			
 
				+            dpi=args.dpi,
			
 
				+        )
			
 
				+        return
			
 
				+
			
 
				+    if args.batch:
			
 
				+        # 批量模式：处理目录下所有 PDF 和图片
			
 
				+        input_dir = args.input
			
 
				+        if not input_dir.is_dir():
			
 
				+            logger.error(f"批量模式需要传入目录: {input_dir}")
			
 
				+            sys.exit(1)
			
 
				+
			
 
				+        # 收集所有支持的文件
			
 
				+        all_files: list[Path] = sorted(input_dir.glob("*.pdf"))
			
 
				+        for ext in IMAGE_SUFFIXES:
			
 
				+            all_files.extend(sorted(input_dir.glob(f"*{ext}")))
			
 
				+            all_files.extend(sorted(input_dir.glob(f"*{ext.upper()}")))
			
 
				+        all_files = sorted(set(all_files))
			
 
				+
			
 
				+        if not all_files:
			
 
				+            logger.warning(f"目录中没有可处理的文件（PDF/图片）: {input_dir}")
			
 
				+            return
			
 
				+        out_dir = args.output or input_dir / "cleaned"
			
 
				+        out_dir.mkdir(parents=True, exist_ok=True)
			
 
				+        for file in all_files:
			
 
				+            out_file = out_dir / f"{file.stem}_cleaned{file.suffix}"
			
 
				+            try:
			
 
				+                process_document(file, out_file, args.threshold, args.morph_kernel, args.dpi, args.page_range, args.force_image)
			
 
				+            except Exception as e:
			
 
				+                logger.error(f"❌ 处理失败 {file.name}: {e}")
			
 
				+        logger.info(f"✅ 批量处理完成，共 {len(all_files)} 个文件 -> {out_dir}")
			
 
				+    else:
			
 
				+        # 单文件模式
			
 
				+        input_path = args.input
			
 
				+        if not input_path.is_file():
			
 
				+            logger.error(f"文件不存在: {input_path}")
			
 
				+            sys.exit(1)
			
 
				+        output_path = args.output or input_path.with_name(
			
 
				+            f"{input_path.stem}_cleaned{input_path.suffix}"
			
 
				+        )
			
 
				+        suffix = input_path.suffix.lower()
			
 
				+        if suffix == ".pdf" or suffix in IMAGE_SUFFIXES:
			
 
				+            process_document(input_path, output_path, args.threshold, args.morph_kernel, args.dpi, args.page_range, args.force_image)
			
 
				+        else:
			
 
				+            logger.error(f"不支持的文件格式: {suffix}，支持 PDF 和 {IMAGE_SUFFIXES}")
			
 
				+            sys.exit(1)
			
 
				+
			
 
				+
			
 
				+if __name__ == "__main__":
			
 
				+    if len(sys.argv) == 1:
			
 
				+        print("ℹ️  未提供命令行参数，使用默认配置运行...")
			
 
				+
			
 
				+        # 默认配置（用于开发测试）
			
 
				+        default_config = {
			
 
				+            # 测试输入
			
 
				+            # "input": "/Users/zhch158/workspace/data/流水分析/杨万益_福建农信.pdf",
			
 
				+            # "input": "Users/zhch158/workspace/data/流水分析/提取自杨万益_福建农信.png",
			
 
				+            
			
 
				+            # 文字PDF测试
			
 
				+            # "input": "/Users/zhch158/workspace/data/流水分析/提取自赤峰黄金2023年报.pdf",
			
 
				+            # "input": "/Users/zhch158/workspace/data/测试文字PDF-水印.pdf",
			
 
				+            "input": "/Users/zhch158/workspace/data/非结构化文档识别统一平台（ocr_platform）-交易流水识别，财报识别.pdf",
			
 
				+            # "output": "./output/杨万益_福建农信",
			
 
				+            # 页面范围（可选，支持 "1-5,7" 语法，仅对 PDF 有效）
			
 
				+            # "page_range": "3",  # 仅处理第 1 页（对应 --page-range 参数）
			
 
				+            "dpi": 200,
			
 
				+            "threshold": 160,
			
 
				+            "morph_kernel": 0,  # 遮罩替换模式下不需要闭运算
			
 
				+            # "preview": True,
			
 
				+        }
			
 
				+
			
 
				+        # 构造参数（注意 input 是位置参数，morph_kernel 对应 --morph-kernel）
			
 
				+        sys.argv = [sys.argv[0], default_config["input"]]
			
 
				+        skip_keys = {"input"}
			
 
				+        for key, value in default_config.items():
			
 
				+            if key in skip_keys:
			
 
				+                continue
			
 
				+            # 将下划线转换为连字符（如 morph_kernel -> morph-kernel）
			
 
				+            flag = f"--{key.replace('_', '-')}"
			
 
				+            if isinstance(value, bool):
			
 
				+                if value:
			
 
				+                    sys.argv.append(flag)
			
 
				+            else:
			
 
				+                sys.argv.extend([flag, str(value)])
			
 
				+
			
 
				+    sys.exit(main())
			
--- a/ocr_tools/universal_doc_parser/config/bank_statement_glm_vl.yaml
+++ b/ocr_tools/universal_doc_parser/config/bank_statement_glm_vl.yaml
@@ -5,6 +5,9 @@ description: "银行交易流水、对账单等场景（使用 GLM-OCR 进行 VL
 
				 input:
			
 
				   supported_formats: [".pdf", ".png", ".jpg", ".jpeg", ".bmp", ".tiff"]
			
 
				   dpi: 200  # PDF转图片的DPI
			
 
				+  txt_pdf_watermark_removal:
			
 
				+    enabled: true   # 文字型PDF渲染前去除水印XObject（保留文字可搜索性）
			
 
				+    sample_pages: 3  # 扫描前N页快速预检
			
 
				 
			
 
				 preprocessor:
			
 
				   module: "mineru"
			
@@ -14,6 +17,14 @@ preprocessor:
 
				     model_dir: null  # 使用默认路径
			
 
				   unwarping:
			
 
				     enabled: false
			
 
				+  # -------------------------------------------------------
			
 
				+  # 水印去除配置（适用于银行流水浅色斜向文字水印）
			
 
				+  # -------------------------------------------------------
			
 
				+  watermark_removal:
			
 
				+    enabled: true           # 是否启用水印去除
			
 
				+    threshold: 160          # 灰度阈值（140-180）：高于此值视为水印变白
			
 
				+                            # 值越大保守（残留水印），值越小激进（损失浅色正文）
			
 
				+    morph_close_kernel: 0   # 形态学闭运算核大小（像素），默认的 morph_kernel 改为 0（非二值图像时形态学闭运算会适得其反）
			
 
				 
			
 
				 # ============================================================
			
 
				 # Layout 检测配置 - 使用 PP-DocLayoutV3
			
--- a/ocr_tools/universal_doc_parser/config/bank_statement_mineru_vl.yaml
+++ b/ocr_tools/universal_doc_parser/config/bank_statement_mineru_vl.yaml
@@ -5,6 +5,9 @@ description: "银行交易流水、对账单等场景"
 
				 input:
			
 
				   supported_formats: [".pdf", ".png", ".jpg", ".jpeg", ".bmp", ".tiff"]
			
 
				   dpi: 200  # PDF转图片的DPI
			
 
				+  txt_pdf_watermark_removal:
			
 
				+    enabled: true   # 文字型PDF渲染前去除水印XObject（保留文字可搜索性）
			
 
				+    sample_pages: 3  # 扫描前N页快速预检
			
 
				 
			
 
				 preprocessor:
			
 
				   module: "mineru"
			
@@ -14,6 +17,14 @@ preprocessor:
 
				     model_dir: null  # 使用默认路径
			
 
				   unwarping:
			
 
				     enabled: false
			
 
				+  # -------------------------------------------------------
			
 
				+  # 水印去除配置（适用于银行流水浅色斜向文字水印）
			
 
				+  # -------------------------------------------------------
			
 
				+  watermark_removal:
			
 
				+    enabled: true           # 是否启用水印去除
			
 
				+    threshold: 160          # 灰度阈值（140-180）：高于此值视为水印变白
			
 
				+                            # 值越大保守（残留水印），值越小激进（损失浅色正文）
			
 
				+    morph_close_kernel: 0   # 形态学闭运算核大小（像素），默认的 morph_kernel 改为 0（非二值图像时形态学闭运算会适得其反）
			
 
				 
			
 
				 layout_detection:
			
 
				   # MinerU-VL layout（通过 VLM 服务做版式检测）
			
--- a/ocr_tools/universal_doc_parser/config/bank_statement_paddle_vl.yaml
+++ b/ocr_tools/universal_doc_parser/config/bank_statement_paddle_vl.yaml
@@ -5,6 +5,9 @@ description: "银行交易流水、对账单等场景"
 
				 input:
			
 
				   supported_formats: [".pdf", ".png", ".jpg", ".jpeg", ".bmp", ".tiff"]
			
 
				   dpi: 200  # PDF转图片的DPI
			
 
				+  txt_pdf_watermark_removal:
			
 
				+    enabled: true   # 文字型PDF渲染前去除水印XObject（保留文字可搜索性）
			
 
				+    sample_pages: 3  # 扫描前N页快速预检
			
 
				 
			
 
				 preprocessor:
			
 
				   module: "mineru"
			
@@ -14,6 +17,14 @@ preprocessor:
 
				     model_dir: null  # 使用默认路径
			
 
				   unwarping:
			
 
				     enabled: false
			
 
				+  # -------------------------------------------------------
			
 
				+  # 水印去除配置（适用于银行流水浅色斜向文字水印）
			
 
				+  # -------------------------------------------------------
			
 
				+  watermark_removal:
			
 
				+    enabled: true           # 是否启用水印去除
			
 
				+    threshold: 160          # 灰度阈值（140-180）：高于此值视为水印变白
			
 
				+                            # 值越大保守（残留水印），值越小激进（损失浅色正文）
			
 
				+    morph_close_kernel: 0   # 形态学闭运算核大小（像素），默认的 morph_kernel 改为 0（非二值图像时形态学闭运算会适得其反）
			
 
				 
			
 
				 layout_detection:
			
 
				   # module: "paddle"
			
--- a/ocr_tools/universal_doc_parser/config/bank_statement_paddle_vl_local.yaml
+++ b/ocr_tools/universal_doc_parser/config/bank_statement_paddle_vl_local.yaml
@@ -0,0 +1,211 @@
 
				+# 银行交易流水场景配置 - V4版本
			
 
				+# Pipeline V3逻辑: 有线表格使用MinerU UNet, 无线表格/seal使用GLM-OCR VLM
			
 
				+# llama-server -hf PaddlePaddle/PaddleOCR-VL-1.5-GGUF
			
 
				+scene_name: "bank_statement_yusys_local"
			
 
				+
			
 
				+description: "银行流水V4: PP-DocLayoutV3 layout + PaddleOCR + MinerU UNet（有线表格）+ PaddleOCR-VL-1.5-GGUF VLM（无线表格/seal）"
			
 
				+
			
 
				+input:
			
 
				+  supported_formats: [".pdf", ".png", ".jpg", ".jpeg", ".bmp", ".tiff"]
			
 
				+  dpi: 200
			
 
				+  txt_pdf_watermark_removal:
			
 
				+    enabled: true   # 文字型PDF渲染前去除水印XObject（保留文字可搜索性）
			
 
				+    sample_pages: 3  # 扫描前N页快速预检
			
 
				+
			
 
				+preprocessor:
			
 
				+  module: "mineru"
			
 
				+  orientation_classifier:
			
 
				+    enabled: true
			
 
				+    model_name: "paddle_orientation_classification"
			
 
				+    model_dir: null  # 使用默认路径
			
 
				+  unwarping:
			
 
				+    enabled: false
			
 
				+  # -------------------------------------------------------
			
 
				+  # 水印去除配置（适用于银行流水浅色斜向文字水印）
			
 
				+  # -------------------------------------------------------
			
 
				+  watermark_removal:
			
 
				+    enabled: true           # 是否启用水印去除
			
 
				+    threshold: 160          # 灰度阈值（140-180）：高于此值视为水印变白
			
 
				+                            # 值越大保守（残留水印），值越小激进（损失浅色正文）
			
 
				+    morph_close_kernel: 0   # 形态学闭运算核大小（像素），默认的 morph_kernel 改为 0（非二值图像时形态学闭运算会适得其反）
			
 
				+
			
 
				+# ============================================================
			
 
				+# Layout 检测配置 - 智能路由器（按场景直接选择模型）
			
 
				+# ============================================================
			
 
				+layout_detection:
			
 
				+  module: "smart_router"
			
 
				+  strategy: "scene"  # 按场景直接选择模型，不走ocr_eval
			
 
				+
			
 
				+  # 场景策略：指定场景直接选用的布局模型
			
 
				+  scene_strategy:
			
 
				+    bank_statement:
			
 
				+      model: "docling"
			
 
				+    financial_report:
			
 
				+      model: "paddle_ppdoclayoutv3"
			
 
				+  default_model: "docling"
			
 
				+
			
 
				+  # 配置多个模型
			
 
				+  models:
			
 
				+    docling:
			
 
				+      module: "docling"
			
 
				+      model_name: "docling-layout-old"
			
 
				+      model_dir: "ds4sd/docling-layout-old"
			
 
				+      device: "cpu"
			
 
				+      conf: 0.3
			
 
				+      num_threads: 4
			
 
				+
			
 
				+    paddle_ppdoclayoutv3:
			
 
				+      module: "paddle"
			
 
				+      model_name: "PP-DocLayoutV3"
			
 
				+      model_dir: "PaddlePaddle/PP-DocLayoutV3_safetensors"
			
 
				+      device: "cpu"
			
 
				+      conf: 0.3
			
 
				+      num_threads: 4
			
 
				+      batch_size: 1
			
 
				+  
			
 
				+  # 后处理配置
			
 
				+  post_process:
			
 
				+    # 将大面积文本块转换为表格（后处理）
			
 
				+    convert_large_text_to_table: true  # 是否启用
			
 
				+    min_text_area_ratio: 0.25         # 最小面积占比（25%）
			
 
				+    min_text_width_ratio: 0.4         # 最小宽度占比（40%）
			
 
				+    min_text_height_ratio: 0.3        # 最小高度占比（30%）
			
 
				+
			
 
				+  # Debug 可视化配置
			
 
				+  debug_options:
			
 
				+    enabled: false              # 由命令行 --debug 统一控制，勿在此 hardcode true
			
 
				+    output_dir: null             # 调试输出目录；null不输出
			
 
				+    prefix: ""                  # 保存文件名前缀（如设置为页码）
			
 
				+
			
 
				+# ============================================================
			
 
				+# OCR 识别配置
			
 
				+# ============================================================
			
 
				+ocr_recognition:
			
 
				+  module: "mineru"
			
 
				+  language: "ch"
			
 
				+  det_threshold: 0.5
			
 
				+  unclip_ratio: 1.5
			
 
				+  enable_merge_det_boxes: false
			
 
				+  batch_size: 8
			
 
				+  device: "cpu"
			
 
				+
			
 
				+# ============================================================
			
 
				+# 表格分类配置（自动区分有线/无线表格）
			
 
				+# ============================================================
			
 
				+table_classification:
			
 
				+  enabled: true               # 启用自动表格分类
			
 
				+  module: "paddle"            # 分类模型：paddle（MinerU PaddleTableClsModel）
			
 
				+  confidence_threshold: 0.5   # 分类置信度阈值
			
 
				+  batch_size: 16              # 批处理大小
			
 
				+
			
 
				+  # Debug 可视化配置
			
 
				+  debug_options:
			
 
				+    enabled: false              # 由命令行 --debug 统一控制，勿在此 hardcode true
			
 
				+    output_dir: null             # 调试输出目录；null不输出
			
 
				+    save_table_lines: true       # 保存表格线可视化（unet横线/竖线叠加）
			
 
				+    image_format: "png"          # 可视化图片格式：png/jpg
			
 
				+    prefix: ""                  # 保存文件名前缀（如设置为页码/表格序号）
			
 
				+
			
 
				+# ============================================================
			
 
				+# 有线表格识别专用配置（MinerU UNet）
			
 
				+# ============================================================
			
 
				+table_recognition_wired:
			
 
				+  use_wired_unet: true
			
 
				+  upscale_ratio: 3.333
			
 
				+  need_ocr: true
			
 
				+  row_threshold: 10
			
 
				+  col_threshold: 15
			
 
				+  ocr_conf_threshold: 0.9       # 单元格 OCR 置信度阈值
			
 
				+  cell_crop_margin: 2
			
 
				+  use_custom_postprocess: true  # 是否使用自定义后处理（默认启用）
			
 
				+
			
 
				+  # 是否启用倾斜矫正
			
 
				+  enable_deskew: true
			
 
				+
			
 
				+  # 🆕 启用多源单元格融合
			
 
				+  use_cell_fusion: true
			
 
				+  
			
 
				+  # 融合引擎配置
			
 
				+  cell_fusion:
			
 
				+    # RT-DETR 模型路径（必需）
			
 
				+    rtdetr_model_path: "/Users/zhch158/models/pytorch_models/Table/RT-DETR-L_wired_table_cell_det.onnx"
			
 
				+    
			
 
				+    # 融合权重
			
 
				+    unet_weight: 0.6        # UNet 权重（结构性强）
			
 
				+    rtdetr_weight: 0.4      # RT-DETR 权重（鲁棒性强）
			
 
				+    
			
 
				+    # 阈值配置
			
 
				+    iou_merge_threshold: 0.7    # 高IoU合并阈值（>0.7则加权平均）
			
 
				+    iou_nms_threshold: 0.5      # NMS去重阈值
			
 
				+    rtdetr_conf_threshold: 0.5  # RT-DETR置信度阈值
			
 
				+    
			
 
				+    # 功能开关
			
 
				+    enable_ocr_compensation: true      # 启用OCR边缘补偿
			
 
				+
			
 
				+  # Debug 可视化配置
			
 
				+  debug_options:
			
 
				+    enabled: false              # 由命令行 --debug 统一控制，勿在此 hardcode true
			
 
				+    output_dir: null             # 调试输出目录；null不输出
			
 
				+    save_table_lines: true       # 保存表格线可视化（unet横线/竖线叠加）
			
 
				+    save_connected_components: true  # 保存连通域提取的单元格图
			
 
				+    save_grid_structure: true    # 保存逻辑网格结构（row/col/rowspan/colspan）
			
 
				+    save_text_overlay: true      # 保存文本填充覆盖图
			
 
				+    image_format: "png"          # 可视化图片格式：png/jpg
			
 
				+    prefix: ""                  # 保存文件名前缀（如设置为页码/表格序号）
			
 
				+
			
 
				+# ============================================================
			
 
				+# VL识别配置 - 使用 GLM-OCR（无线表格 + seal识别）
			
 
				+# ============================================================
			
 
				+vl_recognition:
			
 
				+  module: "glmocr"
			
 
				+  api_url: "http://localhost:8102/v1/chat/completions"
			
 
				+  api_key: null  # 可选，如需要可填写
			
 
				+  model: "glm-ocr"
			
 
				+  max_image_size: 3500  # GLM-OCR 推荐的最大图片尺寸
			
 
				+  resize_mode: 'max'    # 缩放模式: 'max' 保持宽高比, 'fixed' 固定尺寸
			
 
				+  verify_ssl: false
			
 
				+  
			
 
				+  # Task prompt mapping - 针对不同任务使用不同提示词
			
 
				+  task_prompt_mapping:
			
 
				+    text: "Text Recognition:"
			
 
				+    table: "Table Recognition:"
			
 
				+    formula: "Formula Recognition:"
			
 
				+    seal: "Seal Recognition:"  # 印章识别的专用提示词
			
 
				+  
			
 
				+  # 模型参数
			
 
				+  model_params:
			
 
				+    connection_pool_size: 128  # HTTP 连接池大小（应 >= max_workers）
			
 
				+    http_timeout: 300          # HTTP 请求超时时间（秒）
			
 
				+    connect_timeout: 30        # 连接超时时间（秒）
			
 
				+    retry_max_attempts: 2      # 最大重试次数
			
 
				+    retry_backoff_base_seconds: 0.5
			
 
				+    retry_backoff_max_seconds: 8.0
			
 
				+    retry_jitter_ratio: 0.2
			
 
				+    retry_status_codes: [429, 500, 502, 503, 504]
			
 
				+    max_tokens: 16384
			
 
				+    temperature: 0.1
			
 
				+    top_p: 0.0001
			
 
				+    top_k: 1
			
 
				+    repetition_penalty: 1.1
			
 
				+  
			
 
				+  # 场景特定配置
			
 
				+  table_recognition:
			
 
				+
			
 
				+# ============================================================
			
 
				+# 输出配置
			
 
				+# ============================================================
			
 
				+output:
			
 
				+  create_subdir: false
			
 
				+  save_pdf_images: true
			
 
				+  save_json: true
			
 
				+  save_page_json: true
			
 
				+  save_markdown: true
			
 
				+  save_page_markdown: true
			
 
				+  save_html: true
			
 
				+  save_layout_image: true
			
 
				+  save_ocr_image: true
			
 
				+  draw_type_label: true
			
 
				+  draw_bbox_number: true
			
 
				+  save_enhanced_json: true
			
 
				+  normalize_numbers: true
			
 
				+  debug_mode: false
			
--- a/ocr_tools/universal_doc_parser/config/bank_statement_smart_router.yaml
+++ b/ocr_tools/universal_doc_parser/config/bank_statement_smart_router.yaml
@@ -7,11 +7,22 @@ description: "银行交易流水：智能路由器自动选择最佳layout模型
 
				 input:
			
 
				   supported_formats: [".pdf", ".png", ".jpg", ".jpeg", ".bmp", ".tiff"]
			
 
				   dpi: 200
			
 
				+  txt_pdf_watermark_removal:
			
 
				+    enabled: true   # 文字型PDF渲染前去除水印XObject（保留文字可搜索性）
			
 
				+    sample_pages: 3  # 扫描前N页快速预检
			
 
				 
			
 
				 preprocessor:
			
 
				   module: "mineru"
			
 
				   orientation_classifier:
			
 
				     enabled: true
			
 
				+  # -------------------------------------------------------
			
 
				+  # 水印去除配置（适用于银行流水浅色斜向文字水印）
			
 
				+  # -------------------------------------------------------
			
 
				+  watermark_removal:
			
 
				+    enabled: true           # 是否启用水印去除
			
 
				+    threshold: 160          # 灰度阈值（140-180）：高于此值视为水印变白
			
 
				+                            # 值越大保守（残留水印），值越小激进（损失浅色正文）
			
 
				+    morph_close_kernel: 0   # 形态学闭运算核大小（像素），默认的 morph_kernel 改为 0（非二值图像时形态学闭运算会适得其反）
			
 
				 
			
 
				 # ============================================================
			
 
				 # 智能布局模型路由器配置
			
--- a/ocr_tools/universal_doc_parser/config/bank_statement_yusys_local.yaml
+++ b/ocr_tools/universal_doc_parser/config/bank_statement_yusys_local.yaml
@@ -8,6 +8,9 @@ description: "银行流水V4: PP-DocLayoutV3 layout + PaddleOCR + MinerU UNet（
 
				 input:
			
 
				   supported_formats: [".pdf", ".png", ".jpg", ".jpeg", ".bmp", ".tiff"]
			
 
				   dpi: 200
			
 
				+  txt_pdf_watermark_removal:
			
 
				+    enabled: true   # 文字型PDF渲染前去除水印XObject（保留文字可搜索性）
			
 
				+    sample_pages: 3  # 扫描前N页快速预检
			
 
				 
			
 
				 preprocessor:
			
 
				   module: "mineru"
			
@@ -17,6 +20,14 @@ preprocessor:
 
				     model_dir: null  # 使用默认路径
			
 
				   unwarping:
			
 
				     enabled: false
			
 
				+  # -------------------------------------------------------
			
 
				+  # 水印去除配置（适用于银行流水浅色斜向文字水印）
			
 
				+  # -------------------------------------------------------
			
 
				+  watermark_removal:
			
 
				+    enabled: true           # 是否启用水印去除
			
 
				+    threshold: 160          # 灰度阈值（140-180）：高于此值视为水印变白
			
 
				+                            # 值越大保守（残留水印），值越小激进（损失浅色正文）
			
 
				+    morph_close_kernel: 0   # 形态学闭运算核大小（像素），默认的 morph_kernel 改为 0（非二值图像时形态学闭运算会适得其反）
			
 
				 
			
 
				 # ============================================================
			
 
				 # Layout 检测配置 - 智能路由器（按场景直接选择模型）
			
@@ -147,7 +158,7 @@ table_recognition_wired:
 
				 # ============================================================
			
 
				 vl_recognition:
			
 
				   module: "glmocr"
			
 
				-  api_url: "http://localhost:8080/v1/chat/completions"
			
 
				+  api_url: "http://localhost:8101/v1/chat/completions"
			
 
				   api_key: null  # 可选，如需要可填写
			
 
				   model: "glm-ocr"
			
 
				   max_image_size: 3500  # GLM-OCR 推荐的最大图片尺寸
			
--- a/ocr_tools/universal_doc_parser/config/bank_statement_yusys_v2.yaml
+++ b/ocr_tools/universal_doc_parser/config/bank_statement_yusys_v2.yaml
@@ -10,6 +10,9 @@ description: "银行交易流水、对账单等场景 - 增强版"
 
				 input:
			
 
				   supported_formats: [".pdf", ".png", ".jpg", ".jpeg", ".bmp", ".tiff"]
			
 
				   dpi: 200  # PDF转图片的DPI
			
 
				+  txt_pdf_watermark_removal:
			
 
				+    enabled: true   # 文字型PDF渲染前去除水印XObject（保留文字可搜索性）
			
 
				+    sample_pages: 3  # 扫描前N页快速预检
			
 
				 
			
 
				 # ============================================================
			
 
				 # 预处理配置（方向识别）
			
@@ -22,6 +25,14 @@ preprocessor:
 
				     model_dir: null  # 使用默认路径
			
 
				   unwarping:
			
 
				     enabled: false  # 图像矫正（可选）
			
 
				+  # -------------------------------------------------------
			
 
				+  # 水印去除配置（适用于银行流水浅色斜向文字水印）
			
 
				+  # -------------------------------------------------------
			
 
				+  watermark_removal:
			
 
				+    enabled: true           # 是否启用水印去除
			
 
				+    threshold: 160          # 灰度阈值（140-180）：高于此值视为水印变白
			
 
				+                            # 值越大保守（残留水印），值越小激进（损失浅色正文）
			
 
				+    morph_close_kernel: 0   # 形态学闭运算核大小（像素），默认的 morph_kernel 改为 0（非二值图像时形态学闭运算会适得其反）
			
 
				 
			
 
				 # ============================================================
			
 
				 # 版式检测配置
			
--- a/ocr_tools/universal_doc_parser/config/bank_statement_yusys_v3.yaml
+++ b/ocr_tools/universal_doc_parser/config/bank_statement_yusys_v3.yaml
@@ -6,11 +6,22 @@ description: "银行流水：docling layout + PaddleOCR + MinerU UNet（有线
 
				 input:
			
 
				   supported_formats: [".pdf", ".png", ".jpg", ".jpeg", ".bmp", ".tiff"]
			
 
				   dpi: 200
			
 
				+  txt_pdf_watermark_removal:
			
 
				+    enabled: true   # 文字型PDF渲染前去除水印XObject（保留文字可搜索性）
			
 
				+    sample_pages: 3  # 扫描前N页快速预检
			
 
				 
			
 
				 preprocessor:
			
 
				   module: "mineru"
			
 
				   orientation_classifier:
			
 
				     enabled: true
			
 
				+  # -------------------------------------------------------
			
 
				+  # 水印去除配置（适用于银行流水浅色斜向文字水印）
			
 
				+  # -------------------------------------------------------
			
 
				+  watermark_removal:
			
 
				+    enabled: true           # 是否启用水印去除
			
 
				+    threshold: 160          # 灰度阈值（140-180）：高于此值视为水印变白
			
 
				+                            # 值越大保守（残留水印），值越小激进（损失浅色正文）
			
 
				+    morph_close_kernel: 0   # 形态学闭运算核大小（像素），默认的 morph_kernel 改为 0（非二值图像时形态学闭运算会适得其反）
			
 
				 
			
 
				 layout_detection:
			
 
				   module: "docling"
			
--- a/ocr_tools/universal_doc_parser/config/bank_statement_yusys_v4.yaml
+++ b/ocr_tools/universal_doc_parser/config/bank_statement_yusys_v4.yaml
@@ -7,6 +7,9 @@ description: "银行流水V4: PP-DocLayoutV3 layout + PaddleOCR + MinerU UNet（
 
				 input:
			
 
				   supported_formats: [".pdf", ".png", ".jpg", ".jpeg", ".bmp", ".tiff"]
			
 
				   dpi: 200
			
 
				+  txt_pdf_watermark_removal:
			
 
				+    enabled: true   # 文字型PDF渲染前去除水印XObject（保留文字可搜索性）
			
 
				+    sample_pages: 3  # 扫描前N页快速预检
			
 
				 
			
 
				 preprocessor:
			
 
				   module: "mineru"
			
@@ -16,6 +19,14 @@ preprocessor:
 
				     model_dir: null  # 使用默认路径
			
 
				   unwarping:
			
 
				     enabled: false
			
 
				+  # -------------------------------------------------------
			
 
				+  # 水印去除配置（适用于银行流水浅色斜向文字水印）
			
 
				+  # -------------------------------------------------------
			
 
				+  watermark_removal:
			
 
				+    enabled: true           # 是否启用水印去除
			
 
				+    threshold: 160          # 灰度阈值（140-180）：高于此值视为水印变白
			
 
				+                            # 值越大保守（残留水印），值越小激进（损失浅色正文）
			
 
				+    morph_close_kernel: 0   # 形态学闭运算核大小（像素），默认的 morph_kernel 改为 0（非二值图像时形态学闭运算会适得其反）
			
 
				 
			
 
				 # ============================================================
			
 
				 # Layout 检测配置 - 智能路由器（按场景直接选择模型）
			
--- a/ocr_tools/universal_doc_parser/core/pipeline_manager_v2.py
+++ b/ocr_tools/universal_doc_parser/core/pipeline_manager_v2.py
@@ -50,6 +50,8 @@ except ImportError:
 
				     from layout_utils import LayoutUtils, SpanMatcher
			
 
				     from element_processors import ElementProcessors
			
 
				 
			
 
				+from ocr_utils.watermark_utils import scan_pdf_watermark_xobjs, remove_txt_pdf_watermark
			
 
				+
			
 
				 # 从 ocr_tools.ocr_merger 导入 merger 组件
			
 
				 try:
			
 
				     from ocr_tools.ocr_merger import TableCellMatcher, TextMatcher
			
@@ -286,8 +288,23 @@ class EnhancedDocPipeline:
 
				         try:
			
 
				             # 1. 加载文档并分类
			
 
				             dpi = self.config.get('input', {}).get('dpi', 200)
			
 
				+
			
 
				+            # 1a. 文字型 PDF 水印预处理（在渲染前去除，保留文字可搜索性）
			
 
				+            _pdf_bytes_override: Optional[bytes] = None
			
 
				+            if is_pdf:
			
 
				+                wm_cfg = self.config.get('input', {}).get('txt_pdf_watermark_removal', {})
			
 
				+                if wm_cfg.get('enabled', False):
			
 
				+                    _raw = doc_path.read_bytes()
			
 
				+                    if scan_pdf_watermark_xobjs(_raw, sample_pages=wm_cfg.get('sample_pages', 3)):
			
 
				+                        _cleaned = remove_txt_pdf_watermark(_raw)
			
 
				+                        if _cleaned is not None:
			
 
				+                            _pdf_bytes_override = _cleaned
			
 
				+                            logger.info(f"🧹 文字型 PDF 原生去水印完成（{doc_path.name}）")
			
 
				+                        else:
			
 
				+                            logger.debug(f"ℹ️ txt PDF 水印扫描命中但去除返回 None，跳过（{doc_path.name}）")
			
 
				+
			
 
				             images, pdf_type, pdf_doc, renderer_used = PDFUtils.load_and_classify_document(
			
 
				-                doc_path, dpi=dpi, page_range=page_range
			
 
				+                doc_path, dpi=dpi, page_range=page_range, pdf_bytes=_pdf_bytes_override
			
 
				             )
			
 
				             results['metadata']['pdf_type'] = pdf_type
			
 
				             results['metadata']['page_count'] = len(images)
			
--- a/ocr_tools/universal_doc_parser/core/pipeline_manager_v2_streaming.py
+++ b/ocr_tools/universal_doc_parser/core/pipeline_manager_v2_streaming.py
@@ -29,6 +29,7 @@ if str(module_root) not in sys.path:
 
				 # 导入基础类（复用现有实现）
			
 
				 from .pipeline_manager_v2 import EnhancedDocPipeline
			
 
				 from ocr_utils import PDFUtils
			
 
				+from ocr_utils.watermark_utils import scan_pdf_watermark_xobjs, remove_txt_pdf_watermark
			
 
				 
			
 
				 # 从 ocr_utils 导入输出格式化器
			
 
				 try:
			
@@ -125,8 +126,23 @@ class StreamingDocPipeline(EnhancedDocPipeline):
 
				         try:
			
 
				             # 1. 加载文档并分类
			
 
				             dpi = self.config.get('input', {}).get('dpi', 200)
			
 
				+
			
 
				+            # 1a. 文字型 PDF 水印预处理（在渲染前去除，保留文字可搜索性）
			
 
				+            _pdf_bytes_override: Optional[bytes] = None
			
 
				+            if is_pdf:
			
 
				+                wm_cfg = self.config.get('input', {}).get('txt_pdf_watermark_removal', {})
			
 
				+                if wm_cfg.get('enabled', False):
			
 
				+                    _raw = doc_path.read_bytes()
			
 
				+                    if scan_pdf_watermark_xobjs(_raw, sample_pages=wm_cfg.get('sample_pages', 3)):
			
 
				+                        _cleaned = remove_txt_pdf_watermark(_raw)
			
 
				+                        if _cleaned is not None:
			
 
				+                            _pdf_bytes_override = _cleaned
			
 
				+                            logger.info(f"🧹 文字型 PDF 原生去水印完成（{doc_path.name}）")
			
 
				+                        else:
			
 
				+                            logger.debug(f"ℹ️ txt PDF 水印扫描命中但去除返回 None，跳过（{doc_path.name}）")
			
 
				+
			
 
				             images, pdf_type, pdf_doc, renderer_used = PDFUtils.load_and_classify_document(
			
 
				-                doc_path, dpi=dpi, page_range=page_range
			
 
				+                doc_path, dpi=dpi, page_range=page_range, pdf_bytes=_pdf_bytes_override
			
 
				             )
			
 
				             
			
 
				             results_summary['metadata']['pdf_type'] = pdf_type
			
--- a/ocr_tools/universal_doc_parser/main_v2.py
+++ b/ocr_tools/universal_doc_parser/main_v2.py
@@ -570,8 +570,8 @@ if __name__ == "__main__":
 
				             # "input": "/Users/zhch158/workspace/data/流水分析/湛_平安银行图.pdf",
			
 
				             # "output_dir": "./output/湛_平安银行图/bank_statement_yusys_v3",
			
 
				 
			
 
				-            "input": "/Users/zhch158/workspace/data/流水分析/张_微信图.pdf",
			
 
				-            "output_dir": "./output/张_微信图/bank_statement_yusys_v4",
			
 
				+            # "input": "/Users/zhch158/workspace/data/流水分析/张_微信图.pdf",
			
 
				+            # "output_dir": "./output/张_微信图/bank_statement_yusys_v4",
			
 
				 
			
 
				             # "input": "/Users/zhch158/workspace/data/流水分析/许_民生银行图.pdf",
			
 
				             # "output_dir": "./output/许_民生银行图/bank_statement_yusys_v3",
			
@@ -628,8 +628,23 @@ if __name__ == "__main__":
 
				             # "input": "/Users/zhch158/workspace/data/流水分析/山西云集科技有限公司.pdf",
			
 
				             # "output_dir": "/Users/zhch158/workspace/data/流水分析/山西云集科技有限公司/bank_statement_yusys_v3",
			
 
				 
			
 
				+            # "input": "/Users/zhch158/workspace/data/OCBC/数据迁移_20260316173209_180_7.jpg",
			
 
				+            # "input": "/Users/zhch158/workspace/data/OCBC/微信图片_20260316173209_180_7.jpg",
			
 
				+            # "output_dir": "/Users/zhch158/workspace/data/OCBC/bank_statement_yusys_v4",
			
 
				+
			
 
				+            # "input": "/Users/zhch158/workspace/data/流水分析/韩_中国银行图.pdf",
			
 
				+            # "output_dir": "/Users/zhch158/workspace/data/流水分析/韩_中国银行图/bank_statement_yusys_v4",
			
 
				+
			
 
				+            "input": "/Users/zhch158/workspace/data/流水分析/杨万益_福建农信.pdf",
			
 
				+            "output_dir": "/Users/zhch158/workspace/data/流水分析/杨万益_福建农信/bank_statement_yusys_local",
			
 
				+
			
 
				+            # 日志文件
			
 
				+            "log_file": "./output/logs/bank_statement_yusys_local/process.log",
			
 
				+
			
 
				             # 配置文件
			
 
				-            "config": "./config/bank_statement_yusys_v4.yaml",
			
 
				+            "config": "./config/bank_statement_yusys_local.yaml",
			
 
				+            # "config": "./config/bank_statement_paddle_vl_local.yaml",
			
 
				+            # "config": "./config/bank_statement_yusys_v4.yaml",
			
 
				             # "config": "./config/bank_statement_yusys_v3.yaml",
			
 
				             # "config": "./config/bank_statement_smart_router.yaml",
			
 
				             # "config": "./config/bank_statement_mineru_vl.yaml",
			
@@ -641,7 +656,7 @@ if __name__ == "__main__":
 
				             # "scene": "financial_report",
			
 
				             
			
 
				             # 页面范围（可选）
			
 
				-            "pages": "1",  # 只处理前1页
			
 
				+            # "pages": "1",  # 只处理前1页
			
 
				             # "pages": "1-3,5,7-10",  # 处理指定页面
			
 
				             # "pages": "83-109",  # 处理指定页面
			
 
				 
			
@@ -653,8 +668,6 @@ if __name__ == "__main__":
 
				             # 日志级别
			
 
				             "log_level": "DEBUG",
			
 
				 
			
 
				-            # 日志文件
			
 
				-            "log_file": "./output/logs/bank_statement_yusys_v4/process.log",
			
 
				         }
			
 
				         
			
 
				         # 构造参数
			
--- a/ocr_tools/universal_doc_parser/models/adapters/mineru_adapter.py
+++ b/ocr_tools/universal_doc_parser/models/adapters/mineru_adapter.py
@@ -18,6 +18,7 @@ if str(ocr_platform_root) not in sys.path:
 
				 
			
 
				 from .base import BasePreprocessor, BaseLayoutDetector, BaseVLRecognizer, BaseOCRRecognizer
			
 
				 from ocr_utils.coordinate_utils import CoordinateUtils
			
 
				+from ocr_utils.watermark_utils import remove_watermark_from_image_rgb
			
 
				 
			
 
				 # 导入MinerU组件
			
 
				 try:
			
@@ -65,11 +66,28 @@ class MinerUPreprocessor(BasePreprocessor):
 
				 
			
 
				         rotate_angle = 0
			
 
				         processed_image = image
			
 
				-        
			
 
				+
			
 
				+        # 水印去除（在方向校正之前，避免旋转引入额外噪声）
			
 
				+        watermark_cfg = self.config.get('watermark_removal', {})
			
 
				+        if watermark_cfg.get('enabled', False):
			
 
				+            threshold = watermark_cfg.get('threshold', 160)
			
 
				+            morph_close_kernel = watermark_cfg.get('morph_close_kernel', 0)
			
 
				+            try:
			
 
				+                processed_image = remove_watermark_from_image_rgb(
			
 
				+                    processed_image,
			
 
				+                    threshold=threshold,
			
 
				+                    morph_close_kernel=morph_close_kernel,
			
 
				+                    return_pil=False,
			
 
				+                )
			
 
				+                logger.info(f"🧹 Watermark removed (threshold={threshold})")
			
 
				+            except Exception as e:
			
 
				+                logger.warning(f"⚠️ Watermark removal failed, using original: {e}")
			
 
				+                processed_image = image
			
 
				+
			
 
				         # 方向校正
			
 
				         if self.orientation_classifier is not None:
			
 
				             try:
			
 
				-                rotate_angle = int(self.orientation_classifier.predict(image))
			
 
				+                rotate_angle = int(self.orientation_classifier.predict(processed_image))
			
 
				                 processed_image = self._apply_rotation(processed_image, rotate_angle)
			
 
				                 logger.info(f"📐 Applied rotation: {rotate_angle}")
			
 
				             except Exception as e:
			
--- a/ocr_utils/__init__.py
+++ b/ocr_utils/__init__.py
@@ -56,6 +56,9 @@ from .number_utils import (
 
				     parse_number,
			
 
				     normalize_text_number
			
 
				 )
			
 
				+# PDF 分类工具（封装自 MinerU，优先使用 MinerU 原版，MinerU 不可用时退回内置实现）
			
 
				+# PDFUtils 和 extract_pdf_pages 使用延迟导入，避免在 PaddleX 环境中触发 MinerU 导入检查
			
 
				+# from .pdf_classify import classify as pdf_classify  # 按需 import，避免强依赖
			
 
				 # 坐标工具使用延迟导入，避免循环依赖
			
 
				 # from .coordinate_utils import CoordinateUtils  # 已移除，改为延迟导入
			
 
				 
			
--- a/ocr_utils/image_utils.py
+++ b/ocr_utils/image_utils.py
@@ -7,10 +7,11 @@
 
				 - 图像预处理
			
 
				 - BBox 和点坐标转换
			
 
				 - 图像旋转和坐标转换
			
 
				+- 水印去除
			
 
				 """
			
 
				 import cv2
			
 
				 import numpy as np
			
 
				-from typing import List, Tuple, Union
			
 
				+from typing import List, Tuple, Union, Optional, Dict, Any
			
 
				 from PIL import Image
			
 
				 
			
 
				 
			
@@ -113,6 +114,48 @@ def points_to_bbox(points: np.ndarray) -> List[float]:
 
				     return [x0, y0, x1, y1]
			
 
				 
			
 
				 
			
 
				+def detect_watermark(
			
 
				+    image: Union[np.ndarray, Image.Image],
			
 
				+    midtone_low: int = 100,
			
 
				+    midtone_high: int = 220,
			
 
				+    ratio_threshold: float = 0.03,
			
 
				+    check_diagonal: bool = True,
			
 
				+    diagonal_angle_range: tuple = (30, 60),
			
 
				+) -> bool:
			
 
				+    """向后兼容别名，实现已迁移至 ocr_utils.watermark_utils.detect_watermark。"""
			
 
				+    from ocr_utils.watermark_utils import detect_watermark as _impl
			
 
				+    return _impl(
			
 
				+        image,
			
 
				+        midtone_low=midtone_low,
			
 
				+        midtone_high=midtone_high,
			
 
				+        ratio_threshold=ratio_threshold,
			
 
				+        check_diagonal=check_diagonal,
			
 
				+        diagonal_angle_range=diagonal_angle_range,
			
 
				+    )
			
 
				+
			
 
				+
			
 
				+def remove_watermark_from_image(
			
 
				+    image: Union[np.ndarray, Image.Image],
			
 
				+    threshold: int = 160,
			
 
				+    morph_close_kernel: int = 2,
			
 
				+    return_pil: Optional[bool] = None,
			
 
				+) -> Union[np.ndarray, Image.Image]:
			
 
				+    """向后兼容别名，实现已迁移至 ocr_utils.watermark_utils.remove_watermark_from_image。"""
			
 
				+    from ocr_utils.watermark_utils import remove_watermark_from_image as _impl
			
 
				+    return _impl(image, threshold=threshold, morph_close_kernel=morph_close_kernel, return_pil=return_pil)
			
 
				+
			
 
				+
			
 
				+def remove_watermark_from_image_rgb(
			
 
				+    image: Union[np.ndarray, Image.Image],
			
 
				+    threshold: int = 160,
			
 
				+    morph_close_kernel: int = 2,
			
 
				+    return_pil: Optional[bool] = None,
			
 
				+) -> Union[np.ndarray, Image.Image]:
			
 
				+    """向后兼容别名，实现已迁移至 ocr_utils.watermark_utils.remove_watermark_from_image_rgb。"""
			
 
				+    from ocr_utils.watermark_utils import remove_watermark_from_image_rgb as _impl
			
 
				+    return _impl(image, threshold=threshold, morph_close_kernel=morph_close_kernel, return_pil=return_pil)
			
 
				+
			
 
				+
			
 
				 def rotate_image_and_coordinates(
			
 
				     image: Image.Image, 
			
 
				     angle: float, 
			
--- a/ocr_utils/pdf_classify.py
+++ b/ocr_utils/pdf_classify.py
@@ -0,0 +1,198 @@
 
				+"""
			
 
				+PDF 文档类型分类工具
			
 
				+
			
 
				+封装自 MinerU 项目 mineru/utils/pdf_classify.py，作为 ocr_platform 的自有实现。
			
 
				+功能：判断 PDF 是否可直接提取文本（txt）或需要 OCR（ocr）。
			
 
				+
			
 
				+对外接口：
			
 
				+    classify(pdf_bytes: bytes) -> str   # 'txt' 或 'ocr'
			
 
				+
			
 
				+说明：
			
 
				+    classify() 始终使用本模块的自有实现，以保留对 MinerU 原版的定制修改
			
 
				+    （例如 avg_chars >= chars_threshold*4 时跳过图像覆盖率检测，避免含全页水印
			
 
				+    图的文字型 PDF 被误判为 'ocr'）。
			
 
				+
			
 
				+    内部 helper 函数（get_avg_cleaned_chars_per_page / get_high_image_coverage_ratio
			
 
				+    / extract_pages / detect_invalid_chars）优先复用 MinerU 原版，供需要直接调用
			
 
				+    helper 的场景使用；_USING_MINERU_HELPERS 标识当前是否使用 MinerU helpers。
			
 
				+"""
			
 
				+
			
 
				+import re
			
 
				+from io import BytesIO
			
 
				+
			
 
				+import numpy as np
			
 
				+from loguru import logger
			
 
				+
			
 
				+
			
 
				+# ──────────────────────────────────────────────────────────────────────────────
			
 
				+# Helper 函数：优先复用 MinerU 原版（逻辑未修改，保持一致即可）
			
 
				+# ──────────────────────────────────────────────────────────────────────────────
			
 
				+
			
 
				+try:
			
 
				+    from mineru.utils.pdf_classify import (
			
 
				+        get_avg_cleaned_chars_per_page,
			
 
				+        get_high_image_coverage_ratio,
			
 
				+        extract_pages,
			
 
				+        detect_invalid_chars,
			
 
				+    )
			
 
				+    _USING_MINERU_HELPERS = True
			
 
				+
			
 
				+except ImportError:
			
 
				+    _USING_MINERU_HELPERS = False
			
 
				+
			
 
				+    def get_avg_cleaned_chars_per_page(pdf_doc, pages_to_check: int) -> float:
			
 
				+        """计算前 pages_to_check 页的平均清理后字符数。"""
			
 
				+        cleaned_total = 0
			
 
				+        for i in range(pages_to_check):
			
 
				+            page = pdf_doc[i]
			
 
				+            text = page.get_textpage().get_text_bounded()
			
 
				+            cleaned_total += len(re.sub(r'\s+', '', text))
			
 
				+        return cleaned_total / pages_to_check
			
 
				+
			
 
				+    def get_high_image_coverage_ratio(sample_pdf_bytes: bytes, pages_to_check: int) -> float:
			
 
				+        """
			
 
				+        计算高图像覆盖率（>= 80%）的页面占比。
			
 
				+        使用 pdfminer 遍历页面布局元素。
			
 
				+        """
			
 
				+        from pdfminer.pdfparser import PDFParser
			
 
				+        from pdfminer.pdfdocument import PDFDocument
			
 
				+        from pdfminer.pdfpage import PDFPage
			
 
				+        from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
			
 
				+        from pdfminer.layout import LAParams, LTImage, LTFigure
			
 
				+        from pdfminer.converter import PDFPageAggregator
			
 
				+
			
 
				+        pdf_stream = BytesIO(sample_pdf_bytes)
			
 
				+        parser = PDFParser(pdf_stream)
			
 
				+        document = PDFDocument(parser)
			
 
				+
			
 
				+        if not document.is_extractable:
			
 
				+            return 1.0
			
 
				+
			
 
				+        rsrcmgr = PDFResourceManager()
			
 
				+        laparams = LAParams(
			
 
				+            line_overlap=0.5, char_margin=2.0, line_margin=0.5,
			
 
				+            word_margin=0.1, boxes_flow=None, detect_vertical=False, all_texts=False,
			
 
				+        )
			
 
				+        device = PDFPageAggregator(rsrcmgr, laparams=laparams)
			
 
				+        interpreter = PDFPageInterpreter(rsrcmgr, device)
			
 
				+
			
 
				+        high_coverage_pages = 0
			
 
				+        page_count = 0
			
 
				+
			
 
				+        for page in PDFPage.create_pages(document):
			
 
				+            if page_count >= pages_to_check:
			
 
				+                break
			
 
				+            interpreter.process_page(page)
			
 
				+            layout = device.get_result()
			
 
				+
			
 
				+            page_area = layout.width * layout.height
			
 
				+            image_area = sum(
			
 
				+                el.width * el.height
			
 
				+                for el in layout
			
 
				+                if isinstance(el, (LTImage, LTFigure))
			
 
				+            )
			
 
				+            coverage = min(image_area / page_area, 1.0) if page_area > 0 else 0
			
 
				+            if coverage >= 0.8:
			
 
				+                high_coverage_pages += 1
			
 
				+            page_count += 1
			
 
				+
			
 
				+        pdf_stream.close()
			
 
				+        return 0.0 if page_count == 0 else high_coverage_pages / page_count
			
 
				+
			
 
				+    def extract_pages(src_pdf_bytes: bytes) -> bytes:
			
 
				+        """从 PDF 字节数据随机提取最多 10 页，返回新的 PDF 字节数据。"""
			
 
				+        import pypdfium2 as pdfium
			
 
				+
			
 
				+        pdf = pdfium.PdfDocument(src_pdf_bytes)
			
 
				+        total_page = len(pdf)
			
 
				+        if total_page == 0:
			
 
				+            logger.warning("PDF 为空，返回空文档")
			
 
				+            return b''
			
 
				+
			
 
				+        select_count = min(10, total_page)
			
 
				+        page_indices = np.random.choice(total_page, select_count, replace=False).tolist()
			
 
				+
			
 
				+        sample_doc = pdfium.PdfDocument.new()
			
 
				+        try:
			
 
				+            sample_doc.import_pages(pdf, page_indices)
			
 
				+            pdf.close()
			
 
				+            buf = BytesIO()
			
 
				+            sample_doc.save(buf)
			
 
				+            return buf.getvalue()
			
 
				+        except Exception as e:
			
 
				+            pdf.close()
			
 
				+            logger.exception(e)
			
 
				+            return b''
			
 
				+
			
 
				+    def detect_invalid_chars(sample_pdf_bytes: bytes) -> bool:
			
 
				+        """检测 PDF 中是否包含乱码字符（(cid:xxx) 占比 > 5%）。"""
			
 
				+        from pdfminer.high_level import extract_text
			
 
				+        from pdfminer.layout import LAParams
			
 
				+
			
 
				+        laparams = LAParams(
			
 
				+            line_overlap=0.5, char_margin=2.0, line_margin=0.5,
			
 
				+            word_margin=0.1, boxes_flow=None, detect_vertical=False, all_texts=False,
			
 
				+        )
			
 
				+        text = extract_text(pdf_file=BytesIO(sample_pdf_bytes), laparams=laparams)
			
 
				+        text = text.replace('\n', '')
			
 
				+
			
 
				+        cid_pattern = re.compile(r'\(cid:\d+\)')
			
 
				+        matches = cid_pattern.findall(text)
			
 
				+        cid_count = len(matches)
			
 
				+        cid_len = sum(len(m) for m in matches)
			
 
				+        text_len = len(text)
			
 
				+
			
 
				+        if text_len == 0:
			
 
				+            return False
			
 
				+        cid_radio = cid_count / (cid_count + text_len - cid_len)
			
 
				+        return cid_radio > 0.05
			
 
				+
			
 
				+
			
 
				+# ──────────────────────────────────────────────────────────────────────────────
			
 
				+# classify：始终使用自有实现（包含对 MinerU 原版的定制修改）
			
 
				+# ──────────────────────────────────────────────────────────────────────────────
			
 
				+
			
 
				+def classify(pdf_bytes: bytes) -> str:
			
 
				+    """
			
 
				+    判断 PDF 文件是可以直接提取文本还是需要 OCR。
			
 
				+
			
 
				+    与 MinerU 原版的差异（不修改上游代码）：
			
 
				+        检查图像覆盖率之前，若每页平均字符数已 >= chars_threshold * 4，
			
 
				+        则视为确定的文字型 PDF，跳过覆盖率检测。
			
 
				+        典型场景：含全页半透明水印图的银行流水文字 PDF，图像覆盖率接近 100%，
			
 
				+        但每页有大量可提取文字，应分类为 'txt' 而非 'ocr'。
			
 
				+
			
 
				+    Returns:
			
 
				+        'txt' — 可直接提取文本
			
 
				+        'ocr' — 需要 OCR
			
 
				+    """
			
 
				+    import pypdfium2 as pdfium
			
 
				+
			
 
				+    sample_pdf_bytes = extract_pages(pdf_bytes)
			
 
				+    pdf = pdfium.PdfDocument(sample_pdf_bytes)
			
 
				+    try:
			
 
				+        page_count = len(pdf)
			
 
				+        if page_count == 0:
			
 
				+            return 'ocr'
			
 
				+
			
 
				+        pages_to_check = min(page_count, 10)
			
 
				+        chars_threshold = 50
			
 
				+
			
 
				+        avg_chars = get_avg_cleaned_chars_per_page(pdf, pages_to_check)
			
 
				+        if avg_chars < chars_threshold or detect_invalid_chars(sample_pdf_bytes):
			
 
				+            return 'ocr'
			
 
				+
			
 
				+        # 仅在文字数量处于"临界量"时以图像覆盖率辅助判断。
			
 
				+        # 若文字数量已远超阈值（>= 4×），视为确定的文字型 PDF，
			
 
				+        # 不受背景图（如水印）干扰，直接返回 'txt'。
			
 
				+        if avg_chars < chars_threshold * 4 and get_high_image_coverage_ratio(sample_pdf_bytes, pages_to_check) >= 0.8:
			
 
				+            return 'ocr'
			
 
				+
			
 
				+        return 'txt'
			
 
				+
			
 
				+    except Exception as e:
			
 
				+        logger.error(f"判断 PDF 类型时出错: {e}")
			
 
				+        return 'ocr'
			
 
				+    finally:
			
 
				+        pdf.close()
			
 
				+
			
--- a/ocr_utils/pdf_utils.py
+++ b/ocr_utils/pdf_utils.py
@@ -53,7 +53,7 @@ from .pdf_image_rendering import (
 
				 
			
 
				 # 导入 MinerU 组件
			
 
				 try:
			
 
				-    from mineru.utils.pdf_classify import classify as pdf_classify
			
 
				+    from .pdf_classify import classify as pdf_classify
			
 
				     from mineru.utils.enum_class import ImageType
			
 
				     MINERU_AVAILABLE = True
			
 
				 except ImportError:
			
@@ -116,7 +116,8 @@ class PDFUtils:
 
				         document_path: Path,
			
 
				         dpi: int = 200,
			
 
				         page_range: Optional[str] = None,
			
 
				-        renderer: str = "fitz"
			
 
				+        renderer: str = "fitz",
			
 
				+        pdf_bytes: Optional[bytes] = None,
			
 
				     ) -> Tuple[List[Dict], str, Optional[Any], str]:
			
 
				         """
			
 
				         加载文档并分类，支持页面范围过滤
			
@@ -128,6 +129,7 @@ class PDFUtils:
 
				                        - PDF：按页码（从1开始）
			
 
				                        - 图片目录：按文件名排序后的位置（从1开始）
			
 
				             renderer: PDF渲染引擎，"fitz" 或 "pypdfium2"
			
 
				+            pdf_bytes: 可选的 PDF 字节数据；若提供则跳过从文件读取（用于内存中预处理后的 PDF）
			
 
				             
			
 
				         Returns:
			
 
				             (images_list, pdf_type, pdf_doc, renderer_used)
			
@@ -177,8 +179,9 @@ class PDFUtils:
 
				             if not MINERU_AVAILABLE:
			
 
				                 raise RuntimeError("MinerU components not available for PDF processing")
			
 
				             
			
 
				-            with open(document_path, 'rb') as f:
			
 
				-                pdf_bytes = f.read()
			
 
				+            if pdf_bytes is None:
			
 
				+                with open(document_path, 'rb') as f:
			
 
				+                    pdf_bytes = f.read()
			
 
				             
			
 
				             # PDF分类
			
 
				             pdf_type = pdf_classify(pdf_bytes)
			
--- a/ocr_utils/watermark_utils.py
+++ b/ocr_utils/watermark_utils.py
@@ -0,0 +1,378 @@
 
				+"""
			
 
				+水印处理工具模块
			
 
				+
			
 
				+统一管理所有水印检测与去除能力，供整个平台复用：
			
 
				+
			
 
				+- 图像级（扫描 PDF / 图片）：
			
 
				+    detect_watermark()                检测图像中的斜向文字水印
			
 
				+    remove_watermark_from_image()     去除水印，返回灰度图
			
 
				+    remove_watermark_from_image_rgb() 去除水印，返回 RGB 图（适合模型输入）
			
 
				+
			
 
				+- PDF 层级（文字型 PDF，保留可搜索性）：
			
 
				+    scan_pdf_watermark_xobjs()        快速扫描 PDF 是否含水印 XObject（无副作用）
			
 
				+    remove_txt_pdf_watermark()        从内存 PDF bytes 去除水印，返回新 bytes 或 None
			
 
				+"""
			
 
				+from __future__ import annotations
			
 
				+
			
 
				+import re
			
 
				+from typing import Optional, Union
			
 
				+
			
 
				+import cv2
			
 
				+import numpy as np
			
 
				+from PIL import Image
			
 
				+
			
 
				+
			
 
				+# ─────────────────────────────────────────────────────────────────────────────
			
 
				+# 图像级水印检测与去除
			
 
				+# ─────────────────────────────────────────────────────────────────────────────
			
 
				+
			
 
				+def detect_watermark(
			
 
				+    image: Union[np.ndarray, Image.Image],
			
 
				+    midtone_low: int = 100,
			
 
				+    midtone_high: int = 220,
			
 
				+    ratio_threshold: float = 0.03,
			
 
				+    check_diagonal: bool = True,
			
 
				+    diagonal_angle_range: tuple = (30, 60),
			
 
				+) -> bool:
			
 
				+    """
			
 
				+    检测图像中是否存在浅色斜向文字水印（银行流水类文档水印检测）。
			
 
				+
			
 
				+    原理：
			
 
				+    1. 将图像转为灰度，提取「中间调」像素（midtone_low ~ midtone_high），
			
 
				+       这些像素既不是纯白背景，也不是深黑正文，是浅灰水印的典型范围。
			
 
				+    2. 若中间调像素占比超过 ratio_threshold，初步判定存在水印。
			
 
				+    3. 若 check_diagonal=True，进一步用 Hough 直线变换验证中间调区域
			
 
				+       是否呈现斜向（diagonal_angle_range 度）纹理，以排除灰色背景误报。
			
 
				+
			
 
				+    Args:
			
 
				+        image: 输入图像，支持 PIL.Image 或 np.ndarray（BGR/RGB/灰度）。
			
 
				+        midtone_low: 中间调下限（默认 100），低于此视为深色正文。
			
 
				+        midtone_high: 中间调上限（默认 220），高于此视为纯白背景。
			
 
				+        ratio_threshold: 中间调像素占全图比例阈值（默认 0.03 即 3%）。
			
 
				+        check_diagonal: 是否进行斜向纹理验证（默认 True）。
			
 
				+        diagonal_angle_range: 斜向角度范围（度），默认 (30, 60)，含 45° 斜水印。
			
 
				+
			
 
				+    Returns:
			
 
				+        True 表示检测到水印，False 表示未检测到。
			
 
				+    """
			
 
				+    if isinstance(image, Image.Image):
			
 
				+        pil_img = image.convert('RGB') if image.mode == 'RGBA' else image
			
 
				+        np_img = np.array(pil_img)
			
 
				+        gray = cv2.cvtColor(np_img, cv2.COLOR_RGB2GRAY) if np_img.ndim == 3 else np_img
			
 
				+    else:
			
 
				+        np_img = image
			
 
				+        gray = cv2.cvtColor(np_img, cv2.COLOR_BGR2GRAY) if np_img.ndim == 3 else np_img
			
 
				+
			
 
				+    midtone_mask = (gray > midtone_low) & (gray < midtone_high)
			
 
				+    ratio = midtone_mask.sum() / gray.size
			
 
				+
			
 
				+    if ratio < ratio_threshold:
			
 
				+        return False
			
 
				+
			
 
				+    if not check_diagonal:
			
 
				+        return True
			
 
				+
			
 
				+    midtone_uint8 = (midtone_mask.astype(np.uint8)) * 255
			
 
				+    edges = cv2.Canny(midtone_uint8, 50, 150, apertureSize=3)
			
 
				+    lines = cv2.HoughLines(edges, rho=1, theta=np.pi / 180, threshold=80)
			
 
				+
			
 
				+    if lines is None:
			
 
				+        return False
			
 
				+
			
 
				+    low_rad = np.deg2rad(diagonal_angle_range[0])
			
 
				+    high_rad = np.deg2rad(diagonal_angle_range[1])
			
 
				+    diagonal_count = 0
			
 
				+    for line in lines:
			
 
				+        theta = line[0][1]
			
 
				+        if low_rad <= theta <= high_rad or (np.pi - high_rad) <= theta <= (np.pi - low_rad):
			
 
				+            diagonal_count += 1
			
 
				+
			
 
				+    return diagonal_count >= 2
			
 
				+
			
 
				+
			
 
				+def remove_watermark_from_image(
			
 
				+    image: Union[np.ndarray, Image.Image],
			
 
				+    threshold: int = 160,
			
 
				+    morph_close_kernel: int = 2,
			
 
				+    return_pil: Optional[bool] = None,
			
 
				+) -> Union[np.ndarray, Image.Image]:
			
 
				+    """
			
 
				+    去除图像中的浅色斜向文字水印，返回灰度图。
			
 
				+
			
 
				+    原理：正文为深黑色（灰度 < threshold），水印为浅灰（灰度 > threshold）。
			
 
				+    将高于阈值的像素置为白色（255），保留低于阈值的深色正文。
			
 
				+
			
 
				+    Args:
			
 
				+        image: 输入图像（PIL.Image 或 np.ndarray BGR/RGB/灰度）。
			
 
				+        threshold: 灰度阈值（0-255）。建议范围 140-180，默认 160。
			
 
				+                   越大越保守（可能残留水印），越小越激进（可能损失浅色正文）。
			
 
				+        morph_close_kernel: 形态学闭运算核大小，用于填补字符断裂。0 跳过。
			
 
				+        return_pil: None（与输入同类型）| True（PIL.Image）| False（np.ndarray）。
			
 
				+
			
 
				+    Returns:
			
 
				+        去除水印后的灰度图：PIL.Image(mode='L') 或 np.ndarray(HxW, uint8)。
			
 
				+    """
			
 
				+    input_is_pil = isinstance(image, Image.Image)
			
 
				+
			
 
				+    if input_is_pil:
			
 
				+        pil_img = image.convert('RGB') if image.mode == 'RGBA' else image
			
 
				+        np_img = np.array(pil_img)
			
 
				+        if np_img.ndim == 3:
			
 
				+            np_img = cv2.cvtColor(np_img, cv2.COLOR_RGB2BGR)
			
 
				+    else:
			
 
				+        np_img = image.copy()
			
 
				+
			
 
				+    gray = cv2.cvtColor(np_img, cv2.COLOR_BGR2GRAY) if np_img.ndim == 3 else np_img
			
 
				+
			
 
				+    cleaned = gray.copy()
			
 
				+    cleaned[gray > threshold] = 255
			
 
				+
			
 
				+    if morph_close_kernel > 0:
			
 
				+        kernel = np.ones((morph_close_kernel, morph_close_kernel), np.uint8)
			
 
				+        cleaned = cv2.morphologyEx(cleaned, cv2.MORPH_CLOSE, kernel)
			
 
				+
			
 
				+    should_return_pil = input_is_pil if return_pil is None else return_pil
			
 
				+    return Image.fromarray(cleaned, mode='L') if should_return_pil else cleaned
			
 
				+
			
 
				+
			
 
				+def remove_watermark_from_image_rgb(
			
 
				+    image: Union[np.ndarray, Image.Image],
			
 
				+    threshold: int = 160,
			
 
				+    morph_close_kernel: int = 2,
			
 
				+    return_pil: Optional[bool] = None,
			
 
				+) -> Union[np.ndarray, Image.Image]:
			
 
				+    """
			
 
				+    去除水印并返回 RGB 三通道图像。
			
 
				+
			
 
				+    与 remove_watermark_from_image 逻辑相同，但输出为 RGB（三通道），
			
 
				+    方便直接传入布局检测、OCR 等需要彩色输入的下游模型。
			
 
				+
			
 
				+    Args/Returns: 同 remove_watermark_from_image，但输出为 RGB/BGR 三通道。
			
 
				+    """
			
 
				+    input_is_pil = isinstance(image, Image.Image)
			
 
				+    gray_result = remove_watermark_from_image(image, threshold, morph_close_kernel, return_pil=False)
			
 
				+    rgb_np = cv2.cvtColor(gray_result, cv2.COLOR_GRAY2BGR)
			
 
				+
			
 
				+    should_return_pil = input_is_pil if return_pil is None else return_pil
			
 
				+    if should_return_pil:
			
 
				+        return Image.fromarray(cv2.cvtColor(rgb_np, cv2.COLOR_BGR2RGB))
			
 
				+    return rgb_np
			
 
				+
			
 
				+
			
 
				+# ─────────────────────────────────────────────────────────────────────────────
			
 
				+# PDF 层级水印去除（文字型 PDF，保留可搜索性）
			
 
				+# ─────────────────────────────────────────────────────────────────────────────
			
 
				+
			
 
				+def _is_watermark_xobj(doc, xref: int, obj_str: str) -> bool:
			
 
				+    """
			
 
				+    判断一个 Form XObject 是否为水印。
			
 
				+
			
 
				+    启发式规则（满足其一即视为水印）：
			
 
				+    1. 含旋转变换矩阵（cm 指令 sin/cos 分量非零），无论是否有 /Group
			
 
				+    2. 有透明度组（/Group）且内容流包含透明度操作符（ca/CA）
			
 
				+    3. 有透明度组且内容流体积 > 2KB（大量重复绘图 = 平铺水印）
			
 
				+    """
			
 
				+    if "/Form" not in obj_str:
			
 
				+        return False
			
 
				+
			
 
				+    try:
			
 
				+        stream = doc.xref_stream(xref)
			
 
				+        if not stream:
			
 
				+            return False
			
 
				+        stream_text = stream.decode("latin-1", errors="ignore")
			
 
				+    except Exception:
			
 
				+        return False
			
 
				+
			
 
				+    has_group = "/Group" in obj_str
			
 
				+
			
 
				+    cm_pattern = re.compile(
			
 
				+        r"([-\d.]+)\s+([-\d.]+)\s+([-\d.]+)\s+([-\d.]+)\s+[-\d.]+\s+[-\d.]+\s+cm"
			
 
				+    )
			
 
				+    for m in cm_pattern.finditer(stream_text):
			
 
				+        a, b, c, d = float(m.group(1)), float(m.group(2)), float(m.group(3)), float(m.group(4))
			
 
				+        if abs(b) > 0.1 or abs(c) > 0.1:
			
 
				+            return True
			
 
				+
			
 
				+    if not has_group:
			
 
				+        return False
			
 
				+
			
 
				+    if re.search(r'\b(ca|CA)\s+[0-9.]+', stream_text) or re.search(r'[0-9.]+\s+(ca|CA)\b', stream_text):
			
 
				+        return True
			
 
				+
			
 
				+    if len(stream_text) > 2048:
			
 
				+        return True
			
 
				+
			
 
				+    return False
			
 
				+
			
 
				+
			
 
				+def _is_watermark_image_xobj(doc, xref: int, obj_str: str) -> bool:
			
 
				+    """
			
 
				+    判断一个 Image XObject 是否为水印背景图。
			
 
				+
			
 
				+    判断规则（全部满足）：
			
 
				+    1. /Subtype /Image
			
 
				+    2. 有 /SMask（半透明）
			
 
				+    3. 宽 >= 600 且 高 >= 800（全页尺寸，排除小图标）
			
 
				+    4. 解码后像素均值 >= 240（近乎全白，水印文字稀疏）
			
 
				+    """
			
 
				+    if "/Image" not in obj_str or "/SMask" not in obj_str:
			
 
				+        return False
			
 
				+
			
 
				+    w_m = re.search(r'/Width\s+(\d+)', obj_str)
			
 
				+    h_m = re.search(r'/Height\s+(\d+)', obj_str)
			
 
				+    if not w_m or not h_m:
			
 
				+        return False
			
 
				+    if int(w_m.group(1)) < 600 or int(h_m.group(1)) < 800:
			
 
				+        return False
			
 
				+
			
 
				+    try:
			
 
				+        from io import BytesIO
			
 
				+        img_info = doc.extract_image(xref)
			
 
				+        pil_img = Image.open(BytesIO(img_info["image"])).convert("L")
			
 
				+        return float(np.array(pil_img).mean()) >= 240.0
			
 
				+    except Exception:
			
 
				+        return False
			
 
				+
			
 
				+
			
 
				+def _blank_watermark_image(doc, img_xref: int) -> None:
			
 
				+    """
			
 
				+    将水印 Image XObject 的 RGB 流和 SMask 替换为全白/全不透明。
			
 
				+
			
 
				+    关键点：必须先移除 /DecodeParms（Predictor 11），再调用 update_stream。
			
 
				+    否则渲染器在 FlateDecode 之后还会尝试 Predictor 解码，失败后回退原始数据，
			
 
				+    水印依然可见。
			
 
				+    """
			
 
				+    obj_str = doc.xref_object(img_xref)
			
 
				+
			
 
				+    w_m = re.search(r'/Width\s+(\d+)', obj_str)
			
 
				+    h_m = re.search(r'/Height\s+(\d+)', obj_str)
			
 
				+    w = int(w_m.group(1)) if w_m else 1
			
 
				+    h = int(h_m.group(1)) if h_m else 1
			
 
				+    cs_m = re.search(r'/ColorSpace\s+/Device(RGB|Gray|CMYK)', obj_str)
			
 
				+    channels = {'RGB': 3, 'CMYK': 4}.get(cs_m.group(1) if cs_m else '', 1)
			
 
				+
			
 
				+    doc.xref_set_key(img_xref, "DecodeParms", "null")
			
 
				+    doc.update_stream(img_xref, bytes([255]) * (w * h * channels))
			
 
				+
			
 
				+    smask_m = re.search(r'/SMask\s+(\d+)\s+0\s+R', obj_str)
			
 
				+    if smask_m:
			
 
				+        smask_xref = int(smask_m.group(1))
			
 
				+        smask_obj = doc.xref_object(smask_xref)
			
 
				+        sw = int(m.group(1)) if (m := re.search(r'/Width\s+(\d+)', smask_obj)) else w
			
 
				+        sh = int(m.group(1)) if (m := re.search(r'/Height\s+(\d+)', smask_obj)) else h
			
 
				+        doc.xref_set_key(smask_xref, "DecodeParms", "null")
			
 
				+        doc.update_stream(smask_xref, bytes([255]) * (sw * sh))
			
 
				+
			
 
				+
			
 
				+def scan_pdf_watermark_xobjs(pdf_bytes: bytes, sample_pages: int = 3) -> bool:
			
 
				+    """
			
 
				+    快速扫描 PDF 前 N 页，判断是否含水印 XObject。
			
 
				+
			
 
				+    无副作用（只读），用于在执行去水印前快速判断，避免对无水印的大文件
			
 
				+    执行全量扫描和序列化，显著降低财报等大文件的处理开销。
			
 
				+
			
 
				+    Args:
			
 
				+        pdf_bytes: PDF 文件的原始字节。
			
 
				+        sample_pages: 扫描页数上限，默认 3（银行流水通常前几页有水印）。
			
 
				+
			
 
				+    Returns:
			
 
				+        True 表示发现水印 XObject，False 表示未发现。
			
 
				+    """
			
 
				+    try:
			
 
				+        import fitz
			
 
				+    except ImportError:
			
 
				+        return False
			
 
				+
			
 
				+    doc = fitz.open(stream=pdf_bytes, filetype="pdf")
			
 
				+    pages_to_check = min(sample_pages, len(doc))
			
 
				+    try:
			
 
				+        for i in range(pages_to_check):
			
 
				+            page = doc[i]
			
 
				+            for xref, *_ in page.get_xobjects():
			
 
				+                try:
			
 
				+                    obj_str = doc.xref_object(xref)
			
 
				+                except Exception:
			
 
				+                    continue
			
 
				+                if _is_watermark_xobj(doc, xref, obj_str):
			
 
				+                    return True
			
 
				+            for img_tuple in page.get_images(full=True):
			
 
				+                try:
			
 
				+                    obj_str = doc.xref_object(img_tuple[0])
			
 
				+                except Exception:
			
 
				+                    continue
			
 
				+                if _is_watermark_image_xobj(doc, img_tuple[0], obj_str):
			
 
				+                    return True
			
 
				+    finally:
			
 
				+        doc.close()
			
 
				+    return False
			
 
				+
			
 
				+
			
 
				+def remove_txt_pdf_watermark(pdf_bytes: bytes) -> Optional[bytes]:
			
 
				+    """
			
 
				+    对文字型 PDF 执行原生水印去除，完全在内存中完成，不写临时文件。
			
 
				+
			
 
				+    支持两种水印形式：
			
 
				+    - Form XObject 水印：清空内容流
			
 
				+    - Image XObject 水印（全页背景图 + SMask 透明通道）：替换为全白像素
			
 
				+
			
 
				+    适用场景：pdf_type='txt' 的 PDF，去除后可直接传给渲染层（tobytes() → bytes）。
			
 
				+    对于大文件（如财报），建议先用 scan_pdf_watermark_xobjs() 快速判断再调用本函数。
			
 
				+
			
 
				+    Args:
			
 
				+        pdf_bytes: 原始 PDF 的字节内容。
			
 
				+
			
 
				+    Returns:
			
 
				+        去除水印后的 PDF bytes（garbage=4 压缩）；若未发现水印返回 None。
			
 
				+    """
			
 
				+    try:
			
 
				+        import fitz
			
 
				+    except ImportError:
			
 
				+        raise ImportError("请安装 PyMuPDF: pip install PyMuPDF")
			
 
				+
			
 
				+    from loguru import logger
			
 
				+
			
 
				+    doc = fitz.open(stream=pdf_bytes, filetype="pdf")
			
 
				+    processed_xrefs: set[int] = set()
			
 
				+    total_removed = 0
			
 
				+
			
 
				+    for page in doc:
			
 
				+        # ── Form XObject 水印 ─────────────────────────────────────────
			
 
				+        for xref, name, _invoker, _unused in page.get_xobjects():
			
 
				+            if xref in processed_xrefs:
			
 
				+                continue
			
 
				+            try:
			
 
				+                obj_str = doc.xref_object(xref)
			
 
				+            except Exception:
			
 
				+                continue
			
 
				+            if _is_watermark_xobj(doc, xref, obj_str):
			
 
				+                try:
			
 
				+                    doc.update_stream(xref, b"")
			
 
				+                    processed_xrefs.add(xref)
			
 
				+                    total_removed += 1
			
 
				+                    logger.debug(f"  [Form XObject] 清空水印 xref={xref}, name={name}")
			
 
				+                except Exception as e:
			
 
				+                    logger.warning(f"  清空 Form XObject xref={xref} 失败: {e}")
			
 
				+
			
 
				+        # ── Image XObject 水印 ────────────────────────────────────────
			
 
				+        for img_tuple in page.get_images(full=True):
			
 
				+            img_xref = img_tuple[0]
			
 
				+            if img_xref in processed_xrefs:
			
 
				+                continue
			
 
				+            try:
			
 
				+                obj_str = doc.xref_object(img_xref)
			
 
				+            except Exception:
			
 
				+                continue
			
 
				+            if _is_watermark_image_xobj(doc, img_xref, obj_str):
			
 
				+                _blank_watermark_image(doc, img_xref)
			
 
				+                processed_xrefs.add(img_xref)
			
 
				+                total_removed += 1
			
 
				+                logger.debug(f"  [Image XObject] 替换水印图像 xref={img_xref}")
			
 
				+
			
 
				+    if total_removed == 0:
			
 
				+        doc.close()
			
 
				+        return None
			
 
				+
			
 
				+    result = doc.tobytes(garbage=4, deflate=True)
			
 
				+    doc.close()
			
 
				+    logger.info(f"✅ PDF 层级水印去除：共清除 {total_removed} 个水印 XObject")
			
 
				+    return result
			
--- a/ocr_validator/config/global.yaml
+++ b/ocr_validator/config/global.yaml
@@ -161,3 +161,4 @@ data_sources:
 
				   - 付_工商银行943825图.yaml
			
 
				   - 许_民生银行图.yaml
			
 
				   - 韩_中国银行图.yaml
			
 
				+  - 杨万益_福建农信.yaml
			
--- a/ocr_validator/config/杨万益_福建农信.yaml
+++ b/ocr_validator/config/杨万益_福建农信.yaml
@@ -0,0 +1,83 @@
 
				+# 文档: 德_内蒙古银行照
			
 
				+document:
			
 
				+  name: "杨万益_福建农信"
			
 
				+  base_dir: "/Users/zhch158/workspace/data/流水分析/杨万益_福建农信"
			
 
				+  
			
 
				+  # 🎯 关键改进：定义该文档使用的 OCR 工具及其结果目录
			
 
				+  ocr_results:
			
 
				+    # bank_statement_yusys_v4
			
 
				+    # - tool: "mineru"
			
 
				+    #   result_dir: "bank_statement_yusys_v4"
			
 
				+    #   image_dir: "bank_statement_yusys_v4/{{name}}"
			
 
				+    #   description: "YUSYS-OCR框架 v4.0 GLM-OCR"
			
 
				+    #   enabled: true
			
 
				+
			
 
				+    # # bank_statement_yusys_v3
			
 
				+    # - tool: "mineru"
			
 
				+    #   result_dir: "bank_statement_yusys_v3"
			
 
				+    #   image_dir: "bank_statement_yusys_v3/{{name}}"
			
 
				+    #   description: "YUSYS-OCR框架 v3.0"
			
 
				+    #   enabled: true
			
 
				+
			
 
				+    # bank_statement_yusys_local glm-ocr
			
 
				+    - tool: "mineru"
			
 
				+      result_dir: "bank_statement_yusys_local"
			
 
				+      image_dir: "bank_statement_yusys_local/{{name}}"
			
 
				+      description: "YUSYS-OCR框架(local) GLM-OCR"
			
 
				+      enabled: true
			
 
				+
			
 
				+    # bank_statement_yusys_local paddleocr_vl
			
 
				+    - tool: "mineru"
			
 
				+      result_dir: "bank_statement_paddle_vl_local"
			
 
				+      image_dir: "bank_statement_paddle_vl_local/{{name}}"
			
 
				+      description: "YUSYS-OCR框架(local) PaddleOCR-VL"
			
 
				+      enabled: true
			
 
				+
			
 
				+    #  # MinerU
			
 
				+    # - tool: "mineru"
			
 
				+    #   result_dir: "mineru_vllm_results"
			
 
				+    #   image_dir: "mineru_vllm_results/{{name}}"
			
 
				+    #   description: "MinerU 图片合成结果"
			
 
				+    #   enabled: true
			
 
				+    
			
 
				+    # # MinerU (带 cell bbox)
			
 
				+    # - tool: "mineru"
			
 
				+    #   result_dir: "mineru_vllm_results_cell_bbox"
			
 
				+    #   image_dir: "mineru_vllm_results/{{name}}"
			
 
				+    #   description: "MinerU + PaddleOCR 坐标"
			
 
				+    #   enabled: true
			
 
				+   
			
 
				+    # # PaddleOCR-VL
			
 
				+    # - tool: "paddleocr_vl"
			
 
				+    #   result_dir: "paddleocr_vl_results"
			
 
				+    #   image_dir: "paddleocr_vl_results/{{name}}"
			
 
				+    #   description: "PaddleOCR VLM 图片合成结果"
			
 
				+    #   enabled: true
			
 
				+    
			
 
				+    # # PaddleOCR-VL (带 cell bbox)
			
 
				+    # - tool: "mineru"  # 格式同 MinerU
			
 
				+    #   result_dir: "paddleocr_vl_results_cell_bbox"
			
 
				+    #   image_dir: "paddleocr_vl_results/{{name}}"
			
 
				+    #   description: "PaddleOCR VLM + PaddleOCR 坐标"
			
 
				+    #   enabled: true
			
 
				+    
			
 
				+    # # DotsOCR
			
 
				+    # - tool: "dots_ocr"
			
 
				+    #   result_dir: "dotsocr_vllm_results"
			
 
				+    #   image_dir: "dotsocr_vllm_results/{{name}}"
			
 
				+    #   description: "Dots OCR 图片合成结果"
			
 
				+    #   enabled: true
			
 
				+  
			
 
				+    # # DotsOCR (带 cell bbox)
			
 
				+    # - tool: "mineru"
			
 
				+    #   result_dir: "dotsocr_vllm_results_cell_bbox"
			
 
				+    #   image_dir: "dotsocr_vllm_results/{{name}}"
			
 
				+    #   description: "Dots OCR + PaddleOCR 坐标"
			
 
				+    #   enabled: true
			
 
				+
			
 
				+    # # PPStructV3
			
 
				+    # - tool: "ppstructv3"
			
 
				+    #   result_dir: "ppstructurev3_client_results"
			
 
				+    #   image_dir: "ppstructurev3_client_results/{{name}}"
			
 
				+    #   description: "PPStructV3 图片合成结果"
			
 
				+    #   enabled: true
Author	SHA1 Message	Date
zhch158_admin	abedc4e5b6 feat(更新PDF分类工具注释): 添加PDF分类工具的注释，说明优先使用MinerU原版及延迟导入策略	3 months ago
zhch158_admin	fadba9a24b fix(修复MinerU组件导入路径): 修改MinerU组件的导入路径以确保正确加载	3 months ago
zhch158_admin	8032c96d96 feat(新增PDF文档类型分类工具): 添加PDF文档类型分类功能，支持判断可提取文本或需OCR	3 months ago
zhch158_admin	17d86604f4 feat(更新杨万益_福建农信文档配置): 修改输入输出路径和配置文件，添加日志文件路径	3 months ago
zhch158_admin	a2d8a22d91 feat(新增杨万益_福建农信文档配置): 添加杨万益_福建农信的OCR工具及结果目录配置	3 months ago
zhch158_admin	7fa6ad09bb feat(新增水印处理工具模块): 添加图像级和PDF层级水印检测与去除功能，支持多种输入格式	3 months ago
zhch158_admin	b6b75a00ba feat(新增PDF字节数据支持): 在PDFUtils中添加可选的pdf_bytes参数，支持内存中预处理后的PDF加载	3 months ago
zhch158_admin	e92c162db4 feat(新增水印检测和去除功能): 添加水印检测和去除的兼容别名，迁移至水印工具模块	3 months ago
zhch158_admin	4a59f8824f feat(新增水印去除功能): 在MinerU预处理器中添加水印去除功能，优化图像处理流程	3 months ago
zhch158_admin	9a0943bdd3 feat(新增文字型PDF水印预处理): 在文档加载过程中添加文字型PDF水印去除功能，提升文档可搜索性	3 months ago
zhch158_admin	6514d7f3fd feat(新增文字型PDF水印去除预处理): 在文档处理流程中添加文字型PDF水印去除功能，提升文档可搜索性	3 months ago
zhch158_admin	4f6f5e14b1 feat(新增水印去除功能): 在多个银行流水配置文件中添加文字型PDF水印去除功能，提升OCR处理效果	3 months ago
zhch158_admin	d154ca288c feat(新增银行流水水印去除工具): 添加支持 PDF 和常见图片格式的水印去除功能	3 months ago
zhch158_admin	95fe5c7b8c feat(更新PaddleOCR本地守护进程配置): 修改端口和模型路径以增强兼容性	3 months ago
zhch158_admin	d5b79e5f52 feat(更新GLM-OCR本地守护进程配置): 修改端口和模型路径，增强本地服务兼容性	3 months ago