12 Angajamente d3a9545849 ... 0e20f6612e

Autor SHA1 Permisiunea de a trimite mesaje. Dacă este dezactivată, utilizatorul nu va putea trimite nici un fel de mesaj Data
  zhch158_admin 0e20f6612e feat(config): 更新默认配置,切换至 GLM-VL 模型,调整页面范围和日志文件路径 1 săptămână în urmă
  zhch158_admin 85626ae88e feat(config): 添加 YUSYS-OCR v4.0 支持,更新多个文档的 OCR 工具配置 1 săptămână în urmă
  zhch158_admin 451b26652d feat(markdown_generator): 添加印章类型支持,增强 Markdown 输出功能 1 săptămână în urmă
  zhch158_admin 90fc1b8ed4 feat(tests): 添加 GLM-OCR 适配器和 API 测试用例,验证适配器加载和 API 调用 1 săptămână în urmă
  zhch158_admin 371113b468 feat(adapter): 添加 GLM-OCR 适配器支持,增强模型适配器功能 1 săptămână în urmă
  zhch158_admin b24aaa17be fix(adapter): 修改 seal 类别为独立类别,以支持 VLM 识别 1 săptămână în urmă
  zhch158_admin cb2803c537 feat(adapter): 添加 GLM-OCR VL 识别适配器,支持表格、公式、文本和印章识别 1 săptămână în urmă
  zhch158_admin e126aaed5a feat(element_processors): 添加印章元素处理功能,支持 VLM 识别 1 săptămână în urmă
  zhch158_admin 9292deaf3d feat(config): 添加银行交易流水V4场景配置,支持多种格式和GLM-OCR识别 1 săptămână în urmă
  zhch158_admin a92691ddbb feat(config): 添加银行交易流水场景配置,支持GLM-OCR进行VL识别 1 săptămână în urmă
  zhch158_admin 1594cd3e94 feat(processor_configs): 添加yusys_ocr_v4处理器配置,支持银行对账单处理 1 săptămână în urmă
  zhch158_admin 2c1f098ff0 feat(glmocr_vllm_daemon): 添加GLM-OCR vLLM服务守护进程脚本,支持模型启动、停止和状态检查 1 săptămână în urmă
31 a modificat fișierele cu 1649 adăugiri și 13 ștergeri
  1. 359 0
      ocr_tools/daemons/glmocr_vllm_daemon.sh
  2. 16 1
      ocr_tools/ocr_batch/processor_configs.yaml
  3. 113 0
      ocr_tools/universal_doc_parser/config/bank_statement_glm_vl.yaml
  4. 178 0
      ocr_tools/universal_doc_parser/config/bank_statement_yusys_v4.yaml
  5. 51 0
      ocr_tools/universal_doc_parser/core/element_processors.py
  6. 8 1
      ocr_tools/universal_doc_parser/core/model_factory.py
  7. 18 0
      ocr_tools/universal_doc_parser/core/pipeline_manager_v2.py
  8. 19 6
      ocr_tools/universal_doc_parser/main_v2.py
  9. 10 0
      ocr_tools/universal_doc_parser/models/adapters/__init__.py
  10. 413 0
      ocr_tools/universal_doc_parser/models/adapters/glmocr_vl_adapter.py
  11. 1 1
      ocr_tools/universal_doc_parser/models/adapters/pp_doclayout_v3_layout_adapter.py
  12. 188 0
      ocr_tools/universal_doc_parser/tests/test_glmocr_adapter.py
  13. 135 0
      ocr_tools/universal_doc_parser/tests/test_glmocr_api.py
  14. 20 0
      ocr_utils/json_formatters.py
  15. 13 0
      ocr_utils/markdown_generator.py
  16. 7 0
      ocr_validator/config/A用户_单元格扫描流水.yaml
  17. 7 0
      ocr_validator/config/B用户_扫描流水.yaml
  18. 0 2
      ocr_validator/config/global.yaml
  19. 7 0
      ocr_validator/config/乔_建设银行图.yaml
  20. 7 0
      ocr_validator/config/付_工商银行943825图.yaml
  21. 7 0
      ocr_validator/config/对公_招商银行图.yaml
  22. 7 0
      ocr_validator/config/山西云集科技有限公司.yaml
  23. 7 0
      ocr_validator/config/康强_北京农村商业银行.yaml
  24. 7 0
      ocr_validator/config/张_微信图.yaml
  25. 7 0
      ocr_validator/config/德_内蒙古银行照.yaml
  26. 7 0
      ocr_validator/config/提取自赤峰黄金2023年报.yaml
  27. 7 0
      ocr_validator/config/施博深.yaml
  28. 7 0
      ocr_validator/config/朱_中信银行图.yaml
  29. 7 0
      ocr_validator/config/湛_平安银行图.yaml
  30. 9 2
      ocr_validator/config/至远彩色_2023年报.yaml
  31. 7 0
      ocr_validator/config/许_民生银行图.yaml

+ 359 - 0
ocr_tools/daemons/glmocr_vllm_daemon.sh

@@ -0,0 +1,359 @@
+#!/bin/bash
+# filepath: ocr_platform/ocr_tools/daemons/glmocr_vllm_daemon.sh
+# 对应: GLM-OCR 自部署 vLLM 服务,SDK 自托管模式 (config: maas.enabled=false, ocr_api → 本服务)
+
+# 保证transformers与vllm的兼容性, 注意先后次序,vllm安装后的transformers需重新安装
+# uv pip install -U vllm --torch-backend=auto --extra-index-url https://wheels.vllm.ai/nightly
+# uv pip install -U transformers
+# GLM-OCR vLLM 服务守护进程脚本
+
+LOGDIR="/home/ubuntu/zhch/logs"
+mkdir -p $LOGDIR
+PIDFILE="$LOGDIR/glmocr_vllm.pid"
+LOGFILE="$LOGDIR/glmocr_vllm.log"
+
+# 配置参数
+CONDA_ENV="mineru_2_7_1"
+PORT="20036"
+HOST="0.0.0.0"
+# 本地模型目录(与 config-zhch.yaml 自托管模式 ocr_api.api_port 一致)
+MODEL_PATH="/home/ubuntu/models/modelscope_cache/models/ZhipuAI/GLM-OCR"
+# 也可使用 HuggingFace 模型 id: zai-org/GLM-OCR
+SERVED_MODEL_NAME="glm-ocr"
+ALLOWED_LOCAL_MEDIA_PATH="/"
+
+# GPU 配置
+GPU_MEMORY_UTILIZATION="0.7"
+CUDA_VISIBLE_DEVICES="7"
+# 可选:开启 MTP 推测解码以提升推理性能
+ENABLE_MTP="0"
+MTP_NUM_SPECULATIVE_TOKENS="1"
+
+# 环境变量(按需取消注释)
+# export HF_HOME="/home/ubuntu/models/hf_home"
+# export HF_ENDPOINT="https://hf-mirror.com"
+# export MODELSCOPE_CACHE="/home/ubuntu/models/modelscope_cache"
+
+# 正确初始化和激活 conda 环境
+if [ -f "/home/ubuntu/anaconda3/etc/profile.d/conda.sh" ]; then
+    source /home/ubuntu/anaconda3/etc/profile.d/conda.sh
+    conda activate $CONDA_ENV
+elif [ -f "/opt/conda/etc/profile.d/conda.sh" ]; then
+    source /opt/conda/etc/profile.d/conda.sh
+    conda activate $CONDA_ENV
+else
+    echo "Warning: Using direct conda path activation"
+    export PATH="/home/ubuntu/anaconda3/envs/$CONDA_ENV/bin:$PATH"
+fi
+
+start() {
+    if [ -f $PIDFILE ] && kill -0 $(cat $PIDFILE) 2>/dev/null; then
+        echo "GLM-OCR vLLM is already running"
+        return 1
+    fi
+
+    echo "Starting GLM-OCR vLLM daemon..."
+    echo "Host: $HOST, Port: $PORT"
+    echo "Model path: $MODEL_PATH"
+    echo "Served model name: $SERVED_MODEL_NAME"
+    echo "GPU memory utilization: $GPU_MEMORY_UTILIZATION"
+    echo "CUDA devices: $CUDA_VISIBLE_DEVICES"
+
+    # 检查模型:本地路径需存在目录,HuggingFace id (含 / 且非绝对路径) 不检查
+    if [[ "$MODEL_PATH" == /* ]]; then
+        if [ ! -d "$MODEL_PATH" ]; then
+            echo "❌ Model path not found: $MODEL_PATH"
+            echo "Use a local path or HuggingFace id (e.g. zai-org/GLM-OCR). Edit MODEL_PATH in this script."
+            return 1
+        fi
+    fi
+
+    # 检查 Python / vLLM
+    if ! command -v python >/dev/null 2>&1; then
+        echo "❌ Python not found. Check conda environment activation."
+        return 1
+    fi
+
+    if ! python -c "import vllm" 2>/dev/null; then
+        echo "❌ vllm not found. Install: uv pip install -U vllm --torch-backend=auto --extra-index-url https://wheels.vllm.ai/nightly"
+        return 1
+    fi
+
+    echo "🔧 Using Python: $(which python)"
+    echo "🔧 vLLM: $(python -c 'import vllm; print(vllm.__file__)' 2>/dev/null || true)"
+
+    echo "📊 GPU 状态检查:"
+    if command -v nvidia-smi >/dev/null 2>&1; then
+        nvidia-smi --query-gpu=index,name,memory.used,memory.total --format=csv,noheader,nounits | \
+        awk -F',' '{printf "  GPU %s: %s - 内存: %sMB/%sMB\n", $1, $2, $3, $4}'
+    else
+        echo "⚠️  nvidia-smi not available"
+    fi
+
+    # 构建 vllm serve 参数
+    if [ "$ENABLE_MTP" = "1" ]; then
+        CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES TRANSFORMERS_USE_FAST=false nohup vllm serve "$MODEL_PATH" \
+            --host $HOST \
+            --port $PORT \
+            --allowed-local-media-path $ALLOWED_LOCAL_MEDIA_PATH \
+            --served-model-name $SERVED_MODEL_NAME \
+            --gpu-memory-utilization $GPU_MEMORY_UTILIZATION \
+            --speculative-config "{\"method\": \"mtp\", \"num_speculative_tokens\": $MTP_NUM_SPECULATIVE_TOKENS}" \
+            > $LOGFILE 2>&1 &
+    else
+        CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES TRANSFORMERS_USE_FAST=false nohup vllm serve "$MODEL_PATH" \
+            --host $HOST \
+            --port $PORT \
+            --allowed-local-media-path $ALLOWED_LOCAL_MEDIA_PATH \
+            --served-model-name $SERVED_MODEL_NAME \
+            --gpu-memory-utilization $GPU_MEMORY_UTILIZATION \
+            > $LOGFILE 2>&1 &
+    fi
+
+    echo $! > $PIDFILE
+    echo "✅ GLM-OCR vLLM started with PID: $(cat $PIDFILE)"
+    echo "📋 Log file: $LOGFILE"
+    echo "🌐 Service URL: http://$HOST:$PORT"
+    echo "📖 OpenAI-compatible API: http://localhost:$PORT/v1 (chat/completions, models)"
+    echo ""
+    echo "Waiting for service to start..."
+    sleep 5
+    status
+}
+
+stop() {
+    if [ ! -f $PIDFILE ]; then
+        echo "GLM-OCR vLLM is not running"
+        return 1
+    fi
+
+    PID=$(cat $PIDFILE)
+    echo "Stopping GLM-OCR vLLM (PID: $PID)..."
+
+    kill $PID
+
+    for i in {1..30}; do
+        if ! kill -0 $PID 2>/dev/null; then
+            break
+        fi
+        echo "Waiting for process to stop... ($i/30)"
+        sleep 1
+    done
+
+    if kill -0 $PID 2>/dev/null; then
+        echo "Force killing process..."
+        kill -9 $PID
+    fi
+
+    rm -f $PIDFILE
+    echo "✅ GLM-OCR vLLM stopped"
+}
+
+status() {
+    if [ -f $PIDFILE ] && kill -0 $(cat $PIDFILE) 2>/dev/null; then
+        PID=$(cat $PIDFILE)
+        echo "✅ GLM-OCR vLLM is running (PID: $PID)"
+        echo "🌐 Service URL: http://$HOST:$PORT"
+        echo "📋 Log file: $LOGFILE"
+
+        if command -v ss >/dev/null 2>&1; then
+            if ss -tuln | grep -q ":$PORT "; then
+                echo "🔗 Port $PORT is being listened"
+            else
+                echo "⚠️  Port $PORT is not being listened (service may be starting up)"
+            fi
+        elif command -v netstat >/dev/null 2>&1; then
+            if netstat -tuln | grep -q ":$PORT "; then
+                echo "🔗 Port $PORT is being listened"
+            else
+                echo "⚠️  Port $PORT is not being listened (service may be starting up)"
+            fi
+        fi
+
+        if command -v curl >/dev/null 2>&1; then
+            if curl -s --connect-timeout 2 http://127.0.0.1:$PORT/v1/models > /dev/null 2>&1; then
+                echo "🎯 API 响应正常"
+            else
+                echo "⚠️  API 无响应 (service may be starting up)"
+            fi
+        fi
+
+        if command -v nvidia-smi >/dev/null 2>&1; then
+            echo "📊 GPU 使用情况:"
+            nvidia-smi --query-gpu=index,utilization.gpu,utilization.memory,memory.used,memory.total --format=csv,noheader,nounits | \
+            awk -F',' '{printf "  GPU %s: GPU利用率 %s%%, 内存利用率 %s%%, 显存 %sMB/%sMB\n", $1, $2, $3, $4, $5}'
+        fi
+
+        if [ -f $LOGFILE ]; then
+            echo "📄 Latest logs (last 3 lines):"
+            tail -3 $LOGFILE | sed 's/^/  /'
+        fi
+    else
+        echo "❌ GLM-OCR vLLM is not running"
+        if [ -f $PIDFILE ]; then
+            echo "Removing stale PID file..."
+            rm -f $PIDFILE
+        fi
+    fi
+}
+
+logs() {
+    if [ -f $LOGFILE ]; then
+        echo "📄 GLM-OCR vLLM logs:"
+        echo "====================="
+        tail -f $LOGFILE
+    else
+        echo "❌ Log file not found: $LOGFILE"
+    fi
+}
+
+config() {
+    echo "📋 Current configuration:"
+    echo "  Conda Environment: $CONDA_ENV"
+    echo "  Host: $HOST"
+    echo "  Port: $PORT"
+    echo "  Model Path: $MODEL_PATH"
+    echo "  Served Model Name: $SERVED_MODEL_NAME"
+    echo "  Allowed Local Media Path: $ALLOWED_LOCAL_MEDIA_PATH"
+    echo "  GPU Memory Utilization: $GPU_MEMORY_UTILIZATION"
+    echo "  CUDA Visible Devices: $CUDA_VISIBLE_DEVICES"
+    echo "  Enable MTP: $ENABLE_MTP"
+    echo "  PID File: $PIDFILE"
+    echo "  Log File: $LOGFILE"
+
+    if [ -d "$MODEL_PATH" ]; then
+        echo "✅ Model path exists"
+        echo "  Model files:"
+        ls -la "$MODEL_PATH" | head -10 | sed 's/^/    /'
+        if [ $(ls -1 "$MODEL_PATH" 2>/dev/null | wc -l) -gt 10 ]; then
+            echo "    ... and more files"
+        fi
+    else
+        echo "❌ Model path not found (use HuggingFace id like zai-org/GLM-OCR by setting MODEL_PATH)"
+    fi
+
+    echo ""
+    echo "🔧 Environment:"
+    echo "  Python: $(which python 2>/dev/null || echo 'Not found')"
+    echo "  vLLM: $(python -c 'import vllm; print(vllm.__file__)' 2>/dev/null || echo 'Not found')"
+    echo "  Conda: $(which conda 2>/dev/null || echo 'Not found')"
+    echo "  CUDA: $(which nvcc 2>/dev/null || echo 'Not found')"
+
+    if command -v nvidia-smi >/dev/null 2>&1; then
+        echo ""
+        echo "🔥 GPU Information:"
+        nvidia-smi --query-gpu=index,name,driver_version,memory.total --format=csv,noheader,nounits | \
+        awk -F',' '{printf "  GPU %s: %s (Driver: %s, Memory: %sMB)\n", $1, $2, $3, $4}'
+    fi
+}
+
+test_api() {
+    echo "🧪 Testing GLM-OCR vLLM API..."
+
+    if [ ! -f $PIDFILE ] || ! kill -0 $(cat $PIDFILE) 2>/dev/null; then
+        echo "❌ GLM-OCR vLLM service is not running"
+        return 1
+    fi
+
+    if ! command -v curl >/dev/null 2>&1; then
+        echo "❌ curl command not found"
+        return 1
+    fi
+
+    echo "📡 Testing /v1/models endpoint..."
+    response=$(curl -s --connect-timeout 10 http://127.0.0.1:$PORT/v1/models)
+    if [ $? -eq 0 ]; then
+        echo "✅ Models endpoint accessible"
+        echo "$response" | python -m json.tool 2>/dev/null || echo "$response"
+    else
+        echo "❌ Models endpoint not accessible"
+    fi
+}
+
+test_client() {
+    echo "🧪 Testing GLM-OCR SDK with vLLM server (self-hosted mode)..."
+
+    if [ ! -f $PIDFILE ] || ! kill -0 $(cat $PIDFILE) 2>/dev/null; then
+        echo "❌ GLM-OCR vLLM service is not running. Start it first: $0 start"
+        return 1
+    fi
+
+    TEST_IMAGE="/home/ubuntu/zhch/data/test/sample.png"
+    if [ ! -f "$TEST_IMAGE" ]; then
+        echo "⚠️  Test image not found: $TEST_IMAGE"
+        echo "Update TEST_IMAGE in this script or create the file."
+        return 1
+    fi
+
+    echo "📄 Test image: $TEST_IMAGE"
+    echo "Run GLM-OCR with config that has maas.enabled=false and ocr_api pointing to 127.0.0.1:$PORT"
+    echo "Example: glmocr parse $TEST_IMAGE --config /path/to/config.yaml"
+    echo ""
+    echo "Or start GLM-OCR Flask server (layout+OCR) that uses this vLLM backend:"
+    echo "  glmocr server --config /path/to/config-zhch.yaml  # with maas.enabled=false, ocr_api.api_port=$PORT"
+    echo "Then: curl -X POST http://localhost:5002/glmocr/parse -H 'Content-Type: application/json' -d '{\"images\": [\"file://$TEST_IMAGE\"]}'"
+}
+
+usage() {
+    echo "GLM-OCR vLLM Service Daemon"
+    echo "==========================="
+    echo "Usage: $0 {start|stop|restart|status|logs|config|test|test-client}"
+    echo ""
+    echo "Commands:"
+    echo "  start       - Start the GLM-OCR vLLM service"
+    echo "  stop        - Stop the GLM-OCR vLLM service"
+    echo "  restart     - Restart the GLM-OCR vLLM service"
+    echo "  status      - Show service status and resource usage"
+    echo "  logs        - Show service logs (follow mode)"
+    echo "  config      - Show current configuration"
+    echo "  test        - Test /v1/models API endpoint"
+    echo "  test-client - Show how to test SDK/Flask with this vLLM backend"
+    echo ""
+    echo "Configuration (edit script to modify):"
+    echo "  Host: $HOST"
+    echo "  Port: $PORT"
+    echo "  Model Path: $MODEL_PATH"
+    echo "  Served Model Name: $SERVED_MODEL_NAME"
+    echo "  GPU Memory: $GPU_MEMORY_UTILIZATION"
+    echo "  CUDA Devices: $CUDA_VISIBLE_DEVICES"
+    echo "  Enable MTP: $ENABLE_MTP"
+    echo ""
+    echo "Examples:"
+    echo "  ./glmocr_vllm_daemon.sh start"
+    echo "  ./glmocr_vllm_daemon.sh status"
+    echo "  ./glmocr_vllm_daemon.sh logs"
+    echo "  ./glmocr_vllm_daemon.sh test"
+}
+
+case "$1" in
+    start)
+        start
+        ;;
+    stop)
+        stop
+        ;;
+    restart)
+        stop
+        sleep 3
+        start
+        ;;
+    status)
+        status
+        ;;
+    logs)
+        logs
+        ;;
+    config)
+        config
+        ;;
+    test)
+        test_api
+        ;;
+    test-client)
+        test_client
+        ;;
+    *)
+        usage
+        exit 1
+        ;;
+esac

+ 16 - 1
ocr_tools/ocr_batch/processor_configs.yaml

@@ -8,7 +8,22 @@ processors:
   # MinerU vLLM 处理器
   # 基于 MinerU 的多线程批量处理(支持 PDF 和图片)
   # -------------------------------------------------------------------------
-  yusys_ocr:
+  yusys_ocr_v4:
+    script: "/Users/zhch158/workspace/repository.git/ocr_platform/ocr_tools/universal_doc_parser/main_v2.py"
+    input_arg: "--input"
+    output_arg: "--output_dir"
+    extra_args:
+      - "--config=/Users/zhch158/workspace/repository.git/ocr_platform/ocr_tools/universal_doc_parser/config/bank_statement_yusys_v4.yaml"
+      - "--pages=1-35"
+      - "--streaming"
+      - "--debug"
+      - "--log_level=DEBUG"
+    output_subdir: "bank_statement_yusys_v4"
+    log_subdir: "logs/bank_statement_yusys_v4"
+    venv: "conda activate mineru2"
+    description: "YUSYS Wired UNET OCR 框架 GLM-OCR"
+
+  yusys_ocr_v3:
     script: "/Users/zhch158/workspace/repository.git/ocr_platform/ocr_tools/universal_doc_parser/main_v2.py"
     input_arg: "--input"
     output_arg: "--output_dir"

+ 113 - 0
ocr_tools/universal_doc_parser/config/bank_statement_glm_vl.yaml

@@ -0,0 +1,113 @@
+# 银行交易流水场景配置 - GLM-OCR 版本
+scene_name: "bank_statement_glm"
+description: "银行交易流水、对账单等场景(使用 GLM-OCR 进行 VL 识别)"
+
+input:
+  supported_formats: [".pdf", ".png", ".jpg", ".jpeg", ".bmp", ".tiff"]
+  dpi: 200  # PDF转图片的DPI
+
+preprocessor:
+  module: "mineru"
+  orientation_classifier:
+    enabled: true
+    model_name: "paddle_orientation_classification"
+    model_dir: null  # 使用默认路径
+  unwarping:
+    enabled: false
+
+# ============================================================
+# Layout 检测配置 - 使用 PP-DocLayoutV3
+# ============================================================
+layout_detection:
+  module: "paddle"
+  model_name: "PP-DocLayoutV3"
+  model_dir: "PaddlePaddle/PP-DocLayoutV3_safetensors"
+  device: "cpu"
+  conf: 0.3
+  num_threads: 4
+  batch_size: 1
+  
+  # 后处理配置
+  post_process:
+    # 将大面积文本块转换为表格(后处理)
+    convert_large_text_to_table: true  # 是否启用
+    min_text_area_ratio: 0.25         # 最小面积占比(25%)
+    min_text_width_ratio: 0.4         # 最小宽度占比(40%)
+    min_text_height_ratio: 0.3        # 最小高度占比(30%)
+
+  # Debug 可视化配置
+  debug_options:
+    enabled: true               # 是否开启调试可视化输出
+    output_dir: null             # 调试输出目录;null不输出
+    prefix: ""                  # 保存文件名前缀(如设置为页码)
+
+# ============================================================
+# VL识别配置 - 使用 GLM-OCR
+# ============================================================
+vl_recognition:
+  module: "glmocr"
+  api_url: "http://10.192.72.11:20036/v1/chat/completions"
+  api_key: null  # 可选,如需要可填写
+  model: "glm-ocr"
+  max_image_size: 3500  # GLM-OCR 推荐的最大图片尺寸
+  resize_mode: 'max'    # 缩放模式: 'max' 保持宽高比, 'fixed' 固定尺寸
+  verify_ssl: false
+  
+  # Task prompt mapping - 针对不同任务使用不同提示词
+  task_prompt_mapping:
+    text: "Text Recognition:"
+    table: "Table Recognition:"
+    formula: "Formula Recognition:"
+    seal: "Seal Recognition:"  # 印章识别的专用提示词
+  
+  # 模型参数
+  model_params:
+    connection_pool_size: 128  # HTTP 连接池大小(应 >= max_workers)
+    http_timeout: 300          # HTTP 请求超时时间(秒)
+    connect_timeout: 30        # 连接超时时间(秒)
+    retry_max_attempts: 2      # 最大重试次数
+    retry_backoff_base_seconds: 0.5
+    retry_backoff_max_seconds: 8.0
+    retry_jitter_ratio: 0.2
+    retry_status_codes: [429, 500, 502, 503, 504]
+    max_tokens: 4096
+    temperature: 0.8
+    top_p: 0.9
+    top_k: 50
+    repetition_penalty: 1.1
+  
+  # 场景特定配置
+  table_recognition:
+    return_cells_coordinate: false  # GLM-OCR 不直接返回单元格坐标
+    bank_statement_mode: true
+
+# ============================================================
+# OCR识别配置
+# ============================================================
+ocr_recognition:
+  module: "mineru" 
+  language: "ch"
+  det_threshold: 0.6
+  unclip_ratio: 1.5
+  enable_merge_det_boxes: false
+  batch_size: 8
+  device: "cpu"
+
+# ============================================================
+# 输出配置
+# ============================================================
+output:
+  create_subdir: false
+  save_pdf_images: true
+  save_json: true
+  save_page_json: true
+  save_markdown: true
+  save_page_markdown: true
+  save_html: true
+  save_layout_image: true
+  save_ocr_image: true
+  draw_type_label: true
+  draw_bbox_number: true
+  save_enhanced_json: true
+  normalize_numbers: true
+  debug_mode: true

+ 178 - 0
ocr_tools/universal_doc_parser/config/bank_statement_yusys_v4.yaml

@@ -0,0 +1,178 @@
+# 银行交易流水场景配置 - V4版本
+# Pipeline V3逻辑: 有线表格使用MinerU UNet, 无线表格/seal使用GLM-OCR VLM
+scene_name: "bank_statement_yusys_v4"
+
+description: "银行流水V4: PP-DocLayoutV3 layout + PaddleOCR + MinerU UNet(有线表格)+ GLM-OCR VLM(无线表格/seal)"
+
+input:
+  supported_formats: [".pdf", ".png", ".jpg", ".jpeg", ".bmp", ".tiff"]
+  dpi: 200
+
+preprocessor:
+  module: "mineru"
+  orientation_classifier:
+    enabled: true
+    model_name: "paddle_orientation_classification"
+    model_dir: null  # 使用默认路径
+  unwarping:
+    enabled: false
+
+# ============================================================
+# Layout 检测配置 - 使用 PP-DocLayoutV3
+# ============================================================
+layout_detection:
+  module: "paddle"
+  model_name: "PP-DocLayoutV3"
+  model_dir: "PaddlePaddle/PP-DocLayoutV3_safetensors"
+  device: "cpu"
+  conf: 0.3
+  num_threads: 4
+  batch_size: 1
+  
+  # 后处理配置
+  post_process:
+    # 将大面积文本块转换为表格(后处理)
+    convert_large_text_to_table: true  # 是否启用
+    min_text_area_ratio: 0.25         # 最小面积占比(25%)
+    min_text_width_ratio: 0.4         # 最小宽度占比(40%)
+    min_text_height_ratio: 0.3        # 最小高度占比(30%)
+
+  # Debug 可视化配置
+  debug_options:
+    enabled: true               # 是否开启调试可视化输出
+    output_dir: null             # 调试输出目录;null不输出
+    prefix: ""                  # 保存文件名前缀(如设置为页码)
+
+# ============================================================
+# OCR 识别配置
+# ============================================================
+ocr_recognition:
+  module: "mineru"
+  language: "ch"
+  det_threshold: 0.5
+  unclip_ratio: 1.5
+  enable_merge_det_boxes: false
+  batch_size: 8
+  device: "cpu"
+
+# ============================================================
+# 表格分类配置(自动区分有线/无线表格)
+# ============================================================
+table_classification:
+  enabled: true               # 启用自动表格分类
+  module: "paddle"            # 分类模型:paddle(MinerU PaddleTableClsModel)
+  confidence_threshold: 0.5   # 分类置信度阈值
+  batch_size: 16              # 批处理大小
+
+  # Debug 可视化配置
+  debug_options:
+    enabled: true               # 是否开启调试可视化输出
+    output_dir: null             # 调试输出目录;null不输出
+    save_table_lines: true       # 保存表格线可视化(unet横线/竖线叠加)
+    image_format: "png"          # 可视化图片格式:png/jpg
+    prefix: ""                  # 保存文件名前缀(如设置为页码/表格序号)
+
+# ============================================================
+# 有线表格识别专用配置(MinerU UNet)
+# ============================================================
+table_recognition_wired:
+  use_wired_unet: true
+  upscale_ratio: 3.333
+  need_ocr: true
+  row_threshold: 10
+  col_threshold: 15
+  ocr_conf_threshold: 0.9       # 单元格 OCR 置信度阈值
+  cell_crop_margin: 2
+  use_custom_postprocess: true  # 是否使用自定义后处理(默认启用)
+
+  # 是否启用倾斜矫正
+  enable_deskew: true
+
+  # 🆕 启用多源单元格融合
+  use_cell_fusion: true
+  
+  # 融合引擎配置
+  cell_fusion:
+    # RT-DETR 模型路径(必需)
+    rtdetr_model_path: "/Users/zhch158/models/pytorch_models/Table/RT-DETR-L_wired_table_cell_det.onnx"
+    
+    # 融合权重
+    unet_weight: 0.6        # UNet 权重(结构性强)
+    rtdetr_weight: 0.4      # RT-DETR 权重(鲁棒性强)
+    
+    # 阈值配置
+    iou_merge_threshold: 0.7    # 高IoU合并阈值(>0.7则加权平均)
+    iou_nms_threshold: 0.5      # NMS去重阈值
+    rtdetr_conf_threshold: 0.5  # RT-DETR置信度阈值
+    
+    # 功能开关
+    enable_ocr_compensation: true      # 启用OCR边缘补偿
+
+  # Debug 可视化配置
+  debug_options:
+    enabled: true               # 是否开启调试可视化输出
+    output_dir: null             # 调试输出目录;null不输出
+    save_table_lines: true       # 保存表格线可视化(unet横线/竖线叠加)
+    save_connected_components: true  # 保存连通域提取的单元格图
+    save_grid_structure: true    # 保存逻辑网格结构(row/col/rowspan/colspan)
+    save_text_overlay: true      # 保存文本填充覆盖图
+    image_format: "png"          # 可视化图片格式:png/jpg
+    prefix: ""                  # 保存文件名前缀(如设置为页码/表格序号)
+
+# ============================================================
+# VL识别配置 - 使用 GLM-OCR(无线表格 + seal识别)
+# ============================================================
+vl_recognition:
+  module: "glmocr"
+  api_url: "http://10.192.72.11:20036/v1/chat/completions"
+  api_key: null  # 可选,如需要可填写
+  model: "glm-ocr"
+  max_image_size: 3500  # GLM-OCR 推荐的最大图片尺寸
+  resize_mode: 'max'    # 缩放模式: 'max' 保持宽高比, 'fixed' 固定尺寸
+  verify_ssl: false
+  
+  # Task prompt mapping - 针对不同任务使用不同提示词
+  task_prompt_mapping:
+    text: "Text Recognition:"
+    table: "Table Recognition:"
+    formula: "Formula Recognition:"
+    seal: "Seal Recognition:"  # 印章识别的专用提示词
+  
+  # 模型参数
+  model_params:
+    connection_pool_size: 128  # HTTP 连接池大小(应 >= max_workers)
+    http_timeout: 300          # HTTP 请求超时时间(秒)
+    connect_timeout: 30        # 连接超时时间(秒)
+    retry_max_attempts: 2      # 最大重试次数
+    retry_backoff_base_seconds: 0.5
+    retry_backoff_max_seconds: 8.0
+    retry_jitter_ratio: 0.2
+    retry_status_codes: [429, 500, 502, 503, 504]
+    max_tokens: 4096
+    temperature: 0.8
+    top_p: 0.9
+    top_k: 50
+    repetition_penalty: 1.1
+  
+  # 场景特定配置
+  table_recognition:
+    return_cells_coordinate: false  # GLM-OCR 不直接返回单元格坐标
+
+# ============================================================
+# 输出配置
+# ============================================================
+output:
+  create_subdir: false
+  save_pdf_images: true
+  save_json: true
+  save_page_json: true
+  save_markdown: true
+  save_page_markdown: true
+  save_html: true
+  save_layout_image: true
+  save_ocr_image: true
+  draw_type_label: true
+  draw_bbox_number: true
+  save_enhanced_json: true
+  normalize_numbers: true
+  debug_mode: true

+ 51 - 0
ocr_tools/universal_doc_parser/core/element_processors.py

@@ -655,6 +655,57 @@ class ElementProcessors:
             'content': content
         }
     
+    def process_seal_element(
+        self,
+        image: np.ndarray,
+        layout_item: Dict[str, Any]
+    ) -> Dict[str, Any]:
+        """
+        处理印章(seal)元素 - 使用 VLM 识别
+        
+        Args:
+            image: 页面图像
+            layout_item: 布局检测项
+            
+        Returns:
+            处理后的元素字典
+        """
+        bbox = layout_item.get('bbox', [0, 0, 0, 0])
+        category = layout_item.get('category', 'seal')
+        cropped_region = CoordinateUtils.crop_region(image, bbox)
+        
+        content = {'text': '', 'confidence': 0.0}
+        
+        try:
+            # 懒加载 VL 识别器
+            vl_recognizer = self._ensure_vl_recognizer()
+            if vl_recognizer is None:
+                logger.error("❌ VL recognizer not available for seal recognition")
+                return {
+                    'type': category,
+                    'bbox': bbox,
+                    'content': content
+                }
+            
+            # 使用 recognize_text 方法,传入 element_type='seal'
+            # GLM-OCR 适配器会根据 element_type 使用相应的提示词
+            seal_result = vl_recognizer.recognize_text(cropped_region, element_type='seal')
+            content = {
+                'text': seal_result.get('text', ''),
+                'confidence': seal_result.get('confidence', 0.0)
+            }
+            
+            logger.info(f"🔖 Seal recognized: {content['text'][:50]}..." if len(content['text']) > 50 else f"🔖 Seal recognized: {content['text']}")
+        except Exception as e:
+            logger.warning(f"Seal recognition failed: {e}")
+        
+        return {
+            'type': category,
+            'bbox': bbox,
+            'confidence': layout_item.get('confidence', 0.0),
+            'content': content
+        }
+    
     def process_image_element(
         self,
         image: np.ndarray,

+ 8 - 1
ocr_tools/universal_doc_parser/core/model_factory.py

@@ -39,9 +39,13 @@ class ModelFactory:
     def create_layout_detector(cls, config: Dict[str, Any]) -> BaseLayoutDetector:
         # 根据配置创建检测器
         module_name = config.get('module', 'mineru')
-        if module_name == 'paddle':
+        model_name = config.get('model_name', 'default')
+        if module_name == 'paddle' and model_name == 'RT-DETR-H_layout_17cls':
             from models.adapters import PaddleLayoutDetector
             detector = PaddleLayoutDetector(config)
+        elif module_name == 'paddle' and model_name == 'PP-DocLayoutV3':
+            from models.adapters import PPDocLayoutV3Detector
+            detector = PPDocLayoutV3Detector(config)
         elif module_name == 'docling':
             from models.adapters import DoclingLayoutDetector
             detector = DoclingLayoutDetector(config)
@@ -74,6 +78,9 @@ class ModelFactory:
         elif module_name == 'mineru':
             from models.adapters import MinerUVLRecognizer
             recognizer = MinerUVLRecognizer(config)
+        elif module_name == 'glmocr':
+            from models.adapters import GLMOCRVLRecognizer
+            recognizer = GLMOCRVLRecognizer(config)
         else:
             raise ValueError(f"Unknown VL recognizer module: {module_name}")
             

+ 18 - 0
ocr_tools/universal_doc_parser/core/pipeline_manager_v2.py

@@ -88,6 +88,9 @@ class EnhancedDocPipeline:
         'interline_equation_yolo', 'interline_equation_number'
     ]
     
+    # Seal(印章)类元素 - 需要 VLM 识别
+    SEAL_CATEGORIES = ['seal']
+    
     # 丢弃类元素(水印、装饰等)
     DISCARD_CATEGORIES = ['abandon', 'discarded']
     
@@ -750,6 +753,7 @@ class EnhancedDocPipeline:
             'image_body': [],
             'image_text': [],
             'equation': [],
+            'seal': [],  # 🔧 添加 seal 类别
             'code': [],
             'discard': []
         }
@@ -769,6 +773,8 @@ class EnhancedDocPipeline:
                 classified['image_text'].append(item)
             elif category in self.EQUATION_CATEGORIES:
                 classified['equation'].append(item)
+            elif category in self.SEAL_CATEGORIES:
+                classified['seal'].append(item)
             elif category in self.CODE_CATEGORIES:
                 classified['code'].append(item)
             elif category in self.DISCARD_CATEGORIES:
@@ -784,6 +790,7 @@ class EnhancedDocPipeline:
                    f"image={len(classified['image_body'])}, "
                    f"image_text={len(classified['image_text'])}, "
                    f"equation={len(classified['equation'])}, "
+                   f"seal={len(classified['seal'])}, "
                    f"code={len(classified['code'])}, "
                    f"discard={len(classified['discard'])}")
         
@@ -952,6 +959,17 @@ class EnhancedDocPipeline:
                 logger.warning(f"⚠️ Equation processing failed: {e}")
                 processed_elements.append(ElementProcessors.create_error_element(item, str(e)))
         
+        # 🔧 处理 Seal(印章)元素 - 使用 VLM 识别
+        for item in classified_elements['seal']:
+            try:
+                element = self.element_processors.process_seal_element(
+                    detection_image, item
+                )
+                processed_elements.append(element)
+            except Exception as e:
+                logger.warning(f"⚠️ Seal processing failed: {e}")
+                processed_elements.append(ElementProcessors.create_error_element(item, str(e)))
+        
         # 处理图片主体
         for item in classified_elements['image_body']:
             try:

+ 19 - 6
ocr_tools/universal_doc_parser/main_v2.py

@@ -414,6 +414,15 @@ if __name__ == "__main__":
         # 默认配置(用于开发测试)
         default_config = {
             # 测试输入
+            # "input": "/Users/zhch158/workspace/data/流水分析/湛_平安银行图.pdf",
+            # "output_dir": "./output/湛_平安银行图/bank_statement_yusys_v3",
+
+            # "input": "/Users/zhch158/workspace/data/流水分析/张_微信图.pdf",
+            # "output_dir": "./output/张_微信图/bank_statement_yusys_v3",
+
+            # "input": "/Users/zhch158/workspace/data/流水分析/许_民生银行图.pdf",
+            # "output_dir": "./output/许_民生银行图/bank_statement_yusys_v3",
+
             # "input": "/Users/zhch158/workspace/data/流水分析/康强_北京农村商业银行.pdf",
             # "output_dir": "./output/康强_北京农村商业银行/bank_statement_mineru_vl",
 
@@ -425,9 +434,12 @@ if __name__ == "__main__":
 
             # "input": "/Users/zhch158/workspace/repository.git/ocr_platform/ocr_tools/universal_doc_parser/tests/2023年度报告母公司_page_005.png",
             # "input": "/Users/zhch158/workspace/repository.git/ocr_platform/ocr_tools/universal_doc_parser/tests/2023年度报告母公司_page_003.png",
+            # "input": "/Users/zhch158/workspace/repository.git/ocr_platform/ocr_tools/universal_doc_parser/tests/2023年度报告母公司_page_003.png",
             # "input": "/Users/zhch158/workspace/repository.git/ocr_platform/ocr_tools/universal_doc_parser/tests/2023年度报告母公司_page_003_270_skew(-0.4).png",
-            # "input": "/Users/zhch158/workspace/data/流水分析/2023年度报告母公司.pdf",
+            "input": "/Users/zhch158/workspace/data/流水分析/2023年度报告母公司.pdf",
+            # "output_dir": "./output/2023年度报告母公司/bank_statement_yusys_v3",
             # "output_dir": "/Users/zhch158/workspace/data/流水分析/2023年度报告母公司/bank_statement_yusys_v3",
+            "output_dir": "/Users/zhch158/workspace/data/流水分析/2023年度报告母公司/bank_statement_glm_vl",
 
             # "input": "/Users/zhch158/workspace/data/流水分析/2023年度报告母公司.pdf",
             # "output_dir": "/Users/zhch158/workspace/data/流水分析/2023年度报告母公司/bank_statement_yusys_v2",
@@ -444,8 +456,8 @@ if __name__ == "__main__":
             # "input": "/Users/zhch158/workspace/data/流水分析/德_内蒙古银行照.pdf",
             # "output_dir": "./output/德_内蒙古银行照/bank_statement_yusys_v3",
 
-            "input": "/Users/zhch158/workspace/repository.git/ocr_platform/ocr_tools/universal_doc_parser/tests/提取自赤峰黄金2023年报.pdf",
-            "output_dir": "./output/提取自赤峰黄金2023年报/bank_statement_yusys_v3",
+            # "input": "/Users/zhch158/workspace/repository.git/ocr_platform/ocr_tools/universal_doc_parser/tests/提取自赤峰黄金2023年报.pdf",
+            # "output_dir": "./output/提取自赤峰黄金2023年报/bank_statement_yusys_v3",
             # "input": "/Users/zhch158/workspace/data/流水分析/提取自赤峰黄金2023年报.pdf",
             # "output_dir": "/Users/zhch158/workspace/data/流水分析/提取自赤峰黄金2023年报/bank_statement_yusys_v3",
 
@@ -463,7 +475,8 @@ if __name__ == "__main__":
             # "output_dir": "/Users/zhch158/workspace/data/流水分析/山西云集科技有限公司/bank_statement_yusys_v3",
 
             # 配置文件
-            "config": "./config/bank_statement_yusys_v3.yaml",
+            "config": "./config/bank_statement_glm_vl.yaml",
+            # "config": "./config/bank_statement_yusys_v3.yaml",
             # "config": "./config/bank_statement_smart_router.yaml",
             # "config": "./config/bank_statement_mineru_vl.yaml",
             # "config": "./config/bank_statement_yusys_v2.yaml",
@@ -473,7 +486,7 @@ if __name__ == "__main__":
             "scene": "bank_statement",
             
             # 页面范围(可选)
-            "pages": "7",  # 只处理前1页
+            "pages": "3-7",  # 只处理前1页
             # "pages": "1-3,5,7-10",  # 处理指定页面
             # "pages": "83-109",  # 处理指定页面
 
@@ -486,7 +499,7 @@ if __name__ == "__main__":
             "log_level": "DEBUG",
 
             # 日志文件
-            "log_file": "./output/logs/bank_statement_yusys_v3/process.log",
+            "log_file": "./output/logs/bank_statement_glm_vl/process.log",
         }
         
         # 构造参数

+ 10 - 0
ocr_tools/universal_doc_parser/models/adapters/__init__.py

@@ -17,6 +17,13 @@ from .paddle_vl_adapter import PaddleVLRecognizer
 from .docling_layout_adapter import DoclingLayoutDetector
 from .pp_doclayout_v3_layout_adapter import PPDocLayoutV3Detector
 
+# GLM-OCR 适配器
+try:
+    from .glmocr_vl_adapter import GLMOCRVLRecognizer
+    GLMOCR_AVAILABLE = True
+except ImportError:
+    GLMOCR_AVAILABLE = False
+
 # 可选导入 DiT 适配器
 try:
     from .dit_layout_adapter import DitLayoutDetector
@@ -49,6 +56,9 @@ __all__ = [
     'PaddleLayoutDetector',
     'PaddleVLRecognizer',
     
+    # GLM-OCR 适配器
+    'GLMOCRVLRecognizer',
+    
     # Docling 适配器
     'DoclingLayoutDetector',
     # PP-DocLayoutV3 适配器

+ 413 - 0
ocr_tools/universal_doc_parser/models/adapters/glmocr_vl_adapter.py

@@ -0,0 +1,413 @@
+"""GLM-OCR VL识别适配器
+
+直接通过 HTTP 调用 GLM-OCR API(OpenAI 兼容格式)。
+支持表格、公式、文本和印章(seal)识别。
+
+架构说明:
+- 使用 requests 库直接调用 GLM-OCR HTTP API
+- 无需依赖 glmocr 包
+- 通过 task_prompt_mapping 配置不同任务的提示词
+- 支持图片预处理(尺寸控制)
+"""
+
+import sys
+from pathlib import Path
+from typing import Dict, Any, List, Union, Optional
+import numpy as np
+from PIL import Image
+from loguru import logger
+import requests
+from requests.adapters import HTTPAdapter
+from urllib3.util.retry import Retry
+import base64
+from io import BytesIO
+import json
+
+# 导入基类
+from .base import BaseVLRecognizer
+
+
+class GLMOCRVLRecognizer(BaseVLRecognizer):
+    """
+    GLM-OCR VL识别适配器
+    
+    配置示例:
+    ```yaml
+    vl_recognition:
+      module: "glmocr"
+      api_url: "http://10.192.72.11:20036/v1/chat/completions"
+      api_key: null  # 可选
+      model: "glm-ocr"
+      max_image_size: 3500
+      resize_mode: 'max'
+      task_prompt_mapping:
+        text: "Text Recognition:"
+        table: "Table Recognition:"
+        formula: "Formula Recognition:"
+        seal: "Seal Recognition:"
+      model_params:
+        connection_pool_size: 128
+        http_timeout: 300
+        retry_max_attempts: 2
+    ```
+    """
+    
+    def __init__(self, config: Dict[str, Any]):
+        super().__init__(config)
+        
+        self.session = None
+        
+        # API 配置
+        self.api_url = config.get('api_url', 'http://127.0.0.1:8000/v1/chat/completions')
+        self.api_key = config.get('api_key')
+        self.model = config.get('model', 'glm-ocr')
+        self.verify_ssl = config.get('verify_ssl', False)
+        
+        # 图片尺寸限制配置
+        self.max_image_size = config.get('max_image_size', 3500)
+        self.resize_mode = config.get('resize_mode', 'max')
+        
+        # Task prompt mapping(任务提示词映射)
+        self.task_prompt_mapping = config.get('task_prompt_mapping', {
+            'text': 'Text Recognition:',
+            'table': 'Table Recognition:',
+            'formula': 'Formula Recognition:',
+            'seal': 'Seal Recognition:',
+        })
+        
+        # 模型参数
+        model_params = config.get('model_params', {})
+        self.connection_pool_size = model_params.get('connection_pool_size', 128)
+        self.http_timeout = model_params.get('http_timeout', 300)
+        self.connect_timeout = model_params.get('connect_timeout', 30)
+        self.retry_max_attempts = model_params.get('retry_max_attempts', 2)
+        
+        # 生成参数
+        self.max_tokens = model_params.get('max_tokens', 4096)
+        self.temperature = model_params.get('temperature', 0.8)
+        self.top_p = model_params.get('top_p', 0.9)
+        self.top_k = model_params.get('top_k', 50)
+        self.repetition_penalty = model_params.get('repetition_penalty', 1.1)
+        
+        logger.info(f"GLM-OCR VL Recognizer configured with max_image_size={self.max_image_size}")
+        logger.debug(f"Task prompt mapping: {self.task_prompt_mapping}")
+    
+    def initialize(self):
+        """初始化 HTTP 会话"""
+        try:
+            # 创建会话
+            self.session = requests.Session()
+            
+            # 配置连接池
+            adapter = HTTPAdapter(
+                pool_connections=self.connection_pool_size,
+                pool_maxsize=self.connection_pool_size,
+                max_retries=Retry(
+                    total=self.retry_max_attempts,
+                    backoff_factor=0.5,
+                    status_forcelist=[429, 500, 502, 503, 504],
+                )
+            )
+            self.session.mount('http://', adapter)
+            self.session.mount('https://', adapter)
+            
+            # 设置默认 headers
+            self.session.headers.update({
+                'Content-Type': 'application/json',
+            })
+            
+            if self.api_key:
+                self.session.headers.update({
+                    'Authorization': f'Bearer {self.api_key}'
+                })
+            
+            logger.success(f"✅ GLM-OCR VL recognizer initialized: {self.api_url}")
+            
+        except Exception as e:
+            logger.error(f"❌ Failed to initialize GLM-OCR VL recognizer: {e}")
+            raise
+    
+    def cleanup(self):
+        """清理资源"""
+        if self.session:
+            self.session.close()
+            self.session = None
+        logger.debug("GLM-OCR VL recognizer cleaned up")
+    
+    def _preprocess_image(self, image: Union[np.ndarray, Image.Image]) -> Image.Image:
+        """
+        预处理图片,控制尺寸避免序列长度超限
+        
+        Args:
+            image: 输入图片
+            
+        Returns:
+            处理后的PIL图片
+        """
+        # 转换为PIL图像
+        if isinstance(image, np.ndarray):
+            image = Image.fromarray(image)
+        
+        # 获取原始尺寸
+        orig_w, orig_h = image.size
+        
+        # 计算缩放比例
+        if self.resize_mode == 'max':
+            # 保持宽高比,最长边不超过 max_image_size
+            max_dim = max(orig_w, orig_h)
+            if max_dim > self.max_image_size:
+                scale = self.max_image_size / max_dim
+                new_w = int(orig_w * scale)
+                new_h = int(orig_h * scale)
+                
+                logger.debug(f"🔄 Resizing image: {orig_w}x{orig_h} → {new_w}x{new_h} (scale={scale:.3f})")
+                image = image.resize((new_w, new_h), Image.Resampling.LANCZOS)
+        
+        elif self.resize_mode == 'fixed':
+            # 固定尺寸(可能改变宽高比)
+            if orig_w != self.max_image_size or orig_h != self.max_image_size:
+                logger.debug(f"🔄 Resizing image: {orig_w}x{orig_h} → {self.max_image_size}x{self.max_image_size}")
+                image = image.resize((self.max_image_size, self.max_image_size), Image.Resampling.LANCZOS)
+        
+        return image
+    
+    def _build_request_for_image(
+        self, 
+        image: Image.Image, 
+        task_type: str = 'text'
+    ) -> Dict[str, Any]:
+        """
+        为单张图片构建 GLM-OCR API 请求
+        
+        Args:
+            image: PIL图片
+            task_type: 任务类型 ('text', 'table', 'formula', 'seal')
+            
+        Returns:
+            请求字典
+        """
+        # 获取任务对应的提示词
+        prompt_text = self.task_prompt_mapping.get(task_type, self.task_prompt_mapping.get('text', ''))
+        
+        # 将图片转为 base64
+        buffered = BytesIO()
+        image.save(buffered, format="JPEG")
+        img_base64 = base64.b64encode(buffered.getvalue()).decode('utf-8')
+        img_url = f"data:image/jpeg;base64,{img_base64}"
+        
+        # 构建请求(OpenAI 兼容格式)
+        request_data = {
+            "model": self.model,
+            "messages": [
+                {
+                    "role": "user",
+                    "content": [
+                        {"type": "image_url", "image_url": {"url": img_url}},
+                        {"type": "text", "text": prompt_text},
+                    ]
+                }
+            ],
+            "max_tokens": self.max_tokens,
+            "temperature": self.temperature,
+            "top_p": self.top_p,
+            "top_k": self.top_k,
+            "repetition_penalty": self.repetition_penalty,
+        }
+        
+        return request_data
+    
+    def _call_ocr_api(self, image: Image.Image, task_type: str) -> str:
+        """
+        调用 GLM-OCR API 进行识别
+        
+        Args:
+            image: PIL图片
+            task_type: 任务类型
+            
+        Returns:
+            识别结果文本
+        """
+        if self.session is None:
+            raise RuntimeError("HTTP session not initialized")
+        
+        try:
+            # 构建请求
+            request_data = self._build_request_for_image(image, task_type)
+            
+            # 调用 API
+            response = self.session.post(
+                self.api_url,
+                json=request_data,
+                timeout=(self.connect_timeout, self.http_timeout),
+                verify=self.verify_ssl
+            )
+            
+            if response.status_code != 200:
+                logger.error(f"OCR API returned status {response.status_code}: {response.text}")
+                return ""
+            
+            # 解析响应
+            result = response.json()
+            
+            # 提取识别结果
+            if 'choices' in result and len(result['choices']) > 0:
+                content = result['choices'][0].get('message', {}).get('content', '')
+                return content
+            
+            logger.warning(f"No content in OCR response: {result}")
+            return ""
+            
+        except requests.exceptions.Timeout:
+            logger.error(f"OCR API timeout after {self.http_timeout}s")
+            return ""
+        except requests.exceptions.RequestException as e:
+            logger.error(f"OCR API request failed: {e}")
+            return ""
+        except json.JSONDecodeError as e:
+            logger.error(f"Failed to parse OCR response: {e}")
+            return ""
+        except Exception as e:
+            logger.error(f"OCR API call failed: {e}")
+            return ""
+    
+    def recognize_table(self, image: Union[np.ndarray, Image.Image], **kwargs) -> Dict[str, Any]:
+        """
+        识别表格
+        
+        Args:
+            image: 输入图片
+            **kwargs: 额外参数(未使用)
+            
+        Returns:
+            包含 'html' 和 'markdown' 的字典
+        """
+        try:
+            # 预处理图片
+            image = self._preprocess_image(image)
+            
+            # 调用 API
+            result_text = self._call_ocr_api(image, 'table')
+            
+            if not result_text:
+                return {'html': '', 'markdown': '', 'cells': []}
+            
+            # GLM-OCR 默认返回 Markdown 格式
+            # 如果需要 HTML,可以使用简单的转换(或保持 Markdown)
+            return {
+                'html': result_text,  # GLM-OCR 可能返回 HTML 或 Markdown
+                'markdown': result_text,
+                'cells': [],  # GLM-OCR 不直接返回单元格坐标
+            }
+            
+        except Exception as e:
+            logger.error(f"❌ Table recognition failed: {e}")
+            return {'html': '', 'markdown': '', 'cells': []}
+    
+    def recognize_formula(self, image: Union[np.ndarray, Image.Image], **kwargs) -> Dict[str, Any]:
+        """
+        识别公式
+        
+        Args:
+            image: 输入图片
+            **kwargs: 额外参数(未使用)
+            
+        Returns:
+            包含 'latex' 的字典
+        """
+        try:
+            # 预处理图片
+            image = self._preprocess_image(image)
+            
+            # 调用 API
+            result_text = self._call_ocr_api(image, 'formula')
+            
+            if not result_text:
+                return {'latex': '', 'confidence': 0.0, 'raw': {}}
+            
+            # 清理 LaTeX 格式(移除 markdown 代码块标记)
+            latex = self._clean_latex(result_text)
+            
+            return {
+                'latex': latex,
+                'confidence': 0.9 if latex else 0.0,
+                'raw': {'raw_output': result_text}
+            }
+            
+        except Exception as e:
+            logger.error(f"❌ Formula recognition failed: {e}")
+            return {'latex': '', 'confidence': 0.0, 'raw': {}}
+    
+    def recognize_text(self, image: Union[np.ndarray, Image.Image], **kwargs) -> Dict[str, Any]:
+        """
+        识别文本区域(包括普通文本和印章)
+        
+        Args:
+            image: 输入图片
+            **kwargs: 额外参数,可包含 'element_type' 指定类型(如 'seal')
+            
+        Returns:
+            包含 'text' 的字典
+        """
+        try:
+            # 预处理图片
+            image = self._preprocess_image(image)
+            
+            # 确定任务类型(如果是 seal,使用 seal 提示词)
+            element_type = kwargs.get('element_type', 'text')
+            task_type = 'seal' if element_type == 'seal' else 'text'
+            
+            # 调用 API
+            result_text = self._call_ocr_api(image, task_type)
+            
+            return {
+                'text': result_text or '',
+                'confidence': 0.9 if result_text else 0.0
+            }
+            
+        except Exception as e:
+            logger.error(f"❌ Text recognition failed: {e}")
+            return {'text': '', 'confidence': 0.0}
+    
+    def _clean_latex(self, latex_str: str) -> str:
+        """
+        清理 LaTeX 字符串,移除 Markdown 代码块标记
+        
+        Args:
+            latex_str: 原始 LaTeX 字符串
+            
+        Returns:
+            清理后的 LaTeX
+        """
+        if not latex_str:
+            return ""
+        
+        # 移除 Markdown 代码块标记
+        latex_str = latex_str.strip()
+        if latex_str.startswith('```'):
+            lines = latex_str.split('\n')
+            # 移除第一行的 ```latex 或 ```
+            if lines[0].startswith('```'):
+                lines = lines[1:]
+            # 移除最后一行的 ```
+            if lines and lines[-1].strip() == '```':
+                lines = lines[:-1]
+            latex_str = '\n'.join(lines)
+        
+        # 移除行内代码标记
+        if latex_str.startswith('`') and latex_str.endswith('`'):
+            latex_str = latex_str[1:-1]
+        
+        # 移除常见的 LaTeX 包裹符号
+        latex_str = latex_str.strip()
+        if latex_str.startswith('$') and latex_str.endswith('$'):
+            # 移除单个 $ 或 $$
+            if latex_str.startswith('$$') and latex_str.endswith('$$'):
+                latex_str = latex_str[2:-2]
+            else:
+                latex_str = latex_str[1:-1]
+        
+        return latex_str.strip()
+
+
+# 导出适配器类
+__all__ = ['GLMOCRVLRecognizer']

+ 1 - 1
ocr_tools/universal_doc_parser/models/adapters/pp_doclayout_v3_layout_adapter.py

@@ -77,7 +77,7 @@ class PPDocLayoutV3Detector(BaseLayoutDetector):
         "paragraph_title": "title",
         "reference": "text",
         "reference_content": "text",
-        "seal": "image_body",
+        "seal": "seal",  # 🔧 修改:保留 seal 作为独立类别,用于 VLM 识别
         "table": "table_body",
         "text": "text",
         "vision_footnote": "page_footnote",

+ 188 - 0
ocr_tools/universal_doc_parser/tests/test_glmocr_adapter.py

@@ -0,0 +1,188 @@
+#!/opt/miniconda3/envs/mineru2/bin/python
+"""测试 GLM-OCR 适配器加载
+
+验证:
+1. 适配器类可以正确导入
+2. 配置文件可以正确解析
+3. 适配器可以正确初始化
+"""
+
+import sys
+from pathlib import Path
+
+# 添加项目根目录到路径
+project_root = Path(__file__).parents[1]
+sys.path.insert(0, str(project_root))
+
+from loguru import logger
+
+def test_import_adapter():
+    """测试导入适配器"""
+    logger.info("测试 1: 导入 GLM-OCR 适配器...")
+    try:
+        from models.adapters import GLMOCRVLRecognizer
+        logger.success("✅ GLMOCRVLRecognizer 导入成功")
+        return True
+    except Exception as e:
+        logger.error(f"❌ 导入失败: {e}")
+        return False
+
+
+def test_load_config():
+    """测试加载配置文件"""
+    logger.info("测试 2: 加载配置文件...")
+    try:
+        import yaml
+        # 修正配置文件路径
+        config_path = project_root / "config" / "bank_statement_glm_vl.yaml"
+        if not config_path.exists():
+            # 尝试其他可能的路径
+            config_path = Path(__file__).parent / "config" / "bank_statement_glm_vl.yaml"
+        
+        if not config_path.exists():
+            logger.warning(f"⚠️  配置文件不存在: {config_path}")
+            logger.warning("跳过配置文件测试")
+            return True, None  # 不算失败
+        
+        with open(config_path, 'r', encoding='utf-8') as f:
+            config = yaml.safe_load(f)
+        
+        logger.info(f"配置场景: {config.get('scene_name')}")
+        logger.info(f"VL模块: {config.get('vl_recognition', {}).get('module')}")
+        logger.info(f"Layout模块: {config.get('layout_detection', {}).get('module')}")
+        
+        logger.success("✅ 配置文件加载成功")
+        return True, config
+    except Exception as e:
+        logger.error(f"❌ 配置加载失败: {e}")
+        return False, None
+
+
+def test_create_adapter():
+    """测试创建适配器实例"""
+    logger.info("测试 3: 创建适配器实例...")
+    try:
+        from models.adapters import GLMOCRVLRecognizer
+        
+        # 简化的配置
+        config = {
+            'module': 'glmocr',
+            'api_url': 'http://10.192.72.11:20036/v1/chat/completions',
+            'model': 'glm-ocr',
+            'max_image_size': 3500,
+            'resize_mode': 'max',
+            'task_prompt_mapping': {
+                'text': 'Text Recognition:',
+                'table': 'Table Recognition:',
+                'formula': 'Formula Recognition:',
+                'seal': 'Seal Recognition:',
+            },
+            'model_params': {
+                'connection_pool_size': 128,
+                'http_timeout': 300,
+                'retry_max_attempts': 2,
+            }
+        }
+        
+        recognizer = GLMOCRVLRecognizer(config)
+        logger.info(f"适配器类型: {type(recognizer)}")
+        logger.info(f"最大图片尺寸: {recognizer.max_image_size}")
+        logger.info(f"任务提示词: {recognizer.task_prompt_mapping}")
+        
+        logger.success("✅ 适配器实例创建成功")
+        return True, recognizer
+    except Exception as e:
+        logger.error(f"❌ 适配器创建失败: {e}")
+        import traceback
+        traceback.print_exc()
+        return False, None
+
+
+def test_initialize_adapter(recognizer):
+    """测试初始化适配器(需要 API 服务可用)"""
+    logger.info("测试 4: 初始化适配器(需要 GLM-OCR API 服务)...")
+    try:
+        recognizer.initialize()
+        logger.success("✅ 适配器初始化成功")
+        logger.info(f"HTTP Session: {recognizer.session}")
+        return True
+    except Exception as e:
+        logger.warning(f"⚠️  适配器初始化失败(可能是 API 服务不可用): {e}")
+        import traceback
+        traceback.print_exc()
+        return False
+
+
+def test_model_factory():
+    """测试通过工厂创建适配器"""
+    logger.info("测试 5: 通过 ModelFactory 创建适配器...")
+    try:
+        from core.model_factory import ModelFactory
+        
+        config = {
+            'module': 'glmocr',
+            'api_url': 'http://10.192.72.11:20036/v1/chat/completions',
+            'model': 'glm-ocr',
+            'max_image_size': 3500,
+            'model_params': {
+                'connection_pool_size': 128,
+                'http_timeout': 300,
+            }
+        }
+        
+        recognizer = ModelFactory.create_vl_recognizer(config)
+        logger.info(f"适配器类型: {type(recognizer).__name__}")
+        logger.success("✅ ModelFactory 创建适配器成功")
+        return True
+    except Exception as e:
+        logger.error(f"❌ ModelFactory 创建失败: {e}")
+        import traceback
+        traceback.print_exc()
+        return False
+
+
+def main():
+    """主测试函数"""
+    logger.info("="*60)
+    logger.info("开始测试 GLM-OCR 适配器")
+    logger.info("="*60)
+    
+    results = []
+    
+    # 测试 1: 导入
+    results.append(("导入适配器", test_import_adapter()))
+    
+    # 测试 2: 加载配置
+    success, config = test_load_config()
+    results.append(("加载配置", success))
+    
+    # 测试 3: 创建实例
+    success, recognizer = test_create_adapter()
+    results.append(("创建实例", success))
+    
+    # 测试 4: 初始化(可选,需要 API 服务)
+    if success and recognizer:
+        init_success = test_initialize_adapter(recognizer)
+        results.append(("初始化适配器", init_success))
+    
+    # 测试 5: 工厂方法
+    results.append(("ModelFactory", test_model_factory()))
+    
+    # 汇总结果
+    logger.info("="*60)
+    logger.info("测试结果汇总:")
+    logger.info("="*60)
+    for test_name, result in results:
+        status = "✅ 通过" if result else "❌ 失败"
+        logger.info(f"{test_name:20s}: {status}")
+    
+    passed = sum(1 for _, r in results if r)
+    total = len(results)
+    logger.info(f"\n总计: {passed}/{total} 测试通过")
+    
+    return passed == total
+
+
+if __name__ == "__main__":
+    success = main()
+    sys.exit(0 if success else 1)

+ 135 - 0
ocr_tools/universal_doc_parser/tests/test_glmocr_api.py

@@ -0,0 +1,135 @@
+#!/opt/miniconda3/envs/mineru2/bin/python
+"""测试 GLM-OCR API 实际调用
+
+验证:
+1. 文本识别
+2. API 请求和响应
+"""
+
+import sys
+from pathlib import Path
+import numpy as np
+from PIL import Image
+
+# 添加项目根目录到路径
+project_root = Path(__file__).parents[1]
+sys.path.insert(0, str(project_root))
+
+from loguru import logger
+from models.adapters import GLMOCRVLRecognizer
+
+
+def test_text_recognition():
+    """测试文本识别(使用简单测试图片)"""
+    logger.info("测试: 文本识别...")
+    
+    try:
+        # 创建配置
+        config = {
+            'module': 'glmocr',
+            'api_url': 'http://10.192.72.11:20036/v1/chat/completions',
+            'model': 'glm-ocr',
+            'max_image_size': 3500,
+            'resize_mode': 'max',
+            'task_prompt_mapping': {
+                'text': 'Text Recognition:',
+            },
+            'model_params': {
+                'connection_pool_size': 128,
+                'http_timeout': 300,
+            }
+        }
+        
+        # 创建识别器
+        recognizer = GLMOCRVLRecognizer(config)
+        recognizer.initialize()
+        
+        # 创建简单测试图片(白底黑字)
+        img = Image.new('RGB', (200, 100), color='white')
+        # 注意:这只是一个占位图片,实际识别需要有文字的图片
+        
+        logger.info("调用 recognize_text()...")
+        result = recognizer.recognize_text(img)
+        
+        logger.info(f"识别结果: {result}")
+        logger.info(f"文本内容: {result.get('text', '')[:100]}")
+        logger.info(f"置信度: {result.get('confidence', 0.0)}")
+        
+        recognizer.cleanup()
+        
+        if result.get('text') is not None:
+            logger.success("✅ 文本识别测试通过")
+            return True
+        else:
+            logger.warning("⚠️  未获取到识别结果(可能是测试图片为空)")
+            return True  # 不算失败
+            
+    except Exception as e:
+        logger.error(f"❌ 文本识别测试失败: {e}")
+        import traceback
+        traceback.print_exc()
+        return False
+
+
+def test_api_connection():
+    """测试 API 连接"""
+    logger.info("测试: API 连接...")
+    
+    try:
+        import requests
+        
+        api_url = 'http://10.192.72.11:20036/v1/chat/completions'
+        
+        # 发送简单健康检查请求
+        response = requests.get(
+            api_url.replace('/v1/chat/completions', '/health'),
+            timeout=10
+        )
+        
+        if response.status_code == 200:
+            logger.success("✅ API 服务可访问")
+            return True
+        else:
+            logger.warning(f"⚠️  API 返回状态码: {response.status_code}")
+            return True  # 不算失败,可能不支持 /health 端点
+            
+    except requests.exceptions.ConnectionError:
+        logger.warning("⚠️  无法连接到 API 服务(可能未启动)")
+        return True  # 不算失败
+    except Exception as e:
+        logger.warning(f"⚠️  API 连接测试异常: {e}")
+        return True  # 不算失败
+
+
+def main():
+    """主测试函数"""
+    logger.info("="*60)
+    logger.info("GLM-OCR API 实际调用测试")
+    logger.info("="*60)
+    
+    results = []
+    
+    # 测试 API 连接
+    results.append(("API连接", test_api_connection()))
+    
+    # 测试文本识别
+    results.append(("文本识别", test_text_recognition()))
+    
+    # 汇总结果
+    logger.info("="*60)
+    logger.info("测试结果汇总:")
+    logger.info("="*60)
+    for test_name, result in results:
+        status = "✅ 通过" if result else "❌ 失败"
+        logger.info(f"{test_name:20s}: {status}")
+    
+    passed = sum(1 for _, r in results if r)
+    total = len(results)
+    logger.info(f"\n总计: {passed}/{total} 测试通过")
+    
+    return passed == total
+
+
+if __name__ == "__main__":
+    success = main()
+    sys.exit(0 if success else 1)

+ 20 - 0
ocr_utils/json_formatters.py

@@ -180,6 +180,20 @@ class JSONFormatters:
                     }]
                 }]
         
+        # 印章类型
+        elif elem_type == 'seal':
+            text = content.get('text', '') if isinstance(content, dict) else str(content)
+            confidence = content.get('confidence', 0.0) if isinstance(content, dict) else 0.0
+            block['lines'] = [{
+                'bbox': bbox,
+                'spans': [{
+                    'bbox': bbox,
+                    'type': 'seal',
+                    'content': text,
+                    'confidence': confidence
+                }]
+            }]
+        
         # 丢弃类型
         elif elem_type in ['abandon', 'discarded']:
             block['type'] = 'abandon'
@@ -361,6 +375,12 @@ class JSONFormatters:
             result['type'] = elem_type
             result['text'] = content.get('text', '') if isinstance(content, dict) else str(content)
         
+        # 印章类型
+        elif elem_type == 'seal':
+            result['type'] = 'seal'
+            result['text'] = content.get('text', '') if isinstance(content, dict) else str(content)
+            result['confidence'] = content.get('confidence', 0.0) if isinstance(content, dict) else 0.0
+        
         # 丢弃元素
         elif elem_type in ['discarded', 'abandon']:
             result['type'] = 'discarded'

+ 13 - 0
ocr_utils/markdown_generator.py

@@ -276,6 +276,12 @@ pages: {len(results.get('pages', []))}
                         else:
                             md_lines.append(f"*{text}*")
                         md_lines.append("")
+                
+                elif elem_type == 'seal':
+                    text = content.get('text', '') if isinstance(content, dict) else str(content)
+                    if text:
+                        md_lines.append(f"🔖 **[印章]** {text}")
+                        md_lines.append("")
         
         return '\n'.join(md_lines)
     
@@ -371,6 +377,13 @@ pages: {len(results.get('pages', []))}
                         md_lines.append(f"*{text}*")
                     md_lines.append("")
             
+            elif elem_type == 'seal':
+                text = content.get('text', '') if isinstance(content, dict) else str(content)
+                if text:
+                    confidence = content.get('confidence', 0.0) if isinstance(content, dict) else 0.0
+                    md_lines.append(f"🔖 **[印章]** {text} _(置信度: {confidence:.2f})_")
+                    md_lines.append("")
+            
             elif elem_type == 'discarded':
                 text = content.get('text', '') if isinstance(content, dict) else ''
                 if text:

+ 7 - 0
ocr_validator/config/A用户_单元格扫描流水.yaml

@@ -4,6 +4,13 @@ document:
 
   # 🎯 关键改进:定义该文档使用的 OCR 工具及其结果目录
   ocr_results:
+    # bank_statement_yusys_v4
+    - tool: "mineru"
+      result_dir: "bank_statement_yusys_v4"
+      image_dir: "bank_statement_yusys_v4/{{name}}"
+      description: "YUSYS-OCR框架 v4.0 GLM-OCR"
+      enabled: true
+
     # bank_statement_yusys_v3
     - tool: "mineru"
       result_dir: "bank_statement_yusys_v3"

+ 7 - 0
ocr_validator/config/B用户_扫描流水.yaml

@@ -4,6 +4,13 @@ document:
   
   # 🎯 关键改进:定义该文档使用的 OCR 工具及其结果目录
   ocr_results:
+    # bank_statement_yusys_v4
+    - tool: "mineru"
+      result_dir: "bank_statement_yusys_v4"
+      image_dir: "bank_statement_yusys_v4/{{name}}"
+      description: "YUSYS-OCR框架 v4.0 GLM-OCR"
+      enabled: true
+
     # bank_statement_yusys_v3
     - tool: "mineru"
       result_dir: "bank_statement_yusys_v3"

+ 0 - 2
ocr_validator/config/global.yaml

@@ -160,5 +160,3 @@ data_sources:
   - 张_微信图.yaml
   - 付_工商银行943825图.yaml
   - 许_民生银行图.yaml
-
-

+ 7 - 0
ocr_validator/config/乔_建设银行图.yaml

@@ -4,6 +4,13 @@ document:
   
   # 🎯 关键改进:定义该文档使用的 OCR 工具及其结果目录
   ocr_results:
+    # bank_statement_yusys_v4
+    - tool: "mineru"
+      result_dir: "bank_statement_yusys_v4"
+      image_dir: "bank_statement_yusys_v4/{{name}}"
+      description: "YUSYS-OCR框架 v4.0 GLM-OCR"
+      enabled: true
+
     # bank_statement_yusys_v3
     - tool: "mineru"
       result_dir: "bank_statement_yusys_v3"

+ 7 - 0
ocr_validator/config/付_工商银行943825图.yaml

@@ -4,6 +4,13 @@ document:
   
   # 🎯 关键改进:定义该文档使用的 OCR 工具及其结果目录
   ocr_results:
+    # bank_statement_yusys_v4
+    - tool: "mineru"
+      result_dir: "bank_statement_yusys_v4"
+      image_dir: "bank_statement_yusys_v4/{{name}}"
+      description: "YUSYS-OCR框架 v4.0 GLM-OCR"
+      enabled: true
+
     # bank_statement_yusys_v3
     - tool: "mineru"
       result_dir: "bank_statement_yusys_v3"

+ 7 - 0
ocr_validator/config/对公_招商银行图.yaml

@@ -4,6 +4,13 @@ document:
   
   # 🎯 关键改进:定义该文档使用的 OCR 工具及其结果目录
   ocr_results:
+    # bank_statement_yusys_v4
+    - tool: "mineru"
+      result_dir: "bank_statement_yusys_v4"
+      image_dir: "bank_statement_yusys_v4/{{name}}"
+      description: "YUSYS-OCR框架 v4.0 GLM-OCR"
+      enabled: true
+
     # bank_statement_yusys_v3
     - tool: "mineru"
       result_dir: "bank_statement_yusys_v3"

+ 7 - 0
ocr_validator/config/山西云集科技有限公司.yaml

@@ -4,6 +4,13 @@ document:
   
   # 🎯 关键改进:定义该文档使用的 OCR 工具及其结果目录
   ocr_results:
+    # bank_statement_yusys_v4
+    - tool: "mineru"
+      result_dir: "bank_statement_yusys_v4"
+      image_dir: "bank_statement_yusys_v4/{{name}}"
+      description: "YUSYS-OCR框架 v4.0 GLM-OCR"
+      enabled: true
+
     # bank_statement_yusys_v3
     - tool: "mineru"
       result_dir: "bank_statement_yusys_v3"

+ 7 - 0
ocr_validator/config/康强_北京农村商业银行.yaml

@@ -5,6 +5,13 @@ document:
   
   # 🎯 关键改进:定义该文档使用的 OCR 工具及其结果目录
   ocr_results:
+    # bank_statement_yusys_v4
+    - tool: "mineru"
+      result_dir: "bank_statement_yusys_v4"
+      image_dir: "bank_statement_yusys_v4/{{name}}"
+      description: "YUSYS-OCR框架 v4.0 GLM-OCR"
+      enabled: true
+
     # bank_statement_yusys_v3
     - tool: "mineru"
       result_dir: "bank_statement_yusys_v3"

+ 7 - 0
ocr_validator/config/张_微信图.yaml

@@ -4,6 +4,13 @@ document:
   
   # 🎯 关键改进:定义该文档使用的 OCR 工具及其结果目录
   ocr_results:
+    # bank_statement_yusys_v4
+    - tool: "mineru"
+      result_dir: "bank_statement_yusys_v4"
+      image_dir: "bank_statement_yusys_v4/{{name}}"
+      description: "YUSYS-OCR框架 v4.0 GLM-OCR"
+      enabled: true
+
     # bank_statement_yusys_v3
     - tool: "mineru"
       result_dir: "bank_statement_yusys_v3"

+ 7 - 0
ocr_validator/config/德_内蒙古银行照.yaml

@@ -5,6 +5,13 @@ document:
   
   # 🎯 关键改进:定义该文档使用的 OCR 工具及其结果目录
   ocr_results:
+    # bank_statement_yusys_v4
+    - tool: "mineru"
+      result_dir: "bank_statement_yusys_v4"
+      image_dir: "bank_statement_yusys_v4/{{name}}"
+      description: "YUSYS-OCR框架 v4.0 GLM-OCR"
+      enabled: true
+
     # bank_statement_yusys_v3
     - tool: "mineru"
       result_dir: "bank_statement_yusys_v3"

+ 7 - 0
ocr_validator/config/提取自赤峰黄金2023年报.yaml

@@ -4,6 +4,13 @@ document:
   
   # 🎯 关键改进:定义该文档使用的 OCR 工具及其结果目录
   ocr_results:
+    # bank_statement_yusys_v4
+    - tool: "mineru"
+      result_dir: "bank_statement_yusys_v4"
+      image_dir: "bank_statement_yusys_v4/{{name}}"
+      description: "YUSYS-OCR框架 v4.0 GLM-OCR"
+      enabled: true
+
     # bank_statement_yusys_v3
     - tool: "mineru"
       result_dir: "bank_statement_yusys_v3"

+ 7 - 0
ocr_validator/config/施博深.yaml

@@ -4,6 +4,13 @@ document:
   
   # 🎯 关键改进:定义该文档使用的 OCR 工具及其结果目录
   ocr_results:
+    # bank_statement_yusys_v4
+    - tool: "mineru"
+      result_dir: "bank_statement_yusys_v4"
+      image_dir: "bank_statement_yusys_v4/{{name}}"
+      description: "YUSYS-OCR框架 v4.0 GLM-OCR"
+      enabled: true
+
     # bank_statement_yusys_v3
     - tool: "mineru"
       result_dir: "bank_statement_yusys_v3"

+ 7 - 0
ocr_validator/config/朱_中信银行图.yaml

@@ -4,6 +4,13 @@ document:
   
   # 🎯 关键改进:定义该文档使用的 OCR 工具及其结果目录
   ocr_results:
+    # bank_statement_yusys_v4
+    - tool: "mineru"
+      result_dir: "bank_statement_yusys_v4"
+      image_dir: "bank_statement_yusys_v4/{{name}}"
+      description: "YUSYS-OCR框架 v4.0 GLM-OCR"
+      enabled: true
+
     # bank_statement_yusys_v3
     - tool: "mineru"
       result_dir: "bank_statement_yusys_v3"

+ 7 - 0
ocr_validator/config/湛_平安银行图.yaml

@@ -4,6 +4,13 @@ document:
   
   # 🎯 关键改进:定义该文档使用的 OCR 工具及其结果目录
   ocr_results:
+    # bank_statement_yusys_v4
+    - tool: "mineru"
+      result_dir: "bank_statement_yusys_v4"
+      image_dir: "bank_statement_yusys_v4/{{name}}"
+      description: "YUSYS-OCR框架 v4.0 GLM-OCR"
+      enabled: true
+
     # bank_statement_yusys_v3
     - tool: "mineru"
       result_dir: "bank_statement_yusys_v3"

+ 9 - 2
ocr_validator/config/至远彩色_2023年报.yaml

@@ -4,6 +4,13 @@ document:
   
   # 🎯 关键改进:定义该文档使用的 OCR 工具及其结果目录
   ocr_results:
+    # bank_statement_yusys_v4
+    - tool: "mineru"
+      result_dir: "bank_statement_yusys_v4"
+      image_dir: "bank_statement_yusys_v4/{{name}}"
+      description: "YUSYS-OCR框架 v4.0 GLM-OCR"
+      enabled: true
+
     # bank_statement_yusys_v3
     - tool: "mineru"
       result_dir: "bank_statement_yusys_v3"
@@ -13,8 +20,8 @@ document:
 
     # bank_statement_yusys_v2
     - tool: "mineru"
-      result_dir: "bank_statement_yusys_v2"
-      image_dir: "mineru_vllm_results/{{name}}"
+      result_dir: "bank_statement_glm_vl"
+      image_dir: "bank_statement_glm_vl/{{name}}"
       description: "YUSYS统一OCR框架"
       enabled: true
 

+ 7 - 0
ocr_validator/config/许_民生银行图.yaml

@@ -4,6 +4,13 @@ document:
   
   # 🎯 关键改进:定义该文档使用的 OCR 工具及其结果目录
   ocr_results:
+    # bank_statement_yusys_v4
+    - tool: "mineru"
+      result_dir: "bank_statement_yusys_v4"
+      image_dir: "bank_statement_yusys_v4/{{name}}"
+      description: "YUSYS-OCR框架 v4.0 GLM-OCR"
+      enabled: true
+
     # bank_statement_yusys_v3
     - tool: "mineru"
       result_dir: "bank_statement_yusys_v3"