#!/bin/bash # filepath: process_pdf_batch.sh # ============================================================================ # PDF 批量处理脚本 # 功能: 批量调用 ppstructurev3_single_client.py 处理多个 PDF 文件 # ============================================================================ set -e # 遇到错误立即退出 # ============================================================================ # 默认配置 # ============================================================================ DEFAULT_SCRIPT="ppstructurev3_single_client.py" DEFAULT_BASE_DIR="/Users/zhch158/workspace/data/流水分析" DEFAULT_OUTPUT_SUBDIR="data_PPStructureV3_Results" DEFAULT_API_URL="http://10.192.72.11:8111/layout-parsing" DEFAULT_TIMEOUT=300 # ============================================================================ # 颜色定义 # ============================================================================ RED='\033[0;31m' GREEN='\033[0;32m' YELLOW='\033[1;33m' BLUE='\033[0;34m' NC='\033[0m' # No Color # ============================================================================ # 帮助信息 # ============================================================================ usage() { cat << EOF ${BLUE}============================================================================= PDF 批量处理脚本 ==============================================================================${NC} ${GREEN}用法:${NC} $0 [选项] ${GREEN}选项:${NC} -s, --script SCRIPT Python 脚本路径 (默认: ${DEFAULT_SCRIPT}) -d, --base-dir DIR PDF 文件基础目录 (默认: ${DEFAULT_BASE_DIR}) -o, --output-subdir SUBDIR 输出子目录名称 (默认: ${DEFAULT_OUTPUT_SUBDIR}) -u, --api-url URL API 地址 (默认: ${DEFAULT_API_URL}) -t, --timeout SECONDS 超时时间(秒) (默认: ${DEFAULT_TIMEOUT}) -f, --file-list FILE PDF 文件列表文件路径 -p, --pdf-list "PDF1,PDF2" PDF 文件列表(逗号分隔) -h, --help 显示此帮助信息 ${GREEN}PDF 文件指定方式 (按优先级):${NC} 1. 通过 -f 参数指定文件列表 2. 通过 -p 参数直接指定 PDF 列表 3. 处理基础目录下所有 .pdf 文件 ${GREEN}文件列表格式 (每行一个):${NC} 子目录名/文件名.pdf 或 完整路径/文件名.pdf ${GREEN}示例:${NC} # 1. 使用文件列表 $0 -f pdf_list.txt # 2. 直接指定 PDF 列表 $0 -p "A用户_单元格扫描流水.pdf,B用户_扫描流水.pdf" # 3. 处理指定目录下所有 PDF $0 -d "/path/to/pdfs" # 4. 完整参数示例 $0 -s "./ppstructurev3_single_client.py" \\ -d "/Users/zhch158/workspace/data/流水分析" \\ -o "data_PPStructureV3_Results" \\ -u "http://10.192.72.11:8111/layout-parsing" \\ -t 600 \\ -f "pdf_list.txt" ${YELLOW}文件列表示例 (pdf_list.txt):${NC} A用户_单元格扫描流水/A用户_单元格扫描流水.pdf A用户_单元格图片合成/A用户_单元格图片合成.pdf B用户_扫描流水/B用户_扫描流水.pdf 对公_招商银行图/对公_招商银行图.pdf EOF exit 0 } # ============================================================================ # 参数解析 # ============================================================================ SCRIPT="${DEFAULT_SCRIPT}" BASE_DIR="${DEFAULT_BASE_DIR}" OUTPUT_SUBDIR="${DEFAULT_OUTPUT_SUBDIR}" API_URL="${DEFAULT_API_URL}" TIMEOUT="${DEFAULT_TIMEOUT}" FILE_LIST="" PDF_LIST="" while [[ $# -gt 0 ]]; do case $1 in -s|--script) SCRIPT="$2" shift 2 ;; -d|--base-dir) BASE_DIR="$2" shift 2 ;; -o|--output-subdir) OUTPUT_SUBDIR="$2" shift 2 ;; -u|--api-url) API_URL="$2" shift 2 ;; -t|--timeout) TIMEOUT="$2" shift 2 ;; -f|--file-list) FILE_LIST="$2" shift 2 ;; -p|--pdf-list) PDF_LIST="$2" shift 2 ;; -h|--help) usage ;; *) echo -e "${RED}错误: 未知参数 '$1'${NC}" echo "使用 -h 或 --help 查看帮助信息" exit 1 ;; esac done # ============================================================================ # 验证参数 # ============================================================================ echo -e "${BLUE}==============================================================================${NC}" echo -e "${BLUE}PDF 批量处理任务启动${NC}" echo -e "${BLUE}==============================================================================${NC}" echo "" # 检查脚本是否存在 if [[ ! -f "${SCRIPT}" ]]; then echo -e "${RED}错误: Python 脚本不存在: ${SCRIPT}${NC}" exit 1 fi echo -e "${GREEN}✓ Python 脚本:${NC} ${SCRIPT}" # 检查基础目录是否存在 if [[ ! -d "${BASE_DIR}" ]]; then echo -e "${RED}错误: 基础目录不存在: ${BASE_DIR}${NC}" exit 1 fi echo -e "${GREEN}✓ 基础目录:${NC} ${BASE_DIR}" echo -e "${GREEN}✓ 输出子目录:${NC} ${OUTPUT_SUBDIR}" echo -e "${GREEN}✓ API 地址:${NC} ${API_URL}" echo -e "${GREEN}✓ 超时时间:${NC} ${TIMEOUT} 秒" echo "" # ============================================================================ # 构建 PDF 文件列表 # ============================================================================ declare -a PDF_FILES # 方式1: 从文件列表读取 if [[ -n "${FILE_LIST}" ]]; then if [[ ! -f "${FILE_LIST}" ]]; then echo -e "${RED}错误: 文件列表不存在: ${FILE_LIST}${NC}" exit 1 fi echo -e "${YELLOW}从文件列表读取: ${FILE_LIST}${NC}" while IFS= read -r line; do # 跳过空行和注释 [[ -z "$line" || "$line" =~ ^# ]] && continue # 去除首尾空格 line=$(echo "$line" | xargs) # 如果是相对路径,补充基础目录 if [[ "$line" != /* ]]; then line="${BASE_DIR}/${line}" fi PDF_FILES+=("$line") done < "${FILE_LIST}" # 方式2: 从逗号分隔的列表读取 elif [[ -n "${PDF_LIST}" ]]; then echo -e "${YELLOW}从参数列表读取 PDF 文件${NC}" IFS=',' read -ra PDFS <<< "$PDF_LIST" for pdf in "${PDFS[@]}"; do pdf=$(echo "$pdf" | xargs) # 去除空格 # 如果是相对路径,补充基础目录 if [[ "$pdf" != /* ]]; then # 检查是否包含子目录 if [[ "$pdf" == */* ]]; then pdf="${BASE_DIR}/${pdf}" else # 尝试在基础目录的子目录中查找 pdf_name="${pdf%.pdf}" pdf="${BASE_DIR}/${pdf_name}/${pdf}" fi fi PDF_FILES+=("$pdf") done # 方式3: 查找基础目录下所有 PDF else echo -e "${YELLOW}在基础目录中查找所有 PDF 文件${NC}" # 使用 find 命令查找所有 .pdf 文件 while IFS= read -r -d $'\0' file; do PDF_FILES+=("$file") done < <(find "${BASE_DIR}" -type f -name "*.pdf" -print0 | sort -z) fi # 检查是否找到 PDF 文件 if [[ ${#PDF_FILES[@]} -eq 0 ]]; then echo -e "${RED}错误: 未找到任何 PDF 文件${NC}" exit 1 fi echo -e "${GREEN}找到 ${#PDF_FILES[@]} 个 PDF 文件:${NC}" for i in "${!PDF_FILES[@]}"; do echo -e " ${BLUE}[$((i+1))]${NC} ${PDF_FILES[$i]}" done echo "" # ============================================================================ # 确认执行 # ============================================================================ read -p "$(echo -e ${YELLOW}"是否继续处理? [Y/n]: "${NC})" confirm confirm=${confirm:-Y} if [[ ! "$confirm" =~ ^[Yy]$ ]]; then echo -e "${YELLOW}已取消处理${NC}" exit 0 fi echo "" # ============================================================================ # 处理 PDF 文件 # ============================================================================ success_count=0 failed_count=0 skipped_count=0 declare -a failed_files echo -e "${BLUE}==============================================================================${NC}" echo -e "${BLUE}开始批量处理${NC}" echo -e "${BLUE}==============================================================================${NC}" echo "" start_time=$(date +%s) for i in "${!PDF_FILES[@]}"; do pdf_file="${PDF_FILES[$i]}" current=$((i+1)) total=${#PDF_FILES[@]} echo -e "${BLUE}━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━${NC}" echo -e "${BLUE}[${current}/${total}] 处理: ${pdf_file}${NC}" echo -e "${BLUE}━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━${NC}" # 检查文件是否存在 if [[ ! -f "$pdf_file" ]]; then echo -e "${RED}⚠ 跳过: 文件不存在${NC}" ((skipped_count++)) echo "" continue fi # 确定输出目录 pdf_dir=$(dirname "$pdf_file") output_dir="${pdf_dir}/${OUTPUT_SUBDIR}" # 显示处理信息 echo -e "${GREEN}📄 输入文件:${NC} $pdf_file" echo -e "${GREEN}📁 输出目录:${NC} $output_dir" echo -e "${GREEN}🌐 API 地址:${NC} $API_URL" echo "" # 创建输出目录 # mkdir -p "$output_dir" # 执行处理 echo -e "${YELLOW}开始处理...${NC}" file_start_time=$(date +%s) if python "${SCRIPT}" \ --input_file "$pdf_file" \ --output_dir "$output_dir" \ --api_url "$API_URL" \ --timeout "$TIMEOUT"; then file_end_time=$(date +%s) file_duration=$((file_end_time - file_start_time)) echo -e "${GREEN}✓ 成功${NC} (耗时: ${file_duration}秒)" ((success_count++)) else file_end_time=$(date +%s) file_duration=$((file_end_time - file_start_time)) echo -e "${RED}✗ 失败${NC} (耗时: ${file_duration}秒)" ((failed_count++)) failed_files+=("$pdf_file") fi echo "" done end_time=$(date +%s) total_duration=$((end_time - start_time)) # ============================================================================ # 输出统计信息 # ============================================================================ echo -e "${BLUE}==============================================================================${NC}" echo -e "${BLUE}处理完成${NC}" echo -e "${BLUE}==============================================================================${NC}" echo "" echo -e "${GREEN}📊 统计信息:${NC}" echo -e " 总文件数: ${total}" echo -e " ${GREEN}✓ 成功: ${success_count}${NC}" echo -e " ${RED}✗ 失败: ${failed_count}${NC}" echo -e " ${YELLOW}⊘ 跳过: ${skipped_count}${NC}" echo -e " ⏱️ 总耗时: ${total_duration} 秒 ($(date -u -r $total_duration +%H:%M:%S))" echo "" # 显示失败的文件 if [[ ${failed_count} -gt 0 ]]; then echo -e "${RED}失败的文件:${NC}" for file in "${failed_files[@]}"; do echo -e " ${RED}✗${NC} $file" done echo "" fi # 生成日志文件 log_file="batch_process_$(date +%Y%m%d_%H%M%S).log" { echo "批量处理日志" echo "============================================" echo "开始时间: $(date -r $start_time)" echo "结束时间: $(date -r $end_time)" echo "总耗时: ${total_duration} 秒" echo "" echo "统计信息:" echo " 总文件数: ${total}" echo " 成功: ${success_count}" echo " 失败: ${failed_count}" echo " 跳过: ${skipped_count}" echo "" if [[ ${failed_count} -gt 0 ]]; then echo "失败的文件:" for file in "${failed_files[@]}"; do echo " - $file" done fi } > "$log_file" echo -e "${GREEN}📝 日志已保存: ${log_file}${NC}" echo "" # 退出码 if [[ ${failed_count} -gt 0 ]]; then exit 1 else exit 0 fi