|
|
@@ -0,0 +1,381 @@
|
|
|
+#!/bin/bash
|
|
|
+# filepath: process_pdf_batch.sh
|
|
|
+
|
|
|
+# ============================================================================
|
|
|
+# PDF 批量处理脚本
|
|
|
+# 功能: 批量调用 ppstructurev3_single_client.py 处理多个 PDF 文件
|
|
|
+# ============================================================================
|
|
|
+
|
|
|
+set -e # 遇到错误立即退出
|
|
|
+
|
|
|
+# ============================================================================
|
|
|
+# 默认配置
|
|
|
+# ============================================================================
|
|
|
+DEFAULT_SCRIPT="ppstructurev3_single_client.py"
|
|
|
+DEFAULT_BASE_DIR="/Users/zhch158/workspace/data/流水分析"
|
|
|
+DEFAULT_OUTPUT_SUBDIR="data_PPStructureV3_Results"
|
|
|
+DEFAULT_API_URL="http://10.192.72.11:8111/layout-parsing"
|
|
|
+DEFAULT_TIMEOUT=300
|
|
|
+
|
|
|
+# ============================================================================
|
|
|
+# 颜色定义
|
|
|
+# ============================================================================
|
|
|
+RED='\033[0;31m'
|
|
|
+GREEN='\033[0;32m'
|
|
|
+YELLOW='\033[1;33m'
|
|
|
+BLUE='\033[0;34m'
|
|
|
+NC='\033[0m' # No Color
|
|
|
+
|
|
|
+# ============================================================================
|
|
|
+# 帮助信息
|
|
|
+# ============================================================================
|
|
|
+usage() {
|
|
|
+ cat << EOF
|
|
|
+${BLUE}=============================================================================
|
|
|
+PDF 批量处理脚本
|
|
|
+==============================================================================${NC}
|
|
|
+
|
|
|
+${GREEN}用法:${NC}
|
|
|
+ $0 [选项]
|
|
|
+
|
|
|
+${GREEN}选项:${NC}
|
|
|
+ -s, --script SCRIPT Python 脚本路径 (默认: ${DEFAULT_SCRIPT})
|
|
|
+ -d, --base-dir DIR PDF 文件基础目录 (默认: ${DEFAULT_BASE_DIR})
|
|
|
+ -o, --output-subdir SUBDIR 输出子目录名称 (默认: ${DEFAULT_OUTPUT_SUBDIR})
|
|
|
+ -u, --api-url URL API 地址 (默认: ${DEFAULT_API_URL})
|
|
|
+ -t, --timeout SECONDS 超时时间(秒) (默认: ${DEFAULT_TIMEOUT})
|
|
|
+ -f, --file-list FILE PDF 文件列表文件路径
|
|
|
+ -p, --pdf-list "PDF1,PDF2" PDF 文件列表(逗号分隔)
|
|
|
+ -h, --help 显示此帮助信息
|
|
|
+
|
|
|
+${GREEN}PDF 文件指定方式 (按优先级):${NC}
|
|
|
+ 1. 通过 -f 参数指定文件列表
|
|
|
+ 2. 通过 -p 参数直接指定 PDF 列表
|
|
|
+ 3. 处理基础目录下所有 .pdf 文件
|
|
|
+
|
|
|
+${GREEN}文件列表格式 (每行一个):${NC}
|
|
|
+ 子目录名/文件名.pdf
|
|
|
+ 或
|
|
|
+ 完整路径/文件名.pdf
|
|
|
+
|
|
|
+${GREEN}示例:${NC}
|
|
|
+ # 1. 使用文件列表
|
|
|
+ $0 -f pdf_list.txt
|
|
|
+
|
|
|
+ # 2. 直接指定 PDF 列表
|
|
|
+ $0 -p "A用户_单元格扫描流水.pdf,B用户_扫描流水.pdf"
|
|
|
+
|
|
|
+ # 3. 处理指定目录下所有 PDF
|
|
|
+ $0 -d "/path/to/pdfs"
|
|
|
+
|
|
|
+ # 4. 完整参数示例
|
|
|
+ $0 -s "./ppstructurev3_single_client.py" \\
|
|
|
+ -d "/Users/zhch158/workspace/data/流水分析" \\
|
|
|
+ -o "data_PPStructureV3_Results" \\
|
|
|
+ -u "http://10.192.72.11:8111/layout-parsing" \\
|
|
|
+ -t 600 \\
|
|
|
+ -f "pdf_list.txt"
|
|
|
+
|
|
|
+${YELLOW}文件列表示例 (pdf_list.txt):${NC}
|
|
|
+ A用户_单元格扫描流水/A用户_单元格扫描流水.pdf
|
|
|
+ A用户_单元格图片合成/A用户_单元格图片合成.pdf
|
|
|
+ B用户_扫描流水/B用户_扫描流水.pdf
|
|
|
+ 对公_招商银行图/对公_招商银行图.pdf
|
|
|
+
|
|
|
+EOF
|
|
|
+ exit 0
|
|
|
+}
|
|
|
+
|
|
|
+# ============================================================================
|
|
|
+# 参数解析
|
|
|
+# ============================================================================
|
|
|
+SCRIPT="${DEFAULT_SCRIPT}"
|
|
|
+BASE_DIR="${DEFAULT_BASE_DIR}"
|
|
|
+OUTPUT_SUBDIR="${DEFAULT_OUTPUT_SUBDIR}"
|
|
|
+API_URL="${DEFAULT_API_URL}"
|
|
|
+TIMEOUT="${DEFAULT_TIMEOUT}"
|
|
|
+FILE_LIST=""
|
|
|
+PDF_LIST=""
|
|
|
+
|
|
|
+while [[ $# -gt 0 ]]; do
|
|
|
+ case $1 in
|
|
|
+ -s|--script)
|
|
|
+ SCRIPT="$2"
|
|
|
+ shift 2
|
|
|
+ ;;
|
|
|
+ -d|--base-dir)
|
|
|
+ BASE_DIR="$2"
|
|
|
+ shift 2
|
|
|
+ ;;
|
|
|
+ -o|--output-subdir)
|
|
|
+ OUTPUT_SUBDIR="$2"
|
|
|
+ shift 2
|
|
|
+ ;;
|
|
|
+ -u|--api-url)
|
|
|
+ API_URL="$2"
|
|
|
+ shift 2
|
|
|
+ ;;
|
|
|
+ -t|--timeout)
|
|
|
+ TIMEOUT="$2"
|
|
|
+ shift 2
|
|
|
+ ;;
|
|
|
+ -f|--file-list)
|
|
|
+ FILE_LIST="$2"
|
|
|
+ shift 2
|
|
|
+ ;;
|
|
|
+ -p|--pdf-list)
|
|
|
+ PDF_LIST="$2"
|
|
|
+ shift 2
|
|
|
+ ;;
|
|
|
+ -h|--help)
|
|
|
+ usage
|
|
|
+ ;;
|
|
|
+ *)
|
|
|
+ echo -e "${RED}错误: 未知参数 '$1'${NC}"
|
|
|
+ echo "使用 -h 或 --help 查看帮助信息"
|
|
|
+ exit 1
|
|
|
+ ;;
|
|
|
+ esac
|
|
|
+done
|
|
|
+
|
|
|
+# ============================================================================
|
|
|
+# 验证参数
|
|
|
+# ============================================================================
|
|
|
+echo -e "${BLUE}==============================================================================${NC}"
|
|
|
+echo -e "${BLUE}PDF 批量处理任务启动${NC}"
|
|
|
+echo -e "${BLUE}==============================================================================${NC}"
|
|
|
+echo ""
|
|
|
+
|
|
|
+# 检查脚本是否存在
|
|
|
+if [[ ! -f "${SCRIPT}" ]]; then
|
|
|
+ echo -e "${RED}错误: Python 脚本不存在: ${SCRIPT}${NC}"
|
|
|
+ exit 1
|
|
|
+fi
|
|
|
+echo -e "${GREEN}✓ Python 脚本:${NC} ${SCRIPT}"
|
|
|
+
|
|
|
+# 检查基础目录是否存在
|
|
|
+if [[ ! -d "${BASE_DIR}" ]]; then
|
|
|
+ echo -e "${RED}错误: 基础目录不存在: ${BASE_DIR}${NC}"
|
|
|
+ exit 1
|
|
|
+fi
|
|
|
+echo -e "${GREEN}✓ 基础目录:${NC} ${BASE_DIR}"
|
|
|
+echo -e "${GREEN}✓ 输出子目录:${NC} ${OUTPUT_SUBDIR}"
|
|
|
+echo -e "${GREEN}✓ API 地址:${NC} ${API_URL}"
|
|
|
+echo -e "${GREEN}✓ 超时时间:${NC} ${TIMEOUT} 秒"
|
|
|
+echo ""
|
|
|
+
|
|
|
+# ============================================================================
|
|
|
+# 构建 PDF 文件列表
|
|
|
+# ============================================================================
|
|
|
+declare -a PDF_FILES
|
|
|
+
|
|
|
+# 方式1: 从文件列表读取
|
|
|
+if [[ -n "${FILE_LIST}" ]]; then
|
|
|
+ if [[ ! -f "${FILE_LIST}" ]]; then
|
|
|
+ echo -e "${RED}错误: 文件列表不存在: ${FILE_LIST}${NC}"
|
|
|
+ exit 1
|
|
|
+ fi
|
|
|
+
|
|
|
+ echo -e "${YELLOW}从文件列表读取: ${FILE_LIST}${NC}"
|
|
|
+ while IFS= read -r line; do
|
|
|
+ # 跳过空行和注释
|
|
|
+ [[ -z "$line" || "$line" =~ ^# ]] && continue
|
|
|
+
|
|
|
+ # 去除首尾空格
|
|
|
+ line=$(echo "$line" | xargs)
|
|
|
+
|
|
|
+ # 如果是相对路径,补充基础目录
|
|
|
+ if [[ "$line" != /* ]]; then
|
|
|
+ line="${BASE_DIR}/${line}"
|
|
|
+ fi
|
|
|
+
|
|
|
+ PDF_FILES+=("$line")
|
|
|
+ done < "${FILE_LIST}"
|
|
|
+
|
|
|
+# 方式2: 从逗号分隔的列表读取
|
|
|
+elif [[ -n "${PDF_LIST}" ]]; then
|
|
|
+ echo -e "${YELLOW}从参数列表读取 PDF 文件${NC}"
|
|
|
+ IFS=',' read -ra PDFS <<< "$PDF_LIST"
|
|
|
+ for pdf in "${PDFS[@]}"; do
|
|
|
+ pdf=$(echo "$pdf" | xargs) # 去除空格
|
|
|
+
|
|
|
+ # 如果是相对路径,补充基础目录
|
|
|
+ if [[ "$pdf" != /* ]]; then
|
|
|
+ # 检查是否包含子目录
|
|
|
+ if [[ "$pdf" == */* ]]; then
|
|
|
+ pdf="${BASE_DIR}/${pdf}"
|
|
|
+ else
|
|
|
+ # 尝试在基础目录的子目录中查找
|
|
|
+ pdf_name="${pdf%.pdf}"
|
|
|
+ pdf="${BASE_DIR}/${pdf_name}/${pdf}"
|
|
|
+ fi
|
|
|
+ fi
|
|
|
+
|
|
|
+ PDF_FILES+=("$pdf")
|
|
|
+ done
|
|
|
+
|
|
|
+# 方式3: 查找基础目录下所有 PDF
|
|
|
+else
|
|
|
+ echo -e "${YELLOW}在基础目录中查找所有 PDF 文件${NC}"
|
|
|
+
|
|
|
+ # 使用 find 命令查找所有 .pdf 文件
|
|
|
+ while IFS= read -r -d $'\0' file; do
|
|
|
+ PDF_FILES+=("$file")
|
|
|
+ done < <(find "${BASE_DIR}" -type f -name "*.pdf" -print0 | sort -z)
|
|
|
+fi
|
|
|
+
|
|
|
+# 检查是否找到 PDF 文件
|
|
|
+if [[ ${#PDF_FILES[@]} -eq 0 ]]; then
|
|
|
+ echo -e "${RED}错误: 未找到任何 PDF 文件${NC}"
|
|
|
+ exit 1
|
|
|
+fi
|
|
|
+
|
|
|
+echo -e "${GREEN}找到 ${#PDF_FILES[@]} 个 PDF 文件:${NC}"
|
|
|
+for i in "${!PDF_FILES[@]}"; do
|
|
|
+ echo -e " ${BLUE}[$((i+1))]${NC} ${PDF_FILES[$i]}"
|
|
|
+done
|
|
|
+echo ""
|
|
|
+
|
|
|
+# ============================================================================
|
|
|
+# 确认执行
|
|
|
+# ============================================================================
|
|
|
+read -p "$(echo -e ${YELLOW}"是否继续处理? [Y/n]: "${NC})" confirm
|
|
|
+confirm=${confirm:-Y}
|
|
|
+if [[ ! "$confirm" =~ ^[Yy]$ ]]; then
|
|
|
+ echo -e "${YELLOW}已取消处理${NC}"
|
|
|
+ exit 0
|
|
|
+fi
|
|
|
+echo ""
|
|
|
+
|
|
|
+# ============================================================================
|
|
|
+# 处理 PDF 文件
|
|
|
+# ============================================================================
|
|
|
+success_count=0
|
|
|
+failed_count=0
|
|
|
+skipped_count=0
|
|
|
+declare -a failed_files
|
|
|
+
|
|
|
+echo -e "${BLUE}==============================================================================${NC}"
|
|
|
+echo -e "${BLUE}开始批量处理${NC}"
|
|
|
+echo -e "${BLUE}==============================================================================${NC}"
|
|
|
+echo ""
|
|
|
+
|
|
|
+start_time=$(date +%s)
|
|
|
+
|
|
|
+for i in "${!PDF_FILES[@]}"; do
|
|
|
+ pdf_file="${PDF_FILES[$i]}"
|
|
|
+ current=$((i+1))
|
|
|
+ total=${#PDF_FILES[@]}
|
|
|
+
|
|
|
+ echo -e "${BLUE}━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━${NC}"
|
|
|
+ echo -e "${BLUE}[${current}/${total}] 处理: ${pdf_file}${NC}"
|
|
|
+ echo -e "${BLUE}━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━${NC}"
|
|
|
+
|
|
|
+ # 检查文件是否存在
|
|
|
+ if [[ ! -f "$pdf_file" ]]; then
|
|
|
+ echo -e "${RED}⚠ 跳过: 文件不存在${NC}"
|
|
|
+ ((skipped_count++))
|
|
|
+ echo ""
|
|
|
+ continue
|
|
|
+ fi
|
|
|
+
|
|
|
+ # 确定输出目录
|
|
|
+ pdf_dir=$(dirname "$pdf_file")
|
|
|
+ output_dir="${pdf_dir}/${OUTPUT_SUBDIR}"
|
|
|
+
|
|
|
+ # 显示处理信息
|
|
|
+ echo -e "${GREEN}📄 输入文件:${NC} $pdf_file"
|
|
|
+ echo -e "${GREEN}📁 输出目录:${NC} $output_dir"
|
|
|
+ echo -e "${GREEN}🌐 API 地址:${NC} $API_URL"
|
|
|
+ echo ""
|
|
|
+
|
|
|
+ # 创建输出目录
|
|
|
+ # mkdir -p "$output_dir"
|
|
|
+
|
|
|
+ # 执行处理
|
|
|
+ echo -e "${YELLOW}开始处理...${NC}"
|
|
|
+
|
|
|
+ file_start_time=$(date +%s)
|
|
|
+
|
|
|
+ if python "${SCRIPT}" \
|
|
|
+ --input_file "$pdf_file" \
|
|
|
+ --output_dir "$output_dir" \
|
|
|
+ --api_url "$API_URL" \
|
|
|
+ --timeout "$TIMEOUT"; then
|
|
|
+
|
|
|
+ file_end_time=$(date +%s)
|
|
|
+ file_duration=$((file_end_time - file_start_time))
|
|
|
+
|
|
|
+ echo -e "${GREEN}✓ 成功${NC} (耗时: ${file_duration}秒)"
|
|
|
+ ((success_count++))
|
|
|
+ else
|
|
|
+ file_end_time=$(date +%s)
|
|
|
+ file_duration=$((file_end_time - file_start_time))
|
|
|
+
|
|
|
+ echo -e "${RED}✗ 失败${NC} (耗时: ${file_duration}秒)"
|
|
|
+ ((failed_count++))
|
|
|
+ failed_files+=("$pdf_file")
|
|
|
+ fi
|
|
|
+
|
|
|
+ echo ""
|
|
|
+done
|
|
|
+
|
|
|
+end_time=$(date +%s)
|
|
|
+total_duration=$((end_time - start_time))
|
|
|
+
|
|
|
+# ============================================================================
|
|
|
+# 输出统计信息
|
|
|
+# ============================================================================
|
|
|
+echo -e "${BLUE}==============================================================================${NC}"
|
|
|
+echo -e "${BLUE}处理完成${NC}"
|
|
|
+echo -e "${BLUE}==============================================================================${NC}"
|
|
|
+echo ""
|
|
|
+echo -e "${GREEN}📊 统计信息:${NC}"
|
|
|
+echo -e " 总文件数: ${total}"
|
|
|
+echo -e " ${GREEN}✓ 成功: ${success_count}${NC}"
|
|
|
+echo -e " ${RED}✗ 失败: ${failed_count}${NC}"
|
|
|
+echo -e " ${YELLOW}⊘ 跳过: ${skipped_count}${NC}"
|
|
|
+echo -e " ⏱️ 总耗时: ${total_duration} 秒 ($(date -u -r $total_duration +%H:%M:%S))"
|
|
|
+echo ""
|
|
|
+
|
|
|
+# 显示失败的文件
|
|
|
+if [[ ${failed_count} -gt 0 ]]; then
|
|
|
+ echo -e "${RED}失败的文件:${NC}"
|
|
|
+ for file in "${failed_files[@]}"; do
|
|
|
+ echo -e " ${RED}✗${NC} $file"
|
|
|
+ done
|
|
|
+ echo ""
|
|
|
+fi
|
|
|
+
|
|
|
+# 生成日志文件
|
|
|
+log_file="batch_process_$(date +%Y%m%d_%H%M%S).log"
|
|
|
+{
|
|
|
+ echo "批量处理日志"
|
|
|
+ echo "============================================"
|
|
|
+ echo "开始时间: $(date -r $start_time)"
|
|
|
+ echo "结束时间: $(date -r $end_time)"
|
|
|
+ echo "总耗时: ${total_duration} 秒"
|
|
|
+ echo ""
|
|
|
+ echo "统计信息:"
|
|
|
+ echo " 总文件数: ${total}"
|
|
|
+ echo " 成功: ${success_count}"
|
|
|
+ echo " 失败: ${failed_count}"
|
|
|
+ echo " 跳过: ${skipped_count}"
|
|
|
+ echo ""
|
|
|
+ if [[ ${failed_count} -gt 0 ]]; then
|
|
|
+ echo "失败的文件:"
|
|
|
+ for file in "${failed_files[@]}"; do
|
|
|
+ echo " - $file"
|
|
|
+ done
|
|
|
+ fi
|
|
|
+} > "$log_file"
|
|
|
+
|
|
|
+echo -e "${GREEN}📝 日志已保存: ${log_file}${NC}"
|
|
|
+echo ""
|
|
|
+
|
|
|
+# 退出码
|
|
|
+if [[ ${failed_count} -gt 0 ]]; then
|
|
|
+ exit 1
|
|
|
+else
|
|
|
+ exit 0
|
|
|
+fi
|