api_client.py 20 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541
  1. #!/usr/bin/env python3
  2. """
  3. 批量处理图片/PDF文件并通过 API 调用远程服务(PP-StructureV3 API 客户端版本)
  4. 通过 HTTP API 调用远程 PP-StructureV3 服务进行文档处理。
  5. 适用于远程服务、分布式处理场景。
  6. 使用方法:
  7. python api_client.py --input document.pdf --output_dir ./output --api_url http://10.192.72.11:20026/layout-parsing
  8. python api_client.py --input ./images/ --output_dir ./output --api_url http://10.192.72.11:20026/layout-parsing
  9. python api_client.py --input file_list.txt --output_dir ./output --api_url http://10.192.72.11:20026/layout-parsing
  10. """
  11. import os
  12. import sys
  13. import json
  14. import time
  15. import traceback
  16. import base64
  17. from pathlib import Path
  18. from typing import List, Dict, Any
  19. from tqdm import tqdm
  20. import argparse
  21. import requests
  22. from loguru import logger
  23. # 导入 ocr_utils
  24. ocr_platform_root = Path(__file__).parents[2]
  25. if str(ocr_platform_root) not in sys.path:
  26. sys.path.insert(0, str(ocr_platform_root))
  27. from ocr_utils import (
  28. get_input_files,
  29. collect_pid_files,
  30. setup_logging
  31. )
  32. # 导入共享工具函数
  33. tools_root = Path(__file__).parents[1]
  34. if str(tools_root) not in sys.path:
  35. sys.path.insert(0, str(tools_root))
  36. from paddle_common.utils import (
  37. convert_pruned_result_to_json,
  38. save_output_images,
  39. save_markdown_content
  40. )
  41. def call_api_for_image(image_path: str, api_url: str, timeout: int = 300) -> Dict[str, Any]:
  42. """
  43. 为单个图像调用API
  44. Args:
  45. image_path: 图像文件路径
  46. api_url: API URL
  47. timeout: 超时时间(秒)
  48. Returns:
  49. API返回结果
  50. """
  51. try:
  52. # 对本地图像进行Base64编码
  53. with open(image_path, "rb") as file:
  54. image_bytes = file.read()
  55. image_data = base64.b64encode(image_bytes).decode("ascii")
  56. payload = {
  57. "file": image_data,
  58. "fileType": 1,
  59. # 添加管道参数设置
  60. "useDocOrientationClassify": True,
  61. "useDocUnwarping": False,
  62. "useSealRecognition": True,
  63. "useTableRecognition": True,
  64. "useFormulaRecognition": False, # 避免公式识别的索引错误
  65. "useChartRecognition": True,
  66. "useRegionDetection": False,
  67. "useOcrResultsWithTableCells": True,
  68. "useTableOrientationClassify": False,
  69. "useWiredTableCellsTransToHtml": True,
  70. "useWirelessTableCellsTransToHtml": True,
  71. }
  72. # 调用API
  73. response = requests.post(api_url, json=payload, timeout=timeout)
  74. response.raise_for_status()
  75. return response.json()["result"]
  76. except requests.exceptions.Timeout:
  77. raise Exception(f"API调用超时 ({timeout}秒)")
  78. except requests.exceptions.RequestException as e:
  79. raise Exception(f"API调用失败: {e}")
  80. except KeyError:
  81. raise Exception("API返回格式错误,缺少'result'字段")
  82. except Exception as e:
  83. raise Exception(f"处理图像时发生错误: {e}")
  84. def process_images_via_api(image_paths: List[str],
  85. api_url: str,
  86. output_dir: str = "./output",
  87. normalize_numbers: bool = True,
  88. timeout: int = 300,
  89. log_level: str = "INFO") -> List[Dict[str, Any]]:
  90. """
  91. 通过API统一处理图像文件
  92. Args:
  93. image_paths: 图像路径列表
  94. api_url: API URL
  95. output_dir: 输出目录
  96. normalize_numbers: 是否标准化数字格式
  97. timeout: API调用超时时间
  98. log_level: 日志级别
  99. Returns:
  100. 处理结果列表
  101. """
  102. # 创建输出目录
  103. output_path = Path(output_dir)
  104. output_path.mkdir(parents=True, exist_ok=True)
  105. all_results = []
  106. total_images = len(image_paths)
  107. logger.info(f"Processing {total_images} images via API")
  108. # 使用tqdm显示进度
  109. with tqdm(total=total_images, desc="Processing images", unit="img",
  110. bar_format='{l_bar}{bar}| {n_fmt}/{total_fmt} [{elapsed}<{remaining}, {rate_fmt}]') as pbar:
  111. # 逐个处理图像
  112. for img_path in image_paths:
  113. start_time = time.time()
  114. try:
  115. # 调用API处理图像
  116. api_result = call_api_for_image(img_path, api_url, timeout)
  117. processing_time = time.time() - start_time
  118. # 获取主要数据
  119. layout_parsing_results = api_result.get('layoutParsingResults', [])
  120. if not layout_parsing_results:
  121. logger.warning("⚠️ Warning: No layoutParsingResults found in API response")
  122. all_results.append({
  123. "image_path": str(img_path),
  124. "processing_time": processing_time,
  125. "success": False,
  126. "api_url": api_url,
  127. "error": "No layoutParsingResults found in API response",
  128. "is_pdf_page": "_page_" in Path(img_path).name
  129. })
  130. pbar.update(1)
  131. continue
  132. # 处理API返回结果
  133. input_path = Path(img_path)
  134. # 生成输出文件名
  135. output_filename = input_path.stem
  136. # 处理结果(应该只有一个结果)
  137. for idx, result in enumerate(layout_parsing_results):
  138. if idx > 0:
  139. raise ValueError("Multiple results found for a single image")
  140. json_content = result.get('prunedResult', {})
  141. json_output_path, converted_json = convert_pruned_result_to_json(
  142. json_content,
  143. str(input_path),
  144. output_dir,
  145. output_filename,
  146. normalize_numbers=normalize_numbers
  147. )
  148. # 保存输出图像
  149. img_content = result.get('outputImages', {})
  150. saved_images = save_output_images(img_content, str(output_dir), output_filename)
  151. # 保存Markdown内容
  152. markdown_content = result.get('markdown', {})
  153. md_output_path = save_markdown_content(
  154. markdown_content,
  155. output_dir,
  156. output_filename,
  157. normalize_numbers=normalize_numbers,
  158. key_text='markdown_texts',
  159. key_images='markdown_images',
  160. json_data=converted_json
  161. )
  162. # 根据实际保存的文件路径判断成功(成功判断标准:.md 和 .json 文件都存在)
  163. actual_md_path = Path(md_output_path) if md_output_path else Path(output_dir) / f"{output_filename}.md"
  164. actual_json_path = Path(json_output_path) if json_output_path else Path(output_dir) / f"{output_filename}.json"
  165. success = actual_md_path.exists() and actual_json_path.exists()
  166. # 记录处理结果
  167. result_info = {
  168. "image_path": str(input_path),
  169. "processing_time": processing_time,
  170. "success": success,
  171. "api_url": api_url,
  172. "is_pdf_page": "_page_" in input_path.name,
  173. "processing_info": converted_json.get('processing_info', {})
  174. }
  175. if success:
  176. result_info.update({
  177. "output_json": json_output_path,
  178. "output_md": md_output_path,
  179. "output_files": {
  180. "md": str(actual_md_path),
  181. "json": str(actual_json_path),
  182. **saved_images
  183. }
  184. })
  185. logger.info(f"✅ 处理成功: {input_path.stem}")
  186. else:
  187. missing_files = []
  188. if not actual_md_path.exists():
  189. missing_files.append("md")
  190. if not actual_json_path.exists():
  191. missing_files.append("json")
  192. result_info["error"] = f"输出文件不存在: {', '.join(missing_files)}"
  193. result_info["success"] = False
  194. logger.error(f"❌ 处理失败: {input_path.stem} - {result_info['error']}")
  195. all_results.append(result_info)
  196. # 更新进度条
  197. success_count = sum(1 for r in all_results if r.get('success', False))
  198. pbar.update(1)
  199. pbar.set_postfix({
  200. 'time': f"{processing_time:.2f}s",
  201. 'success': f"{success_count}/{len(all_results)}",
  202. 'rate': f"{success_count/len(all_results)*100:.1f}%" if len(all_results) > 0 else "0%"
  203. })
  204. except Exception as e:
  205. logger.error(f"Error processing {Path(img_path).name}: {e}")
  206. if log_level == "DEBUG":
  207. traceback.print_exc()
  208. # 添加错误结果
  209. all_results.append({
  210. "image_path": str(img_path),
  211. "processing_time": 0,
  212. "success": False,
  213. "api_url": api_url,
  214. "error": str(e),
  215. "is_pdf_page": "_page_" in Path(img_path).name
  216. })
  217. pbar.update(1)
  218. return all_results
  219. def main():
  220. """主函数"""
  221. parser = argparse.ArgumentParser(
  222. description="PP-StructureV3 API Client Batch Processing",
  223. formatter_class=argparse.RawDescriptionHelpFormatter,
  224. epilog="""
  225. 示例:
  226. # 处理单个PDF文件
  227. python api_client.py --input document.pdf --output_dir ./output --api_url http://localhost:8080/layout-parsing
  228. # 处理图片目录
  229. python api_client.py --input ./images/ --output_dir ./output --api_url http://10.192.72.11:8111/layout-parsing
  230. # 处理文件列表
  231. python api_client.py --input file_list.txt --output_dir ./output --api_url http://localhost:8080/layout-parsing
  232. # 指定页面范围(PDF或图片目录)
  233. python api_client.py --input document.pdf --output_dir ./output --pages "1-5,7" --api_url http://localhost:20026/layout-parsing
  234. # 仅验证配置(dry run)
  235. python api_client.py --input document.pdf --output_dir ./output --api_url http://localhost:20026/layout-parsing --dry_run
  236. # 使用 DEBUG 日志级别获取详细错误信息
  237. python api_client.py --input document.pdf --output_dir ./output --api_url http://localhost:20026/layout-parsing --log_level DEBUG
  238. """
  239. )
  240. # 输入参数(统一使用 --input)
  241. parser.add_argument(
  242. "--input", "-i",
  243. required=True,
  244. type=str,
  245. help="输入路径(支持PDF文件、图片文件、图片目录、文件列表.txt、CSV文件)"
  246. )
  247. # 输出参数
  248. parser.add_argument(
  249. "--output_dir", "-o",
  250. type=str,
  251. required=True,
  252. help="输出目录"
  253. )
  254. # API 参数
  255. parser.add_argument(
  256. "--api_url",
  257. type=str,
  258. default="http://localhost:8080/layout-parsing",
  259. help="API URL(默认: http://localhost:8080/layout-parsing)"
  260. )
  261. parser.add_argument(
  262. "--timeout",
  263. type=int,
  264. default=300,
  265. help="API 调用超时时间(秒,默认: 300)"
  266. )
  267. parser.add_argument(
  268. "--pdf_dpi",
  269. type=int,
  270. default=200,
  271. help="PDF 转图片的 DPI(默认: 200)"
  272. )
  273. parser.add_argument(
  274. '--no-normalize',
  275. action='store_true',
  276. help='禁用数字标准化'
  277. )
  278. # 处理参数
  279. parser.add_argument(
  280. "--pages", "-p",
  281. type=str,
  282. help="页面范围(PDF和图片目录有效),如: '1-5,7,9-12', '1-', '-10'"
  283. )
  284. parser.add_argument(
  285. "--collect_results",
  286. type=str,
  287. help="收集处理结果到指定CSV文件"
  288. )
  289. # 日志参数
  290. parser.add_argument(
  291. "--log_level",
  292. default="INFO",
  293. choices=["DEBUG", "INFO", "WARNING", "ERROR"],
  294. help="日志级别(默认: INFO)"
  295. )
  296. parser.add_argument(
  297. "--log_file",
  298. type=str,
  299. help="日志文件路径"
  300. )
  301. # Dry run 参数
  302. parser.add_argument(
  303. "--dry_run",
  304. action="store_true",
  305. help="仅验证配置和输入,不执行实际处理"
  306. )
  307. args = parser.parse_args()
  308. # 设置日志
  309. setup_logging(args.log_level, args.log_file)
  310. try:
  311. # 创建参数对象(用于 get_input_files)
  312. class Args:
  313. def __init__(self, input_path, output_dir, pdf_dpi):
  314. self.input = input_path
  315. self.output_dir = output_dir
  316. self.pdf_dpi = pdf_dpi
  317. args_obj = Args(args.input, args.output_dir, args.pdf_dpi)
  318. # 获取并预处理输入文件(页面范围过滤已在 get_input_files 中处理)
  319. logger.info("🔄 Preprocessing input files...")
  320. if args.pages:
  321. logger.info(f"📄 页面范围: {args.pages}")
  322. image_files = get_input_files(args_obj, page_range=args.pages)
  323. if not image_files:
  324. logger.error("❌ No input files found or processed")
  325. return 1
  326. output_dir = Path(args.output_dir).resolve()
  327. logger.info(f"📁 Output dir: {output_dir}")
  328. logger.info(f"📊 Found {len(image_files)} image files to process")
  329. # Dry run 模式
  330. if args.dry_run:
  331. logger.info("🔍 Dry run mode: 仅验证配置,不执行处理")
  332. logger.info(f"📋 配置信息:")
  333. logger.info(f" - 输入: {args.input}")
  334. logger.info(f" - 输出目录: {output_dir}")
  335. logger.info(f" - API URL: {args.api_url}")
  336. logger.info(f" - 超时时间: {args.timeout} 秒")
  337. logger.info(f" - PDF DPI: {args.pdf_dpi}")
  338. logger.info(f" - 数字标准化: {not args.no_normalize}")
  339. logger.info(f" - 日志级别: {args.log_level}")
  340. if args.pages:
  341. logger.info(f" - 页面范围: {args.pages}")
  342. logger.info(f"📋 将要处理的文件 ({len(image_files)} 个):")
  343. for i, img_file in enumerate(image_files[:20], 1): # 只显示前20个
  344. logger.info(f" {i}. {img_file}")
  345. if len(image_files) > 20:
  346. logger.info(f" ... 还有 {len(image_files) - 20} 个文件")
  347. logger.info("✅ Dry run 完成:配置验证通过")
  348. return 0
  349. logger.info(f"🌐 Using API: {args.api_url}")
  350. logger.info(f"⏱️ Timeout: {args.timeout} seconds")
  351. logger.info(f"🔧 数字标准化: {'启用' if not args.no_normalize else '禁用'}")
  352. # 开始处理
  353. start_time = time.time()
  354. results = process_images_via_api(
  355. image_files,
  356. args.api_url,
  357. str(output_dir),
  358. normalize_numbers=not args.no_normalize,
  359. timeout=args.timeout,
  360. log_level=args.log_level
  361. )
  362. total_time = time.time() - start_time
  363. # 统计结果
  364. success_count = sum(1 for r in results if r.get('success', False))
  365. error_count = len(results) - success_count
  366. pdf_page_count = sum(1 for r in results if r.get('is_pdf_page', False))
  367. # 统计标准化信息
  368. total_changes = sum(r.get('processing_info', {}).get('character_changes_count', 0) for r in results if 'processing_info' in r)
  369. print(f"\n" + "="*60)
  370. print(f"✅ API Processing completed!")
  371. print(f"📊 Statistics:")
  372. print(f" Total files processed: {len(image_files)}")
  373. print(f" PDF pages processed: {pdf_page_count}")
  374. print(f" Regular images processed: {len(image_files) - pdf_page_count}")
  375. print(f" Successful: {success_count}")
  376. print(f" Failed: {error_count}")
  377. if len(image_files) > 0:
  378. print(f" Success rate: {success_count / len(image_files) * 100:.2f}%")
  379. if not args.no_normalize and total_changes > 0:
  380. print(f" 总标准化字符数: {total_changes}")
  381. print(f"⏱️ Performance:")
  382. print(f" Total time: {total_time:.2f} seconds")
  383. if total_time > 0:
  384. print(f" Throughput: {len(image_files) / total_time:.2f} images/second")
  385. print(f" Avg time per image: {total_time / len(image_files):.2f} seconds")
  386. print(f"\n📁 Output Structure:")
  387. print(f" output_dir/")
  388. print(f" ├── filename.md # Markdown content")
  389. print(f" ├── filename.json # Content list JSON")
  390. print(f" └── filename_*.jpg # Output images")
  391. # 保存结果统计
  392. stats = {
  393. "total_files": len(image_files),
  394. "pdf_pages": pdf_page_count,
  395. "regular_images": len(image_files) - pdf_page_count,
  396. "success_count": success_count,
  397. "error_count": error_count,
  398. "success_rate": success_count / len(image_files) if len(image_files) > 0 else 0,
  399. "total_time": total_time,
  400. "throughput": len(image_files) / total_time if total_time > 0 else 0,
  401. "avg_time_per_image": total_time / len(image_files) if len(image_files) > 0 else 0,
  402. "api_url": args.api_url,
  403. "timeout": args.timeout,
  404. "pdf_dpi": args.pdf_dpi,
  405. "normalization_enabled": not args.no_normalize,
  406. "total_character_changes": total_changes,
  407. "timestamp": time.strftime("%Y-%m-%d %H:%M:%S")
  408. }
  409. # 保存最终结果
  410. output_file_name = Path(output_dir).name
  411. output_file = output_dir / f"{output_file_name}_api_results.json"
  412. final_results = {
  413. "stats": stats,
  414. "results": results
  415. }
  416. with open(output_file, 'w', encoding='utf-8') as f:
  417. json.dump(final_results, f, ensure_ascii=False, indent=2)
  418. logger.info(f"💾 Results saved to: {output_file}")
  419. # 收集处理结果
  420. if not args.collect_results:
  421. output_file_processed = output_dir / f"processed_files_{time.strftime('%Y%m%d_%H%M%S')}.csv"
  422. else:
  423. output_file_processed = Path(args.collect_results).resolve()
  424. processed_files = collect_pid_files(str(output_file))
  425. with open(output_file_processed, 'w', encoding='utf-8') as f:
  426. f.write("image_path,status\n")
  427. for file_path, status in processed_files:
  428. f.write(f"{file_path},{status}\n")
  429. logger.info(f"💾 Processed files saved to: {output_file_processed}")
  430. return 0
  431. except Exception as e:
  432. logger.error(f"Processing failed: {e}")
  433. traceback.print_exc()
  434. return 1
  435. if __name__ == "__main__":
  436. logger.info(f"🚀 启动PP-StructureV3 API客户端...")
  437. logger.info(f"🔧 CUDA_VISIBLE_DEVICES: {os.environ.get('CUDA_VISIBLE_DEVICES', 'Not set')}")
  438. if len(sys.argv) == 1:
  439. # 如果没有命令行参数,使用默认配置运行
  440. logger.info("ℹ️ No command line arguments provided. Running with default configuration...")
  441. # 默认配置(API 客户端)
  442. default_config = {
  443. "input": "/Users/zhch158/workspace/data/流水分析/马公账流水_工商银行.pdf",
  444. "output_dir": "./output",
  445. "api_url": "http://10.192.72.11:20026/layout-parsing", # 默认 API URL
  446. "timeout": "300",
  447. "pdf_dpi": "200",
  448. "pages": "2",
  449. "log_level": "DEBUG",
  450. }
  451. # 构造参数
  452. sys.argv = [sys.argv[0]]
  453. for key, value in default_config.items():
  454. sys.argv.extend([f"--{key}", str(value)])
  455. sys.exit(main())