ppstructurev3_single_client.py 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360
  1. """PDF转图像后通过API统一处理"""
  2. import json
  3. import time
  4. import os
  5. import traceback
  6. import argparse
  7. import sys
  8. import warnings
  9. import base64
  10. from pathlib import Path
  11. from typing import List, Dict, Any, Union
  12. import requests
  13. from tqdm import tqdm
  14. from dotenv import load_dotenv
  15. load_dotenv(override=True)
  16. from utils import (
  17. collect_pid_files,
  18. get_input_files,
  19. )
  20. from ppstructurev3_utils import (
  21. convert_pruned_result_to_json,
  22. save_output_images,
  23. save_markdown_content
  24. )
  25. def call_api_for_image(image_path: str, api_url: str, timeout: int = 300) -> Dict[str, Any]:
  26. """
  27. 为单个图像调用API
  28. Args:
  29. image_path: 图像文件路径
  30. api_url: API URL
  31. timeout: 超时时间(秒)
  32. Returns:
  33. API返回结果
  34. """
  35. try:
  36. # 对本地图像进行Base64编码
  37. with open(image_path, "rb") as file:
  38. image_bytes = file.read()
  39. image_data = base64.b64encode(image_bytes).decode("ascii")
  40. payload = {
  41. "file": image_data,
  42. "fileType": 1,
  43. # 添加管道参数设置
  44. "useDocOrientationClassify": False, # 流水分析场景关闭方向分类
  45. "useDocUnwarping": False,
  46. "useSealRecognition": True,
  47. "useTableRecognition": True,
  48. "useFormulaRecognition": False, # 避免公式识别的索引错误
  49. "useChartRecognition": True,
  50. "useRegionDetection": False,
  51. "useOcrResultsWithTableCells": True,
  52. "useTableOrientationClassify": False,
  53. "useWiredTableCellsTransToHtml": True,
  54. "useWirelessTableCellsTransToHtml": True,
  55. }
  56. # 调用API
  57. response = requests.post(api_url, json=payload, timeout=timeout)
  58. response.raise_for_status()
  59. return response.json()["result"]
  60. except requests.exceptions.Timeout:
  61. raise Exception(f"API调用超时 ({timeout}秒)")
  62. except requests.exceptions.RequestException as e:
  63. raise Exception(f"API调用失败: {e}")
  64. except KeyError:
  65. raise Exception("API返回格式错误,缺少'result'字段")
  66. except Exception as e:
  67. raise Exception(f"处理图像时发生错误: {e}")
  68. def process_images_via_api(image_paths: List[str],
  69. api_url: str,
  70. output_dir: str = "./output",
  71. normalize_numbers: bool = True,
  72. timeout: int = 300) -> List[Dict[str, Any]]:
  73. """
  74. 通过API统一处理图像文件
  75. Args:
  76. image_paths: 图像路径列表
  77. api_url: API URL
  78. output_dir: 输出目录
  79. normalize_numbers: 是否标准化数字格式
  80. timeout: API调用超时时间
  81. Returns:
  82. 处理结果列表
  83. """
  84. # 创建输出目录
  85. output_path = Path(output_dir)
  86. output_path.mkdir(parents=True, exist_ok=True)
  87. all_results = []
  88. total_images = len(image_paths)
  89. print(f"Processing {total_images} images via API")
  90. # 使用tqdm显示进度
  91. with tqdm(total=total_images, desc="Processing images", unit="img",
  92. bar_format='{l_bar}{bar}| {n_fmt}/{total_fmt} [{elapsed}<{remaining}, {rate_fmt}]') as pbar:
  93. # 逐个处理图像
  94. for img_path in image_paths:
  95. start_time = time.time()
  96. try:
  97. # 调用API处理图像
  98. api_result = call_api_for_image(img_path, api_url, timeout)
  99. processing_time = time.time() - start_time
  100. # 获取主要数据
  101. layout_parsing_results = api_result.get('layoutParsingResults', [])
  102. if not layout_parsing_results:
  103. print("⚠️ Warning: No layoutParsingResults found in API response")
  104. return []
  105. # 处理API返回结果
  106. input_path = Path(img_path)
  107. # 生成输出文件名
  108. output_filename = input_path.stem
  109. # 处理结果
  110. for idx, result in enumerate(layout_parsing_results):
  111. if idx > 0:
  112. raise ValueError("Multiple results found for a single image")
  113. json_content = result.get('prunedResult', {})
  114. json_output_path, converted_json = convert_pruned_result_to_json(
  115. json_content,
  116. str(input_path),
  117. output_dir,
  118. output_filename,
  119. normalize_numbers=normalize_numbers
  120. )
  121. # 保存输出图像
  122. img_content = result.get('outputImages', {})
  123. saved_images = save_output_images(img_content, str(output_dir), output_filename)
  124. # 保存Markdown内容
  125. markdown_content = result.get('markdown', {})
  126. md_output_path = save_markdown_content(
  127. markdown_content,
  128. output_dir,
  129. output_filename,
  130. normalize_numbers=normalize_numbers,
  131. key_text='markdown_texts',
  132. key_images='markdown_images',
  133. json_data=converted_json # 🎯 新增参数
  134. )
  135. # 记录处理结果
  136. all_results.append({
  137. "image_path": str(input_path),
  138. "processing_time": processing_time,
  139. "success": True,
  140. "api_url": api_url,
  141. "output_json": json_output_path,
  142. "output_md": md_output_path,
  143. "is_pdf_page": "_page_" in input_path.name, # 标记是否为PDF页面
  144. "processing_info": converted_json.get('processing_info', {})
  145. })
  146. # 更新进度条
  147. success_count = sum(1 for r in all_results if r.get('success', False))
  148. pbar.update(1)
  149. pbar.set_postfix({
  150. 'time': f"{processing_time:.2f}s",
  151. 'success': f"{success_count}/{len(all_results)}",
  152. 'rate': f"{success_count/len(all_results)*100:.1f}%"
  153. })
  154. except Exception as e:
  155. print(f"Error processing {Path(img_path).name}: {e}", file=sys.stderr)
  156. import traceback
  157. traceback.print_exc()
  158. # 添加错误结果
  159. all_results.append({
  160. "image_path": str(img_path),
  161. "processing_time": 0,
  162. "success": False,
  163. "api_url": api_url,
  164. "error": str(e),
  165. "is_pdf_page": "_page_" in Path(img_path).name
  166. })
  167. pbar.update(1)
  168. return all_results
  169. def main():
  170. """主函数"""
  171. parser = argparse.ArgumentParser(description="PaddleX PP-StructureV3 API Client - Unified PDF/Image Processor")
  172. # 参数定义
  173. input_group = parser.add_mutually_exclusive_group(required=True)
  174. input_group.add_argument("--input_file", type=str, help="Input file (supports both PDF and image file)")
  175. input_group.add_argument("--input_dir", type=str, help="Input directory (supports both PDF and image files)")
  176. input_group.add_argument("--input_file_list", type=str, help="Input file list (one file per line)")
  177. input_group.add_argument("--input_csv", type=str, help="Input CSV file with image_path and status columns")
  178. parser.add_argument("--output_dir", type=str, required=True, help="Output directory")
  179. parser.add_argument("--api_url", type=str, default="http://localhost:8080/layout-parsing", help="API URL")
  180. parser.add_argument("--pdf_dpi", type=int, default=200, help="DPI for PDF to image conversion")
  181. parser.add_argument("--timeout", type=int, default=300, help="API timeout in seconds")
  182. parser.add_argument("--no-normalize", action="store_true", help="禁用数字标准化")
  183. parser.add_argument("--test_mode", action="store_true", help="Test mode (process only 20 files)")
  184. parser.add_argument("--collect_results", type=str, help="收集处理结果到指定CSV文件")
  185. args = parser.parse_args()
  186. normalize_numbers = not args.no_normalize
  187. try:
  188. # 获取并预处理输入文件
  189. print("🔄 Preprocessing input files...")
  190. input_files = get_input_files(args)
  191. if not input_files:
  192. print("❌ No input files found or processed")
  193. return 1
  194. if args.test_mode:
  195. input_files = input_files[:20]
  196. print(f"Test mode: processing only {len(input_files)} images")
  197. print(f"🌐 Using API: {args.api_url}")
  198. print(f"🔧 数字标准化: {'启用' if normalize_numbers else '禁用'}")
  199. print(f"⏱️ Timeout: {args.timeout} seconds")
  200. # 开始处理
  201. start_time = time.time()
  202. results = process_images_via_api(
  203. input_files,
  204. args.api_url,
  205. args.output_dir,
  206. normalize_numbers=normalize_numbers,
  207. timeout=args.timeout
  208. )
  209. total_time = time.time() - start_time
  210. # 统计结果
  211. success_count = sum(1 for r in results if r.get('success', False))
  212. error_count = len(results) - success_count
  213. pdf_page_count = sum(1 for r in results if r.get('is_pdf_page', False))
  214. total_changes = sum(r.get('processing_info', {}).get('character_changes_count', 0) for r in results if 'processing_info' in r)
  215. print(f"\n" + "="*60)
  216. print(f"✅ API Processing completed!")
  217. print(f"📊 Statistics:")
  218. print(f" Total files processed: {len(input_files)}")
  219. print(f" PDF pages processed: {pdf_page_count}")
  220. print(f" Regular images processed: {len(input_files) - pdf_page_count}")
  221. print(f" Successful: {success_count}")
  222. print(f" Failed: {error_count}")
  223. if len(input_files) > 0:
  224. print(f" Success rate: {success_count / len(input_files) * 100:.2f}%")
  225. if normalize_numbers:
  226. print(f" 总标准化字符数: {total_changes}")
  227. print(f"⏱️ Performance:")
  228. print(f" Total time: {total_time:.2f} seconds")
  229. if total_time > 0:
  230. print(f" Throughput: {len(input_files) / total_time:.2f} files/second")
  231. print(f" Avg time per file: {total_time / len(input_files):.2f} seconds")
  232. # 保存结果统计
  233. stats = {
  234. "total_files": len(input_files),
  235. "pdf_pages": pdf_page_count,
  236. "regular_images": len(input_files) - pdf_page_count,
  237. "success_count": success_count,
  238. "error_count": error_count,
  239. "success_rate": success_count / len(input_files) if len(input_files) > 0 else 0,
  240. "total_time": total_time,
  241. "throughput": len(input_files) / total_time if total_time > 0 else 0,
  242. "avg_time_per_file": total_time / len(input_files) if len(input_files) > 0 else 0,
  243. "api_url": args.api_url,
  244. "pdf_dpi": args.pdf_dpi,
  245. "normalize_numbers": normalize_numbers,
  246. "total_character_changes": total_changes,
  247. "timestamp": time.strftime("%Y-%m-%d %H:%M:%S")
  248. }
  249. # 保存最终结果
  250. output_file_name = Path(args.output_dir).name
  251. output_file = os.path.join(args.output_dir, f"{output_file_name}_api_results.json")
  252. final_results = {
  253. "stats": stats,
  254. "results": results
  255. }
  256. with open(output_file, 'w', encoding='utf-8') as f:
  257. json.dump(final_results, f, ensure_ascii=False, indent=2)
  258. print(f"💾 Results saved to: {output_file}")
  259. # 如果没有收集结果的路径,使用缺省文件名,和output_dir同一路径
  260. if not args.collect_results:
  261. output_file_processed = Path(args.output_dir) / f"processed_files_{time.strftime('%Y%m%d_%H%M%S')}.csv"
  262. else:
  263. output_file_processed = Path(args.collect_results).resolve()
  264. processed_files = collect_pid_files(output_file)
  265. with open(output_file_processed, 'w', encoding='utf-8') as f:
  266. f.write("image_path,status\n")
  267. for file_path, status in processed_files:
  268. f.write(f"{file_path},{status}\n")
  269. print(f"💾 Processed files saved to: {output_file_processed}")
  270. return 0
  271. except Exception as e:
  272. print(f"❌ Processing failed: {e}", file=sys.stderr)
  273. traceback.print_exc()
  274. return 1
  275. if __name__ == "__main__":
  276. print(f"🚀 启动PP-StructureV3 API客户端...")
  277. if len(sys.argv) == 1:
  278. # 如果没有命令行参数,使用默认配置运行
  279. print("ℹ️ No command line arguments provided. Running with default configuration...")
  280. # 默认配置
  281. default_config = {
  282. # "input_file": "/Users/zhch158/workspace/data/至远彩色印刷工业有限公司/data_PPStructureV3_Results/2023年度报告母公司/2023年度报告母公司_page_027.png",
  283. # "input_file": "/home/ubuntu/zhch/data/至远彩色印刷工业有限公司/PPStructureV3_Results/2023年度报告母公司/2023年度报告母公司_page_027.png",
  284. "input_file": "/home/ubuntu/zhch/data/至远彩色印刷工业有限公司/2023年度报告母公司.pdf",
  285. "output_dir": "/home/ubuntu/zhch/data/至远彩色印刷工业有限公司/PPStructureV3_Results",
  286. "collect_results": f"/home/ubuntu/zhch/data/至远彩色印刷工业有限公司/PPStructureV3_Results/processed_files_{time.strftime('%Y%m%d_%H%M%S')}.csv",
  287. # "input_dir": "../../OmniDocBench/OpenDataLab___OmniDocBench/images",
  288. # "output_dir": "./OmniDocBench_API_Results",
  289. # "collect_results": f"./OmniDocBench_API_Results/processed_files_{time.strftime('%Y%m%d_%H%M%S')}.csv",
  290. "api_url": "http://10.192.72.11:8111/layout-parsing",
  291. "timeout": "300",
  292. }
  293. # 构造参数
  294. sys.argv = [sys.argv[0]]
  295. for key, value in default_config.items():
  296. sys.argv.extend([f"--{key}", str(value)])
  297. # sys.argv.append("--no-normalize")
  298. # 测试模式
  299. # sys.argv.append("--test_mode")
  300. sys.exit(main())