ppstructurev3_single_client.py 27 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697
  1. """PDF转图像后通过API统一处理"""
  2. import json
  3. import time
  4. import os
  5. import traceback
  6. import argparse
  7. import sys
  8. import warnings
  9. import base64
  10. from pathlib import Path
  11. from typing import List, Dict, Any, Union
  12. import requests
  13. from tqdm import tqdm
  14. from dotenv import load_dotenv
  15. load_dotenv(override=True)
  16. from utils import (
  17. get_image_files_from_dir,
  18. get_image_files_from_list,
  19. get_image_files_from_csv,
  20. collect_pid_files,
  21. load_images_from_pdf,
  22. normalize_financial_numbers,
  23. normalize_markdown_table
  24. )
  25. def convert_pdf_to_images(pdf_file: str, output_dir: str | None = None, dpi: int = 200) -> List[str]:
  26. """
  27. 将PDF转换为图像文件
  28. Args:
  29. pdf_file: PDF文件路径
  30. output_dir: 输出目录
  31. dpi: 图像分辨率
  32. Returns:
  33. 生成的图像文件路径列表
  34. """
  35. pdf_path = Path(pdf_file)
  36. if not pdf_path.exists() or pdf_path.suffix.lower() != '.pdf':
  37. print(f"❌ Invalid PDF file: {pdf_path}")
  38. return []
  39. # 如果没有指定输出目录,使用PDF同名目录
  40. if output_dir is None:
  41. output_path = pdf_path.parent / f"{pdf_path.stem}"
  42. else:
  43. output_path = Path(output_dir) / f"{pdf_path.stem}"
  44. output_path = output_path.resolve()
  45. output_path.mkdir(parents=True, exist_ok=True)
  46. try:
  47. # 使用doc_utils中的函数加载PDF图像
  48. images = load_images_from_pdf(str(pdf_path), dpi=dpi)
  49. image_paths = []
  50. for i, image in enumerate(images):
  51. # 生成图像文件名
  52. image_filename = f"{pdf_path.stem}_page_{i+1:03d}.png"
  53. image_path = output_path / image_filename
  54. # 保存图像
  55. image.save(str(image_path))
  56. image_paths.append(str(image_path))
  57. print(f"✅ Converted {len(images)} pages from {pdf_path.name} to images")
  58. return image_paths
  59. except Exception as e:
  60. print(f"❌ Error converting PDF {pdf_path}: {e}")
  61. traceback.print_exc()
  62. return []
  63. def get_input_files(args) -> List[str]:
  64. """
  65. 获取输入文件列表,统一处理PDF和图像文件
  66. Args:
  67. args: 命令行参数
  68. Returns:
  69. 处理后的图像文件路径列表
  70. """
  71. input_files = []
  72. # 获取原始输入文件
  73. if args.input_csv:
  74. raw_files = get_image_files_from_csv(args.input_csv, "fail")
  75. elif args.input_file_list:
  76. raw_files = get_image_files_from_list(args.input_file_list)
  77. elif args.input_file:
  78. raw_files = [Path(args.input_file).resolve()]
  79. else:
  80. input_dir = Path(args.input_dir).resolve()
  81. if not input_dir.exists():
  82. print(f"❌ Input directory does not exist: {input_dir}")
  83. return []
  84. # 获取所有支持的文件(图像和PDF)
  85. image_extensions = ['.jpg', '.jpeg', '.png', '.bmp', '.tiff', '.tif']
  86. pdf_extensions = ['.pdf']
  87. raw_files = []
  88. for ext in image_extensions + pdf_extensions:
  89. raw_files.extend(list(input_dir.glob(f"*{ext}")))
  90. raw_files.extend(list(input_dir.glob(f"*{ext.upper()}")))
  91. raw_files = [str(f) for f in raw_files]
  92. # 分别处理PDF和图像文件
  93. pdf_count = 0
  94. image_count = 0
  95. for file_path in raw_files:
  96. file_path = Path(file_path)
  97. if file_path.suffix.lower() == '.pdf':
  98. # 转换PDF为图像
  99. print(f"📄 Processing PDF: {file_path.name}")
  100. pdf_images = convert_pdf_to_images(
  101. str(file_path),
  102. args.output_dir,
  103. dpi=args.pdf_dpi
  104. )
  105. input_files.extend(pdf_images)
  106. pdf_count += 1
  107. else:
  108. # 直接添加图像文件
  109. if file_path.exists():
  110. input_files.append(str(file_path))
  111. image_count += 1
  112. print(f"📊 Input summary:")
  113. print(f" PDF files processed: {pdf_count}")
  114. print(f" Image files found: {image_count}")
  115. print(f" Total image files to process: {len(input_files)}")
  116. return input_files
  117. def convert_api_result_to_json(api_result: Dict[str, Any],
  118. input_image_path: str,
  119. output_dir: str,
  120. filename: str,
  121. normalize_numbers: bool = True) -> tuple[str, Dict[str, Any]]:
  122. """
  123. 将API返回结果转换为标准JSON格式,并支持数字标准化
  124. """
  125. # 获取主要数据
  126. layout_parsing_results = api_result.get('layoutParsingResults', [])
  127. if not layout_parsing_results:
  128. print("⚠️ Warning: No layoutParsingResults found in API response")
  129. return {}
  130. # 取第一个结果(通常只有一个)
  131. main_result = layout_parsing_results[0]
  132. pruned_result = main_result.get('prunedResult', {})
  133. # 构造标准格式的JSON
  134. converted_json = {
  135. "input_path": input_image_path,
  136. "page_index": None,
  137. "model_settings": pruned_result.get('model_settings', {}),
  138. "parsing_res_list": pruned_result.get('parsing_res_list', []),
  139. "doc_preprocessor_res": {
  140. "input_path": None,
  141. "page_index": None,
  142. "model_settings": pruned_result.get('doc_preprocessor_res', {}).get('model_settings', {}),
  143. "angle": pruned_result.get('doc_preprocessor_res', {}).get('angle', 0)
  144. },
  145. "layout_det_res": {
  146. "input_path": None,
  147. "page_index": None,
  148. "boxes": pruned_result.get('layout_det_res', {}).get('boxes', [])
  149. },
  150. "overall_ocr_res": {
  151. "input_path": None,
  152. "page_index": None,
  153. "model_settings": pruned_result.get('overall_ocr_res', {}).get('model_settings', {}),
  154. "dt_polys": pruned_result.get('overall_ocr_res', {}).get('dt_polys', []),
  155. "text_det_params": pruned_result.get('overall_ocr_res', {}).get('text_det_params', {}),
  156. "text_type": pruned_result.get('overall_ocr_res', {}).get('text_type', 'general'),
  157. "textline_orientation_angles": pruned_result.get('overall_ocr_res', {}).get('textline_orientation_angles', []),
  158. "text_rec_score_thresh": pruned_result.get('overall_ocr_res', {}).get('text_rec_score_thresh', 0.0),
  159. "return_word_box": pruned_result.get('overall_ocr_res', {}).get('return_word_box', False),
  160. "rec_texts": pruned_result.get('overall_ocr_res', {}).get('rec_texts', []),
  161. "rec_scores": pruned_result.get('overall_ocr_res', {}).get('rec_scores', []),
  162. "rec_polys": pruned_result.get('overall_ocr_res', {}).get('rec_polys', []),
  163. "rec_boxes": pruned_result.get('overall_ocr_res', {}).get('rec_boxes', [])
  164. },
  165. "table_res_list": pruned_result.get('table_res_list', [])
  166. }
  167. # 数字标准化处理
  168. original_json = converted_json.copy()
  169. changes_count = 0
  170. if normalize_numbers:
  171. # 1. 标准化 parsing_res_list 中的文本内容
  172. for item in converted_json.get('parsing_res_list', []):
  173. if 'block_content' in item:
  174. original_content = item['block_content']
  175. normalized_content = original_content
  176. # 根据block_label类型选择标准化方法
  177. if item.get('block_label') == 'table':
  178. normalized_content = normalize_markdown_table(original_content)
  179. # else:
  180. # normalized_content = normalize_financial_numbers(original_content)
  181. if original_content != normalized_content:
  182. item['block_content'] = normalized_content
  183. changes_count += len([1 for o, n in zip(original_content, normalized_content) if o != n])
  184. # 2. 标准化 table_res_list 中的HTML表格
  185. for table_item in converted_json.get('table_res_list', []):
  186. if 'pred_html' in table_item:
  187. original_html = table_item['pred_html']
  188. normalized_html = normalize_markdown_table(original_html)
  189. if original_html != normalized_html:
  190. table_item['pred_html'] = normalized_html
  191. changes_count += len([1 for o, n in zip(original_html, normalized_html) if o != n])
  192. # 检查是否需要修复表格一致性(这里只做统计,实际修复可能需要更复杂的逻辑)
  193. # 统计表格数量
  194. parsing_res_tables_count = 0
  195. table_res_list_count = 0
  196. if 'parsing_res_list' in converted_json:
  197. parsing_res_tables_count = len([item for item in converted_json['parsing_res_list']
  198. if 'block_label' in item and item['block_label'] == 'table'])
  199. if 'table_res_list' in converted_json:
  200. table_res_list_count = len(converted_json["table_res_list"])
  201. table_consistency_fixed = False
  202. if parsing_res_tables_count != table_res_list_count:
  203. warnings.warn(f"⚠️ Warning: {filename} Table count mismatch - parsing_res_list has {parsing_res_tables_count} tables, "
  204. f"but table_res_list has {table_res_list_count} tables.")
  205. table_consistency_fixed = True
  206. # 这里可以添加实际的修复逻辑,例如根据需要添加或删除表格项
  207. # 但由于缺乏具体规则,暂时只做统计和警告
  208. # 3. 标准化 overall_ocr_res 中的识别文本
  209. # ocr_res = converted_json.get('overall_ocr_res', {})
  210. # if 'rec_texts' in ocr_res:
  211. # original_texts = ocr_res['rec_texts'][:]
  212. # normalized_texts = []
  213. # for text in original_texts:
  214. # normalized_text = normalize_financial_numbers(text)
  215. # normalized_texts.append(normalized_text)
  216. # if text != normalized_text:
  217. # changes_count += len([1 for o, n in zip(text, normalized_text) if o != n])
  218. # ocr_res['rec_texts'] = normalized_texts
  219. # 添加标准化处理信息
  220. converted_json['processing_info'] = {
  221. "normalize_numbers": normalize_numbers,
  222. "changes_applied": changes_count > 0,
  223. "character_changes_count": changes_count,
  224. "parsing_res_tables_count": parsing_res_tables_count,
  225. "table_res_list_count": table_res_list_count,
  226. "table_consistency_fixed": table_consistency_fixed
  227. }
  228. # if changes_count > 0:
  229. # print(f"🔧 已标准化 {changes_count} 个字符(全角→半角)")
  230. else:
  231. converted_json['processing_info'] = {
  232. "normalize_numbers": False,
  233. "changes_applied": False,
  234. "character_changes_count": 0
  235. }
  236. # 保存JSON文件
  237. output_path = Path(output_dir).resolve()
  238. output_path.mkdir(parents=True, exist_ok=True)
  239. json_file_path = output_path / f"{filename}.json"
  240. with open(json_file_path, 'w', encoding='utf-8') as f:
  241. json.dump(converted_json, f, ensure_ascii=False, indent=2)
  242. # 如果启用了标准化且有变化,保存原始版本用于对比
  243. if normalize_numbers and changes_count > 0:
  244. original_output_path = output_path / f"{filename}_original.json"
  245. with open(original_output_path, 'w', encoding='utf-8') as f:
  246. json.dump(original_json, f, ensure_ascii=False, indent=2)
  247. return str(output_path), converted_json
  248. def save_output_images(api_result: Dict[str, Any], output_dir: str, output_filename: str) -> Dict[str, str]:
  249. """
  250. 保存API返回的输出图像
  251. Args:
  252. api_result: API返回的结果
  253. output_dir: 输出目录
  254. Returns:
  255. 保存的图像文件路径字典
  256. """
  257. layout_parsing_results = api_result.get('layoutParsingResults', [])
  258. if not layout_parsing_results:
  259. return {}
  260. main_result = layout_parsing_results[0]
  261. output_images = main_result.get('outputImages', {})
  262. output_path = Path(output_dir).resolve()
  263. output_path.mkdir(parents=True, exist_ok=True)
  264. saved_images = {}
  265. for img_name, img_base64 in output_images.items():
  266. try:
  267. # 解码base64图像
  268. img_data = base64.b64decode(img_base64)
  269. # 生成文件名
  270. img_filename = f"{output_filename}_{img_name}.jpg"
  271. img_path = output_path / img_filename
  272. # 保存图像
  273. with open(img_path, 'wb') as f:
  274. f.write(img_data)
  275. saved_images[img_name] = str(img_path)
  276. # print(f"📷 Saved image: {img_path}")
  277. except Exception as e:
  278. print(f"❌ Error saving image {img_name}: {e}")
  279. return saved_images
  280. def save_markdown_content(api_result: Dict[str, Any], output_dir: str,
  281. filename: str, normalize_numbers: bool = True) -> str:
  282. """
  283. 保存Markdown内容,支持数字标准化
  284. """
  285. layout_parsing_results = api_result.get('layoutParsingResults', [])
  286. if not layout_parsing_results:
  287. return ""
  288. main_result = layout_parsing_results[0]
  289. markdown_data = main_result.get('markdown', {})
  290. output_path = Path(output_dir).resolve()
  291. output_path.mkdir(parents=True, exist_ok=True)
  292. # 保存Markdown文本
  293. markdown_text = markdown_data.get('text', '')
  294. # 数字标准化处理
  295. changes_count = 0
  296. if normalize_numbers and markdown_text:
  297. original_markdown_text = markdown_text
  298. markdown_text = normalize_markdown_table(markdown_text)
  299. changes_count = len([1 for o, n in zip(original_markdown_text, markdown_text) if o != n])
  300. # if changes_count > 0:
  301. # print(f"🔧 Markdown中已标准化 {changes_count} 个字符(全角→半角)")
  302. md_file_path = output_path / f"{filename}.md"
  303. with open(md_file_path, 'w', encoding='utf-8') as f:
  304. f.write(markdown_text)
  305. # 如果启用了标准化且有变化,保存原始版本用于对比
  306. if normalize_numbers and changes_count > 0:
  307. original_output_path = output_path / f"{filename}_original.md"
  308. with open(original_output_path, 'w', encoding='utf-8') as f:
  309. f.write(original_markdown_text)
  310. # 保存Markdown中的图像
  311. markdown_images = markdown_data.get('images', {})
  312. for img_path, img_base64 in markdown_images.items():
  313. try:
  314. img_data = base64.b64decode(img_base64)
  315. full_img_path = output_path / img_path
  316. full_img_path.parent.mkdir(parents=True, exist_ok=True)
  317. with open(full_img_path, 'wb') as f:
  318. f.write(img_data)
  319. # print(f"🖼️ Saved Markdown image: {full_img_path}")
  320. except Exception as e:
  321. print(f"❌ Error saving Markdown image {img_path}: {e}")
  322. return str(md_file_path)
  323. def call_api_for_image(image_path: str, api_url: str, timeout: int = 300) -> Dict[str, Any]:
  324. """
  325. 为单个图像调用API
  326. Args:
  327. image_path: 图像文件路径
  328. api_url: API URL
  329. timeout: 超时时间(秒)
  330. Returns:
  331. API返回结果
  332. """
  333. try:
  334. # 对本地图像进行Base64编码
  335. with open(image_path, "rb") as file:
  336. image_bytes = file.read()
  337. image_data = base64.b64encode(image_bytes).decode("ascii")
  338. payload = {
  339. "file": image_data,
  340. "fileType": 1,
  341. # 添加管道参数设置
  342. "useDocOrientationClassify": True,
  343. "useDocUnwarping": False,
  344. "useSealRecognition": True,
  345. "useTableRecognition": True,
  346. "useFormulaRecognition": False, # 避免公式识别的索引错误
  347. "useChartRecognition": True,
  348. "useRegionDetection": False,
  349. }
  350. # 调用API
  351. response = requests.post(api_url, json=payload, timeout=timeout)
  352. response.raise_for_status()
  353. return response.json()["result"]
  354. except requests.exceptions.Timeout:
  355. raise Exception(f"API调用超时 ({timeout}秒)")
  356. except requests.exceptions.RequestException as e:
  357. raise Exception(f"API调用失败: {e}")
  358. except KeyError:
  359. raise Exception("API返回格式错误,缺少'result'字段")
  360. except Exception as e:
  361. raise Exception(f"处理图像时发生错误: {e}")
  362. def process_images_via_api(image_paths: List[str],
  363. api_url: str,
  364. output_dir: str = "./output",
  365. normalize_numbers: bool = True,
  366. timeout: int = 300) -> List[Dict[str, Any]]:
  367. """
  368. 通过API统一处理图像文件
  369. Args:
  370. image_paths: 图像路径列表
  371. api_url: API URL
  372. output_dir: 输出目录
  373. normalize_numbers: 是否标准化数字格式
  374. timeout: API调用超时时间
  375. Returns:
  376. 处理结果列表
  377. """
  378. # 创建输出目录
  379. output_path = Path(output_dir)
  380. output_path.mkdir(parents=True, exist_ok=True)
  381. all_results = []
  382. total_images = len(image_paths)
  383. print(f"Processing {total_images} images via API")
  384. # 使用tqdm显示进度
  385. with tqdm(total=total_images, desc="Processing images", unit="img",
  386. bar_format='{l_bar}{bar}| {n_fmt}/{total_fmt} [{elapsed}<{remaining}, {rate_fmt}]') as pbar:
  387. # 逐个处理图像
  388. for img_path in image_paths:
  389. start_time = time.time()
  390. try:
  391. # 调用API处理图像
  392. api_result = call_api_for_image(img_path, api_url, timeout)
  393. processing_time = time.time() - start_time
  394. # 处理API返回结果
  395. input_path = Path(img_path)
  396. # 生成输出文件名
  397. output_filename = input_path.stem
  398. # 转换并保存标准JSON格式
  399. json_output_path, converted_json = convert_api_result_to_json(
  400. api_result,
  401. str(input_path),
  402. output_dir,
  403. output_filename,
  404. normalize_numbers=normalize_numbers
  405. )
  406. # 保存输出图像
  407. saved_images = save_output_images(api_result, str(output_dir), output_filename)
  408. # 保存Markdown内容
  409. md_output_path = save_markdown_content(
  410. api_result,
  411. output_dir,
  412. output_filename,
  413. normalize_numbers=normalize_numbers
  414. )
  415. # 记录处理结果
  416. all_results.append({
  417. "image_path": str(input_path),
  418. "processing_time": processing_time,
  419. "success": True,
  420. "api_url": api_url,
  421. "output_json": json_output_path,
  422. "output_md": md_output_path,
  423. "is_pdf_page": "_page_" in input_path.name, # 标记是否为PDF页面
  424. "processing_info": converted_json.get('processing_info', {})
  425. })
  426. # 更新进度条
  427. success_count = sum(1 for r in all_results if r.get('success', False))
  428. pbar.update(1)
  429. pbar.set_postfix({
  430. 'time': f"{processing_time:.2f}s",
  431. 'success': f"{success_count}/{len(all_results)}",
  432. 'rate': f"{success_count/len(all_results)*100:.1f}%"
  433. })
  434. except Exception as e:
  435. print(f"Error processing {Path(img_path).name}: {e}", file=sys.stderr)
  436. import traceback
  437. traceback.print_exc()
  438. # 添加错误结果
  439. all_results.append({
  440. "image_path": str(img_path),
  441. "processing_time": 0,
  442. "success": False,
  443. "api_url": api_url,
  444. "error": str(e),
  445. "is_pdf_page": "_page_" in Path(img_path).name
  446. })
  447. pbar.update(1)
  448. return all_results
  449. def main():
  450. """主函数"""
  451. parser = argparse.ArgumentParser(description="PaddleX PP-StructureV3 API Client - Unified PDF/Image Processor")
  452. # 参数定义
  453. input_group = parser.add_mutually_exclusive_group(required=True)
  454. input_group.add_argument("--input_file", type=str, help="Input file (supports both PDF and image file)")
  455. input_group.add_argument("--input_dir", type=str, help="Input directory (supports both PDF and image files)")
  456. input_group.add_argument("--input_file_list", type=str, help="Input file list (one file per line)")
  457. input_group.add_argument("--input_csv", type=str, help="Input CSV file with image_path and status columns")
  458. parser.add_argument("--output_dir", type=str, required=True, help="Output directory")
  459. parser.add_argument("--api_url", type=str, default="http://localhost:8080/layout-parsing", help="API URL")
  460. parser.add_argument("--pdf_dpi", type=int, default=200, help="DPI for PDF to image conversion")
  461. parser.add_argument("--timeout", type=int, default=300, help="API timeout in seconds")
  462. parser.add_argument("--no-normalize", action="store_true", help="禁用数字标准化")
  463. parser.add_argument("--test_mode", action="store_true", help="Test mode (process only 20 files)")
  464. parser.add_argument("--collect_results", type=str, help="收集处理结果到指定CSV文件")
  465. args = parser.parse_args()
  466. normalize_numbers = not args.no_normalize
  467. try:
  468. # 获取并预处理输入文件
  469. print("🔄 Preprocessing input files...")
  470. input_files = get_input_files(args)
  471. if not input_files:
  472. print("❌ No input files found or processed")
  473. return 1
  474. if args.test_mode:
  475. input_files = input_files[:20]
  476. print(f"Test mode: processing only {len(input_files)} images")
  477. print(f"🌐 Using API: {args.api_url}")
  478. print(f"🔧 数字标准化: {'启用' if normalize_numbers else '禁用'}")
  479. print(f"⏱️ Timeout: {args.timeout} seconds")
  480. # 开始处理
  481. start_time = time.time()
  482. results = process_images_via_api(
  483. input_files,
  484. args.api_url,
  485. args.output_dir,
  486. normalize_numbers=normalize_numbers,
  487. timeout=args.timeout
  488. )
  489. total_time = time.time() - start_time
  490. # 统计结果
  491. success_count = sum(1 for r in results if r.get('success', False))
  492. error_count = len(results) - success_count
  493. pdf_page_count = sum(1 for r in results if r.get('is_pdf_page', False))
  494. total_changes = sum(r.get('processing_info', {}).get('character_changes_count', 0) for r in results if 'processing_info' in r)
  495. print(f"\n" + "="*60)
  496. print(f"✅ API Processing completed!")
  497. print(f"📊 Statistics:")
  498. print(f" Total files processed: {len(input_files)}")
  499. print(f" PDF pages processed: {pdf_page_count}")
  500. print(f" Regular images processed: {len(input_files) - pdf_page_count}")
  501. print(f" Successful: {success_count}")
  502. print(f" Failed: {error_count}")
  503. if len(input_files) > 0:
  504. print(f" Success rate: {success_count / len(input_files) * 100:.2f}%")
  505. if normalize_numbers:
  506. print(f" 总标准化字符数: {total_changes}")
  507. print(f"⏱️ Performance:")
  508. print(f" Total time: {total_time:.2f} seconds")
  509. if total_time > 0:
  510. print(f" Throughput: {len(input_files) / total_time:.2f} files/second")
  511. print(f" Avg time per file: {total_time / len(input_files):.2f} seconds")
  512. # 保存结果统计
  513. stats = {
  514. "total_files": len(input_files),
  515. "pdf_pages": pdf_page_count,
  516. "regular_images": len(input_files) - pdf_page_count,
  517. "success_count": success_count,
  518. "error_count": error_count,
  519. "success_rate": success_count / len(input_files) if len(input_files) > 0 else 0,
  520. "total_time": total_time,
  521. "throughput": len(input_files) / total_time if total_time > 0 else 0,
  522. "avg_time_per_file": total_time / len(input_files) if len(input_files) > 0 else 0,
  523. "api_url": args.api_url,
  524. "pdf_dpi": args.pdf_dpi,
  525. "normalize_numbers": normalize_numbers,
  526. "total_character_changes": total_changes,
  527. "timestamp": time.strftime("%Y-%m-%d %H:%M:%S")
  528. }
  529. # 保存最终结果
  530. output_file_name = Path(args.output_dir).name
  531. output_file = os.path.join(args.output_dir, f"{output_file_name}_api_results.json")
  532. final_results = {
  533. "stats": stats,
  534. "results": results
  535. }
  536. with open(output_file, 'w', encoding='utf-8') as f:
  537. json.dump(final_results, f, ensure_ascii=False, indent=2)
  538. print(f"💾 Results saved to: {output_file}")
  539. # 如果没有收集结果的路径,使用缺省文件名,和output_dir同一路径
  540. if not args.collect_results:
  541. output_file_processed = Path(args.output_dir) / f"processed_files_{time.strftime('%Y%m%d_%H%M%S')}.csv"
  542. else:
  543. output_file_processed = Path(args.collect_results).resolve()
  544. processed_files = collect_pid_files(output_file)
  545. with open(output_file_processed, 'w', encoding='utf-8') as f:
  546. f.write("image_path,status\n")
  547. for file_path, status in processed_files:
  548. f.write(f"{file_path},{status}\n")
  549. print(f"💾 Processed files saved to: {output_file_processed}")
  550. return 0
  551. except Exception as e:
  552. print(f"❌ Processing failed: {e}", file=sys.stderr)
  553. traceback.print_exc()
  554. return 1
  555. if __name__ == "__main__":
  556. print(f"🚀 启动PP-StructureV3 API客户端...")
  557. if len(sys.argv) == 1:
  558. # 如果没有命令行参数,使用默认配置运行
  559. print("ℹ️ No command line arguments provided. Running with default configuration...")
  560. # 默认配置
  561. default_config = {
  562. "input_file": "/Users/zhch158/workspace/data/至远彩色印刷工业有限公司/data_PPStructureV3_Results/2023年度报告母公司/2023年度报告母公司_page_027.png",
  563. # "input_dir": "../../OmniDocBench/OpenDataLab___OmniDocBench/images",
  564. "output_dir": "./OmniDocBench_API_Results",
  565. "api_url": "http://10.192.72.11:8111/layout-parsing",
  566. "timeout": "300",
  567. "collect_results": f"./OmniDocBench_API_Results/processed_files_{time.strftime('%Y%m%d_%H%M%S')}.csv",
  568. }
  569. # 构造参数
  570. sys.argv = [sys.argv[0]]
  571. for key, value in default_config.items():
  572. sys.argv.extend([f"--{key}", str(value)])
  573. # sys.argv.append("--no-normalize")
  574. # 测试模式
  575. # sys.argv.append("--test_mode")
  576. sys.exit(main())