batch_apply_table_lines.py 3.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113
  1. """
  2. 批量将表格结构应用到所有页
  3. """
  4. import json
  5. from pathlib import Path
  6. from table_line_generator import TableLineGenerator
  7. from PIL import Image
  8. from typing import List
  9. import argparse
  10. def batch_apply_table_structure(
  11. source_json_path: str,
  12. target_image_dir: str,
  13. output_dir: str,
  14. structure_config_path: str = None
  15. ):
  16. """
  17. 批量应用表格结构
  18. Args:
  19. source_json_path: 源OCR结果JSON路径(用于生成初始结构)
  20. target_image_dir: 目标图片目录
  21. output_dir: 输出目录
  22. structure_config_path: 表格结构配置路径(可选)
  23. """
  24. # 1. 加载或生成表格结构
  25. if structure_config_path and Path(structure_config_path).exists():
  26. # 加载已有配置
  27. with open(structure_config_path, 'r') as f:
  28. structure = json.load(f)
  29. print(f"📂 加载表格结构: {structure_config_path}")
  30. else:
  31. # 生成新配置
  32. with open(source_json_path, 'r') as f:
  33. ocr_data = json.load(f)
  34. source_image_path = Path(source_json_path).with_suffix('.jpg')
  35. generator = TableLineGenerator(str(source_image_path), ocr_data)
  36. structure_info = generator.analyze_table_structure()
  37. structure = generator.save_table_structure(
  38. f"{output_dir}/table_structure.json"
  39. )
  40. print(f"✅ 生成表格结构配置")
  41. # 2. 查找所有目标图片
  42. target_images = list(Path(target_image_dir).glob("*.jpg"))
  43. target_images.extend(list(Path(target_image_dir).glob("*.png")))
  44. target_images = sorted(target_images)
  45. print(f"📁 找到 {len(target_images)} 个图片文件")
  46. # 3. 批量应用
  47. output_path = Path(output_dir)
  48. output_path.mkdir(parents=True, exist_ok=True)
  49. results = []
  50. for image_path in target_images:
  51. try:
  52. # 创建临时生成器(用于应用结构)
  53. generator = TableLineGenerator(str(image_path), [])
  54. generator.rows = structure.get('rows', [])
  55. generator.columns = structure.get('columns', [])
  56. generator.row_height = structure.get('row_height', 30)
  57. # 应用结构
  58. output_file = output_path / f"{image_path.stem}_with_lines.jpg"
  59. generator.apply_structure_to_image(
  60. str(image_path),
  61. structure,
  62. str(output_file)
  63. )
  64. results.append({
  65. 'source': str(image_path),
  66. 'output': str(output_file),
  67. 'status': 'success'
  68. })
  69. print(f"✅ {image_path.name} → {output_file.name}")
  70. except Exception as e:
  71. results.append({
  72. 'source': str(image_path),
  73. 'status': 'error',
  74. 'error': str(e)
  75. })
  76. print(f"❌ {image_path.name} 失败: {e}")
  77. # 保存结果
  78. with open(output_path / "batch_results.json", 'w') as f:
  79. json.dump(results, f, indent=2, ensure_ascii=False)
  80. success_count = sum(1 for r in results if r['status'] == 'success')
  81. print(f"\n🎉 完成!成功: {success_count}/{len(results)}")
  82. if __name__ == "__main__":
  83. parser = argparse.ArgumentParser(description="批量应用表格结构")
  84. parser.add_argument('-s', '--source', required=True, help="源OCR结果JSON路径")
  85. parser.add_argument('-t', '--target', required=True, help="目标图片目录")
  86. parser.add_argument('-o', '--output', required=True, help="输出目录")
  87. parser.add_argument('-c', '--config', help="表格结构配置路径")
  88. args = parser.parse_args()
  89. batch_apply_table_structure(
  90. args.source,
  91. args.target,
  92. args.output,
  93. args.config
  94. )