|
@@ -390,152 +390,3 @@ class TableLineGenerator:
|
|
|
# 保存
|
|
# 保存
|
|
|
target_img.save(output_path)
|
|
target_img.save(output_path)
|
|
|
return output_path
|
|
return output_path
|
|
|
-
|
|
|
|
|
-
|
|
|
|
|
-def generate_table_lines_from_ppstructure(
|
|
|
|
|
- json_path: str,
|
|
|
|
|
- output_dir: str,
|
|
|
|
|
- config: Dict
|
|
|
|
|
-) -> Dict:
|
|
|
|
|
- """
|
|
|
|
|
- 从 PPStructure V3 结果生成表格线
|
|
|
|
|
-
|
|
|
|
|
- Args:
|
|
|
|
|
- json_path: PPStructure V3 结果 JSON 路径
|
|
|
|
|
- output_dir: 输出目录
|
|
|
|
|
- config: 配置字典
|
|
|
|
|
-
|
|
|
|
|
- Returns:
|
|
|
|
|
- 生成结果信息
|
|
|
|
|
- """
|
|
|
|
|
- # 1. 加载 PPStructure V3 结果
|
|
|
|
|
- with open(json_path, 'r', encoding='utf-8') as f:
|
|
|
|
|
- ppstructure_result = json.load(f)
|
|
|
|
|
-
|
|
|
|
|
- # 2. 解析表格区域和文本框
|
|
|
|
|
- table_bbox, text_boxes = TableLineGenerator.parse_ppstructure_result(ppstructure_result)
|
|
|
|
|
-
|
|
|
|
|
- print(f"✅ 表格区域: {table_bbox}")
|
|
|
|
|
- print(f"✅ 表格内文本框数量: {len(text_boxes)}")
|
|
|
|
|
-
|
|
|
|
|
- # 3. 查找对应图片
|
|
|
|
|
- json_file = Path(json_path)
|
|
|
|
|
-
|
|
|
|
|
- # 从 PPStructure 结果中获取原图路径
|
|
|
|
|
- input_path = ppstructure_result.get('input_path')
|
|
|
|
|
- if input_path and Path(input_path).exists():
|
|
|
|
|
- image_path = Path(input_path)
|
|
|
|
|
- else:
|
|
|
|
|
- # 尝试根据 JSON 文件名查找图片
|
|
|
|
|
- image_path = json_file.with_suffix('.png')
|
|
|
|
|
- if not image_path.exists():
|
|
|
|
|
- image_path = json_file.with_suffix('.jpg')
|
|
|
|
|
-
|
|
|
|
|
- if not image_path.exists():
|
|
|
|
|
- raise FileNotFoundError(f"找不到图片: {image_path}")
|
|
|
|
|
-
|
|
|
|
|
- print(f"✅ 图片路径: {image_path}")
|
|
|
|
|
-
|
|
|
|
|
- # 4. 初始化表格线生成器
|
|
|
|
|
- generator = TableLineGenerator(str(image_path), text_boxes)
|
|
|
|
|
-
|
|
|
|
|
- # 5. 分析表格结构
|
|
|
|
|
- structure = generator.analyze_table_structure(
|
|
|
|
|
- y_tolerance=config.get('y_tolerance', 5),
|
|
|
|
|
- x_tolerance=config.get('x_tolerance', 10),
|
|
|
|
|
- min_row_height=config.get('min_row_height', 20)
|
|
|
|
|
- )
|
|
|
|
|
-
|
|
|
|
|
- print(f"✅ 检测到 {len(structure['rows'])} 行,{len(structure['columns'])} 列")
|
|
|
|
|
- print(f"✅ 标准行高: {structure['row_height']}px")
|
|
|
|
|
-
|
|
|
|
|
- # 6. 生成表格线图片
|
|
|
|
|
- img_with_lines = generator.generate_table_lines(
|
|
|
|
|
- line_color=tuple(config.get('line_color', [0, 0, 255])),
|
|
|
|
|
- line_width=config.get('line_width', 2)
|
|
|
|
|
- )
|
|
|
|
|
-
|
|
|
|
|
- # 7. 保存结果
|
|
|
|
|
- output_path = Path(output_dir)
|
|
|
|
|
- output_path.mkdir(parents=True, exist_ok=True)
|
|
|
|
|
-
|
|
|
|
|
- output_image_path = output_path / f"{json_file.stem}_with_lines.jpg"
|
|
|
|
|
- img_with_lines.save(output_image_path)
|
|
|
|
|
-
|
|
|
|
|
- # 保存表格结构配置
|
|
|
|
|
- structure_path = output_path / f"{json_file.stem}_structure.json"
|
|
|
|
|
- generator.save_table_structure(str(structure_path))
|
|
|
|
|
-
|
|
|
|
|
- return {
|
|
|
|
|
- 'image_with_lines': str(output_image_path),
|
|
|
|
|
- 'structure_config': str(structure_path),
|
|
|
|
|
- 'structure': structure,
|
|
|
|
|
- 'table_bbox': table_bbox,
|
|
|
|
|
- 'text_boxes_count': len(text_boxes)
|
|
|
|
|
- }
|
|
|
|
|
-
|
|
|
|
|
-
|
|
|
|
|
-def generate_table_lines_for_page(json_path: str,
|
|
|
|
|
- output_dir: str,
|
|
|
|
|
- config: Dict) -> Dict:
|
|
|
|
|
- """
|
|
|
|
|
- 为单页生成表格线(兼容旧版接口)
|
|
|
|
|
-
|
|
|
|
|
- Args:
|
|
|
|
|
- json_path: OCR结果JSON路径
|
|
|
|
|
- output_dir: 输出目录
|
|
|
|
|
- config: 配置字典
|
|
|
|
|
-
|
|
|
|
|
- Returns:
|
|
|
|
|
- 生成结果信息
|
|
|
|
|
- """
|
|
|
|
|
- # 加载OCR数据
|
|
|
|
|
- with open(json_path, 'r', encoding='utf-8') as f:
|
|
|
|
|
- ocr_data = json.load(f)
|
|
|
|
|
-
|
|
|
|
|
- # 判断是否为 PPStructure 结果
|
|
|
|
|
- if 'parsing_res_list' in ocr_data and 'overall_ocr_res' in ocr_data:
|
|
|
|
|
- # 使用新的 PPStructure 解析函数
|
|
|
|
|
- return generate_table_lines_from_ppstructure(json_path, output_dir, config)
|
|
|
|
|
-
|
|
|
|
|
- # 查找对应图片
|
|
|
|
|
- json_file = Path(json_path)
|
|
|
|
|
- image_path = json_file.with_suffix('.jpg')
|
|
|
|
|
- if not image_path.exists():
|
|
|
|
|
- image_path = json_file.with_suffix('.png')
|
|
|
|
|
-
|
|
|
|
|
- if not image_path.exists():
|
|
|
|
|
- raise FileNotFoundError(f"找不到图片: {image_path}")
|
|
|
|
|
-
|
|
|
|
|
- # 初始化表格线生成器
|
|
|
|
|
- generator = TableLineGenerator(str(image_path), ocr_data)
|
|
|
|
|
-
|
|
|
|
|
- # 分析表格结构
|
|
|
|
|
- structure = generator.analyze_table_structure(
|
|
|
|
|
- y_tolerance=config.get('y_tolerance', 5),
|
|
|
|
|
- x_tolerance=config.get('x_tolerance', 10),
|
|
|
|
|
- min_row_height=config.get('min_row_height', 20)
|
|
|
|
|
- )
|
|
|
|
|
-
|
|
|
|
|
- # 生成表格线图片
|
|
|
|
|
- img_with_lines = generator.generate_table_lines(
|
|
|
|
|
- line_color=tuple(config.get('line_color', [0, 0, 255])),
|
|
|
|
|
- line_width=config.get('line_width', 2)
|
|
|
|
|
- )
|
|
|
|
|
-
|
|
|
|
|
- # 保存
|
|
|
|
|
- output_path = Path(output_dir)
|
|
|
|
|
- output_path.mkdir(parents=True, exist_ok=True)
|
|
|
|
|
-
|
|
|
|
|
- output_image_path = output_path / f"{json_file.stem}_with_lines.jpg"
|
|
|
|
|
- img_with_lines.save(output_image_path)
|
|
|
|
|
-
|
|
|
|
|
- # 保存表格结构配置
|
|
|
|
|
- structure_path = output_path / f"{json_file.stem}_structure.json"
|
|
|
|
|
- generator.save_table_structure(str(structure_path))
|
|
|
|
|
-
|
|
|
|
|
- return {
|
|
|
|
|
- 'image_with_lines': str(output_image_path),
|
|
|
|
|
- 'structure_config': str(structure_path),
|
|
|
|
|
- 'structure': structure
|
|
|
|
|
- }
|
|
|