|
@@ -9,63 +9,60 @@ from PIL import Image, ImageDraw
|
|
|
from typing import Dict, List, Tuple
|
|
from typing import Dict, List, Tuple
|
|
|
import numpy as np
|
|
import numpy as np
|
|
|
import argparse
|
|
import argparse
|
|
|
|
|
+import sys
|
|
|
|
|
+
|
|
|
|
|
+# 添加父目录到路径
|
|
|
|
|
+sys.path.insert(0, str(Path(__file__).parent))
|
|
|
|
|
|
|
|
try:
|
|
try:
|
|
|
|
|
+ from editor.data_processor import get_structure_from_ocr
|
|
|
from table_line_generator import TableLineGenerator
|
|
from table_line_generator import TableLineGenerator
|
|
|
except ImportError:
|
|
except ImportError:
|
|
|
|
|
+ from .editor.data_processor import get_structure_from_ocr
|
|
|
from .table_line_generator import TableLineGenerator
|
|
from .table_line_generator import TableLineGenerator
|
|
|
|
|
|
|
|
|
|
|
|
|
class TableTemplateApplier:
|
|
class TableTemplateApplier:
|
|
|
- """表格模板应用器"""
|
|
|
|
|
|
|
+ """表格模板应用器(混合模式)"""
|
|
|
|
|
|
|
|
def __init__(self, template_config_path: str):
|
|
def __init__(self, template_config_path: str):
|
|
|
- """
|
|
|
|
|
- 初始化模板应用器
|
|
|
|
|
-
|
|
|
|
|
- Args:
|
|
|
|
|
- template_config_path: 模板配置文件路径(人工标注的结果)
|
|
|
|
|
- """
|
|
|
|
|
|
|
+ """初始化时只提取列信息和表头信息"""
|
|
|
with open(template_config_path, 'r', encoding='utf-8') as f:
|
|
with open(template_config_path, 'r', encoding='utf-8') as f:
|
|
|
self.template = json.load(f)
|
|
self.template = json.load(f)
|
|
|
|
|
|
|
|
- # 🎯 从标注结果提取固定参数
|
|
|
|
|
|
|
+ # ✅ 只提取列宽(固定)
|
|
|
self.col_widths = self.template['col_widths']
|
|
self.col_widths = self.template['col_widths']
|
|
|
|
|
|
|
|
- # 🔧 计算数据行的标准行高(排除表头)
|
|
|
|
|
- rows = self.template['rows']
|
|
|
|
|
- if len(rows) > 1:
|
|
|
|
|
- # 计算每行的实际高度
|
|
|
|
|
- row_heights = [row['y_end'] - row['y_start'] for row in rows]
|
|
|
|
|
-
|
|
|
|
|
- # 🎯 假设第一行是表头,从第二行开始计算
|
|
|
|
|
- data_row_heights = row_heights[1:] if len(row_heights) > 1 else row_heights
|
|
|
|
|
-
|
|
|
|
|
- # 使用中位数作为标准行高(更稳健)
|
|
|
|
|
- self.row_height = int(np.median(data_row_heights))
|
|
|
|
|
- self.header_height = row_heights[0] if row_heights else self.row_height
|
|
|
|
|
-
|
|
|
|
|
- print(f"📏 表头高度: {self.header_height}px")
|
|
|
|
|
- print(f"📏 数据行高度: {self.row_height}px")
|
|
|
|
|
- print(f" (从 {len(data_row_heights)} 行数据中计算,中位数)")
|
|
|
|
|
- else:
|
|
|
|
|
- # 兜底方案
|
|
|
|
|
- self.row_height = self.template.get('row_height', 60)
|
|
|
|
|
- self.header_height = self.row_height
|
|
|
|
|
-
|
|
|
|
|
- # 🎯 计算列的相对位置(从第一列开始的偏移量)
|
|
|
|
|
|
|
+ # ✅ 计算列的相对位置
|
|
|
self.col_offsets = [0]
|
|
self.col_offsets = [0]
|
|
|
for width in self.col_widths:
|
|
for width in self.col_widths:
|
|
|
self.col_offsets.append(self.col_offsets[-1] + width)
|
|
self.col_offsets.append(self.col_offsets[-1] + width)
|
|
|
|
|
|
|
|
- # 🎯 提取表头的Y坐标(作为参考)
|
|
|
|
|
- self.template_header_y = rows[0]['y_start'] if rows else 0
|
|
|
|
|
|
|
+ # ✅ 提取表头高度(通常固定)
|
|
|
|
|
+ rows = self.template['rows']
|
|
|
|
|
+ if rows:
|
|
|
|
|
+ self.header_height = rows[0]['y_end'] - rows[0]['y_start']
|
|
|
|
|
+ else:
|
|
|
|
|
+ self.header_height = 40
|
|
|
|
|
+
|
|
|
|
|
+ # ✅ 计算数据行高度(用于固定行高模式)
|
|
|
|
|
+ if len(rows) > 1:
|
|
|
|
|
+ data_row_heights = [row['y_end'] - row['y_start'] for row in rows[1:]]
|
|
|
|
|
+ # 使用中位数作为典型行高
|
|
|
|
|
+ self.row_height = int(np.median(data_row_heights)) if data_row_heights else 40
|
|
|
|
|
+ # 兜底行高(同样使用中位数)
|
|
|
|
|
+ self.fallback_row_height = self.row_height
|
|
|
|
|
+ else:
|
|
|
|
|
+ # 如果只有表头,使用默认值
|
|
|
|
|
+ self.row_height = 40
|
|
|
|
|
+ self.fallback_row_height = 40
|
|
|
|
|
|
|
|
print(f"\n✅ 加载模板配置:")
|
|
print(f"\n✅ 加载模板配置:")
|
|
|
- print(f" 表头高度: {self.header_height}px")
|
|
|
|
|
- print(f" 数据行高度: {self.row_height}px")
|
|
|
|
|
print(f" 列数: {len(self.col_widths)}")
|
|
print(f" 列数: {len(self.col_widths)}")
|
|
|
print(f" 列宽: {self.col_widths}")
|
|
print(f" 列宽: {self.col_widths}")
|
|
|
|
|
+ print(f" 表头高度: {self.header_height}px")
|
|
|
|
|
+ print(f" 数据行高: {self.row_height}px (用于固定行高模式)")
|
|
|
|
|
+ print(f" 兜底行高: {self.fallback_row_height}px (OCR失败时使用)")
|
|
|
|
|
|
|
|
def detect_table_anchor(self, ocr_data: List[Dict]) -> Tuple[int, int]:
|
|
def detect_table_anchor(self, ocr_data: List[Dict]) -> Tuple[int, int]:
|
|
|
"""
|
|
"""
|
|
@@ -128,14 +125,14 @@ class TableTemplateApplier:
|
|
|
|
|
|
|
|
return total_rows
|
|
return total_rows
|
|
|
|
|
|
|
|
- def apply_to_image(self,
|
|
|
|
|
|
|
+ def apply_template_fixed(self,
|
|
|
image: Image.Image,
|
|
image: Image.Image,
|
|
|
ocr_data: List[Dict],
|
|
ocr_data: List[Dict],
|
|
|
anchor_x: int = None,
|
|
anchor_x: int = None,
|
|
|
anchor_y: int = None,
|
|
anchor_y: int = None,
|
|
|
num_rows: int = None,
|
|
num_rows: int = None,
|
|
|
line_width: int = 2,
|
|
line_width: int = 2,
|
|
|
- line_color: Tuple[int, int, int] = (0, 0, 0)) -> Image.Image:
|
|
|
|
|
|
|
+ line_color: Tuple[int, int, int] = (0, 0, 0)) -> Tuple[Image.Image, Dict]:
|
|
|
"""
|
|
"""
|
|
|
将模板应用到图片
|
|
将模板应用到图片
|
|
|
|
|
|
|
@@ -202,62 +199,208 @@ class TableTemplateApplier:
|
|
|
y_end = horizontal_lines[-1]
|
|
y_end = horizontal_lines[-1]
|
|
|
for x in vertical_lines:
|
|
for x in vertical_lines:
|
|
|
draw.line([(x, y_start), (x, y_end)], fill=line_color, width=line_width)
|
|
draw.line([(x, y_start), (x, y_end)], fill=line_color, width=line_width)
|
|
|
|
|
+
|
|
|
|
|
+ print(f"✅ 表格绘制完成: {len(horizontal_lines)}行 × {len(vertical_lines)-1}列")
|
|
|
|
|
+
|
|
|
|
|
+ # 🔑 生成结构信息
|
|
|
|
|
+ structure = self._build_structure(
|
|
|
|
|
+ horizontal_lines,
|
|
|
|
|
+ vertical_lines,
|
|
|
|
|
+ anchor_x,
|
|
|
|
|
+ anchor_y,
|
|
|
|
|
+ mode='fixed'
|
|
|
|
|
+ )
|
|
|
|
|
|
|
|
- return img_with_lines
|
|
|
|
|
-
|
|
|
|
|
- def generate_structure_for_image(self,
|
|
|
|
|
- ocr_data: List[Dict],
|
|
|
|
|
- anchor_x: int = None,
|
|
|
|
|
- anchor_y: int = None,
|
|
|
|
|
- num_rows: int = None) -> Dict:
|
|
|
|
|
|
|
+ return img_with_lines, structure
|
|
|
|
|
+
|
|
|
|
|
+ def apply_template_hybrid(self,
|
|
|
|
|
+ image: Image.Image,
|
|
|
|
|
+ ocr_data_dict: Dict,
|
|
|
|
|
+ use_ocr_rows: bool = True,
|
|
|
|
|
+ anchor_x: int = None,
|
|
|
|
|
+ anchor_y: int = None,
|
|
|
|
|
+ y_tolerance: int = 5,
|
|
|
|
|
+ line_width: int = 2,
|
|
|
|
|
+ line_color: Tuple[int, int, int] = (0, 0, 0)) -> Tuple[Image.Image, Dict]:
|
|
|
"""
|
|
"""
|
|
|
- 为新图片生成表格结构配置
|
|
|
|
|
|
|
+ 混合模式:使用模板的列 + OCR的行
|
|
|
|
|
|
|
|
Args:
|
|
Args:
|
|
|
- ocr_data: OCR识别结果
|
|
|
|
|
|
|
+ image: 目标图片
|
|
|
|
|
+ ocr_data: OCR识别结果(用于检测行)
|
|
|
|
|
+ use_ocr_rows: 是否使用OCR检测的行(True=自适应行高)
|
|
|
anchor_x: 表格起始X坐标(None=自动检测)
|
|
anchor_x: 表格起始X坐标(None=自动检测)
|
|
|
anchor_y: 表头起始Y坐标(None=自动检测)
|
|
anchor_y: 表头起始Y坐标(None=自动检测)
|
|
|
- num_rows: 总行数(None=自动检测)
|
|
|
|
|
|
|
+ y_tolerance: Y轴聚类容差(像素)
|
|
|
|
|
+ line_width: 线条宽度
|
|
|
|
|
+ line_color: 线条颜色
|
|
|
|
|
|
|
|
Returns:
|
|
Returns:
|
|
|
- 表格结构配置
|
|
|
|
|
|
|
+ 绘制了表格线的图片, 结构信息
|
|
|
"""
|
|
"""
|
|
|
|
|
+ img_with_lines = image.copy()
|
|
|
|
|
+ draw = ImageDraw.Draw(img_with_lines)
|
|
|
|
|
+
|
|
|
|
|
+ ocr_data = ocr_data_dict.get('text_boxes', [])
|
|
|
|
|
+
|
|
|
# 🔍 自动检测锚点
|
|
# 🔍 自动检测锚点
|
|
|
if anchor_x is None or anchor_y is None:
|
|
if anchor_x is None or anchor_y is None:
|
|
|
detected_x, detected_y = self.detect_table_anchor(ocr_data)
|
|
detected_x, detected_y = self.detect_table_anchor(ocr_data)
|
|
|
anchor_x = anchor_x or detected_x
|
|
anchor_x = anchor_x or detected_x
|
|
|
anchor_y = anchor_y or detected_y
|
|
anchor_y = anchor_y or detected_y
|
|
|
|
|
|
|
|
- # 🔍 自动检测行数
|
|
|
|
|
- if num_rows is None:
|
|
|
|
|
- num_rows = self.detect_table_rows(ocr_data, anchor_y)
|
|
|
|
|
|
|
+ print(f"\n📍 表格锚点: ({anchor_x}, {anchor_y})")
|
|
|
|
|
|
|
|
- # 🎨 生成横线坐标
|
|
|
|
|
- horizontal_lines = []
|
|
|
|
|
- horizontal_lines.append(anchor_y)
|
|
|
|
|
|
|
+ # ✅ 竖线:使用模板的列宽(固定)
|
|
|
|
|
+ vertical_lines = [anchor_x + offset for offset in self.col_offsets]
|
|
|
|
|
+ print(f"📏 竖线坐标: {vertical_lines} (使用模板,共{len(vertical_lines)}条)")
|
|
|
|
|
+
|
|
|
|
|
+ # ✅ 横线:根据模式选择
|
|
|
|
|
+ if use_ocr_rows and ocr_data:
|
|
|
|
|
+ horizontal_lines = self._detect_rows_from_ocr(
|
|
|
|
|
+ ocr_data, anchor_y, y_tolerance
|
|
|
|
|
+ )
|
|
|
|
|
+ print(f"📏 横线坐标: 使用OCR检测 (共{len(horizontal_lines)}条,自适应行高)")
|
|
|
|
|
+ else:
|
|
|
|
|
+ num_rows = self.detect_table_rows(ocr_data, anchor_y) if ocr_data else 10
|
|
|
|
|
+ horizontal_lines = self._generate_fixed_rows(anchor_y, num_rows)
|
|
|
|
|
+ print(f"📏 横线坐标: 使用固定行高 (共{len(horizontal_lines)}条)")
|
|
|
|
|
+
|
|
|
|
|
+ # 🖊️ 绘制横线
|
|
|
|
|
+ x_start = vertical_lines[0]
|
|
|
|
|
+ x_end = vertical_lines[-1]
|
|
|
|
|
+ for y in horizontal_lines:
|
|
|
|
|
+ draw.line([(x_start, y), (x_end, y)], fill=line_color, width=line_width)
|
|
|
|
|
+
|
|
|
|
|
+ # 🖊️ 绘制竖线
|
|
|
|
|
+ y_start = horizontal_lines[0]
|
|
|
|
|
+ y_end = horizontal_lines[-1]
|
|
|
|
|
+ for x in vertical_lines:
|
|
|
|
|
+ draw.line([(x, y_start), (x, y_end)], fill=line_color, width=line_width)
|
|
|
|
|
+
|
|
|
|
|
+ print(f"✅ 表格绘制完成: {len(horizontal_lines)}行 × {len(vertical_lines)-1}列")
|
|
|
|
|
+
|
|
|
|
|
+ # 🔑 生成结构信息
|
|
|
|
|
+ structure = self._build_structure(
|
|
|
|
|
+ horizontal_lines,
|
|
|
|
|
+ vertical_lines,
|
|
|
|
|
+ anchor_x,
|
|
|
|
|
+ anchor_y,
|
|
|
|
|
+ mode='hybrid'
|
|
|
|
|
+ )
|
|
|
|
|
+
|
|
|
|
|
+ return img_with_lines, structure
|
|
|
|
|
+
|
|
|
|
|
+ def _detect_rows_from_ocr(self,
|
|
|
|
|
+ ocr_data: List[Dict],
|
|
|
|
|
+ anchor_y: int,
|
|
|
|
|
+ y_tolerance: int = 5) -> List[int]:
|
|
|
|
|
+ """
|
|
|
|
|
+ 从OCR结果中检测行(自适应行高)
|
|
|
|
|
+ 复用 get_structure_from_ocr 统一接口
|
|
|
|
|
+
|
|
|
|
|
+ Args:
|
|
|
|
|
+ ocr_data: OCR识别结果(MinerU 格式的 text_boxes)
|
|
|
|
|
+ anchor_y: 表格起始Y坐标
|
|
|
|
|
+ y_tolerance: Y轴聚类容差(未使用,保留参数兼容性)
|
|
|
|
|
+
|
|
|
|
|
+ Returns:
|
|
|
|
|
+ 横线 y 坐标列表
|
|
|
|
|
+ """
|
|
|
|
|
+ if not ocr_data:
|
|
|
|
|
+ return [anchor_y, anchor_y + self.header_height]
|
|
|
|
|
+
|
|
|
|
|
+ print(f"\n🔍 OCR行检测 (使用 MinerU 算法):")
|
|
|
|
|
+ print(f" 有效文本框数: {len(ocr_data)}")
|
|
|
|
|
+
|
|
|
|
|
+ # 🔑 验证是否为 MinerU 格式
|
|
|
|
|
+ has_cell_index = any('row' in item and 'col' in item for item in ocr_data)
|
|
|
|
|
+
|
|
|
|
|
+ if not has_cell_index:
|
|
|
|
|
+ print(" ⚠️ 警告: OCR数据不包含 row/col 索引,可能不是 MinerU 格式")
|
|
|
|
|
+ print(" ⚠️ 混合模式需要 MinerU 格式的 JSON 文件")
|
|
|
|
|
+ return [anchor_y, anchor_y + self.header_height]
|
|
|
|
|
+
|
|
|
|
|
+ # 🔑 重构原始数据格式(MinerU 需要完整的 table 结构)
|
|
|
|
|
+ raw_data = {
|
|
|
|
|
+ 'type': 'table',
|
|
|
|
|
+ 'table_cells': ocr_data
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ try:
|
|
|
|
|
+ # ✅ 使用统一接口解析和分析(无需 dummy_image)
|
|
|
|
|
+ table_bbox, structure = get_structure_from_ocr(
|
|
|
|
|
+ raw_data,
|
|
|
|
|
+ tool="mineru"
|
|
|
|
|
+ )
|
|
|
|
|
+
|
|
|
|
|
+ if not structure or 'horizontal_lines' not in structure:
|
|
|
|
|
+ print(" ⚠️ MinerU 分析失败,使用兜底方案")
|
|
|
|
|
+ return [anchor_y, anchor_y + self.header_height]
|
|
|
|
|
+
|
|
|
|
|
+ # 🔑 获取横线坐标
|
|
|
|
|
+ horizontal_lines = structure['horizontal_lines']
|
|
|
|
|
+
|
|
|
|
|
+ # 🔑 调整第一条线到 anchor_y(表头顶部)
|
|
|
|
|
+ if horizontal_lines:
|
|
|
|
|
+ offset = anchor_y - horizontal_lines[0]
|
|
|
|
|
+ horizontal_lines = [y + offset for y in horizontal_lines]
|
|
|
|
|
+
|
|
|
|
|
+ print(f" 检测到行数: {len(horizontal_lines) - 1}")
|
|
|
|
|
+
|
|
|
|
|
+ # 🔑 分析行高分布
|
|
|
|
|
+ if len(horizontal_lines) > 1:
|
|
|
|
|
+ row_heights = []
|
|
|
|
|
+ for i in range(len(horizontal_lines) - 1):
|
|
|
|
|
+ h = horizontal_lines[i+1] - horizontal_lines[i]
|
|
|
|
|
+ row_heights.append(h)
|
|
|
|
|
+
|
|
|
|
|
+ if len(row_heights) > 1:
|
|
|
|
|
+ import numpy as np
|
|
|
|
|
+ print(f" 行高分布: min={min(row_heights)}, "
|
|
|
|
|
+ f"median={int(np.median(row_heights))}, "
|
|
|
|
|
+ f"max={max(row_heights)}")
|
|
|
|
|
+
|
|
|
|
|
+ return horizontal_lines
|
|
|
|
|
+
|
|
|
|
|
+ except Exception as e:
|
|
|
|
|
+ print(f" ⚠️ 解析失败: {e}")
|
|
|
|
|
+ import traceback
|
|
|
|
|
+ traceback.print_exc()
|
|
|
|
|
+ return [anchor_y, anchor_y + self.header_height]
|
|
|
|
|
+
|
|
|
|
|
+ def _generate_fixed_rows(self, anchor_y: int, num_rows: int) -> List[int]:
|
|
|
|
|
+ """生成固定行高的横线(兜底方案)"""
|
|
|
|
|
+ horizontal_lines = [anchor_y]
|
|
|
|
|
+
|
|
|
|
|
+ # 表头
|
|
|
horizontal_lines.append(anchor_y + self.header_height)
|
|
horizontal_lines.append(anchor_y + self.header_height)
|
|
|
|
|
|
|
|
|
|
+ # 数据行
|
|
|
current_y = anchor_y + self.header_height
|
|
current_y = anchor_y + self.header_height
|
|
|
for i in range(num_rows - 1):
|
|
for i in range(num_rows - 1):
|
|
|
- current_y += self.row_height
|
|
|
|
|
|
|
+ current_y += self.fallback_row_height
|
|
|
horizontal_lines.append(current_y)
|
|
horizontal_lines.append(current_y)
|
|
|
|
|
|
|
|
- # 🎨 生成竖线坐标
|
|
|
|
|
- vertical_lines = []
|
|
|
|
|
- for offset in self.col_offsets:
|
|
|
|
|
- x = anchor_x + offset
|
|
|
|
|
- vertical_lines.append(x)
|
|
|
|
|
-
|
|
|
|
|
- # 🎨 生成行区间
|
|
|
|
|
|
|
+ return horizontal_lines
|
|
|
|
|
+
|
|
|
|
|
+ def _build_structure(self,
|
|
|
|
|
+ horizontal_lines: List[int],
|
|
|
|
|
+ vertical_lines: List[int],
|
|
|
|
|
+ anchor_x: int,
|
|
|
|
|
+ anchor_y: int,
|
|
|
|
|
+ mode: str = 'fixed') -> Dict:
|
|
|
|
|
+ """构建表格结构信息(统一)"""
|
|
|
|
|
+ # 生成行区间
|
|
|
rows = []
|
|
rows = []
|
|
|
- for i in range(num_rows):
|
|
|
|
|
|
|
+ for i in range(len(horizontal_lines) - 1):
|
|
|
rows.append({
|
|
rows.append({
|
|
|
'y_start': horizontal_lines[i],
|
|
'y_start': horizontal_lines[i],
|
|
|
'y_end': horizontal_lines[i + 1],
|
|
'y_end': horizontal_lines[i + 1],
|
|
|
'bboxes': []
|
|
'bboxes': []
|
|
|
})
|
|
})
|
|
|
|
|
|
|
|
- # 🎨 生成列区间
|
|
|
|
|
|
|
+ # 生成列区间
|
|
|
columns = []
|
|
columns = []
|
|
|
for i in range(len(vertical_lines) - 1):
|
|
for i in range(len(vertical_lines) - 1):
|
|
|
columns.append({
|
|
columns.append({
|
|
@@ -265,30 +408,40 @@ class TableTemplateApplier:
|
|
|
'x_end': vertical_lines[i + 1]
|
|
'x_end': vertical_lines[i + 1]
|
|
|
})
|
|
})
|
|
|
|
|
|
|
|
|
|
+ # ✅ 根据模式设置正确的 mode 值
|
|
|
|
|
+ if mode == 'hybrid':
|
|
|
|
|
+ mode_value = 'hybrid'
|
|
|
|
|
+ elif mode == 'fixed':
|
|
|
|
|
+ mode_value = 'fixed'
|
|
|
|
|
+ else:
|
|
|
|
|
+ mode_value = mode # 保留原始值
|
|
|
|
|
+
|
|
|
return {
|
|
return {
|
|
|
'rows': rows,
|
|
'rows': rows,
|
|
|
'columns': columns,
|
|
'columns': columns,
|
|
|
'horizontal_lines': horizontal_lines,
|
|
'horizontal_lines': horizontal_lines,
|
|
|
'vertical_lines': vertical_lines,
|
|
'vertical_lines': vertical_lines,
|
|
|
- 'header_height': self.header_height,
|
|
|
|
|
- 'row_height': self.row_height,
|
|
|
|
|
'col_widths': self.col_widths,
|
|
'col_widths': self.col_widths,
|
|
|
|
|
+ 'row_height': self.row_height if mode == 'fixed' else None,
|
|
|
'table_bbox': [
|
|
'table_bbox': [
|
|
|
vertical_lines[0],
|
|
vertical_lines[0],
|
|
|
horizontal_lines[0],
|
|
horizontal_lines[0],
|
|
|
vertical_lines[-1],
|
|
vertical_lines[-1],
|
|
|
horizontal_lines[-1]
|
|
horizontal_lines[-1]
|
|
|
],
|
|
],
|
|
|
|
|
+ 'mode': mode_value, # ✅ 确保有 mode 字段
|
|
|
'anchor': {'x': anchor_x, 'y': anchor_y},
|
|
'anchor': {'x': anchor_x, 'y': anchor_y},
|
|
|
- 'num_rows': num_rows
|
|
|
|
|
|
|
+ 'modified_h_lines': [], # ✅ 添加修改记录字段
|
|
|
|
|
+ 'modified_v_lines': [] # ✅ 添加修改记录字段
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
-
|
|
|
|
|
def apply_template_to_single_file(
|
|
def apply_template_to_single_file(
|
|
|
applier: TableTemplateApplier,
|
|
applier: TableTemplateApplier,
|
|
|
image_file: Path,
|
|
image_file: Path,
|
|
|
json_file: Path,
|
|
json_file: Path,
|
|
|
output_dir: Path,
|
|
output_dir: Path,
|
|
|
|
|
+ structure_suffix: str = "_structure.json",
|
|
|
|
|
+ use_hybrid_mode: bool = True,
|
|
|
line_width: int = 2,
|
|
line_width: int = 2,
|
|
|
line_color: Tuple[int, int, int] = (0, 0, 0)
|
|
line_color: Tuple[int, int, int] = (0, 0, 0)
|
|
|
) -> bool:
|
|
) -> bool:
|
|
@@ -300,6 +453,7 @@ def apply_template_to_single_file(
|
|
|
image_file: 图片文件路径
|
|
image_file: 图片文件路径
|
|
|
json_file: OCR JSON文件路径
|
|
json_file: OCR JSON文件路径
|
|
|
output_dir: 输出目录
|
|
output_dir: 输出目录
|
|
|
|
|
+ use_hybrid_mode: 是否使用混合模式(需要 MinerU 格式)
|
|
|
line_width: 线条宽度
|
|
line_width: 线条宽度
|
|
|
line_color: 线条颜色
|
|
line_color: 线条颜色
|
|
|
|
|
|
|
@@ -313,39 +467,79 @@ def apply_template_to_single_file(
|
|
|
with open(json_file, 'r', encoding='utf-8') as f:
|
|
with open(json_file, 'r', encoding='utf-8') as f:
|
|
|
raw_data = json.load(f)
|
|
raw_data = json.load(f)
|
|
|
|
|
|
|
|
- # 🔧 解析OCR数据(支持PPStructure格式)
|
|
|
|
|
|
|
+ # 🔑 自动检测 OCR 格式
|
|
|
|
|
+ ocr_format = None
|
|
|
|
|
+
|
|
|
if 'parsing_res_list' in raw_data and 'overall_ocr_res' in raw_data:
|
|
if 'parsing_res_list' in raw_data and 'overall_ocr_res' in raw_data:
|
|
|
- table_bbox, ocr_data = TableLineGenerator.parse_ppstructure_result(raw_data)
|
|
|
|
|
|
|
+ # PPStructure 格式
|
|
|
|
|
+ ocr_format = 'ppstructure'
|
|
|
|
|
+ elif isinstance(raw_data, (list, dict)):
|
|
|
|
|
+ # 尝试提取 MinerU 格式
|
|
|
|
|
+ table_data = None
|
|
|
|
|
+ if isinstance(raw_data, list):
|
|
|
|
|
+ for item in raw_data:
|
|
|
|
|
+ if isinstance(item, dict) and item.get('type') == 'table':
|
|
|
|
|
+ table_data = item
|
|
|
|
|
+ break
|
|
|
|
|
+ elif isinstance(raw_data, dict) and raw_data.get('type') == 'table':
|
|
|
|
|
+ table_data = raw_data
|
|
|
|
|
+ if table_data and 'table_cells' in table_data:
|
|
|
|
|
+ ocr_format = 'mineru'
|
|
|
|
|
+ else:
|
|
|
|
|
+ raise ValueError("未识别的 OCR 格式")
|
|
|
else:
|
|
else:
|
|
|
- raise ValueError("不是PPStructure格式的OCR结果")
|
|
|
|
|
|
|
+ raise ValueError("未识别的 OCR 格式(仅支持 PPStructure 或 MinerU)")
|
|
|
|
|
+
|
|
|
|
|
+ table_bbox, ocr_data = TableLineGenerator.parse_ocr_data(
|
|
|
|
|
+ raw_data,
|
|
|
|
|
+ tool=ocr_format
|
|
|
|
|
+ )
|
|
|
|
|
|
|
|
- print(f" ✅ 加载OCR数据: {len(ocr_data)} 个文本框")
|
|
|
|
|
|
|
+ text_boxes = ocr_data.get('text_boxes', [])
|
|
|
|
|
+ print(f" ✅ 加载OCR数据: {len(text_boxes)} 个文本框")
|
|
|
|
|
+ print(f" 📋 OCR格式: {ocr_format}")
|
|
|
|
|
|
|
|
# 加载图片
|
|
# 加载图片
|
|
|
image = Image.open(image_file)
|
|
image = Image.open(image_file)
|
|
|
print(f" ✅ 加载图片: {image.size}")
|
|
print(f" ✅ 加载图片: {image.size}")
|
|
|
|
|
|
|
|
- # 🎯 应用模板
|
|
|
|
|
- img_with_lines = applier.apply_to_image(
|
|
|
|
|
- image,
|
|
|
|
|
- ocr_data,
|
|
|
|
|
- line_width=line_width,
|
|
|
|
|
- line_color=line_color
|
|
|
|
|
- )
|
|
|
|
|
|
|
+ # 🔑 验证混合模式的格式要求
|
|
|
|
|
+ if use_hybrid_mode and ocr_format != 'mineru':
|
|
|
|
|
+ print(f" ⚠️ 警告: 混合模式需要 MinerU 格式,当前格式为 {ocr_format}")
|
|
|
|
|
+ print(f" ℹ️ 自动切换到完全模板模式")
|
|
|
|
|
+ use_hybrid_mode = False
|
|
|
|
|
+
|
|
|
|
|
+ # 🆕 根据模式选择处理方式
|
|
|
|
|
+ if use_hybrid_mode:
|
|
|
|
|
+ print(f" 🔧 使用混合模式 (模板列 + MinerU 行)")
|
|
|
|
|
+ img_with_lines, structure = applier.apply_template_hybrid(
|
|
|
|
|
+ image,
|
|
|
|
|
+ ocr_data,
|
|
|
|
|
+ use_ocr_rows=True,
|
|
|
|
|
+ line_width=line_width,
|
|
|
|
|
+ line_color=line_color
|
|
|
|
|
+ )
|
|
|
|
|
+ else:
|
|
|
|
|
+ print(f" 🔧 使用完全模板模式 (固定行高)")
|
|
|
|
|
+ img_with_lines, structure = applier.apply_template_fixed(
|
|
|
|
|
+ image,
|
|
|
|
|
+ text_boxes,
|
|
|
|
|
+ line_width=line_width,
|
|
|
|
|
+ line_color=line_color
|
|
|
|
|
+ )
|
|
|
|
|
|
|
|
# 保存图片
|
|
# 保存图片
|
|
|
- output_file = output_dir / f"{image_file.stem}_with_lines.png"
|
|
|
|
|
|
|
+ output_file = output_dir / f"{image_file.stem}.png"
|
|
|
img_with_lines.save(output_file)
|
|
img_with_lines.save(output_file)
|
|
|
|
|
|
|
|
- # 🆕 生成并保存结构配置
|
|
|
|
|
- structure = applier.generate_structure_for_image(ocr_data)
|
|
|
|
|
- structure_file = output_dir / f"{image_file.stem}_structure.json"
|
|
|
|
|
|
|
+ # 保存结构配置
|
|
|
|
|
+ structure_file = output_dir / f"{image_file.stem}{structure_suffix}"
|
|
|
with open(structure_file, 'w', encoding='utf-8') as f:
|
|
with open(structure_file, 'w', encoding='utf-8') as f:
|
|
|
json.dump(structure, f, indent=2, ensure_ascii=False)
|
|
json.dump(structure, f, indent=2, ensure_ascii=False)
|
|
|
|
|
|
|
|
print(f" ✅ 保存图片: {output_file.name}")
|
|
print(f" ✅ 保存图片: {output_file.name}")
|
|
|
print(f" ✅ 保存配置: {structure_file.name}")
|
|
print(f" ✅ 保存配置: {structure_file.name}")
|
|
|
- print(f" 📊 表格: {structure['num_rows']}行 x {len(structure['columns'])}列")
|
|
|
|
|
|
|
+ print(f" 📊 表格: {len(structure['rows'])}行 x {len(structure['columns'])}列")
|
|
|
|
|
|
|
|
return True
|
|
return True
|
|
|
|
|
|
|
@@ -361,6 +555,8 @@ def apply_template_batch(
|
|
|
image_dir: str,
|
|
image_dir: str,
|
|
|
json_dir: str,
|
|
json_dir: str,
|
|
|
output_dir: str,
|
|
output_dir: str,
|
|
|
|
|
+ structure_suffix: str = "_structure.json",
|
|
|
|
|
+ use_hybrid_mode: bool = False,
|
|
|
line_width: int = 2,
|
|
line_width: int = 2,
|
|
|
line_color: Tuple[int, int, int] = (0, 0, 0)
|
|
line_color: Tuple[int, int, int] = (0, 0, 0)
|
|
|
):
|
|
):
|
|
@@ -414,7 +610,7 @@ def apply_template_batch(
|
|
|
continue
|
|
continue
|
|
|
|
|
|
|
|
if apply_template_to_single_file(
|
|
if apply_template_to_single_file(
|
|
|
- applier, image_file, json_file, output_path,
|
|
|
|
|
|
|
+ applier, image_file, json_file, output_path, structure_suffix, use_hybrid_mode,
|
|
|
line_width, line_color
|
|
line_width, line_color
|
|
|
):
|
|
):
|
|
|
results.append({
|
|
results.append({
|
|
@@ -454,29 +650,31 @@ def apply_template_batch(
|
|
|
def main():
|
|
def main():
|
|
|
"""主函数"""
|
|
"""主函数"""
|
|
|
parser = argparse.ArgumentParser(
|
|
parser = argparse.ArgumentParser(
|
|
|
- description='应用表格模板到其他页面',
|
|
|
|
|
|
|
+ description='应用表格模板到其他页面(支持混合模式)',
|
|
|
formatter_class=argparse.RawDescriptionHelpFormatter,
|
|
formatter_class=argparse.RawDescriptionHelpFormatter,
|
|
|
epilog="""
|
|
epilog="""
|
|
|
示例用法:
|
|
示例用法:
|
|
|
|
|
|
|
|
- 1. 批量处理整个目录:
|
|
|
|
|
|
|
+ 1. 混合模式(推荐,自适应行高):
|
|
|
python table_template_applier.py \\
|
|
python table_template_applier.py \\
|
|
|
- --template output/康强_北京农村商业银行_page_001_structure.json \\
|
|
|
|
|
|
|
+ --template template.json \\
|
|
|
--image-dir /path/to/images \\
|
|
--image-dir /path/to/images \\
|
|
|
--json-dir /path/to/jsons \\
|
|
--json-dir /path/to/jsons \\
|
|
|
- --output-dir /path/to/output
|
|
|
|
|
|
|
+ --output-dir /path/to/output \\
|
|
|
|
|
+ --structure-suffix _structure.json \\
|
|
|
|
|
+ --hybrid
|
|
|
|
|
|
|
|
- 2. 处理单个文件:
|
|
|
|
|
|
|
+ 2. 完全模板模式(固定行高):
|
|
|
python table_template_applier.py \\
|
|
python table_template_applier.py \\
|
|
|
- --template output/康强_北京农村商业银行_page_001_structure.json \\
|
|
|
|
|
- --image-file /path/to/page_002.png \\
|
|
|
|
|
- --json-file /path/to/page_002.json \\
|
|
|
|
|
- --output-dir /path/to/output
|
|
|
|
|
|
|
+ --template template.json \\
|
|
|
|
|
+ --image-file page.png \\
|
|
|
|
|
+ --json-file page.json \\
|
|
|
|
|
+ --output-dir /path/to/output \\
|
|
|
|
|
+ --structure-suffix _structure.json \\
|
|
|
|
|
|
|
|
-输出内容:
|
|
|
|
|
- - {name}_with_lines.png: 带表格线的图片
|
|
|
|
|
- - {name}_structure.json: 表格结构配置
|
|
|
|
|
- - batch_results.json: 批处理统计结果
|
|
|
|
|
|
|
+模式说明:
|
|
|
|
|
+ - 混合模式(--hybrid): 列宽使用模板,行高根据OCR自适应
|
|
|
|
|
+ - 完全模板模式: 列宽和行高都使用模板(适合固定格式表格)
|
|
|
"""
|
|
"""
|
|
|
)
|
|
)
|
|
|
|
|
|
|
@@ -522,6 +720,12 @@ def main():
|
|
|
required=True,
|
|
required=True,
|
|
|
help='输出目录(必需)'
|
|
help='输出目录(必需)'
|
|
|
)
|
|
)
|
|
|
|
|
+ output_group.add_argument(
|
|
|
|
|
+ '--structure-suffix',
|
|
|
|
|
+ type=str,
|
|
|
|
|
+ default='_structure.json',
|
|
|
|
|
+ help='输出结构配置文件后缀(默认: _structure.json)'
|
|
|
|
|
+ )
|
|
|
|
|
|
|
|
# 绘图参数组
|
|
# 绘图参数组
|
|
|
draw_group = parser.add_argument_group('绘图参数')
|
|
draw_group = parser.add_argument_group('绘图参数')
|
|
@@ -538,6 +742,14 @@ def main():
|
|
|
help='线条颜色(默认: black)'
|
|
help='线条颜色(默认: black)'
|
|
|
)
|
|
)
|
|
|
|
|
|
|
|
|
|
+ # 🆕 新增模式参数
|
|
|
|
|
+ mode_group = parser.add_argument_group('模式参数')
|
|
|
|
|
+ mode_group.add_argument(
|
|
|
|
|
+ '--hybrid',
|
|
|
|
|
+ action='store_true',
|
|
|
|
|
+ help='使用混合模式(模板列 + OCR行,自适应行高,推荐)'
|
|
|
|
|
+ )
|
|
|
|
|
+
|
|
|
args = parser.parse_args()
|
|
args = parser.parse_args()
|
|
|
|
|
|
|
|
# 颜色映射
|
|
# 颜色映射
|
|
@@ -581,7 +793,9 @@ def main():
|
|
|
|
|
|
|
|
success = apply_template_to_single_file(
|
|
success = apply_template_to_single_file(
|
|
|
applier, image_file, json_file, output_path,
|
|
applier, image_file, json_file, output_path,
|
|
|
- args.width, line_color
|
|
|
|
|
|
|
+ use_hybrid_mode=args.hybrid, # 🆕 传递混合模式参数
|
|
|
|
|
+ line_width=args.width,
|
|
|
|
|
+ line_color=line_color
|
|
|
)
|
|
)
|
|
|
|
|
|
|
|
if success:
|
|
if success:
|
|
@@ -610,8 +824,10 @@ def main():
|
|
|
str(image_dir),
|
|
str(image_dir),
|
|
|
str(json_dir),
|
|
str(json_dir),
|
|
|
str(output_path),
|
|
str(output_path),
|
|
|
- args.width,
|
|
|
|
|
- line_color
|
|
|
|
|
|
|
+ structure_suffix=args.structure_suffix,
|
|
|
|
|
+ use_hybrid_mode=args.hybrid, # 🆕 传递混合模式参数
|
|
|
|
|
+ line_width=args.width,
|
|
|
|
|
+ line_color=line_color,
|
|
|
)
|
|
)
|
|
|
|
|
|
|
|
else:
|
|
else:
|
|
@@ -633,14 +849,21 @@ if __name__ == "__main__":
|
|
|
|
|
|
|
|
# 默认配置
|
|
# 默认配置
|
|
|
default_config = {
|
|
default_config = {
|
|
|
- "template": "output/table_structures/康强_北京农村商业银行_page_001_structure.json",
|
|
|
|
|
|
|
+ "template": "/Users/zhch158/workspace/data/流水分析/康强_北京农村商业银行.wiredtable/康强_北京农村商业银行_page_001_structure.json",
|
|
|
"image-file": "/Users/zhch158/workspace/data/流水分析/康强_北京农村商业银行/ppstructurev3_client_results/康强_北京农村商业银行/康强_北京农村商业银行_page_002.png",
|
|
"image-file": "/Users/zhch158/workspace/data/流水分析/康强_北京农村商业银行/ppstructurev3_client_results/康强_北京农村商业银行/康强_北京农村商业银行_page_002.png",
|
|
|
"json-file": "/Users/zhch158/workspace/data/流水分析/康强_北京农村商业银行/ppstructurev3_client_results/康强_北京农村商业银行_page_002.json",
|
|
"json-file": "/Users/zhch158/workspace/data/流水分析/康强_北京农村商业银行/ppstructurev3_client_results/康强_北京农村商业银行_page_002.json",
|
|
|
"output-dir": "output/batch_results",
|
|
"output-dir": "output/batch_results",
|
|
|
"width": "2",
|
|
"width": "2",
|
|
|
"color": "black"
|
|
"color": "black"
|
|
|
}
|
|
}
|
|
|
-
|
|
|
|
|
|
|
+ # default_config = {
|
|
|
|
|
+ # "template": "/Users/zhch158/workspace/data/流水分析/B用户_扫描流水.wiredtable/B用户_扫描流水_page_001_structure.json",
|
|
|
|
|
+ # "image-file": "/Users/zhch158/workspace/data/流水分析/B用户_扫描流水/mineru_vllm_results/B用户_扫描流水/B用户_扫描流水_page_002.png",
|
|
|
|
|
+ # "json-file": "/Users/zhch158/workspace/data/流水分析/B用户_扫描流水/mineru_vllm_results_cell_bbox/B用户_扫描流水_page_002.json",
|
|
|
|
|
+ # "output-dir": "output/batch_results",
|
|
|
|
|
+ # "width": "2",
|
|
|
|
|
+ # "color": "black"
|
|
|
|
|
+ # }
|
|
|
print("⚙️ 默认参数:")
|
|
print("⚙️ 默认参数:")
|
|
|
for key, value in default_config.items():
|
|
for key, value in default_config.items():
|
|
|
print(f" --{key}: {value}")
|
|
print(f" --{key}: {value}")
|
|
@@ -649,5 +872,7 @@ if __name__ == "__main__":
|
|
|
sys.argv = [sys.argv[0]]
|
|
sys.argv = [sys.argv[0]]
|
|
|
for key, value in default_config.items():
|
|
for key, value in default_config.items():
|
|
|
sys.argv.extend([f"--{key}", str(value)])
|
|
sys.argv.extend([f"--{key}", str(value)])
|
|
|
|
|
+
|
|
|
|
|
+ sys.argv.append("--hybrid") # 使用混合模式
|
|
|
|
|
|
|
|
sys.exit(main())
|
|
sys.exit(main())
|