|
|
@@ -7,8 +7,10 @@ from bs4 import BeautifulSoup
|
|
|
|
|
|
try:
|
|
|
from .text_matcher import TextMatcher
|
|
|
+ from .bbox_extractor import BBoxExtractor
|
|
|
except ImportError:
|
|
|
from text_matcher import TextMatcher
|
|
|
+ from bbox_extractor import BBoxExtractor
|
|
|
|
|
|
|
|
|
class DataProcessor:
|
|
|
@@ -212,28 +214,41 @@ class DataProcessor:
|
|
|
paddle_text_boxes: List[Dict]) -> List[Dict]:
|
|
|
"""
|
|
|
处理 PaddleOCR_VL 数据,添加 bbox 信息
|
|
|
-
|
|
|
+
|
|
|
Args:
|
|
|
paddleocr_vl_data: PaddleOCR_VL 数据 (JSON 对象)
|
|
|
paddle_text_boxes: PaddleOCR 文字框列表
|
|
|
-
|
|
|
+
|
|
|
Returns:
|
|
|
- MinerU 格式的合并数据(统一输出格式)
|
|
|
+ 🎯 MinerU 格式的合并数据(统一输出格式)
|
|
|
"""
|
|
|
merged_data = []
|
|
|
paddle_pointer = 0
|
|
|
last_matched_index = 0
|
|
|
-
|
|
|
+
|
|
|
+ # 🎯 获取旋转角度和原始图像尺寸
|
|
|
+ rotation_angle = self._get_rotation_angle_from_vl(paddleocr_vl_data)
|
|
|
+ orig_image_size = None
|
|
|
+
|
|
|
+ if rotation_angle != 0:
|
|
|
+ orig_image_size = self._get_original_image_size_from_vl(paddleocr_vl_data)
|
|
|
+ print(f"🔄 PaddleOCR_VL 检测到旋转角度: {rotation_angle}°")
|
|
|
+ print(f"📐 原始图像尺寸: {orig_image_size[0]} x {orig_image_size[1]}")
|
|
|
+
|
|
|
# 提取 parsing_res_list
|
|
|
parsing_res_list = paddleocr_vl_data.get('parsing_res_list', [])
|
|
|
-
|
|
|
+
|
|
|
# 按 bbox 排序
|
|
|
parsing_res_list.sort(
|
|
|
key=lambda x: (x['block_bbox'][1], x['block_bbox'][0])
|
|
|
if 'block_bbox' in x else (float('inf'), float('inf'))
|
|
|
)
|
|
|
-
|
|
|
+
|
|
|
for item in parsing_res_list:
|
|
|
+ # 🎯 先转换 bbox 坐标(如果需要)
|
|
|
+ if rotation_angle != 0 and orig_image_size:
|
|
|
+ item = self._transform_vl_block_bbox(item, rotation_angle, orig_image_size)
|
|
|
+
|
|
|
# 🎯 统一转换为 MinerU 格式
|
|
|
mineru_item = self._convert_paddleocr_vl_to_mineru(item)
|
|
|
item_type = mineru_item.get('type', '')
|
|
|
@@ -244,56 +259,94 @@ class DataProcessor:
|
|
|
mineru_item, paddle_text_boxes, paddle_pointer
|
|
|
)
|
|
|
merged_data.append(merged_item)
|
|
|
-
|
|
|
+
|
|
|
elif item_type in ['text', 'title', 'header', 'footer', 'equation']:
|
|
|
merged_item, paddle_pointer, last_matched_index = self._process_text(
|
|
|
mineru_item, paddle_text_boxes, paddle_pointer, last_matched_index
|
|
|
)
|
|
|
merged_data.append(merged_item)
|
|
|
-
|
|
|
+
|
|
|
elif item_type == 'list':
|
|
|
merged_item, paddle_pointer, last_matched_index = self._process_list(
|
|
|
mineru_item, paddle_text_boxes, paddle_pointer, last_matched_index
|
|
|
)
|
|
|
merged_data.append(merged_item)
|
|
|
-
|
|
|
+
|
|
|
else:
|
|
|
- # 其他类型(image, equation 等)直接添加
|
|
|
+ # 其他类型(image 等)直接添加
|
|
|
merged_data.append(mineru_item)
|
|
|
-
|
|
|
+
|
|
|
return merged_data
|
|
|
|
|
|
+ def _get_rotation_angle_from_vl(self, paddleocr_vl_data: Dict) -> float:
|
|
|
+ """从 PaddleOCR_VL 数据中获取旋转角度"""
|
|
|
+ return BBoxExtractor._get_rotation_angle(paddleocr_vl_data)
|
|
|
+
|
|
|
+ def _get_original_image_size_from_vl(self, paddleocr_vl_data: Dict) -> tuple:
|
|
|
+ """从 PaddleOCR_VL 数据中获取原始图像尺寸"""
|
|
|
+ return BBoxExtractor._get_original_image_size(paddleocr_vl_data)
|
|
|
+
|
|
|
+ def _transform_vl_block_bbox(self, item: Dict, angle: float,
|
|
|
+ orig_image_size: tuple) -> Dict:
|
|
|
+ """
|
|
|
+ 转换 PaddleOCR_VL 的 block_bbox 坐标
|
|
|
+
|
|
|
+ Args:
|
|
|
+ item: PaddleOCR_VL 的 block 数据
|
|
|
+ angle: 旋转角度
|
|
|
+ orig_image_size: 原始图像尺寸
|
|
|
+
|
|
|
+ Returns:
|
|
|
+ 转换后的 block 数据
|
|
|
+ """
|
|
|
+ transformed_item = item.copy()
|
|
|
+
|
|
|
+ if 'block_bbox' not in item:
|
|
|
+ return transformed_item
|
|
|
+
|
|
|
+ block_bbox = item['block_bbox']
|
|
|
+ if len(block_bbox) < 4:
|
|
|
+ return transformed_item
|
|
|
+
|
|
|
+ # block_bbox 格式: [x1, y1, x2, y2]
|
|
|
+ # 转换为 poly 格式进行旋转
|
|
|
+ poly = [
|
|
|
+ [block_bbox[0], block_bbox[1]], # 左上
|
|
|
+ [block_bbox[2], block_bbox[1]], # 右上
|
|
|
+ [block_bbox[2], block_bbox[3]], # 右下
|
|
|
+ [block_bbox[0], block_bbox[3]] # 左下
|
|
|
+ ]
|
|
|
+
|
|
|
+ # 🎯 使用 BBoxExtractor 的坐标转换方法
|
|
|
+ transformed_poly = BBoxExtractor._inverse_rotate_coordinates(
|
|
|
+ poly, angle, orig_image_size
|
|
|
+ )
|
|
|
+
|
|
|
+ # 转换回 bbox 格式
|
|
|
+ xs = [p[0] for p in transformed_poly]
|
|
|
+ ys = [p[1] for p in transformed_poly]
|
|
|
+ transformed_bbox = [min(xs), min(ys), max(xs), max(ys)]
|
|
|
+
|
|
|
+ transformed_item['block_bbox'] = transformed_bbox
|
|
|
+
|
|
|
+ return transformed_item
|
|
|
+
|
|
|
def _convert_paddleocr_vl_to_mineru(self, paddleocr_vl_item: Dict) -> Dict:
|
|
|
"""
|
|
|
🎯 将 PaddleOCR_VL 格式转换为 MinerU 格式
|
|
|
|
|
|
- PaddleOCR_VL (PP-DocLayout_plus-L):
|
|
|
- {
|
|
|
- "block_label": "paragraph_title", # 或 "doc_title", "text" 等
|
|
|
- "block_bbox": [172, 151, 547, 184],
|
|
|
- "block_content": "...",
|
|
|
- "block_id": 0
|
|
|
- }
|
|
|
-
|
|
|
- MinerU:
|
|
|
- {
|
|
|
- "type": "title",
|
|
|
- "bbox": [172, 151, 547, 184],
|
|
|
- "text": "...",
|
|
|
- "text_level": 1,
|
|
|
- "page_idx": 0
|
|
|
- }
|
|
|
+ 基于 PP-DocLayout_plus-L 的 20 种类别
|
|
|
"""
|
|
|
block_label = paddleocr_vl_item.get('block_label', '')
|
|
|
|
|
|
- # 🎯 PP-DocLayout_plus-L 类别映射
|
|
|
+ # 🎯 PP-DocLayout_plus-L 类别映射(共 20 种)
|
|
|
label_map = {
|
|
|
- # 标题类
|
|
|
- 'paragraph_title': 'title', # 段落标题 → title (level 2)
|
|
|
- 'doc_title': 'title', # 文档标题 → title (level 1)
|
|
|
- 'figure_table_chart_title': 'title', # 图表标题 → title (level 3)
|
|
|
+ # 标题类(3种)
|
|
|
+ 'paragraph_title': 'title',
|
|
|
+ 'doc_title': 'title',
|
|
|
+ 'figure_table_chart_title': 'title',
|
|
|
|
|
|
- # 文本类
|
|
|
+ # 文本类(9种)
|
|
|
'text': 'text',
|
|
|
'number': 'text',
|
|
|
'content': 'text',
|
|
|
@@ -301,57 +354,49 @@ class DataProcessor:
|
|
|
'footnote': 'text',
|
|
|
'aside_text': 'text',
|
|
|
'algorithm': 'text',
|
|
|
-
|
|
|
- # 参考文献
|
|
|
'reference': 'text',
|
|
|
'reference_content': 'text',
|
|
|
|
|
|
- # 页眉页脚
|
|
|
+ # 页眉页脚(2种)
|
|
|
'header': 'header',
|
|
|
'footer': 'footer',
|
|
|
|
|
|
- # 表格
|
|
|
+ # 表格(1种)
|
|
|
'table': 'table',
|
|
|
|
|
|
- # 图片
|
|
|
+ # 图片/图表(3种)
|
|
|
'image': 'image',
|
|
|
'chart': 'image',
|
|
|
+ 'seal': 'image',
|
|
|
|
|
|
- # 公式
|
|
|
+ # 公式(2种)
|
|
|
'formula': 'equation',
|
|
|
- 'formula_number': 'equation',
|
|
|
-
|
|
|
- # 印章
|
|
|
- 'seal': 'image'
|
|
|
+ 'formula_number': 'equation'
|
|
|
}
|
|
|
|
|
|
mineru_type = label_map.get(block_label, 'text')
|
|
|
|
|
|
- # 🎯 基础转换
|
|
|
mineru_item = {
|
|
|
'type': mineru_type,
|
|
|
'bbox': paddleocr_vl_item.get('block_bbox', []),
|
|
|
'page_idx': 0
|
|
|
}
|
|
|
|
|
|
- # 🎯 处理文本内容
|
|
|
content = paddleocr_vl_item.get('block_content', '')
|
|
|
|
|
|
if mineru_type == 'table':
|
|
|
- # 表格:block_content -> table_body
|
|
|
mineru_item['table_body'] = content
|
|
|
else:
|
|
|
- # 其他类型:block_content -> text
|
|
|
mineru_item['text'] = content
|
|
|
|
|
|
- # 🎯 处理标题级别(基于实际的类别)
|
|
|
+ # 标题级别
|
|
|
if block_label == 'doc_title':
|
|
|
- mineru_item['text_level'] = 1 # 文档标题 - 一级
|
|
|
+ mineru_item['text_level'] = 1
|
|
|
elif block_label == 'paragraph_title':
|
|
|
- mineru_item['text_level'] = 2 # 段落标题 - 二级
|
|
|
+ mineru_item['text_level'] = 2
|
|
|
elif block_label == 'figure_table_chart_title':
|
|
|
- mineru_item['text_level'] = 3 # 图表标题 - 三级
|
|
|
-
|
|
|
+ mineru_item['text_level'] = 3
|
|
|
+
|
|
|
return mineru_item
|
|
|
|
|
|
def _process_table(self, item: Dict, paddle_text_boxes: List[Dict],
|