data_processor.py 17 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463
  1. """
  2. 数据处理模块
  3. 负责处理 MinerU/PaddleOCR_VL/DotsOCR 数据,添加 bbox 信息
  4. """
  5. from typing import List, Dict, Tuple, Optional
  6. from bs4 import BeautifulSoup
  7. import sys
  8. from pathlib import Path
  9. # 添加 ocr_platform 根目录到 Python 路径(用于导入 ocr_utils)
  10. ocr_platform_root = Path(__file__).parents[3] # ocr_merger -> ocr_tools -> ocr_platform -> repository.git
  11. if str(ocr_platform_root) not in sys.path:
  12. sys.path.insert(0, str(ocr_platform_root))
  13. try:
  14. from .text_matcher import TextMatcher
  15. from ocr_utils import BBoxExtractor # 从 ocr_utils 导入
  16. from .table_cell_matcher import TableCellMatcher
  17. except ImportError:
  18. from text_matcher import TextMatcher
  19. from ocr_utils import BBoxExtractor # 从 ocr_utils 导入
  20. from table_cell_matcher import TableCellMatcher
  21. class DataProcessor:
  22. """数据处理器"""
  23. """_summary_
  24. 1.负责处理 MinerU/PaddleOCR_VL/DotsOCR 数据,添加 table_cells bbox 信息, 其他类型的bbox信息依然使用vl自带的bbox
  25. 2.由于不同OCR工具的输出格式不同,DataProcessor 需要包含多个处理方法,分别处理 MinerU、DotsOCR 和 PaddleOCR_VL 数据, 都先转换成mineru格式再添加table cells bbox信息
  26. 3.使用 TextMatcher 进行文本匹配,TableCellMatcher 进行表单元格匹配
  27. 4.最终输出统一的 MinerU 格式数据
  28. 由于VL模型minerU,dotsocr坐标都是使用的原图坐标,不是旋转后的坐标,PaddleVL使用的时旋转转换后的坐标,而ppstructure使用的ocr文本块是旋转后的坐标,
  29. 因此在处理VL数据时,
  30. 1.首先需要根据ppstructure的旋转角度和原图尺寸,将VL的table坐标转换为旋转后的坐标
  31. 2.通过TableCellMatcher 进行表单元格匹配
  32. 3.再将匹配到的单元格bbox逆向转换为原图坐标,存储在最终输出的MinerU格式数据中
  33. 4.其他类型的bbox信息依然使用vl自带的bbox
  34. """
  35. def __init__(self, text_matcher: TextMatcher, look_ahead_window: int = 10, x_tolerance: int = 3, y_tolerance: int = 10):
  36. """
  37. Args:
  38. text_matcher: 文本匹配器
  39. look_ahead_window: 向前查找窗口
  40. x_tolerance: x轴容差
  41. """
  42. self.text_matcher = text_matcher
  43. self.look_ahead_window = look_ahead_window
  44. # X轴容差, 用于判断文本框是否在同一列
  45. self.x_tolerance = x_tolerance
  46. self.y_tolerance = y_tolerance # Y轴容差, 用于行分组
  47. # 🎯 创建表格单元格匹配器
  48. self.table_cell_matcher = TableCellMatcher(
  49. text_matcher=text_matcher,
  50. x_tolerance=x_tolerance,
  51. y_tolerance=y_tolerance
  52. )
  53. def process_mineru_data(self, mineru_data: List[Dict],
  54. paddle_text_boxes: List[Dict], rotation_angle: float, orig_image_size: Tuple[int, int]) -> List[Dict]:
  55. """
  56. 处理 MinerU 数据,添加 bbox 信息
  57. Args:
  58. mineru_data: MinerU 数据
  59. paddle_text_boxes: PaddleOCR 文字框列表
  60. Returns:
  61. 合并后的数据, table cell使用paddle的bbox,其他类型只是移动指针,bbox还是沿用minerU的bbox
  62. """
  63. merged_data = []
  64. paddle_pointer = 0
  65. last_matched_index = 0
  66. # 按 bbox 排序
  67. mineru_data.sort(
  68. key=lambda x: (x['bbox'][1], x['bbox'][0])
  69. if 'bbox' in x else (float('inf'), float('inf'))
  70. )
  71. for item in mineru_data:
  72. item_type = item.get('type', '')
  73. if item_type == 'table':
  74. if rotation_angle != 0:
  75. inverse_table_bbox = BBoxExtractor.rotate_box_coordinates(item['bbox'], rotation_angle, orig_image_size)
  76. inverse_item = item.copy()
  77. inverse_item['bbox'] = inverse_table_bbox
  78. else:
  79. inverse_item = item
  80. merged_item, paddle_pointer, skew_angle = self._process_table(
  81. inverse_item, paddle_text_boxes, paddle_pointer
  82. )
  83. # 🆕 保存角度信息到表格 item
  84. merged_item['image_rotation_angle'] = rotation_angle # 图片旋转角度
  85. merged_item['skew_angle'] = skew_angle # 倾斜角度
  86. # 如果有旋转,需要将匹配到的单元格bbox逆向转换为原图坐标
  87. if rotation_angle != 0:
  88. for cell in merged_item.get('table_cells', []):
  89. cell_bbox = cell.get('bbox', [])
  90. if cell_bbox:
  91. original_bbox = BBoxExtractor.inverse_rotate_box_coordinates(cell_bbox, rotation_angle, orig_image_size)
  92. cell['bbox'] = original_bbox
  93. merged_item['bbox'] = item['bbox'] # 保持表格的原始bbox不变
  94. merged_data.append(merged_item)
  95. elif item_type in ['text', 'title', 'header', 'footer']:
  96. merged_item, paddle_pointer, last_matched_index = self._process_text(
  97. item, paddle_text_boxes, paddle_pointer, last_matched_index
  98. )
  99. merged_data.append(merged_item)
  100. elif item_type == 'list':
  101. merged_item, paddle_pointer, last_matched_index = self._process_list(
  102. item, paddle_text_boxes, paddle_pointer, last_matched_index
  103. )
  104. merged_data.append(merged_item)
  105. else:
  106. merged_data.append(item.copy())
  107. return merged_data
  108. def process_dotsocr_data(self, dotsocr_data: List[Dict],
  109. paddle_text_boxes: List[Dict],
  110. rotation_angle: float,
  111. orig_image_size: Tuple[int, int]) -> List[Dict]:
  112. """
  113. 处理 DotsOCR 数据(简化版:转换后复用 MinerU 处理逻辑)
  114. Args:
  115. dotsocr_data: DotsOCR 输出数据
  116. paddle_text_boxes: PaddleOCR 文本框
  117. rotation_angle: 旋转角度
  118. orig_image_size: 原始图片尺寸
  119. Returns:
  120. 统一的 MinerU 格式数据(带 table_cells bbox)
  121. """
  122. print(f"📊 处理 DotsOCR 数据: {len(dotsocr_data)} 个块")
  123. # 🎯 第一步:转换为 MinerU 格式
  124. mineru_format_data = []
  125. for item in dotsocr_data:
  126. try:
  127. converted_item = self._convert_dotsocr_to_mineru(item)
  128. if converted_item:
  129. mineru_format_data.append(converted_item)
  130. except Exception as e:
  131. print(f"⚠️ DotsOCR 转换失败: {e}")
  132. continue
  133. print(f" ✓ 转换完成: {len(mineru_format_data)} 个块")
  134. # 🎯 第二步:复用 MinerU 处理逻辑
  135. return self.process_mineru_data(
  136. mineru_data=mineru_format_data,
  137. paddle_text_boxes=paddle_text_boxes,
  138. rotation_angle=rotation_angle,
  139. orig_image_size=orig_image_size
  140. )
  141. def _convert_dotsocr_to_mineru(self, dotsocr_item: Dict) -> Dict:
  142. """
  143. 🎯 将 DotsOCR 格式转换为 MinerU 格式
  144. DotsOCR:
  145. {
  146. "category": "Table",
  147. "bbox": [x1, y1, x2, y2],
  148. "text": "..."
  149. }
  150. MinerU:
  151. {
  152. "type": "table",
  153. "bbox": [x1, y1, x2, y2],
  154. "table_body": "...",
  155. "page_idx": 0
  156. }
  157. """
  158. category = dotsocr_item.get('category', '')
  159. # 🎯 Category 映射
  160. category_map = {
  161. 'Page-header': 'header',
  162. 'Page-footer': 'footer',
  163. 'Picture': 'image',
  164. 'Figure': 'image',
  165. 'Section-header': 'title',
  166. 'Table': 'table',
  167. 'Text': 'text',
  168. 'Title': 'title',
  169. 'List': 'list',
  170. 'Caption': 'title'
  171. }
  172. mineru_type = category_map.get(category, 'text')
  173. # 🎯 基础转换
  174. mineru_item = {
  175. 'type': mineru_type,
  176. 'bbox': dotsocr_item.get('bbox', []),
  177. 'page_idx': 0 # DotsOCR 默认单页
  178. }
  179. # 🎯 处理文本内容
  180. text = dotsocr_item.get('text', '')
  181. if mineru_type == 'table':
  182. # 表格:text -> table_body
  183. mineru_item['table_body'] = text
  184. else:
  185. # 其他类型:保持 text
  186. mineru_item['text'] = text
  187. # 标题级别
  188. if category == 'Section-header':
  189. mineru_item['text_level'] = 1
  190. return mineru_item
  191. def process_paddleocr_vl_data(self, paddleocr_vl_data: Dict,
  192. paddle_text_boxes: List[Dict], rotation_angle: float, orig_image_size: Tuple[int, int]) -> List[Dict]:
  193. """
  194. 处理 PaddleOCR_VL 数据,添加 bbox 信息
  195. Args:
  196. paddleocr_vl_data: PaddleOCR_VL 数据 (JSON 对象)
  197. paddle_text_boxes: PaddleOCR 文字框列表
  198. Returns:
  199. 🎯 MinerU 格式的合并数据(统一输出格式)
  200. """
  201. # 🎯 获取旋转角度和原始图像尺寸
  202. vl_rotation_angle = self._get_rotation_angle_from_vl(paddleocr_vl_data)
  203. vl_orig_image_size = (0,0)
  204. if vl_rotation_angle != 0:
  205. vl_orig_image_size = self._get_original_image_size_from_vl(paddleocr_vl_data)
  206. print(f"🔄 PaddleOCR_VL 检测到旋转角度: {vl_rotation_angle}°")
  207. print(f"📐 原始图像尺寸: {vl_orig_image_size[0]} x {vl_orig_image_size[1]}")
  208. # 提取 parsing_res_list
  209. parsing_res_list = paddleocr_vl_data.get('parsing_res_list', [])
  210. # 按 bbox 排序
  211. parsing_res_list.sort(
  212. key=lambda x: (x['block_bbox'][1], x['block_bbox'][0])
  213. if 'block_bbox' in x else (float('inf'), float('inf'))
  214. )
  215. mineru_format_data = []
  216. for item in parsing_res_list:
  217. # 🎯 先转换 bbox 坐标(如果需要)
  218. if vl_rotation_angle != 0 and orig_image_size:
  219. item = self._transform_vl_block_bbox(item, vl_rotation_angle, vl_orig_image_size)
  220. converted_item = self._convert_paddleocr_vl_to_mineru(item)
  221. if converted_item:
  222. mineru_format_data.append(converted_item)
  223. print(f" ✓ 转换完成: {len(mineru_format_data)} 个块")
  224. # 🎯 第三步:复用 MinerU 处理逻辑
  225. return self.process_mineru_data(
  226. mineru_data=mineru_format_data,
  227. paddle_text_boxes=paddle_text_boxes,
  228. rotation_angle=rotation_angle,
  229. orig_image_size=orig_image_size
  230. )
  231. def _get_rotation_angle_from_vl(self, paddleocr_vl_data: Dict) -> float:
  232. """从 PaddleOCR_VL 数据中获取旋转角度"""
  233. return BBoxExtractor._get_rotation_angle(paddleocr_vl_data)
  234. def _get_original_image_size_from_vl(self, paddleocr_vl_data: Dict) -> tuple:
  235. """从 PaddleOCR_VL 数据中获取原始图像尺寸"""
  236. return BBoxExtractor._get_original_image_size(paddleocr_vl_data)
  237. def _transform_vl_block_bbox(self, item: Dict, angle: float,
  238. orig_image_size: tuple) -> Dict:
  239. """
  240. 转换 PaddleOCR_VL 的 block_bbox 坐标
  241. Args:
  242. item: PaddleOCR_VL 的 block 数据
  243. angle: 旋转角度
  244. orig_image_size: 原始图像尺寸
  245. Returns:
  246. 转换后的 block 数据
  247. """
  248. transformed_item = item.copy()
  249. if 'block_bbox' not in item:
  250. return transformed_item
  251. block_bbox = item['block_bbox']
  252. if len(block_bbox) < 4:
  253. return transformed_item
  254. transformed_bbox = BBoxExtractor.inverse_rotate_box_coordinates(block_bbox, angle, orig_image_size)
  255. transformed_item['block_bbox'] = transformed_bbox
  256. return transformed_item
  257. def _convert_paddleocr_vl_to_mineru(self, paddleocr_vl_item: Dict) -> Dict:
  258. """
  259. 🎯 将 PaddleOCR_VL 格式转换为 MinerU 格式
  260. 基于 PP-DocLayout_plus-L 的 20 种类别
  261. """
  262. block_label = paddleocr_vl_item.get('block_label', '')
  263. # 🎯 PP-DocLayout_plus-L 类别映射(共 20 种)
  264. label_map = {
  265. # 标题类(3种)
  266. 'paragraph_title': 'title',
  267. 'doc_title': 'title',
  268. 'figure_table_chart_title': 'title',
  269. # 文本类(9种)
  270. 'text': 'text',
  271. 'number': 'text',
  272. 'content': 'text',
  273. 'abstract': 'text',
  274. 'footnote': 'text',
  275. 'aside_text': 'text',
  276. 'algorithm': 'text',
  277. 'reference': 'text',
  278. 'reference_content': 'text',
  279. # 页眉页脚(2种)
  280. 'header': 'header',
  281. 'footer': 'footer',
  282. # 表格(1种)
  283. 'table': 'table',
  284. # 图片/图表(3种)
  285. 'image': 'image',
  286. 'chart': 'image',
  287. 'seal': 'image',
  288. # 公式(2种)
  289. 'formula': 'equation',
  290. 'formula_number': 'equation'
  291. }
  292. mineru_type = label_map.get(block_label, 'text')
  293. mineru_item = {
  294. 'type': mineru_type,
  295. 'bbox': paddleocr_vl_item.get('block_bbox', []),
  296. 'page_idx': 0
  297. }
  298. content = paddleocr_vl_item.get('block_content', '')
  299. if mineru_type == 'table':
  300. mineru_item['table_body'] = content
  301. else:
  302. mineru_item['text'] = content
  303. # 标题级别
  304. if block_label == 'doc_title':
  305. mineru_item['text_level'] = 1
  306. elif block_label == 'paragraph_title':
  307. mineru_item['text_level'] = 2
  308. elif block_label == 'figure_table_chart_title':
  309. mineru_item['text_level'] = 3
  310. return mineru_item
  311. def _process_table(self, item: Dict, paddle_text_boxes: List[Dict],
  312. start_pointer: int) -> Tuple[Dict, int, float]:
  313. """
  314. 处理表格类型(MinerU 格式)
  315. 策略:
  316. - 解析 HTML 表格
  317. - 为每个单元格匹配 PaddleOCR 的 bbox
  318. - 返回处理后的表格、新指针位置和倾斜角度
  319. """
  320. skew_angle = 0.0
  321. table_body = item.get('table_body', '')
  322. if not table_body:
  323. print(f"⚠️ 表格内容为空,跳过")
  324. return item, start_pointer, skew_angle
  325. try:
  326. # 🔑 传入 table_bbox 用于筛选
  327. table_bbox = item.get('bbox') # MinerU 提供的表格边界
  328. # 🎯 委托给 TableCellMatcher
  329. enhanced_html, cells, new_pointer, skew_angle = \
  330. self.table_cell_matcher.enhance_table_html_with_bbox(
  331. table_body,
  332. paddle_text_boxes,
  333. start_pointer,
  334. table_bbox
  335. )
  336. # 更新 item
  337. item['table_body'] = enhanced_html
  338. item['table_cells'] = cells
  339. # 统计信息
  340. matched_count = len(cells)
  341. total_cells = len(BeautifulSoup(table_body, 'html.parser').find_all(['td', 'th']))
  342. print(f" 表格单元格: {matched_count}/{total_cells} 匹配")
  343. return item, new_pointer, skew_angle # 🆕 返回倾斜角度
  344. except Exception as e:
  345. print(f"⚠️ 表格处理失败: {e}")
  346. import traceback
  347. traceback.print_exc()
  348. return item, start_pointer, skew_angle
  349. def _process_text(self, item: Dict, paddle_text_boxes: List[Dict],
  350. paddle_pointer: int, last_matched_index: int) -> Tuple[Dict, int, int]:
  351. """处理文本"""
  352. merged_item = item.copy()
  353. text = item.get('text', '')
  354. matched_bbox, paddle_pointer, last_matched_index = \
  355. self.text_matcher.find_matching_bbox(
  356. text, paddle_text_boxes, paddle_pointer, last_matched_index,
  357. self.look_ahead_window
  358. )
  359. if matched_bbox:
  360. matched_bbox['used'] = True
  361. return merged_item, paddle_pointer, last_matched_index
  362. def _process_list(self, item: Dict, paddle_text_boxes: List[Dict],
  363. paddle_pointer: int, last_matched_index: int) -> Tuple[Dict, int, int]:
  364. """处理列表"""
  365. merged_item = item.copy()
  366. list_items = item.get('list_items', [])
  367. for list_item in list_items:
  368. matched_bbox, paddle_pointer, last_matched_index = \
  369. self.text_matcher.find_matching_bbox(
  370. list_item, paddle_text_boxes, paddle_pointer, last_matched_index,
  371. self.look_ahead_window
  372. )
  373. if matched_bbox:
  374. matched_bbox['used'] = True
  375. return merged_item, paddle_pointer, last_matched_index