data_processor.py 16 KB


  1. """
  2. 数据处理模块
  3. 负责处理 MinerU/PaddleOCR_VL/DotsOCR 数据,添加 bbox 信息
  4. """
  5. from typing import List, Dict, Tuple
  6. from bs4 import BeautifulSoup
  7. try:
  8. from .text_matcher import TextMatcher
  9. except ImportError:
  10. from text_matcher import TextMatcher
  11. class DataProcessor:
  12. """数据处理器"""
  13. def __init__(self, text_matcher: TextMatcher, look_ahead_window: int = 10):
  14. """
  15. Args:
  16. text_matcher: 文本匹配器
  17. look_ahead_window: 向前查找窗口
  18. """
  19. self.text_matcher = text_matcher
  20. self.look_ahead_window = look_ahead_window
  21. def process_mineru_data(self, mineru_data: List[Dict],
  22. paddle_text_boxes: List[Dict]) -> List[Dict]:
  23. """
  24. 处理 MinerU 数据,添加 bbox 信息
  25. Args:
  26. mineru_data: MinerU 数据
  27. paddle_text_boxes: PaddleOCR 文字框列表
  28. Returns:
  29. 合并后的数据, table cell使用paddle的bbox,其他类型只是移动指针,bbox还是沿用minerU的bbox
  30. """
  31. merged_data = []
  32. paddle_pointer = 0
  33. last_matched_index = 0
  34. # 按 bbox 排序
  35. mineru_data.sort(
  36. key=lambda x: (x['bbox'][1], x['bbox'][0])
  37. if 'bbox' in x else (float('inf'), float('inf'))
  38. )
  39. for item in mineru_data:
  40. item_type = item.get('type', '')
  41. if item_type == 'table':
  42. merged_item, paddle_pointer = self._process_table(
  43. item, paddle_text_boxes, paddle_pointer
  44. )
  45. merged_data.append(merged_item)
  46. elif item_type in ['text', 'title']:
  47. merged_item, paddle_pointer, last_matched_index = self._process_text(
  48. item, paddle_text_boxes, paddle_pointer, last_matched_index
  49. )
  50. merged_data.append(merged_item)
  51. elif item_type == 'list':
  52. merged_item, paddle_pointer, last_matched_index = self._process_list(
  53. item, paddle_text_boxes, paddle_pointer, last_matched_index
  54. )
  55. merged_data.append(merged_item)
  56. else:
  57. merged_data.append(item.copy())
  58. return merged_data
  59. def process_dotsocr_data(self, dotsocr_data: List[Dict],
  60. paddle_text_boxes: List[Dict]) -> List[Dict]:
  61. """
  62. 🎯 处理 DotsOCR 数据,转换为 MinerU 格式并添加 bbox 信息
  63. Args:
  64. dotsocr_data: DotsOCR 数据
  65. paddle_text_boxes: PaddleOCR 文字框列表
  66. Returns:
  67. MinerU 格式的合并数据
  68. """
  69. merged_data = []
  70. paddle_pointer = 0
  71. last_matched_index = 0
  72. # 按 bbox 排序
  73. dotsocr_data.sort(
  74. key=lambda x: (x['bbox'][1], x['bbox'][0])
  75. if 'bbox' in x else (float('inf'), float('inf'))
  76. )
  77. for item in dotsocr_data:
  78. # 🎯 转换为 MinerU 格式
  79. mineru_item = self._convert_dotsocr_to_mineru(item)
  80. category = mineru_item.get('type', '')
  81. # 🎯 根据类型处理
  82. if category.lower() == 'table':
  83. merged_item, paddle_pointer = self._process_dotsocr_table(
  84. mineru_item, paddle_text_boxes, paddle_pointer
  85. )
  86. merged_data.append(merged_item)
  87. elif category.lower() in ['text', 'title', 'header', 'footer']:
  88. merged_item, paddle_pointer, last_matched_index = self._process_text(
  89. mineru_item, paddle_text_boxes, paddle_pointer, last_matched_index
  90. )
  91. merged_data.append(merged_item)
  92. elif category.lower() == 'list':
  93. merged_item, paddle_pointer, last_matched_index = self._process_list(
  94. mineru_item, paddle_text_boxes, paddle_pointer, last_matched_index
  95. )
  96. merged_data.append(merged_item)
  97. else:
  98. # Page-header, Page-footer, Picture 等
  99. merged_data.append(mineru_item)
  100. return merged_data
  101. def _convert_dotsocr_to_mineru(self, dotsocr_item: Dict) -> Dict:
  102. """
  103. 🎯 将 DotsOCR 格式转换为 MinerU 格式
  104. DotsOCR:
  105. {
  106. "category": "Table",
  107. "bbox": [x1, y1, x2, y2],
  108. "text": "..."
  109. }
  110. MinerU:
  111. {
  112. "type": "table",
  113. "bbox": [x1, y1, x2, y2],
  114. "table_body": "...",
  115. "page_idx": 0
  116. }
  117. """
  118. category = dotsocr_item.get('category', '')
  119. # 🎯 Category 映射
  120. category_map = {
  121. 'Page-header': 'header',
  122. 'Page-footer': 'footer',
  123. 'Picture': 'image',
  124. 'Figure': 'image',
  125. 'Section-header': 'title',
  126. 'Table': 'table',
  127. 'Text': 'text',
  128. 'Title': 'title',
  129. 'List': 'list',
  130. 'Caption': 'title'
  131. }
  132. mineru_type = category_map.get(category, 'text')
  133. # 🎯 基础转换
  134. mineru_item = {
  135. 'type': mineru_type,
  136. 'bbox': dotsocr_item.get('bbox', []),
  137. 'page_idx': 0 # DotsOCR 默认单页
  138. }
  139. # 🎯 处理文本内容
  140. text = dotsocr_item.get('text', '')
  141. if mineru_type == 'table':
  142. # 表格:text -> table_body
  143. mineru_item['table_body'] = text
  144. else:
  145. # 其他类型:保持 text
  146. mineru_item['text'] = text
  147. # 标题级别
  148. if category == 'Section-header':
  149. mineru_item['text_level'] = 1
  150. return mineru_item
  151. def _process_dotsocr_table(self, item: Dict, paddle_text_boxes: List[Dict],
  152. start_pointer: int) -> Tuple[Dict, int]:
  153. """
  154. 🎯 处理 DotsOCR 表格(已转换为 MinerU 格式)
  155. DotsOCR 的表格 HTML 已经在 text 字段中,需要转移到 table_body
  156. """
  157. merged_item = item.copy()
  158. table_html = item.get('table_body', '')
  159. if not table_html:
  160. return merged_item, start_pointer
  161. # 🎯 复用表格处理逻辑
  162. enhanced_html, cells, new_pointer = self._enhance_table_html_with_bbox(
  163. table_html, paddle_text_boxes, start_pointer
  164. )
  165. merged_item['table_body'] = enhanced_html
  166. merged_item['table_body_with_bbox'] = enhanced_html
  167. merged_item['bbox_mapping'] = 'merged_from_paddle_ocr'
  168. merged_item['table_cells'] = cells if cells else []
  169. return merged_item, new_pointer
  170. def process_paddleocr_vl_data(self, paddleocr_vl_data: Dict,
  171. paddle_text_boxes: List[Dict]) -> List[Dict]:
  172. """
  173. 处理 PaddleOCR_VL 数据,添加 bbox 信息
  174. Args:
  175. paddleocr_vl_data: PaddleOCR_VL 数据 (JSON 对象)
  176. paddle_text_boxes: PaddleOCR 文字框列表
  177. Returns:
  178. MinerU 格式的合并数据(统一输出格式)
  179. """
  180. merged_data = []
  181. paddle_pointer = 0
  182. last_matched_index = 0
  183. # 提取 parsing_res_list
  184. parsing_res_list = paddleocr_vl_data.get('parsing_res_list', [])
  185. # 按 bbox 排序
  186. parsing_res_list.sort(
  187. key=lambda x: (x['block_bbox'][1], x['block_bbox'][0])
  188. if 'block_bbox' in x else (float('inf'), float('inf'))
  189. )
  190. for item in parsing_res_list:
  191. # 🎯 统一转换为 MinerU 格式
  192. mineru_item = self._convert_paddleocr_vl_to_mineru(item)
  193. item_type = mineru_item.get('type', '')
  194. # 🎯 根据类型处理(复用 MinerU 的通用方法)
  195. if item_type == 'table':
  196. merged_item, paddle_pointer = self._process_table(
  197. mineru_item, paddle_text_boxes, paddle_pointer
  198. )
  199. merged_data.append(merged_item)
  200. elif item_type in ['text', 'title', 'header', 'footer', 'equation']:
  201. merged_item, paddle_pointer, last_matched_index = self._process_text(
  202. mineru_item, paddle_text_boxes, paddle_pointer, last_matched_index
  203. )
  204. merged_data.append(merged_item)
  205. elif item_type == 'list':
  206. merged_item, paddle_pointer, last_matched_index = self._process_list(
  207. mineru_item, paddle_text_boxes, paddle_pointer, last_matched_index
  208. )
  209. merged_data.append(merged_item)
  210. else:
  211. # 其他类型(image, equation 等)直接添加
  212. merged_data.append(mineru_item)
  213. return merged_data
  214. def _convert_paddleocr_vl_to_mineru(self, paddleocr_vl_item: Dict) -> Dict:
  215. """
  216. 🎯 将 PaddleOCR_VL 格式转换为 MinerU 格式
  217. PaddleOCR_VL (PP-DocLayout_plus-L):
  218. {
  219. "block_label": "paragraph_title", # 或 "doc_title", "text" 等
  220. "block_bbox": [172, 151, 547, 184],
  221. "block_content": "...",
  222. "block_id": 0
  223. }
  224. MinerU:
  225. {
  226. "type": "title",
  227. "bbox": [172, 151, 547, 184],
  228. "text": "...",
  229. "text_level": 1,
  230. "page_idx": 0
  231. }
  232. """
  233. block_label = paddleocr_vl_item.get('block_label', '')
  234. # 🎯 PP-DocLayout_plus-L 类别映射
  235. label_map = {
  236. # 标题类
  237. 'paragraph_title': 'title', # 段落标题 → title (level 2)
  238. 'doc_title': 'title', # 文档标题 → title (level 1)
  239. 'figure_table_chart_title': 'title', # 图表标题 → title (level 3)
  240. # 文本类
  241. 'text': 'text',
  242. 'number': 'text',
  243. 'content': 'text',
  244. 'abstract': 'text',
  245. 'footnote': 'text',
  246. 'aside_text': 'text',
  247. 'algorithm': 'text',
  248. # 参考文献
  249. 'reference': 'text',
  250. 'reference_content': 'text',
  251. # 页眉页脚
  252. 'header': 'header',
  253. 'footer': 'footer',
  254. # 表格
  255. 'table': 'table',
  256. # 图片
  257. 'image': 'image',
  258. 'chart': 'image',
  259. # 公式
  260. 'formula': 'equation',
  261. 'formula_number': 'equation',
  262. # 印章
  263. 'seal': 'image'
  264. }
  265. mineru_type = label_map.get(block_label, 'text')
  266. # 🎯 基础转换
  267. mineru_item = {
  268. 'type': mineru_type,
  269. 'bbox': paddleocr_vl_item.get('block_bbox', []),
  270. 'page_idx': 0
  271. }
  272. # 🎯 处理文本内容
  273. content = paddleocr_vl_item.get('block_content', '')
  274. if mineru_type == 'table':
  275. # 表格:block_content -> table_body
  276. mineru_item['table_body'] = content
  277. else:
  278. # 其他类型:block_content -> text
  279. mineru_item['text'] = content
  280. # 🎯 处理标题级别(基于实际的类别)
  281. if block_label == 'doc_title':
  282. mineru_item['text_level'] = 1 # 文档标题 - 一级
  283. elif block_label == 'paragraph_title':
  284. mineru_item['text_level'] = 2 # 段落标题 - 二级
  285. elif block_label == 'figure_table_chart_title':
  286. mineru_item['text_level'] = 3 # 图表标题 - 三级
  287. return mineru_item
  288. def _process_table(self, item: Dict, paddle_text_boxes: List[Dict],
  289. start_pointer: int) -> Tuple[Dict, int]:
  290. """处理 MinerU 表格"""
  291. merged_item = item.copy()
  292. table_html = item.get('table_body', '')
  293. enhanced_html, cells, new_pointer = self._enhance_table_html_with_bbox(
  294. table_html, paddle_text_boxes, start_pointer
  295. )
  296. merged_item['table_body'] = enhanced_html
  297. merged_item['table_body_with_bbox'] = enhanced_html
  298. merged_item['bbox_mapping'] = 'merged_from_paddle_ocr'
  299. merged_item['table_cells'] = cells if cells else []
  300. return merged_item, new_pointer
  301. def _process_text(self, item: Dict, paddle_text_boxes: List[Dict],
  302. paddle_pointer: int, last_matched_index: int) -> Tuple[Dict, int, int]:
  303. """处理文本"""
  304. merged_item = item.copy()
  305. text = item.get('text', '')
  306. matched_bbox, paddle_pointer, last_matched_index = \
  307. self.text_matcher.find_matching_bbox(
  308. text, paddle_text_boxes, paddle_pointer, last_matched_index,
  309. self.look_ahead_window
  310. )
  311. if matched_bbox:
  312. matched_bbox['used'] = True
  313. return merged_item, paddle_pointer, last_matched_index
  314. def _process_list(self, item: Dict, paddle_text_boxes: List[Dict],
  315. paddle_pointer: int, last_matched_index: int) -> Tuple[Dict, int, int]:
  316. """处理列表"""
  317. merged_item = item.copy()
  318. list_items = item.get('list_items', [])
  319. for list_item in list_items:
  320. matched_bbox, paddle_pointer, last_matched_index = \
  321. self.text_matcher.find_matching_bbox(
  322. list_item, paddle_text_boxes, paddle_pointer, last_matched_index,
  323. self.look_ahead_window
  324. )
  325. if matched_bbox:
  326. matched_bbox['used'] = True
  327. return merged_item, paddle_pointer, last_matched_index
  328. def _enhance_table_html_with_bbox(self, html: str, paddle_text_boxes: List[Dict],
  329. start_pointer: int) -> Tuple[str, List[Dict], int]:
  330. """为 HTML 表格添加 bbox 信息"""
  331. soup = BeautifulSoup(html, 'html.parser')
  332. current_pointer = start_pointer
  333. last_matched_index = start_pointer
  334. cells = []
  335. for row_idx, row in enumerate(soup.find_all('tr')):
  336. for col_idx, cell in enumerate(row.find_all(['td', 'th'])):
  337. cell_text = cell.get_text(strip=True)
  338. if not cell_text:
  339. continue
  340. matched_bbox, current_pointer, last_matched_index = \
  341. self.text_matcher.find_matching_bbox(
  342. cell_text, paddle_text_boxes, current_pointer,
  343. last_matched_index, self.look_ahead_window
  344. )
  345. if matched_bbox:
  346. bbox = matched_bbox['bbox']
  347. cell['data-bbox'] = f"[{bbox[0]},{bbox[1]},{bbox[2]},{bbox[3]}]"
  348. cell['data-score'] = f"{matched_bbox['score']:.4f}"
  349. cell['data-paddle-index'] = str(matched_bbox['paddle_bbox_index'])
  350. # ✅ 完整记录单元格信息
  351. cells.append({
  352. 'type': 'table_cell',
  353. 'text': cell_text,
  354. 'bbox': bbox,
  355. 'row': row_idx + 1,
  356. 'col': col_idx + 1,
  357. 'score': matched_bbox['score'],
  358. 'paddle_bbox_index': matched_bbox['paddle_bbox_index']
  359. })
  360. matched_bbox['used'] = True
  361. # ✅ 如果匹配失败,不应该添加到 cells 中
  362. return str(soup), cells, current_pointer