data_processor.py 18 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495
  1. """
  2. 数据处理模块
  3. 负责处理 MinerU/PaddleOCR_VL/DotsOCR 数据,添加 bbox 信息
  4. """
  5. from typing import List, Dict, Tuple
  6. from bs4 import BeautifulSoup
  7. try:
  8. from .text_matcher import TextMatcher
  9. from .bbox_extractor import BBoxExtractor
  10. except ImportError:
  11. from text_matcher import TextMatcher
  12. from bbox_extractor import BBoxExtractor
  13. class DataProcessor:
  14. """数据处理器"""
  15. def __init__(self, text_matcher: TextMatcher, look_ahead_window: int = 10):
  16. """
  17. Args:
  18. text_matcher: 文本匹配器
  19. look_ahead_window: 向前查找窗口
  20. """
  21. self.text_matcher = text_matcher
  22. self.look_ahead_window = look_ahead_window
  23. def process_mineru_data(self, mineru_data: List[Dict],
  24. paddle_text_boxes: List[Dict]) -> List[Dict]:
  25. """
  26. 处理 MinerU 数据,添加 bbox 信息
  27. Args:
  28. mineru_data: MinerU 数据
  29. paddle_text_boxes: PaddleOCR 文字框列表
  30. Returns:
  31. 合并后的数据, table cell使用paddle的bbox,其他类型只是移动指针,bbox还是沿用minerU的bbox
  32. """
  33. merged_data = []
  34. paddle_pointer = 0
  35. last_matched_index = 0
  36. # 按 bbox 排序
  37. mineru_data.sort(
  38. key=lambda x: (x['bbox'][1], x['bbox'][0])
  39. if 'bbox' in x else (float('inf'), float('inf'))
  40. )
  41. for item in mineru_data:
  42. item_type = item.get('type', '')
  43. if item_type == 'table':
  44. merged_item, paddle_pointer = self._process_table(
  45. item, paddle_text_boxes, paddle_pointer
  46. )
  47. merged_data.append(merged_item)
  48. elif item_type in ['text', 'title']:
  49. merged_item, paddle_pointer, last_matched_index = self._process_text(
  50. item, paddle_text_boxes, paddle_pointer, last_matched_index
  51. )
  52. merged_data.append(merged_item)
  53. elif item_type == 'list':
  54. merged_item, paddle_pointer, last_matched_index = self._process_list(
  55. item, paddle_text_boxes, paddle_pointer, last_matched_index
  56. )
  57. merged_data.append(merged_item)
  58. else:
  59. merged_data.append(item.copy())
  60. return merged_data
  61. def process_dotsocr_data(self, dotsocr_data: List[Dict],
  62. paddle_text_boxes: List[Dict]) -> List[Dict]:
  63. """
  64. 🎯 处理 DotsOCR 数据,转换为 MinerU 格式并添加 bbox 信息
  65. Args:
  66. dotsocr_data: DotsOCR 数据
  67. paddle_text_boxes: PaddleOCR 文字框列表
  68. Returns:
  69. MinerU 格式的合并数据
  70. """
  71. merged_data = []
  72. paddle_pointer = 0
  73. last_matched_index = 0
  74. # 按 bbox 排序
  75. dotsocr_data.sort(
  76. key=lambda x: (x['bbox'][1], x['bbox'][0])
  77. if 'bbox' in x else (float('inf'), float('inf'))
  78. )
  79. for item in dotsocr_data:
  80. # 🎯 转换为 MinerU 格式
  81. mineru_item = self._convert_dotsocr_to_mineru(item)
  82. category = mineru_item.get('type', '')
  83. # 🎯 根据类型处理
  84. if category.lower() == 'table':
  85. merged_item, paddle_pointer = self._process_dotsocr_table(
  86. mineru_item, paddle_text_boxes, paddle_pointer
  87. )
  88. merged_data.append(merged_item)
  89. elif category.lower() in ['text', 'title', 'header', 'footer']:
  90. merged_item, paddle_pointer, last_matched_index = self._process_text(
  91. mineru_item, paddle_text_boxes, paddle_pointer, last_matched_index
  92. )
  93. merged_data.append(merged_item)
  94. elif category.lower() == 'list':
  95. merged_item, paddle_pointer, last_matched_index = self._process_list(
  96. mineru_item, paddle_text_boxes, paddle_pointer, last_matched_index
  97. )
  98. merged_data.append(merged_item)
  99. else:
  100. # Page-header, Page-footer, Picture 等
  101. merged_data.append(mineru_item)
  102. return merged_data
  103. def _convert_dotsocr_to_mineru(self, dotsocr_item: Dict) -> Dict:
  104. """
  105. 🎯 将 DotsOCR 格式转换为 MinerU 格式
  106. DotsOCR:
  107. {
  108. "category": "Table",
  109. "bbox": [x1, y1, x2, y2],
  110. "text": "..."
  111. }
  112. MinerU:
  113. {
  114. "type": "table",
  115. "bbox": [x1, y1, x2, y2],
  116. "table_body": "...",
  117. "page_idx": 0
  118. }
  119. """
  120. category = dotsocr_item.get('category', '')
  121. # 🎯 Category 映射
  122. category_map = {
  123. 'Page-header': 'header',
  124. 'Page-footer': 'footer',
  125. 'Picture': 'image',
  126. 'Figure': 'image',
  127. 'Section-header': 'title',
  128. 'Table': 'table',
  129. 'Text': 'text',
  130. 'Title': 'title',
  131. 'List': 'list',
  132. 'Caption': 'title'
  133. }
  134. mineru_type = category_map.get(category, 'text')
  135. # 🎯 基础转换
  136. mineru_item = {
  137. 'type': mineru_type,
  138. 'bbox': dotsocr_item.get('bbox', []),
  139. 'page_idx': 0 # DotsOCR 默认单页
  140. }
  141. # 🎯 处理文本内容
  142. text = dotsocr_item.get('text', '')
  143. if mineru_type == 'table':
  144. # 表格:text -> table_body
  145. mineru_item['table_body'] = text
  146. else:
  147. # 其他类型:保持 text
  148. mineru_item['text'] = text
  149. # 标题级别
  150. if category == 'Section-header':
  151. mineru_item['text_level'] = 1
  152. return mineru_item
  153. def _process_dotsocr_table(self, item: Dict, paddle_text_boxes: List[Dict],
  154. start_pointer: int) -> Tuple[Dict, int]:
  155. """
  156. 🎯 处理 DotsOCR 表格(已转换为 MinerU 格式)
  157. DotsOCR 的表格 HTML 已经在 text 字段中,需要转移到 table_body
  158. """
  159. merged_item = item.copy()
  160. table_html = item.get('table_body', '')
  161. if not table_html:
  162. return merged_item, start_pointer
  163. # 🎯 复用表格处理逻辑
  164. enhanced_html, cells, new_pointer = self._enhance_table_html_with_bbox(
  165. table_html, paddle_text_boxes, start_pointer
  166. )
  167. merged_item['table_body'] = enhanced_html
  168. merged_item['table_body_with_bbox'] = enhanced_html
  169. merged_item['bbox_mapping'] = 'merged_from_paddle_ocr'
  170. merged_item['table_cells'] = cells if cells else []
  171. return merged_item, new_pointer
  172. def process_paddleocr_vl_data(self, paddleocr_vl_data: Dict,
  173. paddle_text_boxes: List[Dict]) -> List[Dict]:
  174. """
  175. 处理 PaddleOCR_VL 数据,添加 bbox 信息
  176. Args:
  177. paddleocr_vl_data: PaddleOCR_VL 数据 (JSON 对象)
  178. paddle_text_boxes: PaddleOCR 文字框列表
  179. Returns:
  180. 🎯 MinerU 格式的合并数据(统一输出格式)
  181. """
  182. merged_data = []
  183. paddle_pointer = 0
  184. last_matched_index = 0
  185. # 🎯 获取旋转角度和原始图像尺寸
  186. rotation_angle = self._get_rotation_angle_from_vl(paddleocr_vl_data)
  187. orig_image_size = None
  188. if rotation_angle != 0:
  189. orig_image_size = self._get_original_image_size_from_vl(paddleocr_vl_data)
  190. print(f"🔄 PaddleOCR_VL 检测到旋转角度: {rotation_angle}°")
  191. print(f"📐 原始图像尺寸: {orig_image_size[0]} x {orig_image_size[1]}")
  192. # 提取 parsing_res_list
  193. parsing_res_list = paddleocr_vl_data.get('parsing_res_list', [])
  194. # 按 bbox 排序
  195. parsing_res_list.sort(
  196. key=lambda x: (x['block_bbox'][1], x['block_bbox'][0])
  197. if 'block_bbox' in x else (float('inf'), float('inf'))
  198. )
  199. for item in parsing_res_list:
  200. # 🎯 先转换 bbox 坐标(如果需要)
  201. if rotation_angle != 0 and orig_image_size:
  202. item = self._transform_vl_block_bbox(item, rotation_angle, orig_image_size)
  203. # 🎯 统一转换为 MinerU 格式
  204. mineru_item = self._convert_paddleocr_vl_to_mineru(item)
  205. item_type = mineru_item.get('type', '')
  206. # 🎯 根据类型处理(复用 MinerU 的通用方法)
  207. if item_type == 'table':
  208. merged_item, paddle_pointer = self._process_table(
  209. mineru_item, paddle_text_boxes, paddle_pointer
  210. )
  211. merged_data.append(merged_item)
  212. elif item_type in ['text', 'title', 'header', 'footer', 'equation']:
  213. merged_item, paddle_pointer, last_matched_index = self._process_text(
  214. mineru_item, paddle_text_boxes, paddle_pointer, last_matched_index
  215. )
  216. merged_data.append(merged_item)
  217. elif item_type == 'list':
  218. merged_item, paddle_pointer, last_matched_index = self._process_list(
  219. mineru_item, paddle_text_boxes, paddle_pointer, last_matched_index
  220. )
  221. merged_data.append(merged_item)
  222. else:
  223. # 其他类型(image 等)直接添加
  224. merged_data.append(mineru_item)
  225. return merged_data
  226. def _get_rotation_angle_from_vl(self, paddleocr_vl_data: Dict) -> float:
  227. """从 PaddleOCR_VL 数据中获取旋转角度"""
  228. return BBoxExtractor._get_rotation_angle(paddleocr_vl_data)
  229. def _get_original_image_size_from_vl(self, paddleocr_vl_data: Dict) -> tuple:
  230. """从 PaddleOCR_VL 数据中获取原始图像尺寸"""
  231. return BBoxExtractor._get_original_image_size(paddleocr_vl_data)
  232. def _transform_vl_block_bbox(self, item: Dict, angle: float,
  233. orig_image_size: tuple) -> Dict:
  234. """
  235. 转换 PaddleOCR_VL 的 block_bbox 坐标
  236. Args:
  237. item: PaddleOCR_VL 的 block 数据
  238. angle: 旋转角度
  239. orig_image_size: 原始图像尺寸
  240. Returns:
  241. 转换后的 block 数据
  242. """
  243. transformed_item = item.copy()
  244. if 'block_bbox' not in item:
  245. return transformed_item
  246. block_bbox = item['block_bbox']
  247. if len(block_bbox) < 4:
  248. return transformed_item
  249. # block_bbox 格式: [x1, y1, x2, y2]
  250. # 转换为 poly 格式进行旋转
  251. poly = [
  252. [block_bbox[0], block_bbox[1]], # 左上
  253. [block_bbox[2], block_bbox[1]], # 右上
  254. [block_bbox[2], block_bbox[3]], # 右下
  255. [block_bbox[0], block_bbox[3]] # 左下
  256. ]
  257. # 🎯 使用 BBoxExtractor 的坐标转换方法
  258. transformed_poly = BBoxExtractor._inverse_rotate_coordinates(
  259. poly, angle, orig_image_size
  260. )
  261. # 转换回 bbox 格式
  262. xs = [p[0] for p in transformed_poly]
  263. ys = [p[1] for p in transformed_poly]
  264. transformed_bbox = [min(xs), min(ys), max(xs), max(ys)]
  265. transformed_item['block_bbox'] = transformed_bbox
  266. return transformed_item
  267. def _convert_paddleocr_vl_to_mineru(self, paddleocr_vl_item: Dict) -> Dict:
  268. """
  269. 🎯 将 PaddleOCR_VL 格式转换为 MinerU 格式
  270. 基于 PP-DocLayout_plus-L 的 20 种类别
  271. """
  272. block_label = paddleocr_vl_item.get('block_label', '')
  273. # 🎯 PP-DocLayout_plus-L 类别映射(共 20 种)
  274. label_map = {
  275. # 标题类(3种)
  276. 'paragraph_title': 'title',
  277. 'doc_title': 'title',
  278. 'figure_table_chart_title': 'title',
  279. # 文本类(9种)
  280. 'text': 'text',
  281. 'number': 'text',
  282. 'content': 'text',
  283. 'abstract': 'text',
  284. 'footnote': 'text',
  285. 'aside_text': 'text',
  286. 'algorithm': 'text',
  287. 'reference': 'text',
  288. 'reference_content': 'text',
  289. # 页眉页脚(2种)
  290. 'header': 'header',
  291. 'footer': 'footer',
  292. # 表格(1种)
  293. 'table': 'table',
  294. # 图片/图表(3种)
  295. 'image': 'image',
  296. 'chart': 'image',
  297. 'seal': 'image',
  298. # 公式(2种)
  299. 'formula': 'equation',
  300. 'formula_number': 'equation'
  301. }
  302. mineru_type = label_map.get(block_label, 'text')
  303. mineru_item = {
  304. 'type': mineru_type,
  305. 'bbox': paddleocr_vl_item.get('block_bbox', []),
  306. 'page_idx': 0
  307. }
  308. content = paddleocr_vl_item.get('block_content', '')
  309. if mineru_type == 'table':
  310. mineru_item['table_body'] = content
  311. else:
  312. mineru_item['text'] = content
  313. # 标题级别
  314. if block_label == 'doc_title':
  315. mineru_item['text_level'] = 1
  316. elif block_label == 'paragraph_title':
  317. mineru_item['text_level'] = 2
  318. elif block_label == 'figure_table_chart_title':
  319. mineru_item['text_level'] = 3
  320. return mineru_item
  321. def _process_table(self, item: Dict, paddle_text_boxes: List[Dict],
  322. start_pointer: int) -> Tuple[Dict, int]:
  323. """处理 MinerU 表格"""
  324. merged_item = item.copy()
  325. table_html = item.get('table_body', '')
  326. enhanced_html, cells, new_pointer = self._enhance_table_html_with_bbox(
  327. table_html, paddle_text_boxes, start_pointer
  328. )
  329. merged_item['table_body'] = enhanced_html
  330. merged_item['table_body_with_bbox'] = enhanced_html
  331. merged_item['bbox_mapping'] = 'merged_from_paddle_ocr'
  332. merged_item['table_cells'] = cells if cells else []
  333. return merged_item, new_pointer
  334. def _process_text(self, item: Dict, paddle_text_boxes: List[Dict],
  335. paddle_pointer: int, last_matched_index: int) -> Tuple[Dict, int, int]:
  336. """处理文本"""
  337. merged_item = item.copy()
  338. text = item.get('text', '')
  339. matched_bbox, paddle_pointer, last_matched_index = \
  340. self.text_matcher.find_matching_bbox(
  341. text, paddle_text_boxes, paddle_pointer, last_matched_index,
  342. self.look_ahead_window
  343. )
  344. if matched_bbox:
  345. matched_bbox['used'] = True
  346. return merged_item, paddle_pointer, last_matched_index
  347. def _process_list(self, item: Dict, paddle_text_boxes: List[Dict],
  348. paddle_pointer: int, last_matched_index: int) -> Tuple[Dict, int, int]:
  349. """处理列表"""
  350. merged_item = item.copy()
  351. list_items = item.get('list_items', [])
  352. for list_item in list_items:
  353. matched_bbox, paddle_pointer, last_matched_index = \
  354. self.text_matcher.find_matching_bbox(
  355. list_item, paddle_text_boxes, paddle_pointer, last_matched_index,
  356. self.look_ahead_window
  357. )
  358. if matched_bbox:
  359. matched_bbox['used'] = True
  360. return merged_item, paddle_pointer, last_matched_index
  361. def _enhance_table_html_with_bbox(self, html: str, paddle_text_boxes: List[Dict],
  362. start_pointer: int) -> Tuple[str, List[Dict], int]:
  363. """为 HTML 表格添加 bbox 信息"""
  364. soup = BeautifulSoup(html, 'html.parser')
  365. current_pointer = start_pointer
  366. last_matched_index = start_pointer
  367. cells = []
  368. for row_idx, row in enumerate(soup.find_all('tr')):
  369. for col_idx, cell in enumerate(row.find_all(['td', 'th'])):
  370. cell_text = cell.get_text(strip=True)
  371. if not cell_text:
  372. continue
  373. matched_bbox, current_pointer, last_matched_index = \
  374. self.text_matcher.find_matching_bbox(
  375. cell_text, paddle_text_boxes, current_pointer,
  376. last_matched_index, self.look_ahead_window
  377. )
  378. if matched_bbox:
  379. bbox = matched_bbox['bbox']
  380. cell['data-bbox'] = f"[{bbox[0]},{bbox[1]},{bbox[2]},{bbox[3]}]"
  381. cell['data-score'] = f"{matched_bbox['score']:.4f}"
  382. cell['data-paddle-index'] = str(matched_bbox['paddle_bbox_index'])
  383. # ✅ 完整记录单元格信息
  384. cells.append({
  385. 'type': 'table_cell',
  386. 'text': cell_text,
  387. 'bbox': bbox,
  388. 'row': row_idx + 1,
  389. 'col': col_idx + 1,
  390. 'score': matched_bbox['score'],
  391. 'paddle_bbox_index': matched_bbox['paddle_bbox_index']
  392. })
  393. matched_bbox['used'] = True
  394. # ✅ 如果匹配失败,不应该添加到 cells 中
  395. return str(soup), cells, current_pointer