merge_mineru_paddle_ocr.py 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399
  1. """
  2. 合并 MinerU 和 PaddleOCR 的结果
  3. 使用 MinerU 的表格结构识别 + PaddleOCR 的文字框坐标
  4. """
  5. import json
  6. import re
  7. from pathlib import Path
  8. from typing import List, Dict, Tuple, Optional
  9. from bs4 import BeautifulSoup
  10. from fuzzywuzzy import fuzz
  11. class MinerUPaddleOCRMerger:
  12. """合并 MinerU 和 PaddleOCR 的结果"""
  13. def __init__(self, look_ahead_window: int = 10, similarity_threshold: int = 80):
  14. """
  15. Args:
  16. look_ahead_window: 向前查找的窗口大小
  17. similarity_threshold: 文本相似度阈值
  18. """
  19. self.look_ahead_window = look_ahead_window
  20. self.similarity_threshold = similarity_threshold
  21. def merge_table_with_bbox(self, mineru_json_path: str, paddle_json_path: str,
  22. output_path: Optional[str] = None) -> Dict:
  23. """
  24. 合并 MinerU 和 PaddleOCR 的结果
  25. Args:
  26. mineru_json_path: MinerU 输出的 JSON 路径
  27. paddle_json_path: PaddleOCR 输出的 JSON 路径
  28. output_path: 输出路径(可选)
  29. Returns:
  30. 合并后的结果字典
  31. """
  32. # 加载数据
  33. with open(mineru_json_path, 'r', encoding='utf-8') as f:
  34. mineru_data = json.load(f)
  35. with open(paddle_json_path, 'r', encoding='utf-8') as f:
  36. paddle_data = json.load(f)
  37. # 提取 PaddleOCR 的文字框信息
  38. paddle_text_boxes = self._extract_paddle_text_boxes(paddle_data)
  39. # 处理 MinerU 的数据
  40. merged_data = self._process_mineru_data(mineru_data, paddle_text_boxes)
  41. # 保存结果
  42. if output_path:
  43. output_path = Path(output_path).resolve()
  44. output_path.parent.mkdir(parents=True, exist_ok=True)
  45. with open(str(output_path), 'w', encoding='utf-8') as f:
  46. json.dump(merged_data, f, ensure_ascii=False, indent=2)
  47. return merged_data
  48. def _extract_paddle_text_boxes(self, paddle_data: Dict) -> List[Dict]:
  49. """提取 PaddleOCR 的文字框信息"""
  50. text_boxes = []
  51. if 'overall_ocr_res' in paddle_data:
  52. ocr_res = paddle_data['overall_ocr_res']
  53. rec_texts = ocr_res.get('rec_texts', [])
  54. rec_polys = ocr_res.get('rec_polys', [])
  55. rec_scores = ocr_res.get('rec_scores', [])
  56. for i, (text, poly, score) in enumerate(zip(rec_texts, rec_polys, rec_scores)):
  57. if text and text.strip():
  58. # 计算 bbox (x_min, y_min, x_max, y_max)
  59. xs = [p[0] for p in poly]
  60. ys = [p[1] for p in poly]
  61. bbox = [min(xs), min(ys), max(xs), max(ys)]
  62. text_boxes.append({
  63. 'text': text,
  64. 'bbox': bbox,
  65. 'poly': poly,
  66. 'score': score,
  67. 'paddle_bbox_index': i,
  68. 'used': False # 标记是否已被使用
  69. })
  70. return text_boxes
  71. def _process_mineru_data(self, mineru_data: List[Dict],
  72. paddle_text_boxes: List[Dict]) -> List[Dict]:
  73. """处理 MinerU 数据,添加 bbox 信息"""
  74. merged_data = []
  75. paddle_pointer = 0 # PaddleOCR 文字框指针
  76. for item in mineru_data:
  77. if item['type'] == 'table':
  78. # 处理表格
  79. merged_item = item.copy()
  80. table_html = item.get('table_body', '')
  81. # 解析 HTML 表格并添加 bbox
  82. enhanced_html, paddle_pointer = self._enhance_table_html_with_bbox(
  83. table_html, paddle_text_boxes, paddle_pointer
  84. )
  85. merged_item['table_body'] = enhanced_html
  86. merged_item['table_body_with_bbox'] = enhanced_html
  87. merged_item['bbox_mapping'] = 'merged_from_paddle_ocr'
  88. merged_data.append(merged_item)
  89. elif item['type'] in ['text', 'header']:
  90. # 处理普通文本
  91. merged_item = item.copy()
  92. text = item.get('text', '')
  93. # 查找匹配的 bbox
  94. matched_bbox, paddle_pointer = self._find_matching_bbox(
  95. text, paddle_text_boxes, paddle_pointer
  96. )
  97. if matched_bbox:
  98. merged_item['bbox'] = matched_bbox['bbox']
  99. merged_item['bbox_source'] = 'paddle_ocr'
  100. merged_item['text_score'] = matched_bbox['score']
  101. # 标记为已使用
  102. matched_bbox['used'] = True
  103. merged_data.append(merged_item)
  104. else:
  105. # 其他类型直接复制
  106. merged_data.append(item.copy())
  107. return merged_data
  108. def _enhance_table_html_with_bbox(self, html: str, paddle_text_boxes: List[Dict],
  109. start_pointer: int) -> Tuple[str, int]:
  110. """
  111. 为 HTML 表格添加 bbox 信息
  112. Args:
  113. html: 原始 HTML 表格
  114. paddle_text_boxes: PaddleOCR 文字框列表
  115. start_pointer: 起始指针位置
  116. Returns:
  117. (增强后的 HTML, 新的指针位置)
  118. """
  119. soup = BeautifulSoup(html, 'html.parser')
  120. current_pointer = start_pointer
  121. # 遍历所有单元格
  122. for cell in soup.find_all(['td', 'th']):
  123. cell_text = cell.get_text(strip=True)
  124. if not cell_text:
  125. continue
  126. # 查找匹配的 bbox
  127. matched_bbox, current_pointer = self._find_matching_bbox(
  128. cell_text, paddle_text_boxes, current_pointer
  129. )
  130. if matched_bbox:
  131. # 添加 data-bbox 属性
  132. bbox = matched_bbox['bbox']
  133. cell['data-bbox'] = f"[{bbox[0]},{bbox[1]},{bbox[2]},{bbox[3]}]"
  134. cell['data-score'] = f"{matched_bbox['score']:.4f}"
  135. cell['data-paddle-index'] = str(matched_bbox['paddle_bbox_index'])
  136. # 标记为已使用
  137. matched_bbox['used'] = True
  138. return str(soup), current_pointer
  139. def _find_matching_bbox(self, target_text: str, text_boxes: List[Dict],
  140. start_index: int) -> tuple[Optional[Dict], int]:
  141. """
  142. 查找匹配的文字框
  143. Args:
  144. target_text: 目标文本
  145. text_boxes: 文字框列表
  146. start_index: 起始索引
  147. Returns:
  148. 匹配的文字框信息,如果未找到返回 None
  149. """
  150. target_text = self._normalize_text(target_text)
  151. # 在窗口范围内查找
  152. search_end = min(start_index + self.look_ahead_window, len(text_boxes))
  153. best_match = None
  154. best_index = start_index
  155. best_similarity = 0
  156. for i in range(start_index, search_end):
  157. if text_boxes[i]['used']:
  158. continue
  159. box_text = self._normalize_text(text_boxes[i]['text'])
  160. # 计算相似度
  161. # similarity = fuzz.ratio(target_text, box_text)
  162. similarity = fuzz.token_set_ratio(target_text, box_text)
  163. # 精确匹配优先
  164. if target_text == box_text:
  165. return text_boxes[i], i + 1
  166. # 记录最佳匹配
  167. if similarity > best_similarity and similarity >= self.similarity_threshold:
  168. best_similarity = similarity
  169. best_match = text_boxes[i]
  170. best_index = i + 1
  171. return best_match, best_index
  172. def _normalize_text(self, text: str) -> str:
  173. """标准化文本(去除空格、标点等)"""
  174. # 移除所有空白字符
  175. text = re.sub(r'\s+', '', text)
  176. # 转换全角数字和字母为半角
  177. text = self._full_to_half(text)
  178. return text.lower()
  179. def _full_to_half(self, text: str) -> str:
  180. """全角转半角"""
  181. result = []
  182. for char in text:
  183. code = ord(char)
  184. if code == 0x3000: # 全角空格
  185. code = 0x0020
  186. elif 0xFF01 <= code <= 0xFF5E: # 全角字符
  187. code -= 0xFEE0
  188. result.append(chr(code))
  189. return ''.join(result)
  190. def generate_enhanced_markdown(self, merged_data: List[Dict],
  191. output_path: Optional[str] = None) -> str:
  192. """
  193. 生成增强的 Markdown(包含 bbox 信息的注释)
  194. Args:
  195. merged_data: 合并后的数据
  196. output_path: 输出路径(可选)
  197. Returns:
  198. Markdown 内容
  199. """
  200. md_lines = []
  201. for item in merged_data:
  202. if item['type'] == 'header':
  203. text = item.get('text', '')
  204. bbox = item.get('bbox', [])
  205. md_lines.append(f"<!-- bbox: {bbox} -->")
  206. md_lines.append(f"# {text}\n")
  207. elif item['type'] == 'text':
  208. text = item.get('text', '')
  209. bbox = item.get('bbox', [])
  210. if bbox:
  211. md_lines.append(f"<!-- bbox: {bbox} -->")
  212. md_lines.append(f"{text}\n")
  213. elif item['type'] == 'table':
  214. md_lines.append("\n## 表格\n")
  215. md_lines.append("<!-- 表格单元格包含 data-bbox 属性 -->\n")
  216. md_lines.append(item.get('table_body_with_bbox', item.get('table_body', '')))
  217. md_lines.append("\n")
  218. markdown_content = '\n'.join(md_lines)
  219. if output_path:
  220. with open(output_path, 'w', encoding='utf-8') as f:
  221. f.write(markdown_content)
  222. return markdown_content
  223. def extract_table_cells_with_bbox(self, merged_data: List[Dict]) -> List[Dict]:
  224. """
  225. 提取所有表格单元格及其 bbox 信息
  226. Returns:
  227. 单元格列表,每个包含 text, bbox, row, col 等信息
  228. """
  229. cells = []
  230. for item in merged_data:
  231. if item['type'] != 'table':
  232. continue
  233. html = item.get('table_body_with_bbox', item.get('table_body', ''))
  234. soup = BeautifulSoup(html, 'html.parser')
  235. # 遍历所有行
  236. for row_idx, row in enumerate(soup.find_all('tr')):
  237. # 遍历所有单元格
  238. for col_idx, cell in enumerate(row.find_all(['td', 'th'])):
  239. cell_text = cell.get_text(strip=True)
  240. bbox_str = cell.get('data-bbox', '')
  241. if bbox_str:
  242. try:
  243. bbox = json.loads(bbox_str)
  244. cells.append({
  245. 'text': cell_text,
  246. 'bbox': bbox,
  247. 'row': row_idx,
  248. 'col': col_idx,
  249. 'score': float(cell.get('data-score', 0)),
  250. 'paddle_index': int(cell.get('data-paddle-index', -1))
  251. })
  252. except (json.JSONDecodeError, ValueError):
  253. pass
  254. return cells
  255. def merge_mineru_paddle_batch(mineru_dir: str, paddle_dir: str, output_dir: str):
  256. """
  257. 批量合并 MinerU 和 PaddleOCR 的结果
  258. Args:
  259. mineru_dir: MinerU 结果目录
  260. paddle_dir: PaddleOCR 结果目录
  261. output_dir: 输出目录
  262. """
  263. mineru_path = Path(mineru_dir)
  264. paddle_path = Path(paddle_dir)
  265. output_path = Path(output_dir)
  266. output_path.mkdir(parents=True, exist_ok=True)
  267. merger = MinerUPaddleOCRMerger(look_ahead_window=10, similarity_threshold=80)
  268. # 查找所有 MinerU 的 JSON 文件, page_001.json
  269. mineru_files = list(mineru_path.glob('*_page_*[0-9].json'))
  270. mineru_files.sort()
  271. print(f"找到 {len(mineru_files)} 个 MinerU 文件")
  272. for mineru_file in mineru_files:
  273. # 查找对应的 PaddleOCR 文件
  274. paddle_file = paddle_path / mineru_file.name
  275. if not paddle_file.exists():
  276. print(f"⚠️ 未找到对应的 PaddleOCR 文件: {paddle_file}")
  277. continue
  278. print(f"处理: {mineru_file.name}")
  279. # 输出文件路径
  280. merged_json_path = output_path / f"{mineru_file.stem}_merged.json"
  281. merged_md_path = output_path / f"{mineru_file.stem}_merged.md"
  282. cells_json_path = output_path / f"{mineru_file.stem}_cells.json"
  283. try:
  284. # 合并数据
  285. merged_data = merger.merge_table_with_bbox(
  286. str(mineru_file),
  287. str(paddle_file),
  288. str(merged_json_path)
  289. )
  290. # 生成 Markdown
  291. merger.generate_enhanced_markdown(merged_data, str(merged_md_path))
  292. # 提取单元格信息
  293. cells = merger.extract_table_cells_with_bbox(merged_data)
  294. with open(cells_json_path, 'w', encoding='utf-8') as f:
  295. json.dump(cells, f, ensure_ascii=False, indent=2)
  296. print(f" ✅ 合并完成")
  297. print(f" - 提取了 {len(cells)} 个表格单元格")
  298. except Exception as e:
  299. print(f" ❌ 处理失败: {e}")
  300. import traceback
  301. traceback.print_exc()
  302. if __name__ == "__main__":
  303. # 示例用法
  304. mineru_dir = "/Users/zhch158/workspace/data/流水分析/A用户_单元格扫描流水/mineru-vlm-2.5.3_Results"
  305. paddle_dir = "/Users/zhch158/workspace/data/流水分析/A用户_单元格扫描流水/data_PPStructureV3_Results"
  306. output_dir = "/Users/zhch158/workspace/data/流水分析/A用户_单元格扫描流水/merged_results"
  307. merge_mineru_paddle_batch(mineru_dir, paddle_dir, output_dir)
  308. # 示例:合并1个文件
  309. # mineru_json = "/Users/zhch158/workspace/data/流水分析/A用户_单元格扫描流水/mineru-vlm-2.5.3_Results/A用户_单元格扫描流水_page_001.json"
  310. # paddle_json = "/Users/zhch158/workspace/data/流水分析/A用户_单元格扫描流水/data_PPStructureV3_Results/A用户_单元格扫描流水_page_001.json"
  311. # output_json = "/Users/zhch158/workspace/data/流水分析/A用户_单元格扫描流水/merged_results/A用户_单元格扫描流水_page_001.json"
  312. # merger = MinerUPaddleOCRMerger(look_ahead_window=10, similarity_threshold=80)
  313. # merger.merge_table_with_bbox(mineru_json, paddle_json, output_json)