merge_mineru_paddle_ocr.py 23 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643
  1. """
  2. 合并 MinerU 和 PaddleOCR 的结果
  3. 使用 MinerU 的表格结构识别 + PaddleOCR 的文字框坐标
  4. """
  5. import json
  6. import re
  7. import argparse
  8. from pathlib import Path
  9. from typing import List, Dict, Tuple, Optional
  10. from bs4 import BeautifulSoup
  11. from fuzzywuzzy import fuzz
  12. import shutil
  13. class MinerUPaddleOCRMerger:
  14. """合并 MinerU 和 PaddleOCR 的结果"""
  15. def __init__(self, look_ahead_window: int = 10, similarity_threshold: int = 90):
  16. """
  17. Args:
  18. look_ahead_window: 向前查找的窗口大小
  19. similarity_threshold: 文本相似度阈值
  20. """
  21. self.look_ahead_window = look_ahead_window
  22. self.similarity_threshold = similarity_threshold
  23. def merge_table_with_bbox(self, mineru_json_path: str, paddle_json_path: str) -> List[Dict]:
  24. """
  25. 合并 MinerU 和 PaddleOCR 的结果
  26. Args:
  27. mineru_json_path: MinerU 输出的 JSON 路径
  28. paddle_json_path: PaddleOCR 输出的 JSON 路径
  29. output_path: 输出路径(可选)
  30. Returns:
  31. 合并后的结果字典
  32. """
  33. merged_data = None
  34. # 加载数据
  35. with open(mineru_json_path, 'r', encoding='utf-8') as f:
  36. mineru_data = json.load(f)
  37. with open(paddle_json_path, 'r', encoding='utf-8') as f:
  38. paddle_data = json.load(f)
  39. # 提取 PaddleOCR 的文字框信息
  40. paddle_text_boxes = self._extract_paddle_text_boxes(paddle_data)
  41. # 处理 MinerU 的数据
  42. merged_data = self._process_mineru_data(mineru_data, paddle_text_boxes)
  43. return merged_data
  44. def _extract_paddle_text_boxes(self, paddle_data: Dict) -> List[Dict]:
  45. """提取 PaddleOCR 的文字框信息"""
  46. text_boxes = []
  47. if 'overall_ocr_res' in paddle_data:
  48. ocr_res = paddle_data['overall_ocr_res']
  49. rec_texts = ocr_res.get('rec_texts', [])
  50. rec_polys = ocr_res.get('rec_polys', [])
  51. rec_scores = ocr_res.get('rec_scores', [])
  52. for i, (text, poly, score) in enumerate(zip(rec_texts, rec_polys, rec_scores)):
  53. if text and text.strip():
  54. # 计算 bbox (x_min, y_min, x_max, y_max)
  55. xs = [p[0] for p in poly]
  56. ys = [p[1] for p in poly]
  57. bbox = [min(xs), min(ys), max(xs), max(ys)]
  58. text_boxes.append({
  59. 'text': text,
  60. 'bbox': bbox,
  61. 'poly': poly,
  62. 'score': score,
  63. 'paddle_bbox_index': i,
  64. 'used': False # 标记是否已被使用
  65. })
  66. return text_boxes
  67. def _process_mineru_data(self, mineru_data: List[Dict],
  68. paddle_text_boxes: List[Dict]) -> List[Dict]:
  69. """处理 MinerU 数据,添加 bbox 信息"""
  70. merged_data = []
  71. cells = None # 存储所有表格单元格信息
  72. paddle_pointer = 0 # PaddleOCR 文字框指针
  73. # 对mineru_data按bbox从上到下排序,从左到右确保顺序一致
  74. mineru_data.sort(key=lambda x: (x['bbox'][1], x['bbox'][0]) if 'bbox' in x else (float('inf'), float('inf')))
  75. for item in mineru_data:
  76. if item['type'] == 'table':
  77. # 处理表格
  78. merged_item = item.copy()
  79. table_html = item.get('table_body', '')
  80. # 解析 HTML 表格并添加 bbox
  81. enhanced_html, cells, paddle_pointer = self._enhance_table_html_with_bbox(
  82. table_html, paddle_text_boxes, paddle_pointer
  83. )
  84. merged_item['table_body'] = enhanced_html
  85. merged_item['table_body_with_bbox'] = enhanced_html
  86. merged_item['bbox_mapping'] = 'merged_from_paddle_ocr'
  87. merged_data.append(merged_item)
  88. elif item['type'] in ['text', 'header']:
  89. # 处理普通文本
  90. merged_item = item.copy()
  91. text = item.get('text', '')
  92. # 查找匹配的 bbox
  93. matched_bbox, paddle_pointer = self._find_matching_bbox(
  94. text, paddle_text_boxes, paddle_pointer
  95. )
  96. if matched_bbox:
  97. merged_item['bbox'] = matched_bbox['bbox']
  98. merged_item['bbox_source'] = 'paddle_ocr'
  99. merged_item['text_score'] = matched_bbox['score']
  100. # 标记为已使用
  101. matched_bbox['used'] = True
  102. merged_data.append(merged_item)
  103. else:
  104. # 其他类型直接复制
  105. merged_data.append(item.copy())
  106. if cells:
  107. merged_data.extend(cells)
  108. return merged_data
  109. def _enhance_table_html_with_bbox(self, html: str, paddle_text_boxes: List[Dict],
  110. start_pointer: int) -> Tuple[str, List[Dict], int]:
  111. """
  112. 为 HTML 表格添加 bbox 信息
  113. Args:
  114. html: 原始 HTML 表格
  115. paddle_text_boxes: PaddleOCR 文字框列表
  116. start_pointer: 起始指针位置
  117. Returns:
  118. (增强后的 HTML, 单元格数组, 新的指针位置)
  119. """
  120. soup = BeautifulSoup(html, 'html.parser')
  121. current_pointer = start_pointer
  122. cells = [] # 存储单元格的 bbox 信息
  123. # 遍历所有行
  124. for row_idx, row in enumerate(soup.find_all('tr')):
  125. # 遍历所有单元格
  126. for col_idx, cell in enumerate(row.find_all(['td', 'th'])):
  127. cell_text = cell.get_text(strip=True)
  128. if not cell_text:
  129. continue
  130. # 查找匹配的 bbox
  131. matched_bbox, current_pointer = self._find_matching_bbox(
  132. cell_text, paddle_text_boxes, current_pointer
  133. )
  134. if matched_bbox:
  135. # 添加 data-bbox 属性
  136. bbox = matched_bbox['bbox']
  137. cell['data-bbox'] = f"[{bbox[0]},{bbox[1]},{bbox[2]},{bbox[3]}]"
  138. cell['data-score'] = f"{matched_bbox['score']:.4f}"
  139. cell['data-paddle-index'] = str(matched_bbox['paddle_bbox_index'])
  140. cells.append({
  141. 'type': 'table_cell',
  142. 'text': cell_text,
  143. 'bbox': bbox,
  144. 'row': row_idx+1,
  145. 'col': col_idx+1,
  146. 'score': matched_bbox['score'],
  147. 'paddle_bbox_index': matched_bbox['paddle_bbox_index']
  148. })
  149. # 标记为已使用
  150. matched_bbox['used'] = True
  151. return str(soup), cells, current_pointer
  152. def _find_matching_bbox(self, target_text: str, text_boxes: List[Dict],
  153. start_index: int) -> tuple[Optional[Dict], int]:
  154. """
  155. 查找匹配的文字框
  156. Args:
  157. target_text: 目标文本
  158. text_boxes: 文字框列表
  159. start_index: 起始索引
  160. Returns:
  161. (匹配的文字框信息, 新的指针位置)
  162. """
  163. target_text = self._normalize_text(target_text)
  164. # 在窗口范围内查找, 窗口是start_index往回移动窗口的1/3到start_index + look_ahead_window
  165. search_start = max(0, int(start_index - self.look_ahead_window/3))
  166. search_end = min(start_index + self.look_ahead_window, len(text_boxes))
  167. best_match = None
  168. best_index = start_index
  169. for i in range(search_start, search_end):
  170. if text_boxes[i]['used']:
  171. continue
  172. box_text = self._normalize_text(text_boxes[i]['text'])
  173. # 计算相似度
  174. similarity = fuzz.partial_ratio(target_text, box_text)
  175. # 精确匹配优先
  176. if target_text == box_text:
  177. return text_boxes[i], i + 1
  178. # 大于阈值就返回,不找最佳
  179. # if similarity > best_similarity and similarity >= self.similarity_threshold:
  180. if similarity >= self.similarity_threshold:
  181. return text_boxes[i], i + 1
  182. return best_match, best_index
  183. def _normalize_text(self, text: str) -> str:
  184. """标准化文本(去除空格、标点等)"""
  185. # 移除所有空白字符
  186. text = re.sub(r'\s+', '', text)
  187. # 转换全角数字和字母为半角
  188. text = self._full_to_half(text)
  189. return text.lower()
  190. def _full_to_half(self, text: str) -> str:
  191. """全角转半角"""
  192. result = []
  193. for char in text:
  194. code = ord(char)
  195. if code == 0x3000: # 全角空格
  196. code = 0x0020
  197. elif 0xFF01 <= code <= 0xFF5E: # 全角字符
  198. code -= 0xFEE0
  199. result.append(chr(code))
  200. return ''.join(result)
  201. def generate_enhanced_markdown(self, merged_data: List[Dict],
  202. output_path: Optional[str] = None, mineru_file: Optional[str] = None) -> str:
  203. """
  204. 生成增强的 Markdown(包含 bbox 信息的注释)
  205. Args:
  206. merged_data: 合并后的数据
  207. output_path: 输出路径(可选)
  208. Returns:
  209. Markdown 内容
  210. """
  211. md_lines = []
  212. for item in merged_data:
  213. if item['type'] == 'header':
  214. text = item.get('text', '')
  215. bbox = item.get('bbox', [])
  216. md_lines.append(f"<!-- bbox: {bbox} -->")
  217. md_lines.append(f"# {text}\n")
  218. elif item['type'] == 'text':
  219. text = item.get('text', '')
  220. bbox = item.get('bbox', [])
  221. if bbox:
  222. md_lines.append(f"<!-- bbox: {bbox} -->")
  223. md_lines.append(f"{text}\n")
  224. elif item['type'] == 'table':
  225. md_lines.append("<!-- 表格单元格包含 data-bbox 属性 -->\n")
  226. md_lines.append(item.get('table_body_with_bbox', item.get('table_body', '')))
  227. md_lines.append("\n")
  228. elif item['type'] == 'image':
  229. img_path = item.get('img_path', '')
  230. # 需要将minerU图像路径下的图片拷贝到输出目录
  231. if img_path and mineru_file:
  232. mineru_dir = Path(mineru_file).parent
  233. img_full_path = mineru_dir / img_path
  234. if img_full_path.exists():
  235. # 需要将图片拷贝到输出目录
  236. output_img_path = Path(output_path).parent / img_path
  237. output_img_path.parent.mkdir(parents=True, exist_ok=True)
  238. shutil.copy(img_full_path, output_img_path)
  239. bbox = item.get('bbox', [])
  240. if bbox:
  241. md_lines.append(f"<!-- bbox: {bbox} -->")
  242. md_lines.append(f"![Image]({img_path})\n")
  243. markdown_content = '\n'.join(md_lines)
  244. if output_path:
  245. with open(output_path, 'w', encoding='utf-8') as f:
  246. f.write(markdown_content)
  247. return markdown_content
  248. def extract_table_cells_with_bbox(self, merged_data: List[Dict]) -> List[Dict]:
  249. """
  250. 提取所有表格单元格及其 bbox 信息
  251. Returns:
  252. 单元格列表,每个包含 text, bbox, row, col 等信息
  253. """
  254. cells = []
  255. for item in merged_data:
  256. if item['type'] != 'table':
  257. continue
  258. html = item.get('table_body_with_bbox', item.get('table_body', ''))
  259. soup = BeautifulSoup(html, 'html.parser')
  260. # 遍历所有行
  261. for row_idx, row in enumerate(soup.find_all('tr')):
  262. # 遍历所有单元格
  263. for col_idx, cell in enumerate(row.find_all(['td', 'th'])):
  264. cell_text = cell.get_text(strip=True)
  265. bbox_str = cell.get('data-bbox', '')
  266. if bbox_str:
  267. try:
  268. bbox = json.loads(bbox_str)
  269. cells.append({
  270. 'text': cell_text,
  271. 'bbox': bbox,
  272. 'row': row_idx,
  273. 'col': col_idx,
  274. 'score': float(cell.get('data-score', 0)),
  275. 'paddle_index': int(cell.get('data-paddle-index', -1))
  276. })
  277. except (json.JSONDecodeError, ValueError):
  278. pass
  279. return cells
  280. def merge_single_file(mineru_file: Path, paddle_file: Path, output_dir: Path,
  281. output_format: str, merger: MinerUPaddleOCRMerger) -> bool:
  282. """
  283. 合并单个文件
  284. Args:
  285. mineru_file: MinerU JSON 文件路径
  286. paddle_file: PaddleOCR JSON 文件路径
  287. output_dir: 输出目录
  288. merger: 合并器实例
  289. Returns:
  290. 是否成功
  291. """
  292. print(f"📄 处理: {mineru_file.name}")
  293. # 输出文件路径
  294. merged_md_path = output_dir / f"{mineru_file.stem}.md"
  295. merged_json_path = output_dir / f"{mineru_file.stem}.json"
  296. try:
  297. # 合并数据
  298. merged_data = merger.merge_table_with_bbox(
  299. str(mineru_file),
  300. str(paddle_file)
  301. )
  302. # 生成 Markdown
  303. if output_format in ['markdown', 'both']:
  304. merger.generate_enhanced_markdown(merged_data, str(merged_md_path), mineru_file)
  305. # 提取单元格信息
  306. # cells = merger.extract_table_cells_with_bbox(merged_data)
  307. if output_format in ['json', 'both']:
  308. with open(merged_json_path, 'w', encoding='utf-8') as f:
  309. json.dump(merged_data, f, ensure_ascii=False, indent=2)
  310. print(f" ✅ 合并完成")
  311. print(f" 📊 共处理了 {len(merged_data)} 个对象")
  312. print(f" 💾 输出文件:")
  313. print(f" - {merged_json_path.name}")
  314. return True
  315. except Exception as e:
  316. print(f" ❌ 处理失败: {e}")
  317. import traceback
  318. traceback.print_exc()
  319. return False
  320. def merge_mineru_paddle_batch(mineru_dir: str, paddle_dir: str, output_dir: str, output_format: str = 'both',
  321. look_ahead_window: int = 10,
  322. similarity_threshold: int = 80):
  323. """
  324. 批量合并 MinerU 和 PaddleOCR 的结果
  325. Args:
  326. mineru_dir: MinerU 结果目录
  327. paddle_dir: PaddleOCR 结果目录
  328. output_dir: 输出目录
  329. look_ahead_window: 向前查找窗口大小
  330. similarity_threshold: 相似度阈值
  331. """
  332. mineru_path = Path(mineru_dir)
  333. paddle_path = Path(paddle_dir)
  334. output_path = Path(output_dir)
  335. output_path.mkdir(parents=True, exist_ok=True)
  336. merger = MinerUPaddleOCRMerger(
  337. look_ahead_window=look_ahead_window,
  338. similarity_threshold=similarity_threshold
  339. )
  340. # 查找所有 MinerU 的 JSON 文件
  341. mineru_files = list(mineru_path.glob('*_page_*[0-9].json'))
  342. mineru_files.sort()
  343. print(f"\n🔍 找到 {len(mineru_files)} 个 MinerU 文件")
  344. print(f"📂 MinerU 目录: {mineru_dir}")
  345. print(f"📂 PaddleOCR 目录: {paddle_dir}")
  346. print(f"📂 输出目录: {output_dir}")
  347. print(f"⚙️ 查找窗口: {look_ahead_window}")
  348. print(f"⚙️ 相似度阈值: {similarity_threshold}%\n")
  349. success_count = 0
  350. failed_count = 0
  351. for mineru_file in mineru_files:
  352. # 查找对应的 PaddleOCR 文件
  353. paddle_file = paddle_path / mineru_file.name
  354. if not paddle_file.exists():
  355. print(f"⚠️ 跳过: 未找到对应的 PaddleOCR 文件: {paddle_file.name}\n")
  356. failed_count += 1
  357. continue
  358. if merge_single_file(mineru_file, paddle_file, output_path, output_format, merger):
  359. success_count += 1
  360. else:
  361. failed_count += 1
  362. print() # 空行分隔
  363. # 打印统计信息
  364. print("=" * 60)
  365. print(f"✅ 处理完成!")
  366. print(f"📊 统计信息:")
  367. print(f" - 总文件数: {len(mineru_files)}")
  368. print(f" - 成功: {success_count}")
  369. print(f" - 失败: {failed_count}")
  370. print("=" * 60)
  371. def main():
  372. """主函数"""
  373. parser = argparse.ArgumentParser(
  374. description='合并 MinerU 和 PaddleOCR 的识别结果,添加 bbox 坐标信息',
  375. formatter_class=argparse.RawDescriptionHelpFormatter,
  376. epilog="""
  377. 示例用法:
  378. 1. 批量处理整个目录:
  379. python merge_mineru_paddle_ocr.py \\
  380. --mineru-dir /path/to/mineru/results \\
  381. --paddle-dir /path/to/paddle/results \\
  382. --output-dir /path/to/output
  383. 2. 处理单个文件:
  384. python merge_mineru_paddle_ocr.py \\
  385. --mineru-file /path/to/file_page_001.json \\
  386. --paddle-file /path/to/file_page_001.json \\
  387. --output-dir /path/to/output
  388. 3. 自定义参数:
  389. python merge_mineru_paddle_ocr.py \\
  390. --mineru-dir /path/to/mineru \\
  391. --paddle-dir /path/to/paddle \\
  392. --output-dir /path/to/output \\
  393. --window 15 \\
  394. --threshold 85
  395. """
  396. )
  397. # 文件/目录参数
  398. file_group = parser.add_argument_group('文件参数')
  399. file_group.add_argument(
  400. '--mineru-file',
  401. type=str,
  402. help='MinerU 输出的 JSON 文件路径(单文件模式)'
  403. )
  404. file_group.add_argument(
  405. '--paddle-file',
  406. type=str,
  407. help='PaddleOCR 输出的 JSON 文件路径(单文件模式)'
  408. )
  409. dir_group = parser.add_argument_group('目录参数')
  410. dir_group.add_argument(
  411. '--mineru-dir',
  412. type=str,
  413. help='MinerU 结果目录(批量模式)'
  414. )
  415. dir_group.add_argument(
  416. '--paddle-dir',
  417. type=str,
  418. help='PaddleOCR 结果目录(批量模式)'
  419. )
  420. # 输出参数
  421. output_group = parser.add_argument_group('输出参数')
  422. output_group.add_argument(
  423. '-o', '--output-dir',
  424. type=str,
  425. required=True,
  426. help='输出目录(必需)'
  427. )
  428. output_group.add_argument(
  429. '-f', '--format',
  430. choices=['json', 'markdown', 'both'],
  431. default='both', help='输出格式'
  432. )
  433. # 算法参数
  434. algo_group = parser.add_argument_group('算法参数')
  435. algo_group.add_argument(
  436. '-w', '--window',
  437. type=int,
  438. default=15,
  439. help='向前查找的窗口大小(默认: 10)'
  440. )
  441. algo_group.add_argument(
  442. '-t', '--threshold',
  443. type=int,
  444. default=85,
  445. help='文本相似度阈值(0-100,默认: 80)'
  446. )
  447. args = parser.parse_args()
  448. output_format = args.format.lower()
  449. # 验证参数
  450. if args.mineru_file and args.paddle_file:
  451. # 单文件模式
  452. mineru_file = Path(args.mineru_file)
  453. paddle_file = Path(args.paddle_file)
  454. output_dir = Path(args.output_dir)
  455. if not mineru_file.exists():
  456. print(f"❌ 错误: MinerU 文件不存在: {mineru_file}")
  457. return
  458. if not paddle_file.exists():
  459. print(f"❌ 错误: PaddleOCR 文件不存在: {paddle_file}")
  460. return
  461. output_dir.mkdir(parents=True, exist_ok=True)
  462. print("\n🔧 单文件处理模式")
  463. print(f"📄 MinerU 文件: {mineru_file}")
  464. print(f"📄 PaddleOCR 文件: {paddle_file}")
  465. print(f"📂 输出目录: {output_dir}")
  466. print(f"⚙️ 查找窗口: {args.window}")
  467. print(f"⚙️ 相似度阈值: {args.threshold}%\n")
  468. merger = MinerUPaddleOCRMerger(
  469. look_ahead_window=args.window,
  470. similarity_threshold=args.threshold
  471. )
  472. success = merge_single_file(mineru_file, paddle_file, output_dir, output_format, merger)
  473. if success:
  474. print("\n✅ 处理完成!")
  475. else:
  476. print("\n❌ 处理失败!")
  477. elif args.mineru_dir and args.paddle_dir:
  478. # 批量模式
  479. if not Path(args.mineru_dir).exists():
  480. print(f"❌ 错误: MinerU 目录不存在: {args.mineru_dir}")
  481. return
  482. if not Path(args.paddle_dir).exists():
  483. print(f"❌ 错误: PaddleOCR 目录不存在: {args.paddle_dir}")
  484. return
  485. print("\n🔧 批量处理模式")
  486. merge_mineru_paddle_batch(
  487. args.mineru_dir,
  488. args.paddle_dir,
  489. args.output_dir,
  490. output_format=output_format,
  491. look_ahead_window=args.window,
  492. similarity_threshold=args.threshold
  493. )
  494. else:
  495. parser.print_help()
  496. print("\n❌ 错误: 请指定单文件模式或批量模式的参数")
  497. print(" 单文件模式: --mineru-file 和 --paddle-file")
  498. print(" 批量模式: --mineru-dir 和 --paddle-dir")
  499. if __name__ == "__main__":
  500. print("🚀 启动 MinerU + PaddleOCR 合并程序...")
  501. import sys
  502. if len(sys.argv) == 1:
  503. # 如果没有命令行参数,使用默认配置运行
  504. print("ℹ️ 未提供命令行参数,使用默认配置运行...")
  505. # 默认配置
  506. default_config = {
  507. "mineru-dir": "/Users/zhch158/workspace/data/流水分析/德_内蒙古银行照/mineru-vlm-2.5.3_Results",
  508. "paddle-dir": "/Users/zhch158/workspace/data/流水分析/德_内蒙古银行照/data_PPStructureV3_Results",
  509. "output-dir": "/Users/zhch158/workspace/data/流水分析/德_内蒙古银行照/merged_results",
  510. "format": "both",
  511. "window": "15",
  512. "threshold": "85"
  513. }
  514. print(f"📂 MinerU 目录: {default_config['mineru-dir']}")
  515. print(f"📂 PaddleOCR 目录: {default_config['paddle-dir']}")
  516. print(f"📂 输出目录: {default_config['output-dir']}")
  517. print(f"⚙️ 查找窗口: {default_config['window']}")
  518. print(f"⚙️ 相似度阈值: {default_config['threshold']}%\n")
  519. # 构造参数
  520. sys.argv = [sys.argv[0]]
  521. for key, value in default_config.items():
  522. sys.argv.extend([f"--{key}", str(value)])
  523. sys.exit(main())