paddleocr_vl_merger.py 2.8 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576
  1. """
  2. PaddleOCR_VL 和 PaddleOCR 合并模块
  3. """
  4. import json
  5. from typing import List, Dict
  6. try:
  7. from .text_matcher import TextMatcher
  8. from .bbox_extractor import BBoxExtractor
  9. from .data_processor import DataProcessor
  10. from .markdown_generator import MarkdownGenerator
  11. except ImportError:
  12. from text_matcher import TextMatcher
  13. from bbox_extractor import BBoxExtractor
  14. from data_processor import DataProcessor
  15. from markdown_generator import MarkdownGenerator
  16. class PaddleOCRVLMerger:
  17. """PaddleOCR_VL 和 PaddleOCR 结果合并器"""
  18. def __init__(self, look_ahead_window: int = 10, similarity_threshold: int = 90):
  19. """
  20. Args:
  21. look_ahead_window: 向前查找的窗口大小
  22. similarity_threshold: 文本相似度阈值
  23. """
  24. self.look_ahead_window = look_ahead_window
  25. self.similarity_threshold = similarity_threshold
  26. # 初始化子模块
  27. self.text_matcher = TextMatcher(similarity_threshold)
  28. self.bbox_extractor = BBoxExtractor()
  29. self.data_processor = DataProcessor(self.text_matcher, look_ahead_window)
  30. self.markdown_generator = MarkdownGenerator()
  31. def merge_table_with_bbox(self, paddleocr_vl_json_path: str,
  32. paddle_json_path: str) -> List[Dict]:
  33. """
  34. 合并 PaddleOCR_VL 和 PaddleOCR 的结果
  35. Args:
  36. paddleocr_vl_json_path: PaddleOCR_VL 输出的 JSON 路径
  37. paddle_json_path: PaddleOCR 输出的 JSON 路径
  38. Returns:
  39. 合并后的结果 (MinerU 格式)
  40. """
  41. # 加载数据
  42. with open(paddleocr_vl_json_path, 'r', encoding='utf-8') as f:
  43. paddleocr_vl_data = json.load(f)
  44. with open(paddle_json_path, 'r', encoding='utf-8') as f:
  45. paddle_data = json.load(f)
  46. # 提取 PaddleOCR 的文字框信息
  47. paddle_text_boxes = self.bbox_extractor.extract_paddle_text_boxes(paddle_data)
  48. # 处理 PaddleOCR_VL 的数据
  49. merged_data = self.data_processor.process_paddleocr_vl_data(
  50. paddleocr_vl_data, paddle_text_boxes
  51. )
  52. return merged_data
  53. def generate_enhanced_markdown(self, merged_data: List[Dict],
  54. output_path: str = None,
  55. source_file: str = None) -> str:
  56. """生成增强的 Markdown"""
  57. return self.markdown_generator._generate_paddleocr_vl_markdown(
  58. merged_data, output_path, source_file
  59. )
  60. def extract_table_cells_with_bbox(self, merged_data: List[Dict]) -> List[Dict]:
  61. """提取所有表格单元格及其 bbox 信息"""
  62. return self.bbox_extractor.extract_table_cells_with_bbox(merged_data)