unified_output_converter.py 5.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137
  1. """
  2. 统一输出格式转换器
  3. 将不同OCR工具的结果转换为标准的MinerU格式
  4. """
  5. from typing import List, Dict
  6. from bs4 import BeautifulSoup
  7. class UnifiedOutputConverter:
  8. """统一输出格式转换器"""
  9. def __init__(self):
  10. # self.text_processor = TextProcessor()
  11. pass
  12. def convert_to_mineru_format(self, merged_data: List[Dict],
  13. data_source: str = 'auto') -> List[Dict]:
  14. """
  15. 将合并后的数据转换为MinerU标准格式
  16. Args:
  17. merged_data: 合并后的数据
  18. data_source: 数据来源 ('paddleocr_vl', 'mineru', 'auto')
  19. Returns:
  20. MinerU格式的数据列表
  21. """
  22. # 自动检测数据格式
  23. if data_source == 'auto':
  24. data_source = self._detect_data_source(merged_data)
  25. if data_source == 'paddleocr_vl':
  26. return self._convert_paddleocr_vl_to_mineru(merged_data)
  27. elif data_source == 'mineru':
  28. return merged_data # 已经是MinerU格式
  29. else:
  30. raise ValueError(f"Unsupported data source: {data_source}")
  31. def _detect_data_source(self, merged_data: List[Dict]) -> str:
  32. """检测数据来源"""
  33. if not merged_data:
  34. return 'mineru'
  35. first_item = merged_data[0]
  36. # 检查PaddleOCR_VL特征
  37. if 'block_label' in first_item:
  38. return 'paddleocr_vl'
  39. # 检查MinerU特征
  40. if 'type' in first_item:
  41. return 'mineru'
  42. return 'mineru' # 默认
  43. def _convert_paddleocr_vl_to_mineru(self, merged_data: List[Dict]) -> List[Dict]:
  44. """将PaddleOCR_VL格式转换为MinerU格式"""
  45. mineru_data = []
  46. for item in merged_data:
  47. block_label = item.get('block_label', '')
  48. # 映射block_label到MinerU的type
  49. type_mapping = {
  50. 'header': 'header',
  51. 'footer': 'footer',
  52. 'page_number': 'page_number',
  53. 'paragraph_title': 'title',
  54. 'doc_title': 'title',
  55. 'abstract_title': 'title',
  56. 'reference_title': 'title',
  57. 'content_title': 'title',
  58. 'figure_title': 'title',
  59. 'table_title': 'title',
  60. 'text': 'text',
  61. 'table': 'table',
  62. 'figure': 'image',
  63. 'chart': 'image',
  64. 'seal': 'image',
  65. 'equation': 'interline_equation',
  66. 'reference': 'ref_text',
  67. }
  68. mineru_type = type_mapping.get(block_label, 'text')
  69. # 构建MinerU格式的数据项
  70. mineru_item = {
  71. 'type': mineru_type,
  72. 'bbox': item.get('block_bbox', item.get('bbox', [])),
  73. 'page_idx': item.get('page_idx', 0)
  74. }
  75. # 根据类型添加特定字段
  76. if mineru_type == 'title':
  77. mineru_item['text'] = item.get('block_content', item.get('text', ''))
  78. # 根据block_label确定标题级别
  79. level_map = {
  80. 'doc_title': 1,
  81. 'paragraph_title': 2,
  82. 'abstract_title': 2,
  83. 'reference_title': 2,
  84. 'content_title': 3,
  85. 'figure_title': 4,
  86. 'table_title': 4,
  87. }
  88. mineru_item['text_level'] = level_map.get(block_label, 1)
  89. elif mineru_type == 'text':
  90. mineru_item['text'] = item.get('block_content', item.get('text', ''))
  91. elif mineru_type in ['header', 'footer', 'page_number', 'ref_text']:
  92. mineru_item['text'] = item.get('block_content', item.get('text', ''))
  93. elif mineru_type == 'table':
  94. # 处理表格
  95. table_html = item.get('block_content_with_bbox',
  96. item.get('block_content', ''))
  97. mineru_item['table_body'] = table_html
  98. mineru_item['table_body_with_bbox'] = table_html
  99. mineru_item['table_caption'] = []
  100. mineru_item['table_footnote'] = []
  101. # 提取表格单元格信息
  102. if item.get('table_cells'):
  103. mineru_item['table_cells'] = item['table_cells']
  104. elif mineru_type == 'image':
  105. mineru_item['img_path'] = item.get('img_path', '')
  106. mineru_item['image_caption'] = []
  107. mineru_item['image_footnote'] = []
  108. elif mineru_type == 'interline_equation':
  109. mineru_item['text'] = item.get('block_content', item.get('text', ''))
  110. mineru_item['text_format'] = 'latex'
  111. mineru_data.append(mineru_item)
  112. return mineru_data