""" 输出格式化器 - 将处理结果转换为多种格式输出 严格复用MinerU的输出格式,确保完全兼容 """ import json import os import sys from pathlib import Path from typing import Dict, Any, List, Union from loguru import logger import numpy as np from PIL import Image, ImageDraw, ImageFont # 导入MinerU的中间格式转换模块 mineru_path = Path(__file__).parents[3] if str(mineru_path) not in sys.path: sys.path.insert(0, str(mineru_path)) from mineru.backend.vlm.vlm_middle_json_mkcontent import union_make as vlm_union_make from mineru.utils.enum_class import MakeMode, BlockType, ContentType class OutputFormatter: """输出格式化器 - 严格按照MinerU格式""" def __init__(self, output_dir: str): self.output_dir = Path(output_dir) self.output_dir.mkdir(parents=True, exist_ok=True) # 颜色映射(与MinerU保持一致) self.color_map = { BlockType.TITLE: (102, 102, 255), # 蓝色 BlockType.TEXT: (153, 0, 76), # 深红 BlockType.IMAGE: (153, 255, 51), # 绿色 BlockType.IMAGE_BODY: (153, 255, 51), BlockType.IMAGE_CAPTION: (102, 178, 255), BlockType.IMAGE_FOOTNOTE: (255, 178, 102), BlockType.TABLE: (204, 204, 0), # 黄色 BlockType.TABLE_BODY: (204, 204, 0), BlockType.TABLE_CAPTION: (255, 255, 102), BlockType.TABLE_FOOTNOTE: (229, 255, 204), BlockType.INTERLINE_EQUATION: (0, 255, 0), # 亮绿 BlockType.LIST: (40, 169, 92), BlockType.CODE: (102, 0, 204), # 紫色 BlockType.CODE_BODY: (102, 0, 204), BlockType.CODE_CAPTION: (204, 153, 255), } def save_results( self, results: Dict[str, Any], output_config: Dict[str, Any] ) -> Dict[str, str]: """ 保存处理结果为多种格式 Args: results: 处理结果字典(包含pages列表,每页有processed_image) output_config: 输出配置 Returns: 各种格式的输出文件路径字典 """ output_paths = {} # 创建文档特定的输出目录 doc_name = Path(results['document_path']).stem doc_output_dir = self.output_dir / doc_name doc_output_dir.mkdir(parents=True, exist_ok=True) # 1. 转换为MinerU标准的middle.json格式 middle_json = self._convert_to_middle_json(results) # 2. 保存middle.json if output_config.get('save_json', True): middle_json_path = doc_output_dir / f"{doc_name}_middle.json" with open(middle_json_path, 'w', encoding='utf-8') as f: json.dump(middle_json, f, ensure_ascii=False, indent=2) output_paths['middle_json'] = str(middle_json_path) logger.info(f"📄 Middle JSON saved: {middle_json_path}") # 3. 使用vlm_union_make生成content_list.json if output_config.get('save_content_list', True): content_list_path = self._save_content_list( middle_json, doc_output_dir, doc_name ) output_paths['content_list'] = str(content_list_path) # 4. 生成Markdown if output_config.get('save_markdown', True): md_path = self._save_markdown(middle_json, doc_output_dir, doc_name) output_paths['markdown'] = str(md_path) # 5. 保存表格HTML(每个表格一个文件) if output_config.get('save_table_html', True): table_html_dir = self._save_table_htmls( middle_json, doc_output_dir, doc_name ) output_paths['table_htmls'] = str(table_html_dir) # 6. 绘制布局图片 if output_config.get('save_layout_image', False): layout_image_paths = self._save_layout_image( middle_json=middle_json, results=results, output_dir=doc_output_dir, doc_name=doc_name, draw_type_label=output_config.get('draw_type_label', True), draw_bbox_number=output_config.get('draw_bbox_number', True) ) output_paths['layout_images'] = layout_image_paths logger.info(f"✅ Results saved to: {doc_output_dir}") return output_paths def _convert_to_middle_json(self, results: Dict[str, Any]) -> Dict[str, Any]: """ 转换为MinerU标准的middle.json格式 严格按照 docs/zh/reference/output_files.md 中的VLM后端格式 """ middle_json = { "pdf_info": [], "_backend": "vlm", # 标记为VLM后端 "_scene": results.get('scene', 'unknown'), "_version_name": "2.5.0" } for page in results['pages']: page_info = { 'page_idx': page['page_idx'], 'page_size': list(page.get('image_shape', [0, 0])[:2][::-1]), # [width, height] 'angle': page.get('angle', 0), 'para_blocks': [], 'discarded_blocks': [] } # 转换每个元素为MinerU格式的block for element in page['elements']: block = self._element_to_mineru_block(element, page_info['page_size']) if block: # 根据类型分类到para_blocks或discarded_blocks if element.get('type') in ['header', 'footer', 'page_number', 'aside_text', 'page_footnote']: page_info['discarded_blocks'].append(block) else: page_info['para_blocks'].append(block) middle_json['pdf_info'].append(page_info) return middle_json def _element_to_mineru_block( self, element: Dict[str, Any], page_size: List[int] ) -> Dict[str, Any]: """ 将处理结果的元素转换为MinerU标准的block格式 参考: mineru/backend/vlm/vlm_middle_json_mkcontent.py """ element_type = element.get('type', '') bbox = element.get('bbox', [0, 0, 0, 0]) # 归一化bbox坐标到0-1范围 # normalized_bbox = self._normalize_bbox(bbox, page_size) block = { 'type': element_type, 'bbox': bbox, 'angle': element.get('angle', 0), # VLM后端特有 'lines': [] } # 文本类型(text, title, ref_text等) if element_type in [BlockType.TEXT, BlockType.TITLE, BlockType.REF_TEXT, BlockType.PHONETIC, BlockType.HEADER, BlockType.FOOTER, BlockType.PAGE_NUMBER, BlockType.ASIDE_TEXT, BlockType.PAGE_FOOTNOTE]: content = element.get('content', {}) text = content.get('text', '') if isinstance(content, dict) else str(content) if text: block['lines'] = [{ 'bbox': bbox, 'spans': [{ 'bbox': bbox, 'type': ContentType.TEXT, 'content': text }] }] # 添加标题级别 if element_type == BlockType.TITLE and 'level' in element: block['level'] = element['level'] # 列表类型 elif element_type == BlockType.LIST: block['sub_type'] = element.get('sub_type', 'text') block['blocks'] = [] list_items = element.get('content', {}).get('list_items', []) for item_text in list_items: item_block = { 'type': BlockType.TEXT, 'bbox': bbox, 'angle': 0, 'lines': [{ 'bbox': bbox, 'spans': [{ 'bbox': bbox, 'type': ContentType.TEXT, 'content': item_text }] }] } block['blocks'].append(item_block) # 代码块类型 elif element_type == BlockType.CODE: block['sub_type'] = element.get('sub_type', 'code') block['blocks'] = [] code_content = element.get('content', {}) # code_body code_body = code_content.get('code_body', '') if code_body: code_body_block = { 'type': BlockType.CODE_BODY, 'bbox': bbox, 'angle': 0, 'lines': [{ 'bbox': bbox, 'spans': [{ 'bbox': bbox, 'type': ContentType.TEXT, 'content': code_body }] }] } block['blocks'].append(code_body_block) # 添加语言标识 if 'guess_lang' in element: block['guess_lang'] = element['guess_lang'] # code_caption code_caption = code_content.get('code_caption', []) for caption_text in code_caption: caption_block = { 'type': BlockType.CODE_CAPTION, 'bbox': bbox, 'angle': 0, 'lines': [{ 'bbox': bbox, 'spans': [{ 'bbox': bbox, 'type': ContentType.TEXT, 'content': caption_text }] }] } block['blocks'].append(caption_block) # 行间公式 elif element_type == BlockType.INTERLINE_EQUATION: formula_content = element.get('content', {}) latex = formula_content.get('latex', '') block['lines'] = [{ 'bbox': bbox, 'spans': [{ 'bbox': bbox, 'type': ContentType.INTERLINE_EQUATION, 'content': latex }] }] # 图片 elif element_type == BlockType.IMAGE: block['blocks'] = [] image_content = element.get('content', {}) # image_body img_path = image_content.get('img_path', '') if img_path: image_body_block = { 'type': BlockType.IMAGE_BODY, 'bbox': bbox, 'angle': 0, 'lines': [{ 'bbox': bbox, 'spans': [{ 'bbox': bbox, 'type': ContentType.IMAGE, 'image_path': img_path }] }] } block['blocks'].append(image_body_block) # image_caption for caption_text in image_content.get('image_caption', []): caption_block = { 'type': BlockType.IMAGE_CAPTION, 'bbox': bbox, 'angle': 0, 'lines': [{ 'bbox': bbox, 'spans': [{ 'bbox': bbox, 'type': ContentType.TEXT, 'content': caption_text }] }] } block['blocks'].append(caption_block) # image_footnote for footnote_text in image_content.get('image_footnote', []): footnote_block = { 'type': BlockType.IMAGE_FOOTNOTE, 'bbox': bbox, 'angle': 0, 'lines': [{ 'bbox': bbox, 'spans': [{ 'bbox': bbox, 'type': ContentType.TEXT, 'content': footnote_text }] }] } block['blocks'].append(footnote_block) # 表格 elif element_type == BlockType.TABLE: block['blocks'] = [] table_content = element.get('content', {}) # table_body table_html = table_content.get('html', '') img_path = table_content.get('img_path', '') if table_html or img_path: table_body_block = { 'type': BlockType.TABLE_BODY, 'bbox': bbox, 'angle': 0, 'lines': [{ 'bbox': bbox, 'spans': [{ 'bbox': bbox, 'type': ContentType.TABLE, 'html': table_html, 'image_path': img_path }] }] } block['blocks'].append(table_body_block) # table_caption for caption_text in table_content.get('table_caption', []): caption_block = { 'type': BlockType.TABLE_CAPTION, 'bbox': bbox, 'angle': 0, 'lines': [{ 'bbox': bbox, 'spans': [{ 'bbox': bbox, 'type': ContentType.TEXT, 'content': caption_text }] }] } block['blocks'].append(caption_block) # table_footnote for footnote_text in table_content.get('table_footnote', []): footnote_block = { 'type': BlockType.TABLE_FOOTNOTE, 'bbox': bbox, 'angle': 0, 'lines': [{ 'bbox': bbox, 'spans': [{ 'bbox': bbox, 'type': ContentType.TEXT, 'content': footnote_text }] }] } block['blocks'].append(footnote_block) return block def _normalize_bbox(self, bbox: List[float], page_size: List[int]) -> List[float]: """ 将bbox归一化到0-1范围 Args: bbox: [x0, y0, x1, y1] 绝对坐标 page_size: [width, height] 页面尺寸 Returns: 归一化后的bbox """ if not bbox or len(bbox) != 4: return [0.0, 0.0, 0.0, 0.0] page_width, page_height = page_size x0, y0, x1, y1 = bbox return [ x0 / page_width if page_width > 0 else 0.0, y0 / page_height if page_height > 0 else 0.0, x1 / page_width if page_width > 0 else 0.0, y1 / page_height if page_height > 0 else 0.0 ] def _save_content_list( self, middle_json: Dict[str, Any], output_dir: Path, doc_name: str ) -> Path: """ 使用vlm_union_make生成content_list.json """ content_list_path = output_dir / f"{doc_name}_content_list.json" try: # 直接调用MinerU的vlm_union_make函数 content_list = vlm_union_make( middle_json['pdf_info'], make_mode=MakeMode.CONTENT_LIST, img_buket_path='images' ) with open(content_list_path, 'w', encoding='utf-8') as f: json.dump(content_list, f, ensure_ascii=False, indent=2) logger.info(f"📋 Content list saved: {content_list_path}") except Exception as e: logger.error(f"❌ Failed to generate content_list: {e}") # Fallback: 保存空列表 with open(content_list_path, 'w', encoding='utf-8') as f: json.dump([], f) return content_list_path def _save_markdown( self, middle_json: Dict[str, Any], output_dir: Path, doc_name: str ) -> Path: """ 使用vlm_union_make生成markdown """ md_path = output_dir / f"{doc_name}.md" try: # 创建images目录 images_dir = output_dir / 'images' images_dir.mkdir(exist_ok=True) # 调用MinerU的vlm_union_make生成markdown markdown_content = vlm_union_make( middle_json['pdf_info'], make_mode=MakeMode.MM_MD, img_buket_path='images' ) # 添加元信息头部 metadata = f"""--- scene: {middle_json.get('_scene', 'unknown')} backend: {middle_json.get('_backend', 'vlm')} version: {middle_json.get('_version_name', '2.5.0')} --- """ with open(md_path, 'w', encoding='utf-8') as f: f.write(metadata) f.write(markdown_content) logger.info(f"📝 Markdown saved: {md_path}") except Exception as e: logger.error(f"❌ Failed to generate markdown: {e}") # Fallback with open(md_path, 'w', encoding='utf-8') as f: f.write(f"# {doc_name}\n\n*Markdown generation failed*\n") return md_path def _save_table_htmls( self, middle_json: Dict[str, Any], output_dir: Path, doc_name: str ) -> Path: """ 保存每个表格为单独的HTML文件 """ tables_dir = output_dir / 'tables' tables_dir.mkdir(exist_ok=True) table_count = 0 for page_idx, page_info in enumerate(middle_json['pdf_info']): for block in page_info.get('para_blocks', []): if block.get('type') == BlockType.TABLE: # 提取表格HTML for sub_block in block.get('blocks', []): if sub_block.get('type') == BlockType.TABLE_BODY: for line in sub_block.get('lines', []): for span in line.get('spans', []): html_content = span.get('html', '') if html_content: # 保存表格HTML table_count += 1 table_path = tables_dir / f"{doc_name}_table_{table_count}_page_{page_idx}.html" # 生成完整的HTML文档 full_html = self._wrap_table_html( html_content, f"{doc_name} - Table {table_count}", page_idx ) with open(table_path, 'w', encoding='utf-8') as f: f.write(full_html) logger.info(f"📊 Table {table_count} saved: {table_path}") if table_count > 0: logger.info(f"📊 Total {table_count} tables saved to: {tables_dir}") return tables_dir def _wrap_table_html(self, table_html: str, title: str, page_idx: int) -> str: """为表格HTML添加完整的HTML文档结构""" return f""" {title}

Title: {title}

Page: {page_idx + 1}

{table_html}
""" def _save_layout_image( self, middle_json: Dict[str, Any], results: Dict[str, Any], output_dir: Path, doc_name: str, draw_type_label: bool = True, draw_bbox_number: bool = True ) -> List[Path]: """ 在原始图片上绘制布局检测结果 Args: middle_json: MinerU中间JSON results: 处理结果, processed_image字段包含预处理后的图像 output_dir: 输出目录 doc_name: 文档名称 draw_type_label: 是否标注类型 draw_bbox_number: 是否标注序号 """ layout_image_paths = [] # 获取所有页面 pages = results.get('pages', []) pdf_info = middle_json.get('pdf_info', []) if len(pages) == 0: logger.warning("⚠️ No pages found in results") return [output_dir] logger.info(f"🖼️ Generating layout images for {len(pages)} page(s)...") # 处理每一页 for page_idx, (page, page_info) in enumerate(zip(pages, pdf_info)): original_image = page.get('processed_image') if original_image is None: logger.warning(f"⚠️ No processed_image found for page {page_idx}, skipping layout image.") continue layout_image_path = output_dir / f"{doc_name}_{page_idx + 1}_layout.png" # 读取图片 if isinstance(original_image, str): image = Image.open(original_image).convert('RGB') elif isinstance(original_image, np.ndarray): image = Image.fromarray(original_image).convert('RGB') elif isinstance(original_image, Image.Image): image = original_image.convert('RGB') else: logger.error("Invalid image type") return layout_image_path # 创建绘图对象 draw = ImageDraw.Draw(image, 'RGBA') # 加载字体 try: font = ImageFont.truetype("/System/Library/Fonts/Helvetica.ttc", 14) except: try: font = ImageFont.truetype("/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf", 14) except: font = ImageFont.load_default() # 假设只处理第一页 page_size = page_info.get('page_size', [image.width, image.height]) image_width, image_height = image.size # 绘制所有blocks block_idx = 1 for block in page_info.get('para_blocks', []) + page_info.get('discarded_blocks', []): block_type = block.get('type', '') bbox_original = block.get('bbox', [0, 0, 0, 0]) x0 = int(bbox_original[0]) y0 = int(bbox_original[1]) x1 = int(bbox_original[2]) y1 = int(bbox_original[3]) # 获取颜色 color = self.color_map.get(block_type, (255, 0, 0)) # 绘制半透明填充 overlay = Image.new('RGBA', image.size, (255, 255, 255, 0)) overlay_draw = ImageDraw.Draw(overlay) overlay_draw.rectangle( [x0, y0, x1, y1], fill=(*color, 76), # 30% 透明度 outline=color, width=2 ) image.paste(Image.alpha_composite(image.convert('RGBA'), overlay).convert('RGB')) draw = ImageDraw.Draw(image) # 绘制边框 draw.rectangle([x0, y0, x1, y1], outline=color, width=2) # 标注类型 if draw_type_label: label = block_type.replace('_', ' ').title() bbox_label = draw.textbbox((x0 + 2, y0 + 2), label, font=font) draw.rectangle(bbox_label, fill=color) draw.text((x0 + 2, y0 + 2), label, fill='white', font=font) # 标注序号 if draw_bbox_number: number_text = str(block_idx) bbox_number = draw.textbbox((x1 - 25, y0 + 2), number_text, font=font) draw.rectangle(bbox_number, fill=(255, 0, 0)) draw.text((x1 - 25, y0 + 2), number_text, fill='white', font=font) block_idx += 1 # 保存图片 image.save(layout_image_path) logger.info(f"🖼️ Layout image saved: {layout_image_path}") layout_image_paths.append(layout_image_path) return layout_image_paths if __name__ == "__main__": # 测试代码 sample_results = { "document_path": "/path/to/sample.pdf", "scene": "financial_report", "pages": [ { "page_idx": 0, "image_shape": [1654, 2338, 3], "elements": [ { "type": "title", "bbox": [100, 50, 800, 100], "content": {"text": "财务报告"}, "confidence": 0.98, "level": 1 }, { "type": "table", "bbox": [100, 200, 800, 600], "content": { "html": "
项目金额
", "markdown": "| 项目 | 金额 |\n|------|------|", "table_caption": ["表1: 财务数据"], "table_footnote": [] }, "confidence": 0.95 } ] } ] } formatter = OutputFormatter("./test_output") output_files = formatter.save_results( sample_results, { "save_json": True, "save_content_list": True, "save_markdown": True, "save_table_html": True, "save_layout_image": False } ) print("Generated files:", output_files)