|
|
@@ -0,0 +1,770 @@
|
|
|
+"""
|
|
|
+输出格式化器 - 将处理结果转换为多种格式输出
|
|
|
+严格复用MinerU的输出格式,确保完全兼容
|
|
|
+"""
|
|
|
+import json
|
|
|
+import os
|
|
|
+import sys
|
|
|
+from pathlib import Path
|
|
|
+from typing import Dict, Any, List, Union
|
|
|
+from loguru import logger
|
|
|
+import numpy as np
|
|
|
+from PIL import Image, ImageDraw, ImageFont
|
|
|
+
|
|
|
+# 导入MinerU的中间格式转换模块
|
|
|
+mineru_path = Path(__file__).parents[3]
|
|
|
+if str(mineru_path) not in sys.path:
|
|
|
+ sys.path.insert(0, str(mineru_path))
|
|
|
+
|
|
|
+from mineru.backend.vlm.vlm_middle_json_mkcontent import union_make as vlm_union_make
|
|
|
+from mineru.utils.enum_class import MakeMode, BlockType, ContentType
|
|
|
+
|
|
|
+
|
|
|
+class OutputFormatter:
|
|
|
+ """输出格式化器 - 严格按照MinerU格式"""
|
|
|
+
|
|
|
+ def __init__(self, output_dir: str):
|
|
|
+ self.output_dir = Path(output_dir)
|
|
|
+ self.output_dir.mkdir(parents=True, exist_ok=True)
|
|
|
+
|
|
|
+ # 颜色映射(与MinerU保持一致)
|
|
|
+ self.color_map = {
|
|
|
+ BlockType.TITLE: (102, 102, 255), # 蓝色
|
|
|
+ BlockType.TEXT: (153, 0, 76), # 深红
|
|
|
+ BlockType.IMAGE: (153, 255, 51), # 绿色
|
|
|
+ BlockType.IMAGE_BODY: (153, 255, 51),
|
|
|
+ BlockType.IMAGE_CAPTION: (102, 178, 255),
|
|
|
+ BlockType.IMAGE_FOOTNOTE: (255, 178, 102),
|
|
|
+ BlockType.TABLE: (204, 204, 0), # 黄色
|
|
|
+ BlockType.TABLE_BODY: (204, 204, 0),
|
|
|
+ BlockType.TABLE_CAPTION: (255, 255, 102),
|
|
|
+ BlockType.TABLE_FOOTNOTE: (229, 255, 204),
|
|
|
+ BlockType.INTERLINE_EQUATION: (0, 255, 0), # 亮绿
|
|
|
+ BlockType.LIST: (40, 169, 92),
|
|
|
+ BlockType.CODE: (102, 0, 204), # 紫色
|
|
|
+ BlockType.CODE_BODY: (102, 0, 204),
|
|
|
+ BlockType.CODE_CAPTION: (204, 153, 255),
|
|
|
+ }
|
|
|
+
|
|
|
+ def save_results(
|
|
|
+ self,
|
|
|
+ results: Dict[str, Any],
|
|
|
+ output_config: Dict[str, Any]
|
|
|
+ ) -> Dict[str, str]:
|
|
|
+ """
|
|
|
+ 保存处理结果为多种格式
|
|
|
+
|
|
|
+ Args:
|
|
|
+ results: 处理结果字典(包含pages列表,每页有processed_image)
|
|
|
+ output_config: 输出配置
|
|
|
+
|
|
|
+ Returns:
|
|
|
+ 各种格式的输出文件路径字典
|
|
|
+ """
|
|
|
+ output_paths = {}
|
|
|
+
|
|
|
+ # 创建文档特定的输出目录
|
|
|
+ doc_name = Path(results['document_path']).stem
|
|
|
+ doc_output_dir = self.output_dir / doc_name
|
|
|
+ doc_output_dir.mkdir(parents=True, exist_ok=True)
|
|
|
+
|
|
|
+ # 1. 转换为MinerU标准的middle.json格式
|
|
|
+ middle_json = self._convert_to_middle_json(results)
|
|
|
+
|
|
|
+ # 2. 保存middle.json
|
|
|
+ if output_config.get('save_json', True):
|
|
|
+ middle_json_path = doc_output_dir / f"{doc_name}_middle.json"
|
|
|
+ with open(middle_json_path, 'w', encoding='utf-8') as f:
|
|
|
+ json.dump(middle_json, f, ensure_ascii=False, indent=2)
|
|
|
+ output_paths['middle_json'] = str(middle_json_path)
|
|
|
+ logger.info(f"📄 Middle JSON saved: {middle_json_path}")
|
|
|
+
|
|
|
+ # 3. 使用vlm_union_make生成content_list.json
|
|
|
+ if output_config.get('save_content_list', True):
|
|
|
+ content_list_path = self._save_content_list(
|
|
|
+ middle_json, doc_output_dir, doc_name
|
|
|
+ )
|
|
|
+ output_paths['content_list'] = str(content_list_path)
|
|
|
+
|
|
|
+ # 4. 生成Markdown
|
|
|
+ if output_config.get('save_markdown', True):
|
|
|
+ md_path = self._save_markdown(middle_json, doc_output_dir, doc_name)
|
|
|
+ output_paths['markdown'] = str(md_path)
|
|
|
+
|
|
|
+ # 5. 保存表格HTML(每个表格一个文件)
|
|
|
+ if output_config.get('save_table_html', True):
|
|
|
+ table_html_dir = self._save_table_htmls(
|
|
|
+ middle_json, doc_output_dir, doc_name
|
|
|
+ )
|
|
|
+ output_paths['table_htmls'] = str(table_html_dir)
|
|
|
+
|
|
|
+ # 6. 绘制布局图片
|
|
|
+ if output_config.get('save_layout_image', False):
|
|
|
+ layout_image_paths = self._save_layout_image(
|
|
|
+ middle_json=middle_json,
|
|
|
+ results=results,
|
|
|
+ output_dir=doc_output_dir,
|
|
|
+ doc_name=doc_name,
|
|
|
+ draw_type_label=output_config.get('draw_type_label', True),
|
|
|
+ draw_bbox_number=output_config.get('draw_bbox_number', True)
|
|
|
+ )
|
|
|
+ output_paths['layout_images'] = layout_image_paths
|
|
|
+
|
|
|
+ logger.info(f"✅ Results saved to: {doc_output_dir}")
|
|
|
+ return output_paths
|
|
|
+
|
|
|
+ def _convert_to_middle_json(self, results: Dict[str, Any]) -> Dict[str, Any]:
|
|
|
+ """
|
|
|
+ 转换为MinerU标准的middle.json格式
|
|
|
+ 严格按照 docs/zh/reference/output_files.md 中的VLM后端格式
|
|
|
+ """
|
|
|
+ middle_json = {
|
|
|
+ "pdf_info": [],
|
|
|
+ "_backend": "vlm", # 标记为VLM后端
|
|
|
+ "_scene": results.get('scene', 'unknown'),
|
|
|
+ "_version_name": "2.5.0"
|
|
|
+ }
|
|
|
+
|
|
|
+ for page in results['pages']:
|
|
|
+ page_info = {
|
|
|
+ 'page_idx': page['page_idx'],
|
|
|
+ 'page_size': list(page.get('image_shape', [0, 0])[:2][::-1]), # [width, height]
|
|
|
+ 'angle': page.get('angle', 0),
|
|
|
+ 'para_blocks': [],
|
|
|
+ 'discarded_blocks': []
|
|
|
+ }
|
|
|
+
|
|
|
+ # 转换每个元素为MinerU格式的block
|
|
|
+ for element in page['elements']:
|
|
|
+ block = self._element_to_mineru_block(element, page_info['page_size'])
|
|
|
+ if block:
|
|
|
+ # 根据类型分类到para_blocks或discarded_blocks
|
|
|
+ if element.get('type') in ['header', 'footer', 'page_number',
|
|
|
+ 'aside_text', 'page_footnote']:
|
|
|
+ page_info['discarded_blocks'].append(block)
|
|
|
+ else:
|
|
|
+ page_info['para_blocks'].append(block)
|
|
|
+
|
|
|
+ middle_json['pdf_info'].append(page_info)
|
|
|
+
|
|
|
+ return middle_json
|
|
|
+
|
|
|
+ def _element_to_mineru_block(
|
|
|
+ self,
|
|
|
+ element: Dict[str, Any],
|
|
|
+ page_size: List[int]
|
|
|
+ ) -> Dict[str, Any]:
|
|
|
+ """
|
|
|
+ 将处理结果的元素转换为MinerU标准的block格式
|
|
|
+
|
|
|
+ 参考: mineru/backend/vlm/vlm_middle_json_mkcontent.py
|
|
|
+ """
|
|
|
+ element_type = element.get('type', '')
|
|
|
+ bbox = element.get('bbox', [0, 0, 0, 0])
|
|
|
+
|
|
|
+ # 归一化bbox坐标到0-1范围
|
|
|
+ # normalized_bbox = self._normalize_bbox(bbox, page_size)
|
|
|
+
|
|
|
+ block = {
|
|
|
+ 'type': element_type,
|
|
|
+ 'bbox': bbox,
|
|
|
+ 'angle': element.get('angle', 0), # VLM后端特有
|
|
|
+ 'lines': []
|
|
|
+ }
|
|
|
+
|
|
|
+ # 文本类型(text, title, ref_text等)
|
|
|
+ if element_type in [BlockType.TEXT, BlockType.TITLE, BlockType.REF_TEXT,
|
|
|
+ BlockType.PHONETIC, BlockType.HEADER, BlockType.FOOTER,
|
|
|
+ BlockType.PAGE_NUMBER, BlockType.ASIDE_TEXT, BlockType.PAGE_FOOTNOTE]:
|
|
|
+ content = element.get('content', {})
|
|
|
+ text = content.get('text', '') if isinstance(content, dict) else str(content)
|
|
|
+
|
|
|
+ if text:
|
|
|
+ block['lines'] = [{
|
|
|
+ 'bbox': bbox,
|
|
|
+ 'spans': [{
|
|
|
+ 'bbox': bbox,
|
|
|
+ 'type': ContentType.TEXT,
|
|
|
+ 'content': text
|
|
|
+ }]
|
|
|
+ }]
|
|
|
+
|
|
|
+ # 添加标题级别
|
|
|
+ if element_type == BlockType.TITLE and 'level' in element:
|
|
|
+ block['level'] = element['level']
|
|
|
+
|
|
|
+ # 列表类型
|
|
|
+ elif element_type == BlockType.LIST:
|
|
|
+ block['sub_type'] = element.get('sub_type', 'text')
|
|
|
+ block['blocks'] = []
|
|
|
+
|
|
|
+ list_items = element.get('content', {}).get('list_items', [])
|
|
|
+ for item_text in list_items:
|
|
|
+ item_block = {
|
|
|
+ 'type': BlockType.TEXT,
|
|
|
+ 'bbox': bbox,
|
|
|
+ 'angle': 0,
|
|
|
+ 'lines': [{
|
|
|
+ 'bbox': bbox,
|
|
|
+ 'spans': [{
|
|
|
+ 'bbox': bbox,
|
|
|
+ 'type': ContentType.TEXT,
|
|
|
+ 'content': item_text
|
|
|
+ }]
|
|
|
+ }]
|
|
|
+ }
|
|
|
+ block['blocks'].append(item_block)
|
|
|
+
|
|
|
+ # 代码块类型
|
|
|
+ elif element_type == BlockType.CODE:
|
|
|
+ block['sub_type'] = element.get('sub_type', 'code')
|
|
|
+ block['blocks'] = []
|
|
|
+
|
|
|
+ code_content = element.get('content', {})
|
|
|
+
|
|
|
+ # code_body
|
|
|
+ code_body = code_content.get('code_body', '')
|
|
|
+ if code_body:
|
|
|
+ code_body_block = {
|
|
|
+ 'type': BlockType.CODE_BODY,
|
|
|
+ 'bbox': bbox,
|
|
|
+ 'angle': 0,
|
|
|
+ 'lines': [{
|
|
|
+ 'bbox': bbox,
|
|
|
+ 'spans': [{
|
|
|
+ 'bbox': bbox,
|
|
|
+ 'type': ContentType.TEXT,
|
|
|
+ 'content': code_body
|
|
|
+ }]
|
|
|
+ }]
|
|
|
+ }
|
|
|
+ block['blocks'].append(code_body_block)
|
|
|
+
|
|
|
+ # 添加语言标识
|
|
|
+ if 'guess_lang' in element:
|
|
|
+ block['guess_lang'] = element['guess_lang']
|
|
|
+
|
|
|
+ # code_caption
|
|
|
+ code_caption = code_content.get('code_caption', [])
|
|
|
+ for caption_text in code_caption:
|
|
|
+ caption_block = {
|
|
|
+ 'type': BlockType.CODE_CAPTION,
|
|
|
+ 'bbox': bbox,
|
|
|
+ 'angle': 0,
|
|
|
+ 'lines': [{
|
|
|
+ 'bbox': bbox,
|
|
|
+ 'spans': [{
|
|
|
+ 'bbox': bbox,
|
|
|
+ 'type': ContentType.TEXT,
|
|
|
+ 'content': caption_text
|
|
|
+ }]
|
|
|
+ }]
|
|
|
+ }
|
|
|
+ block['blocks'].append(caption_block)
|
|
|
+
|
|
|
+ # 行间公式
|
|
|
+ elif element_type == BlockType.INTERLINE_EQUATION:
|
|
|
+ formula_content = element.get('content', {})
|
|
|
+ latex = formula_content.get('latex', '')
|
|
|
+
|
|
|
+ block['lines'] = [{
|
|
|
+ 'bbox': bbox,
|
|
|
+ 'spans': [{
|
|
|
+ 'bbox': bbox,
|
|
|
+ 'type': ContentType.INTERLINE_EQUATION,
|
|
|
+ 'content': latex
|
|
|
+ }]
|
|
|
+ }]
|
|
|
+
|
|
|
+ # 图片
|
|
|
+ elif element_type == BlockType.IMAGE:
|
|
|
+ block['blocks'] = []
|
|
|
+
|
|
|
+ image_content = element.get('content', {})
|
|
|
+
|
|
|
+ # image_body
|
|
|
+ img_path = image_content.get('img_path', '')
|
|
|
+ if img_path:
|
|
|
+ image_body_block = {
|
|
|
+ 'type': BlockType.IMAGE_BODY,
|
|
|
+ 'bbox': bbox,
|
|
|
+ 'angle': 0,
|
|
|
+ 'lines': [{
|
|
|
+ 'bbox': bbox,
|
|
|
+ 'spans': [{
|
|
|
+ 'bbox': bbox,
|
|
|
+ 'type': ContentType.IMAGE,
|
|
|
+ 'image_path': img_path
|
|
|
+ }]
|
|
|
+ }]
|
|
|
+ }
|
|
|
+ block['blocks'].append(image_body_block)
|
|
|
+
|
|
|
+ # image_caption
|
|
|
+ for caption_text in image_content.get('image_caption', []):
|
|
|
+ caption_block = {
|
|
|
+ 'type': BlockType.IMAGE_CAPTION,
|
|
|
+ 'bbox': bbox,
|
|
|
+ 'angle': 0,
|
|
|
+ 'lines': [{
|
|
|
+ 'bbox': bbox,
|
|
|
+ 'spans': [{
|
|
|
+ 'bbox': bbox,
|
|
|
+ 'type': ContentType.TEXT,
|
|
|
+ 'content': caption_text
|
|
|
+ }]
|
|
|
+ }]
|
|
|
+ }
|
|
|
+ block['blocks'].append(caption_block)
|
|
|
+
|
|
|
+ # image_footnote
|
|
|
+ for footnote_text in image_content.get('image_footnote', []):
|
|
|
+ footnote_block = {
|
|
|
+ 'type': BlockType.IMAGE_FOOTNOTE,
|
|
|
+ 'bbox': bbox,
|
|
|
+ 'angle': 0,
|
|
|
+ 'lines': [{
|
|
|
+ 'bbox': bbox,
|
|
|
+ 'spans': [{
|
|
|
+ 'bbox': bbox,
|
|
|
+ 'type': ContentType.TEXT,
|
|
|
+ 'content': footnote_text
|
|
|
+ }]
|
|
|
+ }]
|
|
|
+ }
|
|
|
+ block['blocks'].append(footnote_block)
|
|
|
+
|
|
|
+ # 表格
|
|
|
+ elif element_type == BlockType.TABLE:
|
|
|
+ block['blocks'] = []
|
|
|
+
|
|
|
+ table_content = element.get('content', {})
|
|
|
+
|
|
|
+ # table_body
|
|
|
+ table_html = table_content.get('html', '')
|
|
|
+ img_path = table_content.get('img_path', '')
|
|
|
+
|
|
|
+ if table_html or img_path:
|
|
|
+ table_body_block = {
|
|
|
+ 'type': BlockType.TABLE_BODY,
|
|
|
+ 'bbox': bbox,
|
|
|
+ 'angle': 0,
|
|
|
+ 'lines': [{
|
|
|
+ 'bbox': bbox,
|
|
|
+ 'spans': [{
|
|
|
+ 'bbox': bbox,
|
|
|
+ 'type': ContentType.TABLE,
|
|
|
+ 'html': table_html,
|
|
|
+ 'image_path': img_path
|
|
|
+ }]
|
|
|
+ }]
|
|
|
+ }
|
|
|
+ block['blocks'].append(table_body_block)
|
|
|
+
|
|
|
+ # table_caption
|
|
|
+ for caption_text in table_content.get('table_caption', []):
|
|
|
+ caption_block = {
|
|
|
+ 'type': BlockType.TABLE_CAPTION,
|
|
|
+ 'bbox': bbox,
|
|
|
+ 'angle': 0,
|
|
|
+ 'lines': [{
|
|
|
+ 'bbox': bbox,
|
|
|
+ 'spans': [{
|
|
|
+ 'bbox': bbox,
|
|
|
+ 'type': ContentType.TEXT,
|
|
|
+ 'content': caption_text
|
|
|
+ }]
|
|
|
+ }]
|
|
|
+ }
|
|
|
+ block['blocks'].append(caption_block)
|
|
|
+
|
|
|
+ # table_footnote
|
|
|
+ for footnote_text in table_content.get('table_footnote', []):
|
|
|
+ footnote_block = {
|
|
|
+ 'type': BlockType.TABLE_FOOTNOTE,
|
|
|
+ 'bbox': bbox,
|
|
|
+ 'angle': 0,
|
|
|
+ 'lines': [{
|
|
|
+ 'bbox': bbox,
|
|
|
+ 'spans': [{
|
|
|
+ 'bbox': bbox,
|
|
|
+ 'type': ContentType.TEXT,
|
|
|
+ 'content': footnote_text
|
|
|
+ }]
|
|
|
+ }]
|
|
|
+ }
|
|
|
+ block['blocks'].append(footnote_block)
|
|
|
+
|
|
|
+ return block
|
|
|
+
|
|
|
+ def _normalize_bbox(self, bbox: List[float], page_size: List[int]) -> List[float]:
|
|
|
+ """
|
|
|
+ 将bbox归一化到0-1范围
|
|
|
+
|
|
|
+ Args:
|
|
|
+ bbox: [x0, y0, x1, y1] 绝对坐标
|
|
|
+ page_size: [width, height] 页面尺寸
|
|
|
+
|
|
|
+ Returns:
|
|
|
+ 归一化后的bbox
|
|
|
+ """
|
|
|
+ if not bbox or len(bbox) != 4:
|
|
|
+ return [0.0, 0.0, 0.0, 0.0]
|
|
|
+
|
|
|
+ page_width, page_height = page_size
|
|
|
+ x0, y0, x1, y1 = bbox
|
|
|
+
|
|
|
+ return [
|
|
|
+ x0 / page_width if page_width > 0 else 0.0,
|
|
|
+ y0 / page_height if page_height > 0 else 0.0,
|
|
|
+ x1 / page_width if page_width > 0 else 0.0,
|
|
|
+ y1 / page_height if page_height > 0 else 0.0
|
|
|
+ ]
|
|
|
+
|
|
|
+ def _save_content_list(
|
|
|
+ self,
|
|
|
+ middle_json: Dict[str, Any],
|
|
|
+ output_dir: Path,
|
|
|
+ doc_name: str
|
|
|
+ ) -> Path:
|
|
|
+ """
|
|
|
+ 使用vlm_union_make生成content_list.json
|
|
|
+ """
|
|
|
+ content_list_path = output_dir / f"{doc_name}_content_list.json"
|
|
|
+
|
|
|
+ try:
|
|
|
+ # 直接调用MinerU的vlm_union_make函数
|
|
|
+ content_list = vlm_union_make(
|
|
|
+ middle_json['pdf_info'],
|
|
|
+ make_mode=MakeMode.CONTENT_LIST,
|
|
|
+ img_buket_path='images'
|
|
|
+ )
|
|
|
+
|
|
|
+ with open(content_list_path, 'w', encoding='utf-8') as f:
|
|
|
+ json.dump(content_list, f, ensure_ascii=False, indent=2)
|
|
|
+
|
|
|
+ logger.info(f"📋 Content list saved: {content_list_path}")
|
|
|
+
|
|
|
+ except Exception as e:
|
|
|
+ logger.error(f"❌ Failed to generate content_list: {e}")
|
|
|
+ # Fallback: 保存空列表
|
|
|
+ with open(content_list_path, 'w', encoding='utf-8') as f:
|
|
|
+ json.dump([], f)
|
|
|
+
|
|
|
+ return content_list_path
|
|
|
+
|
|
|
+ def _save_markdown(
|
|
|
+ self,
|
|
|
+ middle_json: Dict[str, Any],
|
|
|
+ output_dir: Path,
|
|
|
+ doc_name: str
|
|
|
+ ) -> Path:
|
|
|
+ """
|
|
|
+ 使用vlm_union_make生成markdown
|
|
|
+ """
|
|
|
+ md_path = output_dir / f"{doc_name}.md"
|
|
|
+
|
|
|
+ try:
|
|
|
+ # 创建images目录
|
|
|
+ images_dir = output_dir / 'images'
|
|
|
+ images_dir.mkdir(exist_ok=True)
|
|
|
+
|
|
|
+ # 调用MinerU的vlm_union_make生成markdown
|
|
|
+ markdown_content = vlm_union_make(
|
|
|
+ middle_json['pdf_info'],
|
|
|
+ make_mode=MakeMode.MM_MD,
|
|
|
+ img_buket_path='images'
|
|
|
+ )
|
|
|
+
|
|
|
+ # 添加元信息头部
|
|
|
+ metadata = f"""---
|
|
|
+scene: {middle_json.get('_scene', 'unknown')}
|
|
|
+backend: {middle_json.get('_backend', 'vlm')}
|
|
|
+version: {middle_json.get('_version_name', '2.5.0')}
|
|
|
+---
|
|
|
+
|
|
|
+"""
|
|
|
+
|
|
|
+ with open(md_path, 'w', encoding='utf-8') as f:
|
|
|
+ f.write(metadata)
|
|
|
+ f.write(markdown_content)
|
|
|
+
|
|
|
+ logger.info(f"📝 Markdown saved: {md_path}")
|
|
|
+
|
|
|
+ except Exception as e:
|
|
|
+ logger.error(f"❌ Failed to generate markdown: {e}")
|
|
|
+ # Fallback
|
|
|
+ with open(md_path, 'w', encoding='utf-8') as f:
|
|
|
+ f.write(f"# {doc_name}\n\n*Markdown generation failed*\n")
|
|
|
+
|
|
|
+ return md_path
|
|
|
+
|
|
|
+ def _save_table_htmls(
|
|
|
+ self,
|
|
|
+ middle_json: Dict[str, Any],
|
|
|
+ output_dir: Path,
|
|
|
+ doc_name: str
|
|
|
+ ) -> Path:
|
|
|
+ """
|
|
|
+ 保存每个表格为单独的HTML文件
|
|
|
+ """
|
|
|
+ tables_dir = output_dir / 'tables'
|
|
|
+ tables_dir.mkdir(exist_ok=True)
|
|
|
+
|
|
|
+ table_count = 0
|
|
|
+
|
|
|
+ for page_idx, page_info in enumerate(middle_json['pdf_info']):
|
|
|
+ for block in page_info.get('para_blocks', []):
|
|
|
+ if block.get('type') == BlockType.TABLE:
|
|
|
+ # 提取表格HTML
|
|
|
+ for sub_block in block.get('blocks', []):
|
|
|
+ if sub_block.get('type') == BlockType.TABLE_BODY:
|
|
|
+ for line in sub_block.get('lines', []):
|
|
|
+ for span in line.get('spans', []):
|
|
|
+ html_content = span.get('html', '')
|
|
|
+ if html_content:
|
|
|
+ # 保存表格HTML
|
|
|
+ table_count += 1
|
|
|
+ table_path = tables_dir / f"{doc_name}_table_{table_count}_page_{page_idx}.html"
|
|
|
+
|
|
|
+ # 生成完整的HTML文档
|
|
|
+ full_html = self._wrap_table_html(
|
|
|
+ html_content,
|
|
|
+ f"{doc_name} - Table {table_count}",
|
|
|
+ page_idx
|
|
|
+ )
|
|
|
+
|
|
|
+ with open(table_path, 'w', encoding='utf-8') as f:
|
|
|
+ f.write(full_html)
|
|
|
+
|
|
|
+ logger.info(f"📊 Table {table_count} saved: {table_path}")
|
|
|
+
|
|
|
+ if table_count > 0:
|
|
|
+ logger.info(f"📊 Total {table_count} tables saved to: {tables_dir}")
|
|
|
+
|
|
|
+ return tables_dir
|
|
|
+
|
|
|
+ def _wrap_table_html(self, table_html: str, title: str, page_idx: int) -> str:
|
|
|
+ """为表格HTML添加完整的HTML文档结构"""
|
|
|
+ return f"""<!DOCTYPE html>
|
|
|
+<html lang="zh-CN">
|
|
|
+<head>
|
|
|
+ <meta charset="UTF-8">
|
|
|
+ <meta name="viewport" content="width=device-width, initial-scale=1.0">
|
|
|
+ <title>{title}</title>
|
|
|
+ <style>
|
|
|
+ body {{
|
|
|
+ font-family: Arial, "Microsoft YaHei", sans-serif;
|
|
|
+ margin: 20px;
|
|
|
+ background-color: #f5f5f5;
|
|
|
+ }}
|
|
|
+ .container {{
|
|
|
+ max-width: 1200px;
|
|
|
+ margin: 0 auto;
|
|
|
+ background-color: white;
|
|
|
+ padding: 20px;
|
|
|
+ box-shadow: 0 0 10px rgba(0,0,0,0.1);
|
|
|
+ }}
|
|
|
+ .meta {{
|
|
|
+ color: #666;
|
|
|
+ font-size: 0.9em;
|
|
|
+ margin-bottom: 20px;
|
|
|
+ padding-bottom: 10px;
|
|
|
+ border-bottom: 1px solid #ddd;
|
|
|
+ }}
|
|
|
+ table {{
|
|
|
+ border-collapse: collapse;
|
|
|
+ width: 100%;
|
|
|
+ margin: 20px 0;
|
|
|
+ }}
|
|
|
+ th, td {{
|
|
|
+ border: 1px solid #ddd;
|
|
|
+ padding: 8px 12px;
|
|
|
+ text-align: left;
|
|
|
+ }}
|
|
|
+ th {{
|
|
|
+ background-color: #f2f2f2;
|
|
|
+ font-weight: bold;
|
|
|
+ }}
|
|
|
+ tr:hover {{
|
|
|
+ background-color: #f9f9f9;
|
|
|
+ }}
|
|
|
+ </style>
|
|
|
+</head>
|
|
|
+<body>
|
|
|
+ <div class="container">
|
|
|
+ <div class="meta">
|
|
|
+ <p><strong>Title:</strong> {title}</p>
|
|
|
+ <p><strong>Page:</strong> {page_idx + 1}</p>
|
|
|
+ </div>
|
|
|
+ {table_html}
|
|
|
+ </div>
|
|
|
+</body>
|
|
|
+</html>"""
|
|
|
+
|
|
|
+ def _save_layout_image(
|
|
|
+ self,
|
|
|
+ middle_json: Dict[str, Any],
|
|
|
+ results: Dict[str, Any],
|
|
|
+ output_dir: Path,
|
|
|
+ doc_name: str,
|
|
|
+ draw_type_label: bool = True,
|
|
|
+ draw_bbox_number: bool = True
|
|
|
+ ) -> List[Path]:
|
|
|
+ """
|
|
|
+ 在原始图片上绘制布局检测结果
|
|
|
+
|
|
|
+ Args:
|
|
|
+ middle_json: MinerU中间JSON
|
|
|
+ results: 处理结果, processed_image字段包含预处理后的图像
|
|
|
+ output_dir: 输出目录
|
|
|
+ doc_name: 文档名称
|
|
|
+ draw_type_label: 是否标注类型
|
|
|
+ draw_bbox_number: 是否标注序号
|
|
|
+ """
|
|
|
+ layout_image_paths = []
|
|
|
+
|
|
|
+ # 获取所有页面
|
|
|
+ pages = results.get('pages', [])
|
|
|
+ pdf_info = middle_json.get('pdf_info', [])
|
|
|
+
|
|
|
+ if len(pages) == 0:
|
|
|
+ logger.warning("⚠️ No pages found in results")
|
|
|
+ return [output_dir]
|
|
|
+
|
|
|
+ logger.info(f"🖼️ Generating layout images for {len(pages)} page(s)...")
|
|
|
+
|
|
|
+ # 处理每一页
|
|
|
+ for page_idx, (page, page_info) in enumerate(zip(pages, pdf_info)):
|
|
|
+ original_image = page.get('processed_image')
|
|
|
+ if original_image is None:
|
|
|
+ logger.warning(f"⚠️ No processed_image found for page {page_idx}, skipping layout image.")
|
|
|
+ continue
|
|
|
+ layout_image_path = output_dir / f"{doc_name}_{page_idx + 1}_layout.png"
|
|
|
+
|
|
|
+ # 读取图片
|
|
|
+ if isinstance(original_image, str):
|
|
|
+ image = Image.open(original_image).convert('RGB')
|
|
|
+ elif isinstance(original_image, np.ndarray):
|
|
|
+ image = Image.fromarray(original_image).convert('RGB')
|
|
|
+ elif isinstance(original_image, Image.Image):
|
|
|
+ image = original_image.convert('RGB')
|
|
|
+ else:
|
|
|
+ logger.error("Invalid image type")
|
|
|
+ return layout_image_path
|
|
|
+
|
|
|
+ # 创建绘图对象
|
|
|
+ draw = ImageDraw.Draw(image, 'RGBA')
|
|
|
+
|
|
|
+ # 加载字体
|
|
|
+ try:
|
|
|
+ font = ImageFont.truetype("/System/Library/Fonts/Helvetica.ttc", 14)
|
|
|
+ except:
|
|
|
+ try:
|
|
|
+ font = ImageFont.truetype("/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf", 14)
|
|
|
+ except:
|
|
|
+ font = ImageFont.load_default()
|
|
|
+
|
|
|
+ # 假设只处理第一页
|
|
|
+ page_size = page_info.get('page_size', [image.width, image.height])
|
|
|
+ image_width, image_height = image.size
|
|
|
+
|
|
|
+ # 绘制所有blocks
|
|
|
+ block_idx = 1
|
|
|
+ for block in page_info.get('para_blocks', []) + page_info.get('discarded_blocks', []):
|
|
|
+ block_type = block.get('type', '')
|
|
|
+ bbox_original = block.get('bbox', [0, 0, 0, 0])
|
|
|
+
|
|
|
+ x0 = int(bbox_original[0])
|
|
|
+ y0 = int(bbox_original[1])
|
|
|
+ x1 = int(bbox_original[2])
|
|
|
+ y1 = int(bbox_original[3])
|
|
|
+
|
|
|
+ # 获取颜色
|
|
|
+ color = self.color_map.get(block_type, (255, 0, 0))
|
|
|
+
|
|
|
+ # 绘制半透明填充
|
|
|
+ overlay = Image.new('RGBA', image.size, (255, 255, 255, 0))
|
|
|
+ overlay_draw = ImageDraw.Draw(overlay)
|
|
|
+ overlay_draw.rectangle(
|
|
|
+ [x0, y0, x1, y1],
|
|
|
+ fill=(*color, 76), # 30% 透明度
|
|
|
+ outline=color,
|
|
|
+ width=2
|
|
|
+ )
|
|
|
+ image.paste(Image.alpha_composite(image.convert('RGBA'), overlay).convert('RGB'))
|
|
|
+ draw = ImageDraw.Draw(image)
|
|
|
+
|
|
|
+ # 绘制边框
|
|
|
+ draw.rectangle([x0, y0, x1, y1], outline=color, width=2)
|
|
|
+
|
|
|
+ # 标注类型
|
|
|
+ if draw_type_label:
|
|
|
+ label = block_type.replace('_', ' ').title()
|
|
|
+ bbox_label = draw.textbbox((x0 + 2, y0 + 2), label, font=font)
|
|
|
+ draw.rectangle(bbox_label, fill=color)
|
|
|
+ draw.text((x0 + 2, y0 + 2), label, fill='white', font=font)
|
|
|
+
|
|
|
+ # 标注序号
|
|
|
+ if draw_bbox_number:
|
|
|
+ number_text = str(block_idx)
|
|
|
+ bbox_number = draw.textbbox((x1 - 25, y0 + 2), number_text, font=font)
|
|
|
+ draw.rectangle(bbox_number, fill=(255, 0, 0))
|
|
|
+ draw.text((x1 - 25, y0 + 2), number_text, fill='white', font=font)
|
|
|
+ block_idx += 1
|
|
|
+
|
|
|
+ # 保存图片
|
|
|
+ image.save(layout_image_path)
|
|
|
+ logger.info(f"🖼️ Layout image saved: {layout_image_path}")
|
|
|
+
|
|
|
+ layout_image_paths.append(layout_image_path)
|
|
|
+
|
|
|
+ return layout_image_paths
|
|
|
+
|
|
|
+
|
|
|
+if __name__ == "__main__":
|
|
|
+ # 测试代码
|
|
|
+ sample_results = {
|
|
|
+ "document_path": "/path/to/sample.pdf",
|
|
|
+ "scene": "financial_report",
|
|
|
+ "pages": [
|
|
|
+ {
|
|
|
+ "page_idx": 0,
|
|
|
+ "image_shape": [1654, 2338, 3],
|
|
|
+ "elements": [
|
|
|
+ {
|
|
|
+ "type": "title",
|
|
|
+ "bbox": [100, 50, 800, 100],
|
|
|
+ "content": {"text": "财务报告"},
|
|
|
+ "confidence": 0.98,
|
|
|
+ "level": 1
|
|
|
+ },
|
|
|
+ {
|
|
|
+ "type": "table",
|
|
|
+ "bbox": [100, 200, 800, 600],
|
|
|
+ "content": {
|
|
|
+ "html": "<table><tr><td>项目</td><td>金额</td></tr></table>",
|
|
|
+ "markdown": "| 项目 | 金额 |\n|------|------|",
|
|
|
+ "table_caption": ["表1: 财务数据"],
|
|
|
+ "table_footnote": []
|
|
|
+ },
|
|
|
+ "confidence": 0.95
|
|
|
+ }
|
|
|
+ ]
|
|
|
+ }
|
|
|
+ ]
|
|
|
+ }
|
|
|
+
|
|
|
+ formatter = OutputFormatter("./test_output")
|
|
|
+ output_files = formatter.save_results(
|
|
|
+ sample_results,
|
|
|
+ {
|
|
|
+ "save_json": True,
|
|
|
+ "save_content_list": True,
|
|
|
+ "save_markdown": True,
|
|
|
+ "save_table_html": True,
|
|
|
+ "save_layout_image": False
|
|
|
+ }
|
|
|
+ )
|
|
|
+
|
|
|
+ print("Generated files:", output_files)
|