|
|
@@ -1,770 +0,0 @@
|
|
|
-"""
|
|
|
-输出格式化器 - 将处理结果转换为多种格式输出
|
|
|
-严格复用MinerU的输出格式,确保完全兼容
|
|
|
-"""
|
|
|
-import json
|
|
|
-import os
|
|
|
-import sys
|
|
|
-from pathlib import Path
|
|
|
-from typing import Dict, Any, List, Union
|
|
|
-from loguru import logger
|
|
|
-import numpy as np
|
|
|
-from PIL import Image, ImageDraw, ImageFont
|
|
|
-
|
|
|
-# 导入MinerU的中间格式转换模块
|
|
|
-mineru_path = Path(__file__).parents[3]
|
|
|
-if str(mineru_path) not in sys.path:
|
|
|
- sys.path.insert(0, str(mineru_path))
|
|
|
-
|
|
|
-from mineru.backend.vlm.vlm_middle_json_mkcontent import union_make as vlm_union_make
|
|
|
-from mineru.utils.enum_class import MakeMode, BlockType, ContentType
|
|
|
-
|
|
|
-
|
|
|
-class OutputFormatter:
|
|
|
- """输出格式化器 - 严格按照MinerU格式"""
|
|
|
-
|
|
|
- def __init__(self, output_dir: str):
|
|
|
- self.output_dir = Path(output_dir)
|
|
|
- self.output_dir.mkdir(parents=True, exist_ok=True)
|
|
|
-
|
|
|
- # 颜色映射(与MinerU保持一致)
|
|
|
- self.color_map = {
|
|
|
- BlockType.TITLE: (102, 102, 255), # 蓝色
|
|
|
- BlockType.TEXT: (153, 0, 76), # 深红
|
|
|
- BlockType.IMAGE: (153, 255, 51), # 绿色
|
|
|
- BlockType.IMAGE_BODY: (153, 255, 51),
|
|
|
- BlockType.IMAGE_CAPTION: (102, 178, 255),
|
|
|
- BlockType.IMAGE_FOOTNOTE: (255, 178, 102),
|
|
|
- BlockType.TABLE: (204, 204, 0), # 黄色
|
|
|
- BlockType.TABLE_BODY: (204, 204, 0),
|
|
|
- BlockType.TABLE_CAPTION: (255, 255, 102),
|
|
|
- BlockType.TABLE_FOOTNOTE: (229, 255, 204),
|
|
|
- BlockType.INTERLINE_EQUATION: (0, 255, 0), # 亮绿
|
|
|
- BlockType.LIST: (40, 169, 92),
|
|
|
- BlockType.CODE: (102, 0, 204), # 紫色
|
|
|
- BlockType.CODE_BODY: (102, 0, 204),
|
|
|
- BlockType.CODE_CAPTION: (204, 153, 255),
|
|
|
- }
|
|
|
-
|
|
|
- def save_results(
|
|
|
- self,
|
|
|
- results: Dict[str, Any],
|
|
|
- output_config: Dict[str, Any]
|
|
|
- ) -> Dict[str, str]:
|
|
|
- """
|
|
|
- 保存处理结果为多种格式
|
|
|
-
|
|
|
- Args:
|
|
|
- results: 处理结果字典(包含pages列表,每页有processed_image)
|
|
|
- output_config: 输出配置
|
|
|
-
|
|
|
- Returns:
|
|
|
- 各种格式的输出文件路径字典
|
|
|
- """
|
|
|
- output_paths = {}
|
|
|
-
|
|
|
- # 创建文档特定的输出目录
|
|
|
- doc_name = Path(results['document_path']).stem
|
|
|
- doc_output_dir = self.output_dir / doc_name
|
|
|
- doc_output_dir.mkdir(parents=True, exist_ok=True)
|
|
|
-
|
|
|
- # 1. 转换为MinerU标准的middle.json格式
|
|
|
- middle_json = self._convert_to_middle_json(results)
|
|
|
-
|
|
|
- # 2. 保存middle.json
|
|
|
- if output_config.get('save_json', True):
|
|
|
- middle_json_path = doc_output_dir / f"{doc_name}_middle.json"
|
|
|
- with open(middle_json_path, 'w', encoding='utf-8') as f:
|
|
|
- json.dump(middle_json, f, ensure_ascii=False, indent=2)
|
|
|
- output_paths['middle_json'] = str(middle_json_path)
|
|
|
- logger.info(f"📄 Middle JSON saved: {middle_json_path}")
|
|
|
-
|
|
|
- # 3. 使用vlm_union_make生成content_list.json
|
|
|
- if output_config.get('save_content_list', True):
|
|
|
- content_list_path = self._save_content_list(
|
|
|
- middle_json, doc_output_dir, doc_name
|
|
|
- )
|
|
|
- output_paths['content_list'] = str(content_list_path)
|
|
|
-
|
|
|
- # 4. 生成Markdown
|
|
|
- if output_config.get('save_markdown', True):
|
|
|
- md_path = self._save_markdown(middle_json, doc_output_dir, doc_name)
|
|
|
- output_paths['markdown'] = str(md_path)
|
|
|
-
|
|
|
- # 5. 保存表格HTML(每个表格一个文件)
|
|
|
- if output_config.get('save_table_html', True):
|
|
|
- table_html_dir = self._save_table_htmls(
|
|
|
- middle_json, doc_output_dir, doc_name
|
|
|
- )
|
|
|
- output_paths['table_htmls'] = str(table_html_dir)
|
|
|
-
|
|
|
- # 6. 绘制布局图片
|
|
|
- if output_config.get('save_layout_image', False):
|
|
|
- layout_image_paths = self._save_layout_image(
|
|
|
- middle_json=middle_json,
|
|
|
- results=results,
|
|
|
- output_dir=doc_output_dir,
|
|
|
- doc_name=doc_name,
|
|
|
- draw_type_label=output_config.get('draw_type_label', True),
|
|
|
- draw_bbox_number=output_config.get('draw_bbox_number', True)
|
|
|
- )
|
|
|
- output_paths['layout_images'] = layout_image_paths
|
|
|
-
|
|
|
- logger.info(f"✅ Results saved to: {doc_output_dir}")
|
|
|
- return output_paths
|
|
|
-
|
|
|
- def _convert_to_middle_json(self, results: Dict[str, Any]) -> Dict[str, Any]:
|
|
|
- """
|
|
|
- 转换为MinerU标准的middle.json格式
|
|
|
- 严格按照 docs/zh/reference/output_files.md 中的VLM后端格式
|
|
|
- """
|
|
|
- middle_json = {
|
|
|
- "pdf_info": [],
|
|
|
- "_backend": "vlm", # 标记为VLM后端
|
|
|
- "_scene": results.get('scene', 'unknown'),
|
|
|
- "_version_name": "2.5.0"
|
|
|
- }
|
|
|
-
|
|
|
- for page in results['pages']:
|
|
|
- page_info = {
|
|
|
- 'page_idx': page['page_idx'],
|
|
|
- 'page_size': list(page.get('image_shape', [0, 0])[:2][::-1]), # [width, height]
|
|
|
- 'angle': page.get('angle', 0),
|
|
|
- 'para_blocks': [],
|
|
|
- 'discarded_blocks': []
|
|
|
- }
|
|
|
-
|
|
|
- # 转换每个元素为MinerU格式的block
|
|
|
- for element in page['elements']:
|
|
|
- block = self._element_to_mineru_block(element, page_info['page_size'])
|
|
|
- if block:
|
|
|
- # 根据类型分类到para_blocks或discarded_blocks
|
|
|
- if element.get('type') in ['header', 'footer', 'page_number',
|
|
|
- 'aside_text', 'page_footnote']:
|
|
|
- page_info['discarded_blocks'].append(block)
|
|
|
- else:
|
|
|
- page_info['para_blocks'].append(block)
|
|
|
-
|
|
|
- middle_json['pdf_info'].append(page_info)
|
|
|
-
|
|
|
- return middle_json
|
|
|
-
|
|
|
- def _element_to_mineru_block(
|
|
|
- self,
|
|
|
- element: Dict[str, Any],
|
|
|
- page_size: List[int]
|
|
|
- ) -> Dict[str, Any]:
|
|
|
- """
|
|
|
- 将处理结果的元素转换为MinerU标准的block格式
|
|
|
-
|
|
|
- 参考: mineru/backend/vlm/vlm_middle_json_mkcontent.py
|
|
|
- """
|
|
|
- element_type = element.get('type', '')
|
|
|
- bbox = element.get('bbox', [0, 0, 0, 0])
|
|
|
-
|
|
|
- # 归一化bbox坐标到0-1范围
|
|
|
- # normalized_bbox = self._normalize_bbox(bbox, page_size)
|
|
|
-
|
|
|
- block = {
|
|
|
- 'type': element_type,
|
|
|
- 'bbox': bbox,
|
|
|
- 'angle': element.get('angle', 0), # VLM后端特有
|
|
|
- 'lines': []
|
|
|
- }
|
|
|
-
|
|
|
- # 文本类型(text, title, ref_text等)
|
|
|
- if element_type in [BlockType.TEXT, BlockType.TITLE, BlockType.REF_TEXT,
|
|
|
- BlockType.PHONETIC, BlockType.HEADER, BlockType.FOOTER,
|
|
|
- BlockType.PAGE_NUMBER, BlockType.ASIDE_TEXT, BlockType.PAGE_FOOTNOTE]:
|
|
|
- content = element.get('content', {})
|
|
|
- text = content.get('text', '') if isinstance(content, dict) else str(content)
|
|
|
-
|
|
|
- if text:
|
|
|
- block['lines'] = [{
|
|
|
- 'bbox': bbox,
|
|
|
- 'spans': [{
|
|
|
- 'bbox': bbox,
|
|
|
- 'type': ContentType.TEXT,
|
|
|
- 'content': text
|
|
|
- }]
|
|
|
- }]
|
|
|
-
|
|
|
- # 添加标题级别
|
|
|
- if element_type == BlockType.TITLE and 'level' in element:
|
|
|
- block['level'] = element['level']
|
|
|
-
|
|
|
- # 列表类型
|
|
|
- elif element_type == BlockType.LIST:
|
|
|
- block['sub_type'] = element.get('sub_type', 'text')
|
|
|
- block['blocks'] = []
|
|
|
-
|
|
|
- list_items = element.get('content', {}).get('list_items', [])
|
|
|
- for item_text in list_items:
|
|
|
- item_block = {
|
|
|
- 'type': BlockType.TEXT,
|
|
|
- 'bbox': bbox,
|
|
|
- 'angle': 0,
|
|
|
- 'lines': [{
|
|
|
- 'bbox': bbox,
|
|
|
- 'spans': [{
|
|
|
- 'bbox': bbox,
|
|
|
- 'type': ContentType.TEXT,
|
|
|
- 'content': item_text
|
|
|
- }]
|
|
|
- }]
|
|
|
- }
|
|
|
- block['blocks'].append(item_block)
|
|
|
-
|
|
|
- # 代码块类型
|
|
|
- elif element_type == BlockType.CODE:
|
|
|
- block['sub_type'] = element.get('sub_type', 'code')
|
|
|
- block['blocks'] = []
|
|
|
-
|
|
|
- code_content = element.get('content', {})
|
|
|
-
|
|
|
- # code_body
|
|
|
- code_body = code_content.get('code_body', '')
|
|
|
- if code_body:
|
|
|
- code_body_block = {
|
|
|
- 'type': BlockType.CODE_BODY,
|
|
|
- 'bbox': bbox,
|
|
|
- 'angle': 0,
|
|
|
- 'lines': [{
|
|
|
- 'bbox': bbox,
|
|
|
- 'spans': [{
|
|
|
- 'bbox': bbox,
|
|
|
- 'type': ContentType.TEXT,
|
|
|
- 'content': code_body
|
|
|
- }]
|
|
|
- }]
|
|
|
- }
|
|
|
- block['blocks'].append(code_body_block)
|
|
|
-
|
|
|
- # 添加语言标识
|
|
|
- if 'guess_lang' in element:
|
|
|
- block['guess_lang'] = element['guess_lang']
|
|
|
-
|
|
|
- # code_caption
|
|
|
- code_caption = code_content.get('code_caption', [])
|
|
|
- for caption_text in code_caption:
|
|
|
- caption_block = {
|
|
|
- 'type': BlockType.CODE_CAPTION,
|
|
|
- 'bbox': bbox,
|
|
|
- 'angle': 0,
|
|
|
- 'lines': [{
|
|
|
- 'bbox': bbox,
|
|
|
- 'spans': [{
|
|
|
- 'bbox': bbox,
|
|
|
- 'type': ContentType.TEXT,
|
|
|
- 'content': caption_text
|
|
|
- }]
|
|
|
- }]
|
|
|
- }
|
|
|
- block['blocks'].append(caption_block)
|
|
|
-
|
|
|
- # 行间公式
|
|
|
- elif element_type == BlockType.INTERLINE_EQUATION:
|
|
|
- formula_content = element.get('content', {})
|
|
|
- latex = formula_content.get('latex', '')
|
|
|
-
|
|
|
- block['lines'] = [{
|
|
|
- 'bbox': bbox,
|
|
|
- 'spans': [{
|
|
|
- 'bbox': bbox,
|
|
|
- 'type': ContentType.INTERLINE_EQUATION,
|
|
|
- 'content': latex
|
|
|
- }]
|
|
|
- }]
|
|
|
-
|
|
|
- # 图片
|
|
|
- elif element_type == BlockType.IMAGE:
|
|
|
- block['blocks'] = []
|
|
|
-
|
|
|
- image_content = element.get('content', {})
|
|
|
-
|
|
|
- # image_body
|
|
|
- img_path = image_content.get('img_path', '')
|
|
|
- if img_path:
|
|
|
- image_body_block = {
|
|
|
- 'type': BlockType.IMAGE_BODY,
|
|
|
- 'bbox': bbox,
|
|
|
- 'angle': 0,
|
|
|
- 'lines': [{
|
|
|
- 'bbox': bbox,
|
|
|
- 'spans': [{
|
|
|
- 'bbox': bbox,
|
|
|
- 'type': ContentType.IMAGE,
|
|
|
- 'image_path': img_path
|
|
|
- }]
|
|
|
- }]
|
|
|
- }
|
|
|
- block['blocks'].append(image_body_block)
|
|
|
-
|
|
|
- # image_caption
|
|
|
- for caption_text in image_content.get('image_caption', []):
|
|
|
- caption_block = {
|
|
|
- 'type': BlockType.IMAGE_CAPTION,
|
|
|
- 'bbox': bbox,
|
|
|
- 'angle': 0,
|
|
|
- 'lines': [{
|
|
|
- 'bbox': bbox,
|
|
|
- 'spans': [{
|
|
|
- 'bbox': bbox,
|
|
|
- 'type': ContentType.TEXT,
|
|
|
- 'content': caption_text
|
|
|
- }]
|
|
|
- }]
|
|
|
- }
|
|
|
- block['blocks'].append(caption_block)
|
|
|
-
|
|
|
- # image_footnote
|
|
|
- for footnote_text in image_content.get('image_footnote', []):
|
|
|
- footnote_block = {
|
|
|
- 'type': BlockType.IMAGE_FOOTNOTE,
|
|
|
- 'bbox': bbox,
|
|
|
- 'angle': 0,
|
|
|
- 'lines': [{
|
|
|
- 'bbox': bbox,
|
|
|
- 'spans': [{
|
|
|
- 'bbox': bbox,
|
|
|
- 'type': ContentType.TEXT,
|
|
|
- 'content': footnote_text
|
|
|
- }]
|
|
|
- }]
|
|
|
- }
|
|
|
- block['blocks'].append(footnote_block)
|
|
|
-
|
|
|
- # 表格
|
|
|
- elif element_type == BlockType.TABLE:
|
|
|
- block['blocks'] = []
|
|
|
-
|
|
|
- table_content = element.get('content', {})
|
|
|
-
|
|
|
- # table_body
|
|
|
- table_html = table_content.get('html', '')
|
|
|
- img_path = table_content.get('img_path', '')
|
|
|
-
|
|
|
- if table_html or img_path:
|
|
|
- table_body_block = {
|
|
|
- 'type': BlockType.TABLE_BODY,
|
|
|
- 'bbox': bbox,
|
|
|
- 'angle': 0,
|
|
|
- 'lines': [{
|
|
|
- 'bbox': bbox,
|
|
|
- 'spans': [{
|
|
|
- 'bbox': bbox,
|
|
|
- 'type': ContentType.TABLE,
|
|
|
- 'html': table_html,
|
|
|
- 'image_path': img_path
|
|
|
- }]
|
|
|
- }]
|
|
|
- }
|
|
|
- block['blocks'].append(table_body_block)
|
|
|
-
|
|
|
- # table_caption
|
|
|
- for caption_text in table_content.get('table_caption', []):
|
|
|
- caption_block = {
|
|
|
- 'type': BlockType.TABLE_CAPTION,
|
|
|
- 'bbox': bbox,
|
|
|
- 'angle': 0,
|
|
|
- 'lines': [{
|
|
|
- 'bbox': bbox,
|
|
|
- 'spans': [{
|
|
|
- 'bbox': bbox,
|
|
|
- 'type': ContentType.TEXT,
|
|
|
- 'content': caption_text
|
|
|
- }]
|
|
|
- }]
|
|
|
- }
|
|
|
- block['blocks'].append(caption_block)
|
|
|
-
|
|
|
- # table_footnote
|
|
|
- for footnote_text in table_content.get('table_footnote', []):
|
|
|
- footnote_block = {
|
|
|
- 'type': BlockType.TABLE_FOOTNOTE,
|
|
|
- 'bbox': bbox,
|
|
|
- 'angle': 0,
|
|
|
- 'lines': [{
|
|
|
- 'bbox': bbox,
|
|
|
- 'spans': [{
|
|
|
- 'bbox': bbox,
|
|
|
- 'type': ContentType.TEXT,
|
|
|
- 'content': footnote_text
|
|
|
- }]
|
|
|
- }]
|
|
|
- }
|
|
|
- block['blocks'].append(footnote_block)
|
|
|
-
|
|
|
- return block
|
|
|
-
|
|
|
- def _normalize_bbox(self, bbox: List[float], page_size: List[int]) -> List[float]:
|
|
|
- """
|
|
|
- 将bbox归一化到0-1范围
|
|
|
-
|
|
|
- Args:
|
|
|
- bbox: [x0, y0, x1, y1] 绝对坐标
|
|
|
- page_size: [width, height] 页面尺寸
|
|
|
-
|
|
|
- Returns:
|
|
|
- 归一化后的bbox
|
|
|
- """
|
|
|
- if not bbox or len(bbox) != 4:
|
|
|
- return [0.0, 0.0, 0.0, 0.0]
|
|
|
-
|
|
|
- page_width, page_height = page_size
|
|
|
- x0, y0, x1, y1 = bbox
|
|
|
-
|
|
|
- return [
|
|
|
- x0 / page_width if page_width > 0 else 0.0,
|
|
|
- y0 / page_height if page_height > 0 else 0.0,
|
|
|
- x1 / page_width if page_width > 0 else 0.0,
|
|
|
- y1 / page_height if page_height > 0 else 0.0
|
|
|
- ]
|
|
|
-
|
|
|
- def _save_content_list(
|
|
|
- self,
|
|
|
- middle_json: Dict[str, Any],
|
|
|
- output_dir: Path,
|
|
|
- doc_name: str
|
|
|
- ) -> Path:
|
|
|
- """
|
|
|
- 使用vlm_union_make生成content_list.json
|
|
|
- """
|
|
|
- content_list_path = output_dir / f"{doc_name}_content_list.json"
|
|
|
-
|
|
|
- try:
|
|
|
- # 直接调用MinerU的vlm_union_make函数
|
|
|
- content_list = vlm_union_make(
|
|
|
- middle_json['pdf_info'],
|
|
|
- make_mode=MakeMode.CONTENT_LIST,
|
|
|
- img_buket_path='images'
|
|
|
- )
|
|
|
-
|
|
|
- with open(content_list_path, 'w', encoding='utf-8') as f:
|
|
|
- json.dump(content_list, f, ensure_ascii=False, indent=2)
|
|
|
-
|
|
|
- logger.info(f"📋 Content list saved: {content_list_path}")
|
|
|
-
|
|
|
- except Exception as e:
|
|
|
- logger.error(f"❌ Failed to generate content_list: {e}")
|
|
|
- # Fallback: 保存空列表
|
|
|
- with open(content_list_path, 'w', encoding='utf-8') as f:
|
|
|
- json.dump([], f)
|
|
|
-
|
|
|
- return content_list_path
|
|
|
-
|
|
|
- def _save_markdown(
|
|
|
- self,
|
|
|
- middle_json: Dict[str, Any],
|
|
|
- output_dir: Path,
|
|
|
- doc_name: str
|
|
|
- ) -> Path:
|
|
|
- """
|
|
|
- 使用vlm_union_make生成markdown
|
|
|
- """
|
|
|
- md_path = output_dir / f"{doc_name}.md"
|
|
|
-
|
|
|
- try:
|
|
|
- # 创建images目录
|
|
|
- images_dir = output_dir / 'images'
|
|
|
- images_dir.mkdir(exist_ok=True)
|
|
|
-
|
|
|
- # 调用MinerU的vlm_union_make生成markdown
|
|
|
- markdown_content = vlm_union_make(
|
|
|
- middle_json['pdf_info'],
|
|
|
- make_mode=MakeMode.MM_MD,
|
|
|
- img_buket_path='images'
|
|
|
- )
|
|
|
-
|
|
|
- # 添加元信息头部
|
|
|
- metadata = f"""---
|
|
|
-scene: {middle_json.get('_scene', 'unknown')}
|
|
|
-backend: {middle_json.get('_backend', 'vlm')}
|
|
|
-version: {middle_json.get('_version_name', '2.5.0')}
|
|
|
----
|
|
|
-
|
|
|
-"""
|
|
|
-
|
|
|
- with open(md_path, 'w', encoding='utf-8') as f:
|
|
|
- f.write(metadata)
|
|
|
- f.write(markdown_content)
|
|
|
-
|
|
|
- logger.info(f"📝 Markdown saved: {md_path}")
|
|
|
-
|
|
|
- except Exception as e:
|
|
|
- logger.error(f"❌ Failed to generate markdown: {e}")
|
|
|
- # Fallback
|
|
|
- with open(md_path, 'w', encoding='utf-8') as f:
|
|
|
- f.write(f"# {doc_name}\n\n*Markdown generation failed*\n")
|
|
|
-
|
|
|
- return md_path
|
|
|
-
|
|
|
- def _save_table_htmls(
|
|
|
- self,
|
|
|
- middle_json: Dict[str, Any],
|
|
|
- output_dir: Path,
|
|
|
- doc_name: str
|
|
|
- ) -> Path:
|
|
|
- """
|
|
|
- 保存每个表格为单独的HTML文件
|
|
|
- """
|
|
|
- tables_dir = output_dir / 'tables'
|
|
|
- tables_dir.mkdir(exist_ok=True)
|
|
|
-
|
|
|
- table_count = 0
|
|
|
-
|
|
|
- for page_idx, page_info in enumerate(middle_json['pdf_info']):
|
|
|
- for block in page_info.get('para_blocks', []):
|
|
|
- if block.get('type') == BlockType.TABLE:
|
|
|
- # 提取表格HTML
|
|
|
- for sub_block in block.get('blocks', []):
|
|
|
- if sub_block.get('type') == BlockType.TABLE_BODY:
|
|
|
- for line in sub_block.get('lines', []):
|
|
|
- for span in line.get('spans', []):
|
|
|
- html_content = span.get('html', '')
|
|
|
- if html_content:
|
|
|
- # 保存表格HTML
|
|
|
- table_count += 1
|
|
|
- table_path = tables_dir / f"{doc_name}_table_{table_count}_page_{page_idx}.html"
|
|
|
-
|
|
|
- # 生成完整的HTML文档
|
|
|
- full_html = self._wrap_table_html(
|
|
|
- html_content,
|
|
|
- f"{doc_name} - Table {table_count}",
|
|
|
- page_idx
|
|
|
- )
|
|
|
-
|
|
|
- with open(table_path, 'w', encoding='utf-8') as f:
|
|
|
- f.write(full_html)
|
|
|
-
|
|
|
- logger.info(f"📊 Table {table_count} saved: {table_path}")
|
|
|
-
|
|
|
- if table_count > 0:
|
|
|
- logger.info(f"📊 Total {table_count} tables saved to: {tables_dir}")
|
|
|
-
|
|
|
- return tables_dir
|
|
|
-
|
|
|
- def _wrap_table_html(self, table_html: str, title: str, page_idx: int) -> str:
|
|
|
- """为表格HTML添加完整的HTML文档结构"""
|
|
|
- return f"""<!DOCTYPE html>
|
|
|
-<html lang="zh-CN">
|
|
|
-<head>
|
|
|
- <meta charset="UTF-8">
|
|
|
- <meta name="viewport" content="width=device-width, initial-scale=1.0">
|
|
|
- <title>{title}</title>
|
|
|
- <style>
|
|
|
- body {{
|
|
|
- font-family: Arial, "Microsoft YaHei", sans-serif;
|
|
|
- margin: 20px;
|
|
|
- background-color: #f5f5f5;
|
|
|
- }}
|
|
|
- .container {{
|
|
|
- max-width: 1200px;
|
|
|
- margin: 0 auto;
|
|
|
- background-color: white;
|
|
|
- padding: 20px;
|
|
|
- box-shadow: 0 0 10px rgba(0,0,0,0.1);
|
|
|
- }}
|
|
|
- .meta {{
|
|
|
- color: #666;
|
|
|
- font-size: 0.9em;
|
|
|
- margin-bottom: 20px;
|
|
|
- padding-bottom: 10px;
|
|
|
- border-bottom: 1px solid #ddd;
|
|
|
- }}
|
|
|
- table {{
|
|
|
- border-collapse: collapse;
|
|
|
- width: 100%;
|
|
|
- margin: 20px 0;
|
|
|
- }}
|
|
|
- th, td {{
|
|
|
- border: 1px solid #ddd;
|
|
|
- padding: 8px 12px;
|
|
|
- text-align: left;
|
|
|
- }}
|
|
|
- th {{
|
|
|
- background-color: #f2f2f2;
|
|
|
- font-weight: bold;
|
|
|
- }}
|
|
|
- tr:hover {{
|
|
|
- background-color: #f9f9f9;
|
|
|
- }}
|
|
|
- </style>
|
|
|
-</head>
|
|
|
-<body>
|
|
|
- <div class="container">
|
|
|
- <div class="meta">
|
|
|
- <p><strong>Title:</strong> {title}</p>
|
|
|
- <p><strong>Page:</strong> {page_idx + 1}</p>
|
|
|
- </div>
|
|
|
- {table_html}
|
|
|
- </div>
|
|
|
-</body>
|
|
|
-</html>"""
|
|
|
-
|
|
|
- def _save_layout_image(
|
|
|
- self,
|
|
|
- middle_json: Dict[str, Any],
|
|
|
- results: Dict[str, Any],
|
|
|
- output_dir: Path,
|
|
|
- doc_name: str,
|
|
|
- draw_type_label: bool = True,
|
|
|
- draw_bbox_number: bool = True
|
|
|
- ) -> List[Path]:
|
|
|
- """
|
|
|
- 在原始图片上绘制布局检测结果
|
|
|
-
|
|
|
- Args:
|
|
|
- middle_json: MinerU中间JSON
|
|
|
- results: 处理结果, processed_image字段包含预处理后的图像
|
|
|
- output_dir: 输出目录
|
|
|
- doc_name: 文档名称
|
|
|
- draw_type_label: 是否标注类型
|
|
|
- draw_bbox_number: 是否标注序号
|
|
|
- """
|
|
|
- layout_image_paths = []
|
|
|
-
|
|
|
- # 获取所有页面
|
|
|
- pages = results.get('pages', [])
|
|
|
- pdf_info = middle_json.get('pdf_info', [])
|
|
|
-
|
|
|
- if len(pages) == 0:
|
|
|
- logger.warning("⚠️ No pages found in results")
|
|
|
- return [output_dir]
|
|
|
-
|
|
|
- logger.info(f"🖼️ Generating layout images for {len(pages)} page(s)...")
|
|
|
-
|
|
|
- # 处理每一页
|
|
|
- for page_idx, (page, page_info) in enumerate(zip(pages, pdf_info)):
|
|
|
- original_image = page.get('processed_image')
|
|
|
- if original_image is None:
|
|
|
- logger.warning(f"⚠️ No processed_image found for page {page_idx}, skipping layout image.")
|
|
|
- continue
|
|
|
- layout_image_path = output_dir / f"{doc_name}_{page_idx + 1}_layout.png"
|
|
|
-
|
|
|
- # 读取图片
|
|
|
- if isinstance(original_image, str):
|
|
|
- image = Image.open(original_image).convert('RGB')
|
|
|
- elif isinstance(original_image, np.ndarray):
|
|
|
- image = Image.fromarray(original_image).convert('RGB')
|
|
|
- elif isinstance(original_image, Image.Image):
|
|
|
- image = original_image.convert('RGB')
|
|
|
- else:
|
|
|
- logger.error("Invalid image type")
|
|
|
- return layout_image_path
|
|
|
-
|
|
|
- # 创建绘图对象
|
|
|
- draw = ImageDraw.Draw(image, 'RGBA')
|
|
|
-
|
|
|
- # 加载字体
|
|
|
- try:
|
|
|
- font = ImageFont.truetype("/System/Library/Fonts/Helvetica.ttc", 14)
|
|
|
- except:
|
|
|
- try:
|
|
|
- font = ImageFont.truetype("/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf", 14)
|
|
|
- except:
|
|
|
- font = ImageFont.load_default()
|
|
|
-
|
|
|
- # 假设只处理第一页
|
|
|
- page_size = page_info.get('page_size', [image.width, image.height])
|
|
|
- image_width, image_height = image.size
|
|
|
-
|
|
|
- # 绘制所有blocks
|
|
|
- block_idx = 1
|
|
|
- for block in page_info.get('para_blocks', []) + page_info.get('discarded_blocks', []):
|
|
|
- block_type = block.get('type', '')
|
|
|
- bbox_original = block.get('bbox', [0, 0, 0, 0])
|
|
|
-
|
|
|
- x0 = int(bbox_original[0])
|
|
|
- y0 = int(bbox_original[1])
|
|
|
- x1 = int(bbox_original[2])
|
|
|
- y1 = int(bbox_original[3])
|
|
|
-
|
|
|
- # 获取颜色
|
|
|
- color = self.color_map.get(block_type, (255, 0, 0))
|
|
|
-
|
|
|
- # 绘制半透明填充
|
|
|
- overlay = Image.new('RGBA', image.size, (255, 255, 255, 0))
|
|
|
- overlay_draw = ImageDraw.Draw(overlay)
|
|
|
- overlay_draw.rectangle(
|
|
|
- [x0, y0, x1, y1],
|
|
|
- fill=(*color, 76), # 30% 透明度
|
|
|
- outline=color,
|
|
|
- width=2
|
|
|
- )
|
|
|
- image.paste(Image.alpha_composite(image.convert('RGBA'), overlay).convert('RGB'))
|
|
|
- draw = ImageDraw.Draw(image)
|
|
|
-
|
|
|
- # 绘制边框
|
|
|
- draw.rectangle([x0, y0, x1, y1], outline=color, width=2)
|
|
|
-
|
|
|
- # 标注类型
|
|
|
- if draw_type_label:
|
|
|
- label = block_type.replace('_', ' ').title()
|
|
|
- bbox_label = draw.textbbox((x0 + 2, y0 + 2), label, font=font)
|
|
|
- draw.rectangle(bbox_label, fill=color)
|
|
|
- draw.text((x0 + 2, y0 + 2), label, fill='white', font=font)
|
|
|
-
|
|
|
- # 标注序号
|
|
|
- if draw_bbox_number:
|
|
|
- number_text = str(block_idx)
|
|
|
- bbox_number = draw.textbbox((x1 - 25, y0 + 2), number_text, font=font)
|
|
|
- draw.rectangle(bbox_number, fill=(255, 0, 0))
|
|
|
- draw.text((x1 - 25, y0 + 2), number_text, fill='white', font=font)
|
|
|
- block_idx += 1
|
|
|
-
|
|
|
- # 保存图片
|
|
|
- image.save(layout_image_path)
|
|
|
- logger.info(f"🖼️ Layout image saved: {layout_image_path}")
|
|
|
-
|
|
|
- layout_image_paths.append(layout_image_path)
|
|
|
-
|
|
|
- return layout_image_paths
|
|
|
-
|
|
|
-
|
|
|
-if __name__ == "__main__":
|
|
|
- # 测试代码
|
|
|
- sample_results = {
|
|
|
- "document_path": "/path/to/sample.pdf",
|
|
|
- "scene": "financial_report",
|
|
|
- "pages": [
|
|
|
- {
|
|
|
- "page_idx": 0,
|
|
|
- "image_shape": [1654, 2338, 3],
|
|
|
- "elements": [
|
|
|
- {
|
|
|
- "type": "title",
|
|
|
- "bbox": [100, 50, 800, 100],
|
|
|
- "content": {"text": "财务报告"},
|
|
|
- "confidence": 0.98,
|
|
|
- "level": 1
|
|
|
- },
|
|
|
- {
|
|
|
- "type": "table",
|
|
|
- "bbox": [100, 200, 800, 600],
|
|
|
- "content": {
|
|
|
- "html": "<table><tr><td>项目</td><td>金额</td></tr></table>",
|
|
|
- "markdown": "| 项目 | 金额 |\n|------|------|",
|
|
|
- "table_caption": ["表1: 财务数据"],
|
|
|
- "table_footnote": []
|
|
|
- },
|
|
|
- "confidence": 0.95
|
|
|
- }
|
|
|
- ]
|
|
|
- }
|
|
|
- ]
|
|
|
- }
|
|
|
-
|
|
|
- formatter = OutputFormatter("./test_output")
|
|
|
- output_files = formatter.save_results(
|
|
|
- sample_results,
|
|
|
- {
|
|
|
- "save_json": True,
|
|
|
- "save_content_list": True,
|
|
|
- "save_markdown": True,
|
|
|
- "save_table_html": True,
|
|
|
- "save_layout_image": False
|
|
|
- }
|
|
|
- )
|
|
|
-
|
|
|
- print("Generated files:", output_files)
|