|
|
@@ -89,13 +89,17 @@ class MarkdownGenerator:
|
|
|
md_lines.extend(MarkdownGenerator._format_mineru_image(
|
|
|
item, output_path, source_file
|
|
|
))
|
|
|
- elif item_type == 'equation':
|
|
|
+ elif item_type in ['equation', 'interline_equation']:
|
|
|
md_lines.extend(MarkdownGenerator._format_equation(item))
|
|
|
elif item_type == 'inline_equation':
|
|
|
md_lines.extend(MarkdownGenerator._format_inline_equation(item))
|
|
|
- elif item_type in ['page_number', 'header', 'footer']:
|
|
|
- md_lines.extend(MarkdownGenerator._format_metadata(item, item_type))
|
|
|
- elif item_type == 'reference':
|
|
|
+ elif item_type == 'header':
|
|
|
+ md_lines.extend(MarkdownGenerator._format_mineru_header(item))
|
|
|
+ elif item_type == 'footer':
|
|
|
+ md_lines.extend(MarkdownGenerator._format_mineru_footer(item))
|
|
|
+ elif item_type == 'page_number':
|
|
|
+ md_lines.extend(MarkdownGenerator._format_mineru_page_number(item))
|
|
|
+ elif item_type == 'ref_text':
|
|
|
md_lines.extend(MarkdownGenerator._format_reference(item))
|
|
|
else:
|
|
|
md_lines.extend(MarkdownGenerator._format_unknown(item))
|
|
|
@@ -255,6 +259,42 @@ class MarkdownGenerator:
|
|
|
|
|
|
return lines
|
|
|
|
|
|
+ @staticmethod
|
|
|
+ def _format_mineru_header(item: Dict) -> List[str]:
|
|
|
+ """格式化MinerU header"""
|
|
|
+ lines = []
|
|
|
+ bbox = item.get('bbox', [])
|
|
|
+ if bbox:
|
|
|
+ lines.append(MarkdownGenerator._add_bbox_comment(bbox))
|
|
|
+
|
|
|
+ text = item.get('text', '')
|
|
|
+ lines.append(f"<!-- 页眉: {text} -->\n")
|
|
|
+ return lines
|
|
|
+
|
|
|
+ @staticmethod
|
|
|
+ def _format_mineru_footer(item: Dict) -> List[str]:
|
|
|
+ """格式化MinerU footer"""
|
|
|
+ lines = []
|
|
|
+ bbox = item.get('bbox', [])
|
|
|
+ if bbox:
|
|
|
+ lines.append(MarkdownGenerator._add_bbox_comment(bbox))
|
|
|
+
|
|
|
+ text = item.get('text', '')
|
|
|
+ lines.append(f"<!-- 页脚: {text} -->\n")
|
|
|
+ return lines
|
|
|
+
|
|
|
+ @staticmethod
|
|
|
+ def _format_mineru_page_number(item: Dict) -> List[str]:
|
|
|
+ """格式化MinerU page_number"""
|
|
|
+ lines = []
|
|
|
+ bbox = item.get('bbox', [])
|
|
|
+ if bbox:
|
|
|
+ lines.append(MarkdownGenerator._add_bbox_comment(bbox))
|
|
|
+
|
|
|
+ text = item.get('text', '')
|
|
|
+ lines.append(f"<!-- 页码: {text} -->\n")
|
|
|
+ return lines
|
|
|
+
|
|
|
# ================== PaddleOCR_VL 格式化方法 ==================
|
|
|
|
|
|
@staticmethod
|