Procházet zdrojové kódy

feat: 添加MinerU格式化方法,支持页眉、页脚和页码的Markdown生成

zhch158_admin před 4 týdny
rodič
revize
84e3ccaf99
1 změnil soubory, kde provedl 44 přidání a 4 odebrání
  1. 44 4
      merger/markdown_generator.py

+ 44 - 4
merger/markdown_generator.py

@@ -89,13 +89,17 @@ class MarkdownGenerator:
                 md_lines.extend(MarkdownGenerator._format_mineru_image(
                     item, output_path, source_file
                 ))
-            elif item_type == 'equation':
+            elif item_type in ['equation', 'interline_equation']:
                 md_lines.extend(MarkdownGenerator._format_equation(item))
             elif item_type == 'inline_equation':
                 md_lines.extend(MarkdownGenerator._format_inline_equation(item))
-            elif item_type in ['page_number', 'header', 'footer']:
-                md_lines.extend(MarkdownGenerator._format_metadata(item, item_type))
-            elif item_type == 'reference':
+            elif item_type == 'header':
+                md_lines.extend(MarkdownGenerator._format_mineru_header(item))
+            elif item_type == 'footer':
+                md_lines.extend(MarkdownGenerator._format_mineru_footer(item))
+            elif item_type == 'page_number':
+                md_lines.extend(MarkdownGenerator._format_mineru_page_number(item))
+            elif item_type == 'ref_text':
                 md_lines.extend(MarkdownGenerator._format_reference(item))
             else:
                 md_lines.extend(MarkdownGenerator._format_unknown(item))
@@ -255,6 +259,42 @@ class MarkdownGenerator:
         
         return lines
     
+    @staticmethod
+    def _format_mineru_header(item: Dict) -> List[str]:
+        """格式化MinerU header"""
+        lines = []
+        bbox = item.get('bbox', [])
+        if bbox:
+            lines.append(MarkdownGenerator._add_bbox_comment(bbox))
+        
+        text = item.get('text', '')
+        lines.append(f"<!-- 页眉: {text} -->\n")
+        return lines
+
+    @staticmethod
+    def _format_mineru_footer(item: Dict) -> List[str]:
+        """格式化MinerU footer"""
+        lines = []
+        bbox = item.get('bbox', [])
+        if bbox:
+            lines.append(MarkdownGenerator._add_bbox_comment(bbox))
+        
+        text = item.get('text', '')
+        lines.append(f"<!-- 页脚: {text} -->\n")
+        return lines
+
+    @staticmethod
+    def _format_mineru_page_number(item: Dict) -> List[str]:
+        """格式化MinerU page_number"""
+        lines = []
+        bbox = item.get('bbox', [])
+        if bbox:
+            lines.append(MarkdownGenerator._add_bbox_comment(bbox))
+        
+        text = item.get('text', '')
+        lines.append(f"<!-- 页码: {text} -->\n")
+        return lines
+
     # ================== PaddleOCR_VL 格式化方法 ==================
     
     @staticmethod