Преглед изворни кода

refactor: update content type references in pipeline and VLM processing scripts

myhloli пре 4 месеци
родитељ
комит
d9b5d004d9

+ 11 - 14
mineru/backend/pipeline/pipeline_middle_json_mkcontent.py

@@ -193,12 +193,12 @@ def make_blocks_to_content_list(para_block, img_buket_path, page_idx):
     para_content = {}
     if para_type in [BlockType.TEXT, BlockType.LIST, BlockType.INDEX]:
         para_content = {
-            'type': 'text',
+            'type': ContentType.TEXT,
             'text': merge_para_with_text(para_block),
         }
     elif para_type == BlockType.TITLE:
         para_content = {
-            'type': 'text',
+            'type': ContentType.TEXT,
             'text': merge_para_with_text(para_block),
         }
         title_level = get_title_level(para_block)
@@ -208,14 +208,14 @@ def make_blocks_to_content_list(para_block, img_buket_path, page_idx):
         if len(para_block['lines']) == 0 or len(para_block['lines'][0]['spans']) == 0:
             return None
         para_content = {
-            'type': 'equation',
+            'type': ContentType.EQUATION,
             'img_path': f"{img_buket_path}/{para_block['lines'][0]['spans'][0].get('image_path', '')}",
         }
         if para_block['lines'][0]['spans'][0].get('content', ''):
             para_content['text'] = merge_para_with_text(para_block)
             para_content['text_format'] = 'latex'
     elif para_type == BlockType.IMAGE:
-        para_content = {'type': 'image', 'img_path': '', 'img_caption': [], 'img_footnote': []}
+        para_content = {'type': ContentType.IMAGE, 'img_path': '', BlockType.IMAGE_CAPTION: [], BlockType.IMAGE_FOOTNOTE: []}
         for block in para_block['blocks']:
             if block['type'] == BlockType.IMAGE_BODY:
                 for line in block['lines']:
@@ -224,29 +224,26 @@ def make_blocks_to_content_list(para_block, img_buket_path, page_idx):
                             if span.get('image_path', ''):
                                 para_content['img_path'] = f"{img_buket_path}/{span['image_path']}"
             if block['type'] == BlockType.IMAGE_CAPTION:
-                para_content['img_caption'].append(merge_para_with_text(block))
+                para_content[BlockType.IMAGE_CAPTION].append(merge_para_with_text(block))
             if block['type'] == BlockType.IMAGE_FOOTNOTE:
-                para_content['img_footnote'].append(merge_para_with_text(block))
+                para_content[BlockType.IMAGE_FOOTNOTE].append(merge_para_with_text(block))
     elif para_type == BlockType.TABLE:
-        para_content = {'type': 'table', 'img_path': '', 'table_caption': [], 'table_footnote': []}
+        para_content = {'type': ContentType.TABLE, 'img_path': '', BlockType.TABLE_CAPTION: [], BlockType.TABLE_FOOTNOTE: []}
         for block in para_block['blocks']:
             if block['type'] == BlockType.TABLE_BODY:
                 for line in block['lines']:
                     for span in line['spans']:
                         if span['type'] == ContentType.TABLE:
-
-                            if span.get('latex', ''):
-                                para_content['table_body'] = f"{span['latex']}"
-                            elif span.get('html', ''):
-                                para_content['table_body'] = f"{span['html']}"
+                            if span.get('html', ''):
+                                para_content[BlockType.TABLE_BODY] = f"{span['html']}"
 
                             if span.get('image_path', ''):
                                 para_content['img_path'] = f"{img_buket_path}/{span['image_path']}"
 
             if block['type'] == BlockType.TABLE_CAPTION:
-                para_content['table_caption'].append(merge_para_with_text(block))
+                para_content[BlockType.TABLE_CAPTION].append(merge_para_with_text(block))
             if block['type'] == BlockType.TABLE_FOOTNOTE:
-                para_content['table_footnote'].append(merge_para_with_text(block))
+                para_content[BlockType.TABLE_FOOTNOTE].append(merge_para_with_text(block))
 
     para_content['page_idx'] = page_idx
 

+ 10 - 10
mineru/backend/vlm/vlm_middle_json_mkcontent.py

@@ -130,25 +130,25 @@ def make_blocks_to_content_list(para_block, img_buket_path, page_idx):
     para_content = {}
     if para_type in [BlockType.TEXT, BlockType.LIST, BlockType.INDEX]:
         para_content = {
-            'type': 'text',
+            'type': ContentType.TEXT,
             'text': merge_para_with_text(para_block),
         }
     elif para_type == BlockType.TITLE:
         title_level = get_title_level(para_block)
         para_content = {
-            'type': 'text',
+            'type': ContentType.TEXT,
             'text': merge_para_with_text(para_block),
         }
         if title_level != 0:
             para_content['text_level'] = title_level
     elif para_type == BlockType.INTERLINE_EQUATION:
         para_content = {
-            'type': 'equation',
+            'type': ContentType.EQUATION,
             'text': merge_para_with_text(para_block),
             'text_format': 'latex',
         }
     elif para_type == BlockType.IMAGE:
-        para_content = {'type': 'image', 'img_path': '', 'img_caption': [], 'img_footnote': []}
+        para_content = {'type': ContentType.IMAGE, 'img_path': '', BlockType.IMAGE_CAPTION: [], BlockType.IMAGE_FOOTNOTE: []}
         for block in para_block['blocks']:
             if block['type'] == BlockType.IMAGE_BODY:
                 for line in block['lines']:
@@ -157,11 +157,11 @@ def make_blocks_to_content_list(para_block, img_buket_path, page_idx):
                             if span.get('image_path', ''):
                                 para_content['img_path'] = f"{img_buket_path}/{span['image_path']}"
             if block['type'] == BlockType.IMAGE_CAPTION:
-                para_content['img_caption'].append(merge_para_with_text(block))
+                para_content[BlockType.IMAGE_CAPTION].append(merge_para_with_text(block))
             if block['type'] == BlockType.IMAGE_FOOTNOTE:
-                para_content['img_footnote'].append(merge_para_with_text(block))
+                para_content[BlockType.IMAGE_FOOTNOTE].append(merge_para_with_text(block))
     elif para_type == BlockType.TABLE:
-        para_content = {'type': 'table', 'img_path': '', 'table_caption': [], 'table_footnote': []}
+        para_content = {'type': ContentType.TABLE, 'img_path': '', BlockType.TABLE_CAPTION: [], BlockType.TABLE_FOOTNOTE: []}
         for block in para_block['blocks']:
             if block['type'] == BlockType.TABLE_BODY:
                 for line in block['lines']:
@@ -169,15 +169,15 @@ def make_blocks_to_content_list(para_block, img_buket_path, page_idx):
                         if span['type'] == ContentType.TABLE:
 
                             if span.get('html', ''):
-                                para_content['table_body'] = f"{span['html']}"
+                                para_content[BlockType.TABLE_BODY] = f"{span['html']}"
 
                             if span.get('image_path', ''):
                                 para_content['img_path'] = f"{img_buket_path}/{span['image_path']}"
 
             if block['type'] == BlockType.TABLE_CAPTION:
-                para_content['table_caption'].append(merge_para_with_text(block))
+                para_content[BlockType.TABLE_CAPTION].append(merge_para_with_text(block))
             if block['type'] == BlockType.TABLE_FOOTNOTE:
-                para_content['table_footnote'].append(merge_para_with_text(block))
+                para_content[BlockType.TABLE_FOOTNOTE].append(merge_para_with_text(block))
 
     para_content['page_idx'] = page_idx
 

+ 1 - 0
mineru/utils/enum_class.py

@@ -21,6 +21,7 @@ class ContentType:
     TEXT = 'text'
     INTERLINE_EQUATION = 'interline_equation'
     INLINE_EQUATION = 'inline_equation'
+    EQUATION = 'equation'
 
 
 class CategoryId: