Ver código fonte

fix: handle empty lines and spans in pipeline_middle_json_mkcontent.py

myhloli 5 meses atrás
pai
commit
cfc7840689

+ 6 - 1
mineru/backend/pipeline/pipeline_middle_json_mkcontent.py

@@ -34,6 +34,8 @@ def make_blocks_to_markdown(paras_of_layout,
             title_level = get_title_level(para_block)
             para_text = f'{"#" * title_level} {merge_para_with_text(para_block)}'
         elif para_type == BlockType.INTERLINE_EQUATION:
+            if len(para_block['lines']) == 0 or len(para_block['lines'][0]['spans']) == 0:
+                continue
             if para_block['lines'][0]['spans'][0].get('content', ''):
                 para_text = merge_para_with_text(para_block)
             else:
@@ -201,6 +203,8 @@ def make_blocks_to_content_list(para_block, img_buket_path, page_idx):
         if title_level != 0:
             para_content['text_level'] = title_level
     elif para_type == BlockType.INTERLINE_EQUATION:
+        if len(para_block['lines']) == 0 or len(para_block['lines'][0]['spans']) == 0:
+            return None
         para_content = {
             'type': 'equation',
             'img_path': f"{img_buket_path}/{para_block['lines'][0]['spans'][0].get('image_path', '')}",
@@ -263,7 +267,8 @@ def union_make(pdf_info_dict: list,
         elif make_mode == MakeMode.CONTENT_LIST:
             for para_block in paras_of_layout:
                 para_content = make_blocks_to_content_list(para_block, img_buket_path, page_idx)
-                output_content.append(para_content)
+                if para_content:
+                    output_content.append(para_content)
 
     if make_mode in [MakeMode.MM_MD, MakeMode.NLP_MD]:
         return '\n\n'.join(output_content)