1 年之前 · c4a52ee6c6
--- a/magic_pdf/dict2md/ocr_mkcontent.py
+++ b/magic_pdf/dict2md/ocr_mkcontent.py
@@ -272,30 +272,28 @@ def para_to_standard_format(para, img_buket_path):
 
				     return para_content
			
 
				 
			
 
				 
			
 
				-def para_to_standard_format_v2(para_block, img_buket_path, page_idx, parse_type="auto", lang=None):
			
 
				+def para_to_standard_format_v2(para_block, img_buket_path, page_idx, parse_type="auto", lang=None, drop_reason=None):
			
 
				     para_type = para_block['type']
			
 
				+    para_content = {}
			
 
				     if para_type == BlockType.Text:
			
 
				         para_content = {
			
 
				             'type': 'text',
			
 
				             'text': merge_para_with_text(para_block, parse_type=parse_type, lang=lang),
			
 
				-            'page_idx': page_idx,
			
 
				         }
			
 
				     elif para_type == BlockType.Title:
			
 
				         para_content = {
			
 
				             'type': 'text',
			
 
				             'text': merge_para_with_text(para_block, parse_type=parse_type, lang=lang),
			
 
				             'text_level': 1,
			
 
				-            'page_idx': page_idx,
			
 
				         }
			
 
				     elif para_type == BlockType.InterlineEquation:
			
 
				         para_content = {
			
 
				             'type': 'equation',
			
 
				             'text': merge_para_with_text(para_block, parse_type=parse_type, lang=lang),
			
 
				             'text_format': 'latex',
			
 
				-            'page_idx': page_idx,
			
 
				         }
			
 
				     elif para_type == BlockType.Image:
			
 
				-        para_content = {'type': 'image', 'page_idx': page_idx}
			
 
				+        para_content = {'type': 'image'}
			
 
				         for block in para_block['blocks']:
			
 
				             if block['type'] == BlockType.ImageBody:
			
 
				                 para_content['img_path'] = join_path(
			
@@ -306,7 +304,7 @@ def para_to_standard_format_v2(para_block, img_buket_path, page_idx, parse_type=
 
				             if block['type'] == BlockType.ImageFootnote:
			
 
				                 para_content['img_footnote'] = merge_para_with_text(block, parse_type=parse_type, lang=lang)
			
 
				     elif para_type == BlockType.Table:
			
 
				-        para_content = {'type': 'table', 'page_idx': page_idx}
			
 
				+        para_content = {'type': 'table'}
			
 
				         for block in para_block['blocks']:
			
 
				             if block['type'] == BlockType.TableBody:
			
 
				                 if block["lines"][0]["spans"][0].get('latex', ''):
			
@@ -319,6 +317,11 @@ def para_to_standard_format_v2(para_block, img_buket_path, page_idx, parse_type=
 
				             if block['type'] == BlockType.TableFootnote:
			
 
				                 para_content['table_footnote'] = merge_para_with_text(block, parse_type=parse_type, lang=lang)
			
 
				 
			
 
				+    para_content['page_idx'] = page_idx
			
 
				+
			
 
				+    if drop_reason is not None:
			
 
				+        para_content['drop_reason'] = drop_reason
			
 
				+
			
 
				     return para_content
			
 
				 
			
 
				 
			
@@ -406,10 +409,14 @@ def union_make(pdf_info_dict: list,
 
				                lang=None):
			
 
				     output_content = []
			
 
				     for page_info in pdf_info_dict:
			
 
				+        drop_reason_flag = False
			
 
				+        drop_reason = None
			
 
				         if page_info.get('need_drop', False):
			
 
				             drop_reason = page_info.get('drop_reason')
			
 
				             if drop_mode == DropMode.NONE:
			
 
				                 pass
			
 
				+            elif drop_mode == DropMode.NONE_WITH_REASON:
			
 
				+                drop_reason_flag = True
			
 
				             elif drop_mode == DropMode.WHOLE_PDF:
			
 
				                 raise Exception((f'drop_mode is {DropMode.WHOLE_PDF} ,'
			
 
				                                  f'drop_reason is {drop_reason}'))
			
@@ -434,8 +441,12 @@ def union_make(pdf_info_dict: list,
 
				             output_content.extend(page_markdown)
			
 
				         elif make_mode == MakeMode.STANDARD_FORMAT:
			
 
				             for para_block in paras_of_layout:
			
 
				-                para_content = para_to_standard_format_v2(
			
 
				-                    para_block, img_buket_path, page_idx, parse_type=parse_type, lang=lang)
			
 
				+                if drop_reason_flag:
			
 
				+                    para_content = para_to_standard_format_v2(
			
 
				+                        para_block, img_buket_path, page_idx, parse_type=parse_type, lang=lang, drop_reason=drop_reason)
			
 
				+                else:
			
 
				+                    para_content = para_to_standard_format_v2(
			
 
				+                        para_block, img_buket_path, page_idx, parse_type=parse_type, lang=lang)
			
 
				                 output_content.append(para_content)
			
 
				     if make_mode in [MakeMode.MM_MD, MakeMode.NLP_MD]:
			
 
				         return '\n\n'.join(output_content)
			
--- a/magic_pdf/libs/MakeContentConfig.py
+++ b/magic_pdf/libs/MakeContentConfig.py
@@ -8,3 +8,4 @@ class DropMode:
 
				     WHOLE_PDF = "whole_pdf"
			
 
				     SINGLE_PAGE = "single_page"
			
 
				     NONE = "none"
			
 
				+    NONE_WITH_REASON = "none_with_reason"