6 kuukautta sitten · b122b86e8a
--- a/magic_pdf/dict2md/ocr_mkcontent.py
+++ b/magic_pdf/dict2md/ocr_mkcontent.py
@@ -70,19 +70,34 @@ def ocr_mk_markdown_with_para_core_v2(paras_of_layout,
 
				             if mode == 'nlp':
			
 
				                 continue
			
 
				             elif mode == 'mm':
			
 
				-                for block in para_block['blocks']:  # 1st.拼image_body
			
 
				-                    if block['type'] == BlockType.ImageBody:
			
 
				-                        for line in block['lines']:
			
 
				-                            for span in line['spans']:
			
 
				-                                if span['type'] == ContentType.Image:
			
 
				-                                    if span.get('image_path', ''):
			
 
				-                                        para_text += f"\n![]({join_path(img_buket_path, span['image_path'])})  \n"
			
 
				-                for block in para_block['blocks']:  # 2nd.拼image_caption
			
 
				-                    if block['type'] == BlockType.ImageCaption:
			
 
				-                        para_text += merge_para_with_text(block) + '  \n'
			
 
				-                for block in para_block['blocks']:  # 3rd.拼image_footnote
			
 
				-                    if block['type'] == BlockType.ImageFootnote:
			
 
				-                        para_text += merge_para_with_text(block) + '  \n'
			
 
				+                # 检测是否存在图片脚注
			
 
				+                has_image_footnote = any(block['type'] == BlockType.ImageFootnote for block in para_block['blocks'])
			
 
				+                # 如果存在图片脚注，则将图片脚注拼接到图片正文后面
			
 
				+                if has_image_footnote:
			
 
				+                    for block in para_block['blocks']:  # 1st.拼image_caption
			
 
				+                        if block['type'] == BlockType.ImageCaption:
			
 
				+                            para_text += merge_para_with_text(block) + '  \n'
			
 
				+                    for block in para_block['blocks']:  # 2nd.拼image_body
			
 
				+                        if block['type'] == BlockType.ImageBody:
			
 
				+                            for line in block['lines']:
			
 
				+                                for span in line['spans']:
			
 
				+                                    if span['type'] == ContentType.Image:
			
 
				+                                        if span.get('image_path', ''):
			
 
				+                                            para_text += f"![]({img_buket_path}/{span['image_path']})"
			
 
				+                    for block in para_block['blocks']:  # 3rd.拼image_footnote
			
 
				+                        if block['type'] == BlockType.ImageFootnote:
			
 
				+                            para_text += '  \n' + merge_para_with_text(block)
			
 
				+                else:
			
 
				+                    for block in para_block['blocks']:  # 1st.拼image_body
			
 
				+                        if block['type'] == BlockType.ImageBody:
			
 
				+                            for line in block['lines']:
			
 
				+                                for span in line['spans']:
			
 
				+                                    if span['type'] == ContentType.Image:
			
 
				+                                        if span.get('image_path', ''):
			
 
				+                                            para_text += f"![]({img_buket_path}/{span['image_path']})"
			
 
				+                    for block in para_block['blocks']:  # 2nd.拼image_caption
			
 
				+                        if block['type'] == BlockType.ImageCaption:
			
 
				+                            para_text += '  \n' + merge_para_with_text(block)
			
 
				         elif para_type == BlockType.Table:
			
 
				             if mode == 'nlp':
			
 
				                 continue
			
@@ -96,20 +111,19 @@ def ocr_mk_markdown_with_para_core_v2(paras_of_layout,
 
				                             for span in line['spans']:
			
 
				                                 if span['type'] == ContentType.Table:
			
 
				                                     # if processed by table model
			
 
				-                                    if span.get('latex', ''):
			
 
				-                                        para_text += f"\n\n$\n {span['latex']}\n$\n\n"
			
 
				-                                    elif span.get('html', ''):
			
 
				-                                        para_text += f"\n\n{span['html']}\n\n"
			
 
				+                                    if span.get('html', ''):
			
 
				+                                        para_text += f"\n{span['html']}\n"
			
 
				                                     elif span.get('image_path', ''):
			
 
				-                                        para_text += f"\n![]({join_path(img_buket_path, span['image_path'])})  \n"
			
 
				+                                        para_text += f"![]({img_buket_path}/{span['image_path']})"
			
 
				                 for block in para_block['blocks']:  # 3rd.拼table_footnote
			
 
				                     if block['type'] == BlockType.TableFootnote:
			
 
				-                        para_text += merge_para_with_text(block) + '  \n'
			
 
				+                        para_text += '\n' + merge_para_with_text(block) + '  '
			
 
				 
			
 
				         if para_text.strip() == '':
			
 
				             continue
			
 
				         else:
			
 
				-            page_markdown.append(para_text.strip() + '  ')
			
 
				+            # page_markdown.append(para_text.strip() + '  ')
			
 
				+            page_markdown.append(para_text.strip())
			
 
				 
			
 
				     return page_markdown
			
 
				 
			
@@ -257,9 +271,9 @@ def para_to_standard_format_v2(para_block, img_buket_path, page_idx, drop_reason
 
				                         if span['type'] == ContentType.Table:
			
 
				 
			
 
				                             if span.get('latex', ''):
			
 
				-                                para_content['table_body'] = f"\n\n$\n {span['latex']}\n$\n\n"
			
 
				+                                para_content['table_body'] = f"{span['latex']}"
			
 
				                             elif span.get('html', ''):
			
 
				-                                para_content['table_body'] = f"\n\n{span['html']}\n\n"
			
 
				+                                para_content['table_body'] = f"{span['html']}"
			
 
				 
			
 
				                             if span.get('image_path', ''):
			
 
				                                 para_content['img_path'] = join_path(img_buket_path, span['image_path'])