|
|
@@ -70,19 +70,34 @@ def ocr_mk_markdown_with_para_core_v2(paras_of_layout,
|
|
|
if mode == 'nlp':
|
|
|
continue
|
|
|
elif mode == 'mm':
|
|
|
- for block in para_block['blocks']: # 1st.拼image_body
|
|
|
- if block['type'] == BlockType.ImageBody:
|
|
|
- for line in block['lines']:
|
|
|
- for span in line['spans']:
|
|
|
- if span['type'] == ContentType.Image:
|
|
|
- if span.get('image_path', ''):
|
|
|
- para_text += f"\n}) \n"
|
|
|
- for block in para_block['blocks']: # 2nd.拼image_caption
|
|
|
- if block['type'] == BlockType.ImageCaption:
|
|
|
- para_text += merge_para_with_text(block) + ' \n'
|
|
|
- for block in para_block['blocks']: # 3rd.拼image_footnote
|
|
|
- if block['type'] == BlockType.ImageFootnote:
|
|
|
- para_text += merge_para_with_text(block) + ' \n'
|
|
|
+ # 检测是否存在图片脚注
|
|
|
+ has_image_footnote = any(block['type'] == BlockType.ImageFootnote for block in para_block['blocks'])
|
|
|
+ # 如果存在图片脚注,则将图片脚注拼接到图片正文后面
|
|
|
+ if has_image_footnote:
|
|
|
+ for block in para_block['blocks']: # 1st.拼image_caption
|
|
|
+ if block['type'] == BlockType.ImageCaption:
|
|
|
+ para_text += merge_para_with_text(block) + ' \n'
|
|
|
+ for block in para_block['blocks']: # 2nd.拼image_body
|
|
|
+ if block['type'] == BlockType.ImageBody:
|
|
|
+ for line in block['lines']:
|
|
|
+ for span in line['spans']:
|
|
|
+ if span['type'] == ContentType.Image:
|
|
|
+ if span.get('image_path', ''):
|
|
|
+ para_text += f""
|
|
|
+ for block in para_block['blocks']: # 3rd.拼image_footnote
|
|
|
+ if block['type'] == BlockType.ImageFootnote:
|
|
|
+ para_text += ' \n' + merge_para_with_text(block)
|
|
|
+ else:
|
|
|
+ for block in para_block['blocks']: # 1st.拼image_body
|
|
|
+ if block['type'] == BlockType.ImageBody:
|
|
|
+ for line in block['lines']:
|
|
|
+ for span in line['spans']:
|
|
|
+ if span['type'] == ContentType.Image:
|
|
|
+ if span.get('image_path', ''):
|
|
|
+ para_text += f""
|
|
|
+ for block in para_block['blocks']: # 2nd.拼image_caption
|
|
|
+ if block['type'] == BlockType.ImageCaption:
|
|
|
+ para_text += ' \n' + merge_para_with_text(block)
|
|
|
elif para_type == BlockType.Table:
|
|
|
if mode == 'nlp':
|
|
|
continue
|
|
|
@@ -96,20 +111,19 @@ def ocr_mk_markdown_with_para_core_v2(paras_of_layout,
|
|
|
for span in line['spans']:
|
|
|
if span['type'] == ContentType.Table:
|
|
|
# if processed by table model
|
|
|
- if span.get('latex', ''):
|
|
|
- para_text += f"\n\n$\n {span['latex']}\n$\n\n"
|
|
|
- elif span.get('html', ''):
|
|
|
- para_text += f"\n\n{span['html']}\n\n"
|
|
|
+ if span.get('html', ''):
|
|
|
+ para_text += f"\n{span['html']}\n"
|
|
|
elif span.get('image_path', ''):
|
|
|
- para_text += f"\n}) \n"
|
|
|
+ para_text += f""
|
|
|
for block in para_block['blocks']: # 3rd.拼table_footnote
|
|
|
if block['type'] == BlockType.TableFootnote:
|
|
|
- para_text += merge_para_with_text(block) + ' \n'
|
|
|
+ para_text += '\n' + merge_para_with_text(block) + ' '
|
|
|
|
|
|
if para_text.strip() == '':
|
|
|
continue
|
|
|
else:
|
|
|
- page_markdown.append(para_text.strip() + ' ')
|
|
|
+ # page_markdown.append(para_text.strip() + ' ')
|
|
|
+ page_markdown.append(para_text.strip())
|
|
|
|
|
|
return page_markdown
|
|
|
|
|
|
@@ -257,9 +271,9 @@ def para_to_standard_format_v2(para_block, img_buket_path, page_idx, drop_reason
|
|
|
if span['type'] == ContentType.Table:
|
|
|
|
|
|
if span.get('latex', ''):
|
|
|
- para_content['table_body'] = f"\n\n$\n {span['latex']}\n$\n\n"
|
|
|
+ para_content['table_body'] = f"{span['latex']}"
|
|
|
elif span.get('html', ''):
|
|
|
- para_content['table_body'] = f"\n\n{span['html']}\n\n"
|
|
|
+ para_content['table_body'] = f"{span['html']}"
|
|
|
|
|
|
if span.get('image_path', ''):
|
|
|
para_content['img_path'] = join_path(img_buket_path, span['image_path'])
|