|
|
@@ -191,11 +191,20 @@ def merge_para_with_text(para_block):
|
|
|
def make_blocks_to_content_list(para_block, img_buket_path, page_idx, page_size):
|
|
|
para_type = para_block['type']
|
|
|
para_content = {}
|
|
|
- if para_type in [BlockType.TEXT, BlockType.LIST, BlockType.INDEX]:
|
|
|
+ if para_type in [
|
|
|
+ BlockType.TEXT,
|
|
|
+ BlockType.LIST,
|
|
|
+ BlockType.INDEX,
|
|
|
+ ]:
|
|
|
para_content = {
|
|
|
'type': ContentType.TEXT,
|
|
|
'text': merge_para_with_text(para_block),
|
|
|
}
|
|
|
+ elif para_type == BlockType.DISCARDED:
|
|
|
+ para_content = {
|
|
|
+ 'type': para_type,
|
|
|
+ 'text': merge_para_with_text(para_block),
|
|
|
+ }
|
|
|
elif para_type == BlockType.TITLE:
|
|
|
para_content = {
|
|
|
'type': ContentType.TEXT,
|
|
|
@@ -268,15 +277,18 @@ def union_make(pdf_info_dict: list,
|
|
|
output_content = []
|
|
|
for page_info in pdf_info_dict:
|
|
|
paras_of_layout = page_info.get('para_blocks')
|
|
|
+ paras_of_discarded = page_info.get('discarded_blocks')
|
|
|
page_idx = page_info.get('page_idx')
|
|
|
page_size = page_info.get('page_size')
|
|
|
- if not paras_of_layout:
|
|
|
- continue
|
|
|
if make_mode in [MakeMode.MM_MD, MakeMode.NLP_MD]:
|
|
|
+ if not paras_of_layout:
|
|
|
+ continue
|
|
|
page_markdown = make_blocks_to_markdown(paras_of_layout, make_mode, img_buket_path)
|
|
|
output_content.extend(page_markdown)
|
|
|
elif make_mode == MakeMode.CONTENT_LIST:
|
|
|
- for para_block in paras_of_layout:
|
|
|
+ if not paras_of_layout + paras_of_discarded:
|
|
|
+ continue
|
|
|
+ for para_block in paras_of_layout + paras_of_discarded:
|
|
|
para_content = make_blocks_to_content_list(para_block, img_buket_path, page_idx, page_size)
|
|
|
if para_content:
|
|
|
output_content.append(para_content)
|