|
|
@@ -193,12 +193,12 @@ def make_blocks_to_content_list(para_block, img_buket_path, page_idx):
|
|
|
para_content = {}
|
|
|
if para_type in [BlockType.TEXT, BlockType.LIST, BlockType.INDEX]:
|
|
|
para_content = {
|
|
|
- 'type': 'text',
|
|
|
+ 'type': ContentType.TEXT,
|
|
|
'text': merge_para_with_text(para_block),
|
|
|
}
|
|
|
elif para_type == BlockType.TITLE:
|
|
|
para_content = {
|
|
|
- 'type': 'text',
|
|
|
+ 'type': ContentType.TEXT,
|
|
|
'text': merge_para_with_text(para_block),
|
|
|
}
|
|
|
title_level = get_title_level(para_block)
|
|
|
@@ -208,14 +208,14 @@ def make_blocks_to_content_list(para_block, img_buket_path, page_idx):
|
|
|
if len(para_block['lines']) == 0 or len(para_block['lines'][0]['spans']) == 0:
|
|
|
return None
|
|
|
para_content = {
|
|
|
- 'type': 'equation',
|
|
|
+ 'type': ContentType.EQUATION,
|
|
|
'img_path': f"{img_buket_path}/{para_block['lines'][0]['spans'][0].get('image_path', '')}",
|
|
|
}
|
|
|
if para_block['lines'][0]['spans'][0].get('content', ''):
|
|
|
para_content['text'] = merge_para_with_text(para_block)
|
|
|
para_content['text_format'] = 'latex'
|
|
|
elif para_type == BlockType.IMAGE:
|
|
|
- para_content = {'type': 'image', 'img_path': '', 'img_caption': [], 'img_footnote': []}
|
|
|
+ para_content = {'type': ContentType.IMAGE, 'img_path': '', BlockType.IMAGE_CAPTION: [], BlockType.IMAGE_FOOTNOTE: []}
|
|
|
for block in para_block['blocks']:
|
|
|
if block['type'] == BlockType.IMAGE_BODY:
|
|
|
for line in block['lines']:
|
|
|
@@ -224,29 +224,26 @@ def make_blocks_to_content_list(para_block, img_buket_path, page_idx):
|
|
|
if span.get('image_path', ''):
|
|
|
para_content['img_path'] = f"{img_buket_path}/{span['image_path']}"
|
|
|
if block['type'] == BlockType.IMAGE_CAPTION:
|
|
|
- para_content['img_caption'].append(merge_para_with_text(block))
|
|
|
+ para_content[BlockType.IMAGE_CAPTION].append(merge_para_with_text(block))
|
|
|
if block['type'] == BlockType.IMAGE_FOOTNOTE:
|
|
|
- para_content['img_footnote'].append(merge_para_with_text(block))
|
|
|
+ para_content[BlockType.IMAGE_FOOTNOTE].append(merge_para_with_text(block))
|
|
|
elif para_type == BlockType.TABLE:
|
|
|
- para_content = {'type': 'table', 'img_path': '', 'table_caption': [], 'table_footnote': []}
|
|
|
+ para_content = {'type': ContentType.TABLE, 'img_path': '', BlockType.TABLE_CAPTION: [], BlockType.TABLE_FOOTNOTE: []}
|
|
|
for block in para_block['blocks']:
|
|
|
if block['type'] == BlockType.TABLE_BODY:
|
|
|
for line in block['lines']:
|
|
|
for span in line['spans']:
|
|
|
if span['type'] == ContentType.TABLE:
|
|
|
-
|
|
|
- if span.get('latex', ''):
|
|
|
- para_content['table_body'] = f"{span['latex']}"
|
|
|
- elif span.get('html', ''):
|
|
|
- para_content['table_body'] = f"{span['html']}"
|
|
|
+ if span.get('html', ''):
|
|
|
+ para_content[BlockType.TABLE_BODY] = f"{span['html']}"
|
|
|
|
|
|
if span.get('image_path', ''):
|
|
|
para_content['img_path'] = f"{img_buket_path}/{span['image_path']}"
|
|
|
|
|
|
if block['type'] == BlockType.TABLE_CAPTION:
|
|
|
- para_content['table_caption'].append(merge_para_with_text(block))
|
|
|
+ para_content[BlockType.TABLE_CAPTION].append(merge_para_with_text(block))
|
|
|
if block['type'] == BlockType.TABLE_FOOTNOTE:
|
|
|
- para_content['table_footnote'].append(merge_para_with_text(block))
|
|
|
+ para_content[BlockType.TABLE_FOOTNOTE].append(merge_para_with_text(block))
|
|
|
|
|
|
para_content['page_idx'] = page_idx
|
|
|
|