|
|
@@ -141,11 +141,33 @@ def draw_layout_bbox(pdf_info, pdf_bytes, out_path, filename):
|
|
|
|
|
|
layout_bbox_list = []
|
|
|
|
|
|
+ table_type_order = {
|
|
|
+ 'table_caption': 1,
|
|
|
+ 'table_body': 2,
|
|
|
+ 'table_footnote': 3
|
|
|
+ }
|
|
|
for page in pdf_info:
|
|
|
page_block_list = []
|
|
|
for block in page['para_blocks']:
|
|
|
- bbox = block['bbox']
|
|
|
- page_block_list.append(bbox)
|
|
|
+ if block['type'] in [
|
|
|
+ BlockType.Text,
|
|
|
+ BlockType.Title,
|
|
|
+ BlockType.InterlineEquation,
|
|
|
+ BlockType.List,
|
|
|
+ BlockType.Index,
|
|
|
+ ]:
|
|
|
+ bbox = block['bbox']
|
|
|
+ page_block_list.append(bbox)
|
|
|
+ elif block['type'] in [BlockType.Image]:
|
|
|
+ for sub_block in block['blocks']:
|
|
|
+ bbox = sub_block['bbox']
|
|
|
+ page_block_list.append(bbox)
|
|
|
+ elif block['type'] in [BlockType.Table]:
|
|
|
+ sorted_blocks = sorted(block['blocks'], key=lambda x: table_type_order[x['type']])
|
|
|
+ for sub_block in sorted_blocks:
|
|
|
+ bbox = sub_block['bbox']
|
|
|
+ page_block_list.append(bbox)
|
|
|
+
|
|
|
layout_bbox_list.append(page_block_list)
|
|
|
|
|
|
pdf_docs = fitz.open('pdf', pdf_bytes)
|
|
|
@@ -153,11 +175,11 @@ def draw_layout_bbox(pdf_info, pdf_bytes, out_path, filename):
|
|
|
for i, page in enumerate(pdf_docs):
|
|
|
|
|
|
draw_bbox_without_number(i, dropped_bbox_list, page, [158, 158, 158], True)
|
|
|
- draw_bbox_without_number(i, tables_list, page, [153, 153, 0], True) # color !
|
|
|
+ # draw_bbox_without_number(i, tables_list, page, [153, 153, 0], True) # color !
|
|
|
draw_bbox_without_number(i, tables_body_list, page, [204, 204, 0], True)
|
|
|
draw_bbox_without_number(i, tables_caption_list, page, [255, 255, 102], True)
|
|
|
draw_bbox_without_number(i, tables_footnote_list, page, [229, 255, 204], True)
|
|
|
- draw_bbox_without_number(i, imgs_list, page, [51, 102, 0], True)
|
|
|
+ # draw_bbox_without_number(i, imgs_list, page, [51, 102, 0], True)
|
|
|
draw_bbox_without_number(i, imgs_body_list, page, [153, 255, 51], True)
|
|
|
draw_bbox_without_number(i, imgs_caption_list, page, [102, 178, 255], True)
|
|
|
draw_bbox_without_number(i, imgs_footnote_list, page, [255, 178, 102], True),
|
|
|
@@ -338,19 +360,23 @@ def draw_line_sort_bbox(pdf_info, pdf_bytes, out_path, filename):
|
|
|
for page in pdf_info:
|
|
|
page_line_list = []
|
|
|
for block in page['preproc_blocks']:
|
|
|
- if block['type'] in ['text', 'title', 'interline_equation']:
|
|
|
+ if block['type'] in [BlockType.Text, BlockType.Title, BlockType.InterlineEquation]:
|
|
|
for line in block['lines']:
|
|
|
bbox = line['bbox']
|
|
|
index = line['index']
|
|
|
page_line_list.append({'index': index, 'bbox': bbox})
|
|
|
- if block['type'] in ['table', 'image']:
|
|
|
- bbox = block['bbox']
|
|
|
- index = block['index']
|
|
|
- page_line_list.append({'index': index, 'bbox': bbox})
|
|
|
- # for line in block['lines']:
|
|
|
- # bbox = line['bbox']
|
|
|
- # index = line['index']
|
|
|
- # page_line_list.append({'index': index, 'bbox': bbox})
|
|
|
+ if block['type'] in [BlockType.Image, BlockType.Table]:
|
|
|
+ for sub_block in block['blocks']:
|
|
|
+ if sub_block['type'] in [BlockType.ImageBody, BlockType.TableBody]:
|
|
|
+ for line in sub_block['virtual_lines']:
|
|
|
+ bbox = line['bbox']
|
|
|
+ index = line['index']
|
|
|
+ page_line_list.append({'index': index, 'bbox': bbox})
|
|
|
+ elif sub_block['type'] in [BlockType.ImageCaption, BlockType.TableCaption, BlockType.ImageFootnote, BlockType.TableFootnote]:
|
|
|
+ for line in sub_block['lines']:
|
|
|
+ bbox = line['bbox']
|
|
|
+ index = line['index']
|
|
|
+ page_line_list.append({'index': index, 'bbox': bbox})
|
|
|
sorted_bboxes = sorted(page_line_list, key=lambda x: x['index'])
|
|
|
layout_bbox_list.append(sorted_bbox['bbox'] for sorted_bbox in sorted_bboxes)
|
|
|
pdf_docs = fitz.open('pdf', pdf_bytes)
|