|
|
@@ -12,9 +12,23 @@ def draw_bbox_without_number(i, bbox_list, page, rgb_config, fill_config):
|
|
|
x0, y0, x1, y1 = bbox
|
|
|
rect_coords = fitz.Rect(x0, y0, x1, y1) # Define the rectangle
|
|
|
if fill_config:
|
|
|
- page.draw_rect(rect_coords, color=None, fill=new_rgb, fill_opacity=0.3, width=0.5, overlay=True) # Draw the rectangle
|
|
|
+ page.draw_rect(
|
|
|
+ rect_coords,
|
|
|
+ color=None,
|
|
|
+ fill=new_rgb,
|
|
|
+ fill_opacity=0.3,
|
|
|
+ width=0.5,
|
|
|
+ overlay=True,
|
|
|
+ ) # Draw the rectangle
|
|
|
else:
|
|
|
- page.draw_rect(rect_coords, color=new_rgb, fill=None, fill_opacity=1, width=0.5, overlay=True) # Draw the rectangle
|
|
|
+ page.draw_rect(
|
|
|
+ rect_coords,
|
|
|
+ color=new_rgb,
|
|
|
+ fill=None,
|
|
|
+ fill_opacity=1,
|
|
|
+ width=0.5,
|
|
|
+ overlay=True,
|
|
|
+ ) # Draw the rectangle
|
|
|
|
|
|
|
|
|
def draw_bbox_with_number(i, bbox_list, page, rgb_config, fill_config):
|
|
|
@@ -27,37 +41,113 @@ def draw_bbox_with_number(i, bbox_list, page, rgb_config, fill_config):
|
|
|
x0, y0, x1, y1 = bbox
|
|
|
rect_coords = fitz.Rect(x0, y0, x1, y1) # Define the rectangle
|
|
|
if fill_config:
|
|
|
- page.draw_rect(rect_coords, color=None, fill=new_rgb, fill_opacity=0.3, width=0.5, overlay=True) # Draw the rectangle
|
|
|
+ page.draw_rect(
|
|
|
+ rect_coords,
|
|
|
+ color=None,
|
|
|
+ fill=new_rgb,
|
|
|
+ fill_opacity=0.3,
|
|
|
+ width=0.5,
|
|
|
+ overlay=True,
|
|
|
+ ) # Draw the rectangle
|
|
|
else:
|
|
|
- page.draw_rect(rect_coords, color=new_rgb, fill=None, fill_opacity=1, width=0.5, overlay=True) # Draw the rectangle
|
|
|
- page.insert_text((x0, y0+10), str(j + 1), fontsize=10, color=new_rgb) # Insert the index at the top left corner of the rectangle
|
|
|
+ page.draw_rect(
|
|
|
+ rect_coords,
|
|
|
+ color=new_rgb,
|
|
|
+ fill=None,
|
|
|
+ fill_opacity=1,
|
|
|
+ width=0.5,
|
|
|
+ overlay=True,
|
|
|
+ ) # Draw the rectangle
|
|
|
+ page.insert_text(
|
|
|
+ (x0, y0 + 10), str(j + 1), fontsize=10, color=new_rgb
|
|
|
+ ) # Insert the index at the top left corner of the rectangle
|
|
|
|
|
|
|
|
|
def draw_layout_bbox(pdf_info, pdf_bytes, out_path):
|
|
|
layout_bbox_list = []
|
|
|
blocks_bbox_list = []
|
|
|
dropped_bbox_list = []
|
|
|
+ tables_list, tables_body_list, tables_caption_list, tables_footnote_list = (
|
|
|
+ [],
|
|
|
+ [],
|
|
|
+ [],
|
|
|
+ [],
|
|
|
+ )
|
|
|
+ imgs_list, imgs_body_list, imgs_caption_list = [], [], []
|
|
|
+ titles_list = []
|
|
|
+ texts_list = []
|
|
|
+ interequations_list = []
|
|
|
for page in pdf_info:
|
|
|
page_layout_list = []
|
|
|
page_dropped_list = []
|
|
|
page_blocks_bbox_list = []
|
|
|
- for layout in page['layout_bboxes']:
|
|
|
- page_layout_list.append(layout['layout_bbox'])
|
|
|
+ tables, tables_body, tables_caption, tables_footnote = [], [], [], []
|
|
|
+ imgs, imgs_body, imgs_caption = [], [], []
|
|
|
+ titles = []
|
|
|
+ texts = []
|
|
|
+ interequations = []
|
|
|
+ for layout in page["layout_bboxes"]:
|
|
|
+ page_layout_list.append(layout["layout_bbox"])
|
|
|
layout_bbox_list.append(page_layout_list)
|
|
|
- for dropped_bbox in page['discarded_blocks']:
|
|
|
- page_dropped_list.append(dropped_bbox['bbox'])
|
|
|
+ for dropped_bbox in page["discarded_blocks"]:
|
|
|
+ page_dropped_list.append(dropped_bbox["bbox"])
|
|
|
dropped_bbox_list.append(page_dropped_list)
|
|
|
- for block in page['para_blocks']:
|
|
|
- page_blocks_bbox_list.append(block['bbox'])
|
|
|
- blocks_bbox_list.append(page_blocks_bbox_list)
|
|
|
+ for block in page["para_blocks"]:
|
|
|
+ bbox = block["bbox"]
|
|
|
+ if block["type"] == BlockType.Table:
|
|
|
+ tables.append(bbox)
|
|
|
+ for nested_block in block["blocks"]:
|
|
|
+ bbox = nested_block["bbox"]
|
|
|
+ if nested_block["type"] == BlockType.TableBody:
|
|
|
+ tables_body.append(bbox)
|
|
|
+ elif nested_block["type"] == BlockType.TableCaption:
|
|
|
+ tables_caption.append(bbox)
|
|
|
+ elif nested_block["type"] == BlockType.TableFootnote:
|
|
|
+ tables_footnote.append(bbox)
|
|
|
+ elif block["type"] == BlockType.Image:
|
|
|
+ imgs.append(bbox)
|
|
|
+ for nested_block in block["blocks"]:
|
|
|
+ bbox = nested_block["bbox"]
|
|
|
+ if nested_block["type"] == BlockType.ImageBody:
|
|
|
+ imgs_body.append(bbox)
|
|
|
+ elif nested_block["type"] == BlockType.ImageCaption:
|
|
|
+ imgs_caption.append(bbox)
|
|
|
+ elif block["type"] == BlockType.Title:
|
|
|
+ titles.append(bbox)
|
|
|
+ elif block["type"] == BlockType.Text:
|
|
|
+ texts.append(bbox)
|
|
|
+ elif block["type"] == BlockType.InterlineEquation:
|
|
|
+ interequations.append(bbox)
|
|
|
+ tables_list.append(tables)
|
|
|
+ tables_body_list.append(tables_body)
|
|
|
+ tables_caption_list.append(tables_caption)
|
|
|
+ tables_footnote_list.append(tables_footnote)
|
|
|
+ imgs_list.append(imgs)
|
|
|
+ imgs_body_list.append(imgs_body)
|
|
|
+ imgs_caption_list.append(imgs_caption)
|
|
|
+ titles_list.append(titles)
|
|
|
+ texts_list.append(texts)
|
|
|
+ interequations_list.append(interequations)
|
|
|
+
|
|
|
pdf_docs = fitz.open("pdf", pdf_bytes)
|
|
|
for i, page in enumerate(pdf_docs):
|
|
|
draw_bbox_with_number(i, layout_bbox_list, page, [255, 0, 0], False)
|
|
|
draw_bbox_without_number(i, dropped_bbox_list, page, [0, 255, 0], True)
|
|
|
- draw_bbox_without_number(i, blocks_bbox_list, page, [0, 0, 255], True)
|
|
|
+ draw_bbox_without_number(i, tables_list, page, [153, 153, 0], True) # color !
|
|
|
+ draw_bbox_without_number(i, tables_body_list, page, [204, 204, 0], True)
|
|
|
+ draw_bbox_without_number(i, tables_caption_list, page, [255, 255, 102], True)
|
|
|
+ draw_bbox_without_number(i, tables_footnote_list, page, [229, 255, 204], True)
|
|
|
+ draw_bbox_without_number(i, imgs_list, page, [51, 102, 0], True)
|
|
|
+ draw_bbox_without_number(i, imgs_body_list, page, [153, 255, 51], True)
|
|
|
+ draw_bbox_without_number(i, imgs_caption_list, page, [102, 178, 255], True)
|
|
|
+ draw_bbox_without_number(i, titles_list, page, [102, 102, 255], True)
|
|
|
+ draw_bbox_without_number(i, texts_list, page, [153, 0, 76], True)
|
|
|
+ draw_bbox_without_number(i, interequations_list, page, [160, 160, 160], True)
|
|
|
+
|
|
|
# Save the PDF
|
|
|
pdf_docs.save(f"{out_path}/layout.pdf")
|
|
|
|
|
|
+
|
|
|
def draw_span_bbox(pdf_info, pdf_bytes, out_path):
|
|
|
text_list = []
|
|
|
inline_equation_list = []
|
|
|
@@ -70,34 +160,38 @@ def draw_span_bbox(pdf_info, pdf_bytes, out_path):
|
|
|
page_interline_equation_list = []
|
|
|
page_image_list = []
|
|
|
page_table_list = []
|
|
|
- for block in page['para_blocks']:
|
|
|
- if block['type'] in [BlockType.Text, BlockType.Title, BlockType.InterlineEquation]:
|
|
|
- for line in block['lines']:
|
|
|
- for span in line['spans']:
|
|
|
- if span['type'] == ContentType.Text:
|
|
|
- page_text_list.append(span['bbox'])
|
|
|
- elif span['type'] == ContentType.InlineEquation:
|
|
|
- page_inline_equation_list.append(span['bbox'])
|
|
|
- elif span['type'] == ContentType.InterlineEquation:
|
|
|
- page_interline_equation_list.append(span['bbox'])
|
|
|
- elif span['type'] == ContentType.Image:
|
|
|
- page_image_list.append(span['bbox'])
|
|
|
- elif span['type'] == ContentType.Table:
|
|
|
- page_table_list.append(span['bbox'])
|
|
|
- elif block['type'] in [BlockType.Image, BlockType.Table]:
|
|
|
+ for block in page["para_blocks"]:
|
|
|
+ if block["type"] in [
|
|
|
+ BlockType.Text,
|
|
|
+ BlockType.Title,
|
|
|
+ BlockType.InterlineEquation,
|
|
|
+ ]:
|
|
|
+ for line in block["lines"]:
|
|
|
+ for span in line["spans"]:
|
|
|
+ if span["type"] == ContentType.Text:
|
|
|
+ page_text_list.append(span["bbox"])
|
|
|
+ elif span["type"] == ContentType.InlineEquation:
|
|
|
+ page_inline_equation_list.append(span["bbox"])
|
|
|
+ elif span["type"] == ContentType.InterlineEquation:
|
|
|
+ page_interline_equation_list.append(span["bbox"])
|
|
|
+ elif span["type"] == ContentType.Image:
|
|
|
+ page_image_list.append(span["bbox"])
|
|
|
+ elif span["type"] == ContentType.Table:
|
|
|
+ page_table_list.append(span["bbox"])
|
|
|
+ elif block["type"] in [BlockType.Image, BlockType.Table]:
|
|
|
for sub_block in block["blocks"]:
|
|
|
- for line in sub_block['lines']:
|
|
|
- for span in line['spans']:
|
|
|
- if span['type'] == ContentType.Text:
|
|
|
- page_text_list.append(span['bbox'])
|
|
|
- elif span['type'] == ContentType.InlineEquation:
|
|
|
- page_inline_equation_list.append(span['bbox'])
|
|
|
- elif span['type'] == ContentType.InterlineEquation:
|
|
|
- page_interline_equation_list.append(span['bbox'])
|
|
|
- elif span['type'] == ContentType.Image:
|
|
|
- page_image_list.append(span['bbox'])
|
|
|
- elif span['type'] == ContentType.Table:
|
|
|
- page_table_list.append(span['bbox'])
|
|
|
+ for line in sub_block["lines"]:
|
|
|
+ for span in line["spans"]:
|
|
|
+ if span["type"] == ContentType.Text:
|
|
|
+ page_text_list.append(span["bbox"])
|
|
|
+ elif span["type"] == ContentType.InlineEquation:
|
|
|
+ page_inline_equation_list.append(span["bbox"])
|
|
|
+ elif span["type"] == ContentType.InterlineEquation:
|
|
|
+ page_interline_equation_list.append(span["bbox"])
|
|
|
+ elif span["type"] == ContentType.Image:
|
|
|
+ page_image_list.append(span["bbox"])
|
|
|
+ elif span["type"] == ContentType.Table:
|
|
|
+ page_table_list.append(span["bbox"])
|
|
|
text_list.append(page_text_list)
|
|
|
inline_equation_list.append(page_inline_equation_list)
|
|
|
interline_equation_list.append(page_interline_equation_list)
|