Browse Source

feat: draw block based on block_type

许瑞 1 year ago
parent
commit
4aa48329a4
1 changed files with 134 additions and 40 deletions
  1. 134 40
      magic_pdf/libs/draw_bbox.py

+ 134 - 40
magic_pdf/libs/draw_bbox.py

@@ -12,9 +12,23 @@ def draw_bbox_without_number(i, bbox_list, page, rgb_config, fill_config):
         x0, y0, x1, y1 = bbox
         rect_coords = fitz.Rect(x0, y0, x1, y1)  # Define the rectangle
         if fill_config:
-            page.draw_rect(rect_coords, color=None, fill=new_rgb, fill_opacity=0.3, width=0.5, overlay=True)  # Draw the rectangle
+            page.draw_rect(
+                rect_coords,
+                color=None,
+                fill=new_rgb,
+                fill_opacity=0.3,
+                width=0.5,
+                overlay=True,
+            )  # Draw the rectangle
         else:
-            page.draw_rect(rect_coords, color=new_rgb, fill=None, fill_opacity=1, width=0.5, overlay=True)  # Draw the rectangle
+            page.draw_rect(
+                rect_coords,
+                color=new_rgb,
+                fill=None,
+                fill_opacity=1,
+                width=0.5,
+                overlay=True,
+            )  # Draw the rectangle
 
 
 def draw_bbox_with_number(i, bbox_list, page, rgb_config, fill_config):
@@ -27,37 +41,113 @@ def draw_bbox_with_number(i, bbox_list, page, rgb_config, fill_config):
         x0, y0, x1, y1 = bbox
         rect_coords = fitz.Rect(x0, y0, x1, y1)  # Define the rectangle
         if fill_config:
-            page.draw_rect(rect_coords, color=None, fill=new_rgb, fill_opacity=0.3, width=0.5, overlay=True)  # Draw the rectangle
+            page.draw_rect(
+                rect_coords,
+                color=None,
+                fill=new_rgb,
+                fill_opacity=0.3,
+                width=0.5,
+                overlay=True,
+            )  # Draw the rectangle
         else:
-            page.draw_rect(rect_coords, color=new_rgb, fill=None, fill_opacity=1, width=0.5, overlay=True)  # Draw the rectangle
-        page.insert_text((x0, y0+10), str(j + 1), fontsize=10, color=new_rgb)  # Insert the index at the top left corner of the rectangle
+            page.draw_rect(
+                rect_coords,
+                color=new_rgb,
+                fill=None,
+                fill_opacity=1,
+                width=0.5,
+                overlay=True,
+            )  # Draw the rectangle
+        page.insert_text(
+            (x0, y0 + 10), str(j + 1), fontsize=10, color=new_rgb
+        )  # Insert the index at the top left corner of the rectangle
 
 
 def draw_layout_bbox(pdf_info, pdf_bytes, out_path):
     layout_bbox_list = []
     blocks_bbox_list = []
     dropped_bbox_list = []
+    tables_list, tables_body_list, tables_caption_list, tables_footnote_list = (
+        [],
+        [],
+        [],
+        [],
+    )
+    imgs_list, imgs_body_list, imgs_caption_list = [], [], []
+    titles_list = []
+    texts_list = []
+    interequations_list = []
     for page in pdf_info:
         page_layout_list = []
         page_dropped_list = []
         page_blocks_bbox_list = []
-        for layout in page['layout_bboxes']:
-            page_layout_list.append(layout['layout_bbox'])
+        tables, tables_body, tables_caption, tables_footnote = [], [], [], []
+        imgs, imgs_body, imgs_caption = [], [], []
+        titles = []
+        texts = []
+        interequations = []
+        for layout in page["layout_bboxes"]:
+            page_layout_list.append(layout["layout_bbox"])
         layout_bbox_list.append(page_layout_list)
-        for dropped_bbox in page['discarded_blocks']:
-            page_dropped_list.append(dropped_bbox['bbox'])
+        for dropped_bbox in page["discarded_blocks"]:
+            page_dropped_list.append(dropped_bbox["bbox"])
         dropped_bbox_list.append(page_dropped_list)
-        for block in page['para_blocks']:
-            page_blocks_bbox_list.append(block['bbox'])
-        blocks_bbox_list.append(page_blocks_bbox_list)
+        for block in page["para_blocks"]:
+            bbox = block["bbox"]
+            if block["type"] == BlockType.Table:
+                tables.append(bbox)
+                for nested_block in block["blocks"]:
+                    bbox = nested_block["bbox"]
+                    if nested_block["type"] == BlockType.TableBody:
+                        tables_body.append(bbox)
+                    elif nested_block["type"] == BlockType.TableCaption:
+                        tables_caption.append(bbox)
+                    elif nested_block["type"] == BlockType.TableFootnote:
+                        tables_footnote.append(bbox)
+            elif block["type"] == BlockType.Image:
+                imgs.append(bbox)
+                for nested_block in block["blocks"]:
+                    bbox = nested_block["bbox"]
+                    if nested_block["type"] == BlockType.ImageBody:
+                        imgs_body.append(bbox)
+                    elif nested_block["type"] == BlockType.ImageCaption:
+                        imgs_caption.append(bbox)
+            elif block["type"] == BlockType.Title:
+                titles.append(bbox)
+            elif block["type"] == BlockType.Text:
+                texts.append(bbox)
+            elif block["type"] == BlockType.InterlineEquation:
+                interequations.append(bbox)
+        tables_list.append(tables)
+        tables_body_list.append(tables_body)
+        tables_caption_list.append(tables_caption)
+        tables_footnote_list.append(tables_footnote)
+        imgs_list.append(imgs)
+        imgs_body_list.append(imgs_body)
+        imgs_caption_list.append(imgs_caption)
+        titles_list.append(titles)
+        texts_list.append(texts)
+        interequations_list.append(interequations)
+
     pdf_docs = fitz.open("pdf", pdf_bytes)
     for i, page in enumerate(pdf_docs):
         draw_bbox_with_number(i, layout_bbox_list, page, [255, 0, 0], False)
         draw_bbox_without_number(i, dropped_bbox_list, page, [0, 255, 0], True)
-        draw_bbox_without_number(i, blocks_bbox_list, page, [0, 0, 255], True)
+        draw_bbox_without_number(i, tables_list, page, [153, 153, 0], True)  # color !
+        draw_bbox_without_number(i, tables_body_list, page, [204, 204, 0], True)
+        draw_bbox_without_number(i, tables_caption_list, page, [255, 255, 102], True)
+        draw_bbox_without_number(i, tables_footnote_list, page, [229, 255, 204], True)
+        draw_bbox_without_number(i, imgs_list, page, [51, 102, 0], True)
+        draw_bbox_without_number(i, imgs_body_list, page, [153, 255, 51], True)
+        draw_bbox_without_number(i, imgs_caption_list, page, [102, 178, 255], True)
+        draw_bbox_without_number(i, titles_list, page, [102, 102, 255], True)
+        draw_bbox_without_number(i, texts_list, page, [153, 0, 76], True)
+        draw_bbox_without_number(i, interequations_list, page, [160, 160, 160], True)
+
     # Save the PDF
     pdf_docs.save(f"{out_path}/layout.pdf")
 
+
 def draw_span_bbox(pdf_info, pdf_bytes, out_path):
     text_list = []
     inline_equation_list = []
@@ -70,34 +160,38 @@ def draw_span_bbox(pdf_info, pdf_bytes, out_path):
         page_interline_equation_list = []
         page_image_list = []
         page_table_list = []
-        for block in page['para_blocks']:
-            if block['type'] in [BlockType.Text, BlockType.Title, BlockType.InterlineEquation]:
-                for line in block['lines']:
-                    for span in line['spans']:
-                        if span['type'] == ContentType.Text:
-                            page_text_list.append(span['bbox'])
-                        elif span['type'] == ContentType.InlineEquation:
-                            page_inline_equation_list.append(span['bbox'])
-                        elif span['type'] == ContentType.InterlineEquation:
-                            page_interline_equation_list.append(span['bbox'])
-                        elif span['type'] == ContentType.Image:
-                            page_image_list.append(span['bbox'])
-                        elif span['type'] == ContentType.Table:
-                            page_table_list.append(span['bbox'])
-            elif block['type'] in [BlockType.Image, BlockType.Table]:
+        for block in page["para_blocks"]:
+            if block["type"] in [
+                BlockType.Text,
+                BlockType.Title,
+                BlockType.InterlineEquation,
+            ]:
+                for line in block["lines"]:
+                    for span in line["spans"]:
+                        if span["type"] == ContentType.Text:
+                            page_text_list.append(span["bbox"])
+                        elif span["type"] == ContentType.InlineEquation:
+                            page_inline_equation_list.append(span["bbox"])
+                        elif span["type"] == ContentType.InterlineEquation:
+                            page_interline_equation_list.append(span["bbox"])
+                        elif span["type"] == ContentType.Image:
+                            page_image_list.append(span["bbox"])
+                        elif span["type"] == ContentType.Table:
+                            page_table_list.append(span["bbox"])
+            elif block["type"] in [BlockType.Image, BlockType.Table]:
                 for sub_block in block["blocks"]:
-                    for line in sub_block['lines']:
-                        for span in line['spans']:
-                            if span['type'] == ContentType.Text:
-                                page_text_list.append(span['bbox'])
-                            elif span['type'] == ContentType.InlineEquation:
-                                page_inline_equation_list.append(span['bbox'])
-                            elif span['type'] == ContentType.InterlineEquation:
-                                page_interline_equation_list.append(span['bbox'])
-                            elif span['type'] == ContentType.Image:
-                                page_image_list.append(span['bbox'])
-                            elif span['type'] == ContentType.Table:
-                                page_table_list.append(span['bbox'])
+                    for line in sub_block["lines"]:
+                        for span in line["spans"]:
+                            if span["type"] == ContentType.Text:
+                                page_text_list.append(span["bbox"])
+                            elif span["type"] == ContentType.InlineEquation:
+                                page_inline_equation_list.append(span["bbox"])
+                            elif span["type"] == ContentType.InterlineEquation:
+                                page_interline_equation_list.append(span["bbox"])
+                            elif span["type"] == ContentType.Image:
+                                page_image_list.append(span["bbox"])
+                            elif span["type"] == ContentType.Table:
+                                page_table_list.append(span["bbox"])
         text_list.append(page_text_list)
         inline_equation_list.append(page_inline_equation_list)
         interline_equation_list.append(page_interline_equation_list)