Selaa lähdekoodia

update draw_span_bbox logic

赵小蒙 1 vuosi sitten
vanhempi
commit
8a17926959
1 muutettua tiedostoa jossa 23 lisäystä ja 28 poistoa
  1. 23 28
      magic_pdf/libs/draw_bbox.py

+ 23 - 28
magic_pdf/libs/draw_bbox.py

@@ -151,6 +151,25 @@ def draw_span_bbox(pdf_info, pdf_bytes, out_path):
     dropped_list = []
     next_page_text_list = []
     next_page_inline_equation_list = []
+
+    def get_span_info(span):
+        if span["type"] == ContentType.Text:
+            if span.get(CROSS_PAGE, False):
+                next_page_text_list.append(span["bbox"])
+            else:
+                page_text_list.append(span["bbox"])
+        elif span["type"] == ContentType.InlineEquation:
+            if span.get(CROSS_PAGE, False):
+                next_page_inline_equation_list.append(span["bbox"])
+            else:
+                page_inline_equation_list.append(span["bbox"])
+        elif span["type"] == ContentType.InterlineEquation:
+            page_interline_equation_list.append(span["bbox"])
+        elif span["type"] == ContentType.Image:
+            page_image_list.append(span["bbox"])
+        elif span["type"] == ContentType.Table:
+            page_table_list.append(span["bbox"])
+
     for page in pdf_info:
         page_text_list = []
         page_inline_equation_list = []
@@ -162,10 +181,10 @@ def draw_span_bbox(pdf_info, pdf_bytes, out_path):
         # 将跨页的span放到移动到下一页的列表中
         if len(next_page_text_list) > 0:
             page_text_list.extend(next_page_text_list)
-            next_page_text_list = []
+            next_page_text_list.clear()
         if len(next_page_inline_equation_list) > 0:
             page_inline_equation_list.extend(next_page_inline_equation_list)
-            next_page_inline_equation_list = []
+            next_page_inline_equation_list.clear()
 
         # 构造dropped_list
         for block in page["discarded_blocks"]:
@@ -183,36 +202,12 @@ def draw_span_bbox(pdf_info, pdf_bytes, out_path):
             ]:
                 for line in block["lines"]:
                     for span in line["spans"]:
-                        if span["type"] == ContentType.Text:
-                            if span.get(CROSS_PAGE, False):
-                                next_page_text_list.append(span["bbox"])
-                            else:
-                                page_text_list.append(span["bbox"])
-                        elif span["type"] == ContentType.InlineEquation:
-                            if span.get(CROSS_PAGE, False):
-                                next_page_inline_equation_list.append(span["bbox"])
-                            else:
-                                page_inline_equation_list.append(span["bbox"])
-                        elif span["type"] == ContentType.InterlineEquation:
-                            page_interline_equation_list.append(span["bbox"])
-                        elif span["type"] == ContentType.Image:
-                            page_image_list.append(span["bbox"])
-                        elif span["type"] == ContentType.Table:
-                            page_table_list.append(span["bbox"])
+                        get_span_info(span)
             elif block["type"] in [BlockType.Image, BlockType.Table]:
                 for sub_block in block["blocks"]:
                     for line in sub_block["lines"]:
                         for span in line["spans"]:
-                            if span["type"] == ContentType.Text:
-                                page_text_list.append(span["bbox"])
-                            elif span["type"] == ContentType.InlineEquation:
-                                page_inline_equation_list.append(span["bbox"])
-                            elif span["type"] == ContentType.InterlineEquation:
-                                page_interline_equation_list.append(span["bbox"])
-                            elif span["type"] == ContentType.Image:
-                                page_image_list.append(span["bbox"])
-                            elif span["type"] == ContentType.Table:
-                                page_table_list.append(span["bbox"])
+                            get_span_info(span)
         text_list.append(page_text_list)
         inline_equation_list.append(page_inline_equation_list)
         interline_equation_list.append(page_interline_equation_list)