瀏覽代碼

在layout.pdf中绘制drop的bbox

赵小蒙 1 年之前
父節點
當前提交
b6f051d88f

+ 1 - 1
magic_pdf/dict2md/ocr_mkcontent.py

@@ -1,5 +1,4 @@
 def mk_nlp_markdown(pdf_info_dict: dict):
-
     markdown = []
 
     for _, page_info in pdf_info_dict.items():
@@ -22,6 +21,7 @@ def mk_nlp_markdown(pdf_info_dict: dict):
                 markdown.append(line_text.strip() + '  ')
     return '\n'.join(markdown)
 
+
 def mk_mm_markdown(pdf_info_dict: dict):
 
     markdown = []

+ 10 - 3
magic_pdf/libs/draw_bbox.py

@@ -27,15 +27,22 @@ def draw_bbox_with_number(i, bbox_list, page, rgb_config):
 
 def draw_layout_bbox(pdf_info_dict, input_path, out_path):
     layout_bbox_list = []
+    dropped_bbox_list = []
     for page in pdf_info_dict.values():
-        page_list = []
+        page_layout_list = []
+        page_dropped_list = []
         for layout in page['layout_bboxes']:
-            page_list.append(layout['layout_bbox'])
-        layout_bbox_list.append(page_list)
+            page_layout_list.append(layout['layout_bbox'])
+        layout_bbox_list.append(page_layout_list)
+        for drop_tag, dropped_bboxes in page['dropped_bboxes'].items():
+            for dropped_bbox in dropped_bboxes:
+                page_dropped_list.append(dropped_bbox)
+        dropped_bbox_list.append(page_dropped_list)
 
     doc = fitz.open(input_path)
     for i, page in enumerate(doc):
         draw_bbox_with_number(i, layout_bbox_list, page, [255, 0, 0])
+        draw_bbox_without_number(i, dropped_bbox_list, page, [0, 255, 0])
     # Save the PDF
     doc.save(f"{out_path}/layout.pdf")
 

+ 5 - 2
magic_pdf/pdf_parse_by_ocr.py

@@ -32,7 +32,8 @@ from magic_pdf.pre_proc.remove_bbox_overlap import remove_overlap_between_bbox
 
 def construct_page_component(blocks, layout_bboxes, page_id, page_w, page_h, layout_tree,
                              images, tables, interline_equations, inline_equations,
-                             dropped_text_block, dropped_image_block, dropped_table_block):
+                             dropped_text_block, dropped_image_block, dropped_table_block,
+                             need_remove_spans_bboxes_dict):
     return_dict = {
         'preproc_blocks': blocks,
         'layout_bboxes': layout_bboxes,
@@ -46,6 +47,7 @@ def construct_page_component(blocks, layout_bboxes, page_id, page_w, page_h, lay
         'dropped_text_block': dropped_text_block,
         'dropped_image_block': dropped_image_block,
         'dropped_table_block': dropped_table_block,
+        'dropped_bboxes': need_remove_spans_bboxes_dict,
     }
     return return_dict
 
@@ -233,7 +235,8 @@ def parse_pdf_by_ocr(
         # 构造pdf_info_dict
         page_info = construct_page_component(blocks, layout_bboxes, page_id, page_w, page_h, layout_tree,
                                              images, tables, interline_equations, inline_equations,
-                                             dropped_text_block, dropped_image_block, dropped_table_block)
+                                             dropped_text_block, dropped_image_block, dropped_table_block,
+                                             need_remove_spans_bboxes_dict)
         pdf_info_dict[f"page_{page_id}"] = page_info
 
     # 在测试时,保存调试信息

+ 1 - 1
magic_pdf/pre_proc/ocr_dict_merge.py

@@ -60,7 +60,7 @@ def merge_spans_to_line_by_layout(spans, layout_bboxes):
         # 遍历spans,将每个span放入对应的layout中
         layout_sapns = []
         for span in spans:
-            if calculate_overlap_area_in_bbox1_area_ratio(span['bbox'], layout_bbox) > 0.8:
+            if calculate_overlap_area_in_bbox1_area_ratio(span['bbox'], layout_bbox) > 0.65:
                 layout_sapns.append(span)
         # 如果layout_sapns不为空,则放入new_spans中
         if len(layout_sapns) > 0:

+ 4 - 4
magic_pdf/pre_proc/ocr_span_list_modify.py

@@ -37,18 +37,18 @@ def remove_spans_by_bboxes_dict(spans, need_remove_spans_bboxes_dict):
     dropped_text_block = []
     dropped_image_block = []
     dropped_table_block = []
-    for key, value in need_remove_spans_bboxes_dict.items():
-        # logger.info(f"remove spans by bbox dict, key: {key}, value: {value}")
+    for drop_tag, removed_bboxes in need_remove_spans_bboxes_dict.items():
+        # logger.info(f"remove spans by bbox dict, drop_tag: {drop_tag}, removed_bboxes: {removed_bboxes}")
         need_remove_spans = []
         for span in spans:
-            for removed_bbox in value:
+            for removed_bbox in removed_bboxes:
                 if calculate_overlap_area_in_bbox1_area_ratio(span['bbox'], removed_bbox) > 0.5:
                     need_remove_spans.append(span)
                     break
 
         for span in need_remove_spans:
             spans.remove(span)
-            span['tag'] = key
+            span['tag'] = drop_tag
             if span['type'] in ['text', 'inline_equation', 'displayed_equation']:
                 dropped_text_block.append(span)
             elif span['type'] == 'image':