浏览代码

fix: remove high iou and low confidence blocks

赵小蒙 1 年之前
父节点
当前提交
9a3774631f
共有 2 个文件被更改,包括 36 次插入6 次删除
  1. 28 3
      magic_pdf/model/magic_model.py
  2. 8 3
      magic_pdf/pdf_parse_union_core.py

+ 28 - 3
magic_pdf/model/magic_model.py

@@ -15,7 +15,7 @@ from magic_pdf.libs.boxbase import (
     bbox_relative_pos,
     bbox_distance,
     _is_part_overlap,
-    calculate_overlap_area_in_bbox1_area_ratio,
+    calculate_overlap_area_in_bbox1_area_ratio, calculate_iou,
 )
 from magic_pdf.libs.ModelBlockTypeEnum import ModelBlockTypeEnum
 
@@ -51,7 +51,7 @@ class MagicModel:
             for need_remove in need_remove_list:
                 layout_dets.remove(need_remove)
 
-    def __fix_by_confidence(self):
+    def __fix_by_remove_low_confidence(self):
         for model_page_info in self.__model_list:
             need_remove_list = []
             layout_dets = model_page_info["layout_dets"]
@@ -63,11 +63,36 @@ class MagicModel:
             for need_remove in need_remove_list:
                 layout_dets.remove(need_remove)
 
+    def __fix_by_remove_high_iou_and_low_confidence(self):
+        for model_page_info in self.__model_list:
+            need_remove_list = []
+            layout_dets = model_page_info["layout_dets"]
+            for layout_det1 in layout_dets:
+                for layout_det2 in layout_dets:
+                    if layout_det1 == layout_det2:
+                        continue
+                    if layout_det1["category_id"] in [0,1,2,3,4,5,6,7,8,9] and layout_det2["category_id"] in [0,1,2,3,4,5,6,7,8,9]:
+                        if calculate_iou(layout_det1['bbox'], layout_det2['bbox']) > 0.9:
+                            if layout_det1['score'] < layout_det2['score']:
+                                layout_det_need_remove = layout_det1
+                            else:
+                                layout_det_need_remove = layout_det2
+
+                            if layout_det_need_remove not in need_remove_list:
+                                need_remove_list.append(layout_det_need_remove)
+                        else:
+                            continue
+                    else:
+                        continue
+            for need_remove in need_remove_list:
+                layout_dets.remove(need_remove)
+
     def __init__(self, model_list: list, docs: fitz.Document):
         self.__model_list = model_list
         self.__docs = docs
         self.__fix_axis()
-        self.__fix_by_confidence()
+        self.__fix_by_remove_low_confidence()
+        self.__fix_by_remove_high_iou_and_low_confidence()
 
     def __reduct_overlap(self, bboxes):
         N = len(bboxes)

+ 8 - 3
magic_pdf/pdf_parse_union_core.py

@@ -129,9 +129,14 @@ def parse_page_core(pdf_docs, magic_model, page_id, pdf_bytes_md5, imageWriter,
     spans = ocr_cut_image_and_table(spans, pdf_docs[page_id], page_id, pdf_bytes_md5, imageWriter)
 
     '''将所有区块的bbox整理到一起'''
-    all_bboxes, all_discarded_blocks = ocr_prepare_bboxes_for_layout_split(
-        img_blocks, table_blocks, discarded_blocks, text_blocks, title_blocks,
-        interline_equations, page_w, page_h)
+    if len(interline_equation_blocks) > 0:
+        all_bboxes, all_discarded_blocks = ocr_prepare_bboxes_for_layout_split(
+            img_blocks, table_blocks, discarded_blocks, text_blocks, title_blocks,
+            interline_equation_blocks, page_w, page_h)
+    else:
+        all_bboxes, all_discarded_blocks = ocr_prepare_bboxes_for_layout_split(
+            img_blocks, table_blocks, discarded_blocks, text_blocks, title_blocks,
+            interline_equations, page_w, page_h)
 
     '''先处理不需要排版的discarded_blocks'''
     discarded_block_with_spans, spans = fill_spans_in_blocks(all_discarded_blocks, spans, 0.4)