瀏覽代碼

Merge pull request #1601 from myhloli/dev

refactor(pdf_parse): uncomment char bbox validation logic
Xiaomeng Zhao 10 月之前
父節點
當前提交
c7a3a68316
共有 2 個文件被更改,包括 9 次插入5 次删除
  1. 5 2
      magic_pdf/libs/boxbase.py
  2. 4 3
      magic_pdf/pdf_parse_union_core_v2.py

+ 5 - 2
magic_pdf/libs/boxbase.py

@@ -185,10 +185,13 @@ def calculate_iou(bbox1, bbox2):
     bbox1_area = (bbox1[2] - bbox1[0]) * (bbox1[3] - bbox1[1])
     bbox2_area = (bbox2[2] - bbox2[0]) * (bbox2[3] - bbox2[1])
 
+    if any([bbox1_area == 0, bbox2_area == 0]):
+        return 0
+
     # Compute the intersection over union by taking the intersection area
     # and dividing it by the sum of both areas minus the intersection area
-    iou = intersection_area / float(bbox1_area + bbox2_area -
-                                    intersection_area)
+    iou = intersection_area / float(bbox1_area + bbox2_area - intersection_area)
+
     return iou
 
 

+ 4 - 3
magic_pdf/pdf_parse_union_core_v2.py

@@ -118,9 +118,10 @@ def fill_char_in_spans(spans, all_chars):
 
     for char in all_chars:
         # 跳过非法bbox的char
-        x1, y1, x2, y2 = char['bbox']
-        if abs(x1 - x2) <= 0.01 or abs(y1 - y2) <= 0.01:
-            continue
+        # x1, y1, x2, y2 = char['bbox']
+        # if abs(x1 - x2) <= 0.01 or abs(y1 - y2) <= 0.01:
+        #     continue
+
         for span in spans:
             if calculate_char_in_span(char['bbox'], span['bbox'], char['c']):
                 span['chars'].append(char)