Răsfoiți Sursa

Merge pull request #48 from icecraft/feat/fix_zero_height

fix: remove_overlap leading zero height case
myhloli 1 an în urmă
părinte
comite
f6d8f6cab5

+ 5 - 1
magic_pdf/libs/math.py

@@ -2,4 +2,8 @@ def float_gt(a, b):
     if 0.0001 >= abs(a -b):
         return False
     return a > b
-    
+    
+def float_equal(a, b):
+    if 0.0001 >= abs(a-b):
+        return True
+    return False

+ 6 - 2
magic_pdf/pdf_parse_by_txt_v2.py

@@ -31,7 +31,8 @@ from magic_pdf.pre_proc.equations_replace import (
     replace_equations_in_textblock,
 )
 from magic_pdf.pre_proc.citationmarker_remove import remove_citation_marker
-
+from magic_pdf.libs.math import float_equal
+from magic_pdf.para.para_split_v2 import para_split
 
 def txt_spans_extract(pdf_page, inline_equations, interline_equations):
     text_raw_blocks = pdf_page.get_text("dict", flags=fitz.TEXTFLAGS_TEXT)["blocks"]
@@ -48,6 +49,9 @@ def txt_spans_extract(pdf_page, inline_equations, interline_equations):
     for v in text_blocks:
         for line in v["lines"]:
             for span in line["spans"]:
+                bbox = span["bbox"]
+                if float_equal(bbox[0], bbox[2]) or float_equal(bbox[1], bbox[3]):
+                    continue
                 spans.append(
                     {
                         "bbox": list(span["bbox"]),
@@ -167,7 +171,7 @@ def parse_pdf_by_txt(
         pdf_info_dict[f"page_{page_id}"] = page_info
 
     """分段"""
-    pass
+    para_split(pdf_info_dict, debug_mode=debug_mode)
 
     """dict转list"""
     pdf_info_list = dict_to_list(pdf_info_dict)

+ 9 - 9
magic_pdf/pre_proc/remove_bbox_overlap.py

@@ -5,7 +5,7 @@ def _remove_overlap_between_bbox(spans):
     res = []
     for v in spans:
         for i in range(len(res)):
-            if _is_in(res[i]["bbox"], v["bbox"]):
+            if _is_in(res[i]["bbox"], v["bbox"]) or _is_in(v["bbox"], res[i]["bbox"]):
                 continue
             if _is_in_or_part_overlap(res[i]["bbox"], v["bbox"]):
                 ix0, iy0, ix1, iy1 = res[i]["bbox"]
@@ -17,21 +17,21 @@ def _remove_overlap_between_bbox(spans):
                 if diff_y > diff_x:
                     if x1 >= ix1:
                         mid = (x0 + ix1) // 2
-                        ix1 = min(mid, ix1)
-                        x0 = max(mid + 1, x0)
+                        ix1 = min(mid - 0.25, ix1)
+                        x0 = max(mid + 0.25, x0)
                     else:
                         mid = (ix0 + x1) // 2
-                        ix0 = max(mid + 1, ix0)
-                        x1 = min(mid, x1)
+                        ix0 = max(mid + 0.25, ix0)
+                        x1 = min(mid -0.25, x1)
                 else:
                     if y1 >= iy1:
                         mid = (y0 + iy1) // 2
-                        y0 = max(mid + 1, y0)
-                        iy1 = min(iy1, mid)
+                        y0 = max(mid + 0.25, y0)
+                        iy1 = min(iy1, mid-0.25)
                     else:
                         mid = (iy0 + y1) // 2
-                        y1 = min(y1, mid)
-                        iy0 = max(mid + 1, iy0)
+                        y1 = min(y1, mid-0.25)
+                        iy0 = max(mid + 0.25, iy0)
                 res[i]["bbox"] = [ix0, iy0, ix1, iy1]
                 v["bbox"] = [x0, y0, x1, y1]