Browse Source

span进入具体的layout后,需要在下次循环前将该span移除

赵小蒙 1 year ago
parent
commit
68e83c124f
1 changed files with 4 additions and 0 deletions
  1. 4 0
      magic_pdf/pre_proc/ocr_dict_merge.py

+ 4 - 0
magic_pdf/pre_proc/ocr_dict_merge.py

@@ -77,6 +77,10 @@ def merge_spans_to_line_by_layout(spans, layout_bboxes):
                 layout_sapns.append(span)
                 layout_sapns.append(span)
         new_spans.append(layout_sapns)
         new_spans.append(layout_sapns)
 
 
+        # 从spans删除已经放入layout_sapns中的span
+        for layout_sapn in layout_sapns:
+            spans.remove(layout_sapn)
+
     for layout_sapns in new_spans:
     for layout_sapns in new_spans:
         layout_lines = merge_spans_to_line(layout_sapns)
         layout_lines = merge_spans_to_line(layout_sapns)
         lines.extend(layout_lines)
         lines.extend(layout_lines)