Prechádzať zdrojové kódy

修复spans为空list导致的IndexError: list index out of range

赵小蒙 1 rok pred
rodič
commit
a01356400e

+ 29 - 26
magic_pdf/pre_proc/ocr_dict_merge.py

@@ -24,34 +24,37 @@ def line_sort_spans_by_left_to_right(lines):
     return line_objects
 
 def merge_spans_to_line(spans):
-    # 按照y0坐标排序
-    spans.sort(key=lambda span: span['bbox'][1])
-
-    lines = []
-    current_line = [spans[0]]
-    for span in spans[1:]:
-        # 如果当前的span类型为"interline_equation" 或者 当前行中已经有"interline_equation"
-        # image和table类型,同上
-        if span['type'] in [ContentType.InterlineEquation, ContentType.Image, ContentType.Table] or any(
-                s['type'] in [ContentType.InterlineEquation, ContentType.Image, ContentType.Table] for s in current_line):
-            # 则开始新行
-            lines.append(current_line)
-            current_line = [span]
-            continue
-
-        # 如果当前的span与当前行的最后一个span在y轴上重叠,则添加到当前行
-        if __is_overlaps_y_exceeds_threshold(span['bbox'], current_line[-1]['bbox']):
-            current_line.append(span)
-        else:
-            # 否则,开始新行
+    if len(spans) == 0:
+        return []
+    else:
+        # 按照y0坐标排序
+        spans.sort(key=lambda span: span['bbox'][1])
+
+        lines = []
+        current_line = [spans[0]]
+        for span in spans[1:]:
+            # 如果当前的span类型为"interline_equation" 或者 当前行中已经有"interline_equation"
+            # image和table类型,同上
+            if span['type'] in [ContentType.InterlineEquation, ContentType.Image, ContentType.Table] or any(
+                    s['type'] in [ContentType.InterlineEquation, ContentType.Image, ContentType.Table] for s in current_line):
+                # 则开始新行
+                lines.append(current_line)
+                current_line = [span]
+                continue
+
+            # 如果当前的span与当前行的最后一个span在y轴上重叠,则添加到当前行
+            if __is_overlaps_y_exceeds_threshold(span['bbox'], current_line[-1]['bbox']):
+                current_line.append(span)
+            else:
+                # 否则,开始新行
+                lines.append(current_line)
+                current_line = [span]
+
+        # 添加最后一行
+        if current_line:
             lines.append(current_line)
-            current_line = [span]
 
-    # 添加最后一行
-    if current_line:
-        lines.append(current_line)
-
-    return lines
+        return lines
 
 def merge_spans_to_line_by_layout(spans, layout_bboxes):
     lines = []

+ 60 - 57
magic_pdf/pre_proc/ocr_span_list_modify.py

@@ -77,70 +77,73 @@ def adjust_bbox_for_standalone_block(spans):
 
 def modify_y_axis(spans: list, displayed_list: list, text_inline_lines: list):
     # displayed_list = []
+    # 如果spans为空,则不处理
+    if len(spans) == 0:
+        pass
+    else:
+        spans.sort(key=lambda span: span['bbox'][1])
+
+        lines = []
+        current_line = [spans[0]]
+        if spans[0]["type"] in [ContentType.InterlineEquation, ContentType.Image, ContentType.Table]:
+            displayed_list.append(spans[0])
+
+        line_first_y0 = spans[0]["bbox"][1]
+        line_first_y = spans[0]["bbox"][3]
+        # 用于给行间公式搜索
+        # text_inline_lines = []
+        for span in spans[1:]:
+            # if span.get("content","") == "78.":
+            #     print("debug")
+            # 如果当前的span类型为"interline_equation" 或者 当前行中已经有"interline_equation"
+            # image和table类型,同上
+            if span['type'] in [ContentType.InterlineEquation, ContentType.Image, ContentType.Table] or any(
+                    s['type'] in [ContentType.InterlineEquation, ContentType.Image, ContentType.Table] for s in current_line):
+                # 传入
+                if span["type"] in [ContentType.InterlineEquation, ContentType.Image, ContentType.Table]:
+                    displayed_list.append(span)
+                # 则开始新行
+                lines.append(current_line)
+                if len(current_line) > 1 or current_line[0]["type"] in [ContentType.Text, ContentType.InlineEquation]:
+                    text_inline_lines.append((current_line, (line_first_y0, line_first_y)))
+                current_line = [span]
+                line_first_y0 = span["bbox"][1]
+                line_first_y = span["bbox"][3]
+                continue
 
-    spans.sort(key=lambda span: span['bbox'][1])
-
-    lines = []
-    current_line = [spans[0]]
-    if spans[0]["type"] in [ContentType.InterlineEquation, ContentType.Image, ContentType.Table]:
-        displayed_list.append(spans[0])
+            # 如果当前的span与当前行的最后一个span在y轴上重叠,则添加到当前行
+            if __is_overlaps_y_exceeds_threshold(span['bbox'], current_line[-1]['bbox']):
+                if span["type"] == "text":
+                    line_first_y0 = span["bbox"][1]
+                    line_first_y = span["bbox"][3]
+                current_line.append(span)
 
-    line_first_y0 = spans[0]["bbox"][1]
-    line_first_y = spans[0]["bbox"][3]
-    # 用于给行间公式搜索
-    # text_inline_lines = []
-    for span in spans[1:]:
-        # if span.get("content","") == "78.":
-        #     print("debug")
-        # 如果当前的span类型为"interline_equation" 或者 当前行中已经有"interline_equation"
-        # image和table类型,同上
-        if span['type'] in [ContentType.InterlineEquation, ContentType.Image, ContentType.Table] or any(
-                s['type'] in [ContentType.InterlineEquation, ContentType.Image, ContentType.Table] for s in current_line):
-            # 传入
-            if span["type"] in [ContentType.InterlineEquation, ContentType.Image, ContentType.Table]:
-                displayed_list.append(span)
-            # 则开始新行
-            lines.append(current_line)
-            if len(current_line) > 1 or current_line[0]["type"] in [ContentType.Text, ContentType.InlineEquation]:
+            else:
+                # 否则,开始新行
+                lines.append(current_line)
                 text_inline_lines.append((current_line, (line_first_y0, line_first_y)))
-            current_line = [span]
-            line_first_y0 = span["bbox"][1]
-            line_first_y = span["bbox"][3]
-            continue
-
-        # 如果当前的span与当前行的最后一个span在y轴上重叠,则添加到当前行
-        if __is_overlaps_y_exceeds_threshold(span['bbox'], current_line[-1]['bbox']):
-            if span["type"] == "text":
+                current_line = [span]
                 line_first_y0 = span["bbox"][1]
                 line_first_y = span["bbox"][3]
-            current_line.append(span)
 
-        else:
-            # 否则,开始新行
+            # 添加最后一行
+        if current_line:
             lines.append(current_line)
-            text_inline_lines.append((current_line, (line_first_y0, line_first_y)))
-            current_line = [span]
-            line_first_y0 = span["bbox"][1]
-            line_first_y = span["bbox"][3]
-
-        # 添加最后一行
-    if current_line:
-        lines.append(current_line)
-        if len(current_line) > 1 or current_line[0]["type"] in [ContentType.Text, ContentType.InlineEquation]:
-            text_inline_lines.append((current_line, (line_first_y0, line_first_y)))
-    for line in text_inline_lines:
-        # 按照x0坐标排序
-        current_line = line[0]
-        current_line.sort(key=lambda span: span['bbox'][0])
-
-    # 调整每一个文字行内bbox统一
-    for line in text_inline_lines:
-        current_line, (line_first_y0, line_first_y) = line
-        for span in current_line:
-            span["bbox"][1] = line_first_y0
-            span["bbox"][3] = line_first_y
-
-    # return spans, displayed_list, text_inline_lines
+            if len(current_line) > 1 or current_line[0]["type"] in [ContentType.Text, ContentType.InlineEquation]:
+                text_inline_lines.append((current_line, (line_first_y0, line_first_y)))
+        for line in text_inline_lines:
+            # 按照x0坐标排序
+            current_line = line[0]
+            current_line.sort(key=lambda span: span['bbox'][0])
+
+        # 调整每一个文字行内bbox统一
+        for line in text_inline_lines:
+            current_line, (line_first_y0, line_first_y) = line
+            for span in current_line:
+                span["bbox"][1] = line_first_y0
+                span["bbox"][3] = line_first_y
+
+        # return spans, displayed_list, text_inline_lines
 
 
 def modify_inline_equation(spans: list, displayed_list: list, text_inline_lines: list):