|
|
@@ -1,4 +1,7 @@
|
|
|
-from magic_pdf.libs.boxbase import __is_overlaps_y_exceeds_threshold, get_minbox_if_overlap_by_ratio
|
|
|
+from loguru import logger
|
|
|
+
|
|
|
+from magic_pdf.libs.boxbase import __is_overlaps_y_exceeds_threshold, get_minbox_if_overlap_by_ratio, \
|
|
|
+ calculate_overlap_area_in_bbox1_area_ratio
|
|
|
|
|
|
|
|
|
# 删除重叠spans中较小的那些
|
|
|
@@ -14,6 +17,24 @@ def remove_overlaps_min_spans(spans):
|
|
|
return spans
|
|
|
|
|
|
|
|
|
+# 将每一个line中的span从左到右排序
|
|
|
+def line_sort_spans_by_left_to_right(lines):
|
|
|
+ line_objects = []
|
|
|
+ for line in lines:
|
|
|
+ # 按照x0坐标排序
|
|
|
+ line.sort(key=lambda span: span['bbox'][0])
|
|
|
+ line_bbox = [
|
|
|
+ min(span['bbox'][0] for span in line), # x0
|
|
|
+ min(span['bbox'][1] for span in line), # y0
|
|
|
+ max(span['bbox'][2] for span in line), # x1
|
|
|
+ max(span['bbox'][3] for span in line), # y1
|
|
|
+ ]
|
|
|
+ line_objects.append({
|
|
|
+ "bbox": line_bbox,
|
|
|
+ "spans": line,
|
|
|
+ })
|
|
|
+ return line_objects
|
|
|
+
|
|
|
def merge_spans_to_line(spans):
|
|
|
# 按照y0坐标排序
|
|
|
spans.sort(key=lambda span: span['bbox'][1])
|
|
|
@@ -23,7 +44,8 @@ def merge_spans_to_line(spans):
|
|
|
for span in spans[1:]:
|
|
|
# 如果当前的span类型为"displayed_equation" 或者 当前行中已经有"displayed_equation"
|
|
|
# image和table类型,同上
|
|
|
- if span['type'] in ["displayed_equation", "image", "table"] or any(s['type'] in ["displayed_equation", "image", "table"] for s in current_line):
|
|
|
+ if span['type'] in ["displayed_equation", "image", "table"] or any(
|
|
|
+ s['type'] in ["displayed_equation", "image", "table"] for s in current_line):
|
|
|
# 则开始新行
|
|
|
lines.append(current_line)
|
|
|
current_line = [span]
|
|
|
@@ -41,20 +63,25 @@ def merge_spans_to_line(spans):
|
|
|
if current_line:
|
|
|
lines.append(current_line)
|
|
|
|
|
|
- # 计算每行的边界框,并对每行中的span按照x0进行排序
|
|
|
- line_objects = []
|
|
|
- for line in lines:
|
|
|
- # 按照x0坐标排序
|
|
|
- line.sort(key=lambda span: span['bbox'][0])
|
|
|
- line_bbox = [
|
|
|
- min(span['bbox'][0] for span in line), # x0
|
|
|
- min(span['bbox'][1] for span in line), # y0
|
|
|
- max(span['bbox'][2] for span in line), # x1
|
|
|
- max(span['bbox'][3] for span in line), # y1
|
|
|
- ]
|
|
|
- line_objects.append({
|
|
|
- "bbox": line_bbox,
|
|
|
- "spans": line,
|
|
|
- })
|
|
|
+ return lines
|
|
|
|
|
|
- return line_objects
|
|
|
+def merge_spans_to_line_by_layout(spans, layout_bboxes):
|
|
|
+ lines = []
|
|
|
+ new_spans = []
|
|
|
+ for item in layout_bboxes:
|
|
|
+ layout_bbox = item['layout_bbox']
|
|
|
+ # 遍历spans,将每个span放入对应的layout中
|
|
|
+ layout_sapns = []
|
|
|
+ for span in spans:
|
|
|
+ if calculate_overlap_area_in_bbox1_area_ratio(span['bbox'], layout_bbox) > 0.8:
|
|
|
+ layout_sapns.append(span)
|
|
|
+ new_spans.append(layout_sapns)
|
|
|
+
|
|
|
+ for layout_sapns in new_spans:
|
|
|
+ layout_lines = merge_spans_to_line(layout_sapns)
|
|
|
+ lines.extend(layout_lines)
|
|
|
+
|
|
|
+ #对line中的span进行排序
|
|
|
+ lines = line_sort_spans_by_left_to_right(lines)
|
|
|
+
|
|
|
+ return lines
|