|
|
@@ -107,10 +107,10 @@ def _is_in_or_part_overlap(box1, box2) -> bool:
|
|
|
or y0_1 > y1_2
|
|
|
) # box1在box2的下边
|
|
|
|
|
|
-
|
|
|
def remove_text_block_overlap_interline_equation_bbox(
|
|
|
interline_eq_bboxes, pymu_block_list
|
|
|
):
|
|
|
+
|
|
|
"""消除掉行行内公式有部分重叠的文本块的内容。
|
|
|
同时重新计算消除重叠之后文本块的大小"""
|
|
|
deleted_block = []
|
|
|
@@ -317,12 +317,7 @@ def replace_line_v2(eqinfo, line):
|
|
|
"descender": -0.3050000071525574,
|
|
|
"latex": "",
|
|
|
"origin": [337.1410153102337, 216.0205245153934],
|
|
|
- "bbox": [
|
|
|
- 337.1410153102337,
|
|
|
- 216.0205245153934,
|
|
|
- 390.4496373892022,
|
|
|
- 228.50171037628277,
|
|
|
- ],
|
|
|
+ "bbox": eqinfo["bbox"]
|
|
|
}
|
|
|
# equation_span = line['spans'][0].copy()
|
|
|
equation_span["latex"] = eqinfo['latex']
|
|
|
@@ -363,6 +358,11 @@ def replace_line_v2(eqinfo, line):
|
|
|
line["spans"].remove(first_overlap_span)
|
|
|
|
|
|
if len(tail_span_chars) > 0:
|
|
|
+ min_of_tail_span_x0 = min([chr["bbox"][0] for chr in tail_span_chars])
|
|
|
+ min_of_tail_span_y0 = min([chr["bbox"][1] for chr in tail_span_chars])
|
|
|
+ max_of_tail_span_x1 = max([chr["bbox"][2] for chr in tail_span_chars])
|
|
|
+ max_of_tail_span_y1 = max([chr["bbox"][3] for chr in tail_span_chars])
|
|
|
+
|
|
|
if last_overlap_span == first_overlap_span: # 这个时候应该插入一个新的
|
|
|
tail_span_txt = "".join([char["c"] for char in tail_span_chars])
|
|
|
last_span_to_insert = last_overlap_span.copy()
|
|
|
@@ -370,12 +370,20 @@ def replace_line_v2(eqinfo, line):
|
|
|
last_span_to_insert["text"] = "".join(
|
|
|
[char["c"] for char in tail_span_chars]
|
|
|
)
|
|
|
- last_span_to_insert["bbox"] = (
|
|
|
- min([chr["bbox"][0] for chr in tail_span_chars]),
|
|
|
- last_overlap_span["bbox"][1],
|
|
|
- last_overlap_span["bbox"][2],
|
|
|
- last_overlap_span["bbox"][3],
|
|
|
- )
|
|
|
+ if equation_span["bbox"][2] >= last_overlap_span["bbox"][2]:
|
|
|
+ last_span_to_insert["bbox"] = (
|
|
|
+ min_of_tail_span_x0,
|
|
|
+ min_of_tail_span_y0,
|
|
|
+ max_of_tail_span_x1,
|
|
|
+ max_of_tail_span_y1
|
|
|
+ )
|
|
|
+ else:
|
|
|
+ last_span_to_insert["bbox"] = (
|
|
|
+ min([chr["bbox"][0] for chr in tail_span_chars]),
|
|
|
+ last_overlap_span["bbox"][1],
|
|
|
+ last_overlap_span["bbox"][2],
|
|
|
+ last_overlap_span["bbox"][3],
|
|
|
+ )
|
|
|
# 插入到公式对象之后
|
|
|
equation_idx = line["spans"].index(equation_span)
|
|
|
line["spans"].insert(equation_idx + 1, last_span_to_insert) # 放入公式
|
|
|
@@ -460,17 +468,16 @@ def replace_equations_in_textblock(
|
|
|
"""
|
|
|
替换行间和和行内公式为latex
|
|
|
"""
|
|
|
-
|
|
|
raw_text_blocks = remove_text_block_in_interline_equation_bbox(
|
|
|
interline_equation_bboxes, raw_text_blocks
|
|
|
) # 消除重叠:第一步,在公式内部的
|
|
|
+
|
|
|
raw_text_blocks = remove_text_block_overlap_interline_equation_bbox(
|
|
|
interline_equation_bboxes, raw_text_blocks
|
|
|
) # 消重,第二步,和公式覆盖的
|
|
|
- insert_interline_equations_textblock(interline_equation_bboxes, raw_text_blocks)
|
|
|
|
|
|
+ insert_interline_equations_textblock(interline_equation_bboxes, raw_text_blocks)
|
|
|
raw_text_blocks = replace_inline_equations(inline_equation_bboxes, raw_text_blocks)
|
|
|
-
|
|
|
return raw_text_blocks
|
|
|
|
|
|
|