Quellcode durchsuchen

增加了一个solve_line_alien.py,在detect_equation.py中修改了__solve_contain_bboxs函数,并在pdf_parse_by_model.py里增加了函数solve_line_alien.py的调用

hsy vor 1 Jahr
Ursprung
Commit
bc339320ab

+ 5 - 0
magic_pdf/pdf_parse_by_model.py

@@ -59,6 +59,7 @@ from magic_pdf.post_proc.pdf_post_filter import pdf_post_filter
 from magic_pdf.pre_proc.remove_rotate_bbox import get_side_boundry, remove_rotate_side_textblock, remove_side_blank_block
 from magic_pdf.pre_proc.resolve_bbox_conflict import check_text_block_horizontal_overlap, resolve_bbox_overlap_conflict
 from magic_pdf.pre_proc.fix_table import fix_table_text_block, fix_tables, include_table_title
+from magic_pdf.pre_proc.solve_line_alien import solve_inline_too_large_interval
 
 denseSingleLineBlockException_msg = DenseSingleLineBlockException().message
 titleDetectionException_msg = TitleDetectionException().message
@@ -446,6 +447,10 @@ def parse_pdf_by_model(
     ==================================================================================================================================
     进入段落处理-2阶段
     """
+
+    # 处理行内文字间距较大问题
+    pdf_info_dict = solve_inline_too_large_interval(pdf_info_dict)
+    
     start_time = time.time()
 
     para_process_pipeline = ParaProcessPipeline()

+ 11 - 2
magic_pdf/pre_proc/detect_equation.py

@@ -1,4 +1,4 @@
-from magic_pdf.libs.boxbase import _is_in               # 正则
+from magic_pdf.libs.boxbase import _is_in, calculate_overlap_area_2_minbox_area_ratio              # 正则
 from magic_pdf.libs.commons import fitz             # pyMuPDF库
 
 
@@ -18,7 +18,16 @@ def __solve_contain_bboxs(all_bbox_list: list):
                 dump_list.append(all_bbox_list[i])
             elif _is_in(bbox2, bbox1):
                 dump_list.append(all_bbox_list[j])
-    
+            else:
+                ratio = calculate_overlap_area_2_minbox_area_ratio(bbox1, bbox2)
+                if ratio > 0.7:
+                    s1 = (bbox1[2] - bbox1[0]) * (bbox1[3] - bbox1[1]) 
+                    s2 = (bbox2[2] - bbox2[0]) * (bbox2[3] - bbox2[1])
+                    if s2 > s1:  
+                        dump_list.append(all_bbox_list[i])
+                    else:
+                        dump_list.append(all_bbox_list[i]) 
+
     # 遍历需要删除的列表中的每个元素
     for item in dump_list:
         

+ 29 - 0
magic_pdf/pre_proc/solve_line_alien.py

@@ -0,0 +1,29 @@
+def solve_inline_too_large_interval(pdf_info_dict: dict) -> dict:  # text_block -> json中的preproc_block
+    """解决行内文本间距过大问题"""
+    for i in range(len(pdf_info_dict)):
+
+        text_blocks = pdf_info_dict[f'page_{i}']['preproc_blocks']
+
+        for block in text_blocks:
+
+            x_pre_1, y_pre_1, x_pre_2, y_pre_2 = 0, 0, 0, 0
+            
+            for line in block['lines']:
+
+                x_cur_1, y_cur_1, x_cur_2, y_cur_2 = line['bbox']
+                # line_box = [x1, y1, x2, y2] 
+                if int(y_cur_1) == int(y_pre_1) and int(y_cur_2) == int(y_pre_2):
+                    # if len(line['spans']) == 1:
+                    line['spans'][0]['text'] = ' ' + line['spans'][0]['text']
+                
+                x_pre_1, y_pre_1, x_pre_2, y_pre_2 = line['bbox'] 
+
+    return pdf_info_dict
+
+
+
+
+
+
+
+