소스 검색

OCR line的左右侧如果超过layoutbox,那么让layoutbox截断左右侧

kernel.h@qq.com 1 년 전
부모
커밋
c3b8f6d7bb
2개의 변경된 파일25개의 추가작업 그리고 6개의 파일을 삭제
  1. 3 3
      demo/ocr_demo.py
  2. 22 3
      magic_pdf/para/para_split.py

+ 3 - 3
demo/ocr_demo.py

@@ -115,8 +115,8 @@ def ocr_parse_pdf_core(pdf_bytes, model_output_json_list, book_name, start_page_
 if __name__ == '__main__':
     pdf_path = r"/home/cxu/workspace/Magic-PDF/ocr_demo/j.1540-627x.2006.00176.x.pdf"
     json_file_path = r"/home/cxu/workspace/Magic-PDF/ocr_demo/j.1540-627x.2006.00176.x.json"
-    # ocr_local_parse(pdf_path, json_file_path)
-    book_name = "科数网/edu_00011318"
-    ocr_online_parse(book_name)
+    ocr_local_parse(pdf_path, json_file_path)
+    # book_name = "科数网/edu_00011318"
+    # ocr_online_parse(book_name)
     
     pass

+ 22 - 3
magic_pdf/para/para_split.py

@@ -183,11 +183,31 @@ def __valign_lines(blocks, layout_bboxes):
     return new_layout_bboxes
 
 
+def __align_text_in_layout(blocks, layout_bboxes):
+    """
+    由于ocr出来的line,有时候会在前后有一段空白,这个时候需要对文本进行对齐,超出的部分被layout左右侧截断。
+    """
+    for layout in layout_bboxes:
+        lb = layout['layout_bbox']
+        blocks_in_layoutbox = [b for b in blocks if is_in_layout(b['bbox'], lb)]
+        if len(blocks_in_layoutbox)==0:
+            continue
+        
+        for block in blocks_in_layoutbox:
+            for line in block['lines']:
+                x0, x1 = line['bbox'][0], line['bbox'][2]
+                if x0 < lb[0]:
+                    line['bbox'][0] = lb[0]
+                if x1 > lb[2]:
+                    line['bbox'][2] = lb[2]
+    
+ 
 def __common_pre_proc(blocks, layout_bboxes):
     """
     不分语言的,对文本进行预处理
     """
     #__add_line_period(blocks, layout_bboxes)
+    __align_text_in_layout(blocks, layout_bboxes)
     aligned_layout_bboxes = __valign_lines(blocks, layout_bboxes)
     
     return aligned_layout_bboxes
@@ -233,7 +253,6 @@ def __split_para_in_layoutbox(lines_group, new_layout_bbox, lang="en", char_avg_
     layout_paras = []
     right_tail_distance = 1.5 * char_avg_len
     
-    
     for lines in lines_group:
         paras = []
         total_lines = len(lines)
@@ -575,8 +594,8 @@ def __do_split_page(blocks, layout_bboxes, new_layout_bbox, page_num, lang):
     
     
     return connected_layout_paras, page_list_info
-   
-
+       
+    
 def para_split(pdf_info_dict, debug_mode, lang="en"):
     """
     根据line和layout情况进行分段