kernel.h@qq.com пре 1 година
родитељ
комит
d062bb6ce9
2 измењених фајлова са 15 додато и 12 уклоњено
  1. 9 7
      demo/ocr_demo.py
  2. 6 5
      magic_pdf/para/para_split.py

+ 9 - 7
demo/ocr_demo.py

@@ -83,11 +83,13 @@ def ocr_parse_core(book_name, ocr_pdf_path, ocr_pdf_model_info, start_page_id=0,
 
 
 if __name__ == '__main__':
-    # pdf_path = r"D:\project\20231108code-clean\ocr\new\双栏\s0043-1354(02)00581-x.pdf"
-    # json_file_path = r"D:\project\20231108code-clean\ocr\new\双栏\s0043-1354(02)00581-x.json"
-    # pdf_path = r"D:\project\20231108code-clean\ocr\new\双栏\j.1540-627x.2006.00176.x.pdf"
-    # json_file_path = r"D:\project\20231108code-clean\ocr\new\双栏\j.1540-627x.2006.00176.x.json"
-    # pdf_path = r"/home/cxu/workspace/Magic-PDF/ocr_demo/ocr_1.pdf"
-    # json_file_path = r"/home/cxu/workspace/Magic-PDF/ocr_demo/ocr_1.json"
-    # ocr_local_parse(pdf_path, json_file_path)
+    #ocr_pdf_path = r"D:\project\20231108code-clean\ocr\new\双栏\s0043-1354(02)00581-x.pdf"
+    #ocr_json_file_path = r"D:\project\20231108code-clean\ocr\new\双栏\s0043-1354(02)00581-x.json"
+    # ocr_pdf_path = r"D:\project\20231108code-clean\ocr\new\双栏\j.1540-627x.2006.00176.x.pdf"
+    # ocr_json_file_path = r"D:\project\20231108code-clean\ocr\new\双栏\j.1540-627x.2006.00176.x.json"
+    
+    ocr_pdf_path = r"/home/cxu/workspace/Magic-PDF/ocr_demo/j.1540-627x.2006.00176.x.pdf"
+    ocr_json_file_path = r"/home/cxu/workspace/Magic-PDF/ocr_demo/j.1540-627x.2006.00176.x.json"
+    # ocr_pdf_path = r"/home/cxu/workspace/Magic-PDF/ocr_demo/ocr_1.pdf"
+    # ocr_json_file_path = r"/home/cxu/workspace/Magic-PDF/ocr_demo/ocr_1.json"
     ocr_online_parse(book_name="数学新星网/edu_00001236")

+ 6 - 5
magic_pdf/para/para_split.py

@@ -2,7 +2,7 @@ from sklearn.cluster import DBSCAN
 import numpy as np
 from loguru import logger
 
-from magic_pdf.libs.boxbase import _is_in
+from magic_pdf.libs.boxbase import _is_in_or_part_overlap
 from magic_pdf.libs.ocr_content_type import ContentType
 
 
@@ -50,7 +50,7 @@ def __valign_lines(blocks, layout_bboxes):
     new_layout_bboxes = []
     
     for layout_box in layout_bboxes:
-        blocks_in_layoutbox = [b for b in blocks if _is_in(b['bbox'], layout_box['layout_bbox'])]
+        blocks_in_layoutbox = [b for b in blocks if _is_in_or_part_overlap(b['bbox'], layout_box['layout_bbox'])]
         if len(blocks_in_layoutbox)==0:
             continue
         
@@ -136,7 +136,7 @@ def __group_line_by_layout(blocks, layout_bboxes, lang="en"):
     lines_group = []
     
     for lyout in layout_bboxes:
-        lines = [line for block in blocks if _is_in(block['bbox'], lyout['layout_bbox']) for line in block['lines']]
+        lines = [line for block in blocks if _is_in_or_part_overlap(block['bbox'], lyout['layout_bbox']) for line in block['lines']]
         lines_group.append(lines)
 
     return lines_group
@@ -159,6 +159,7 @@ def __split_para_in_layoutbox(lines_group, new_layout_bbox, lang="en", char_avg_
             continue
         #layout_right = max([line['bbox'][2] for line in lines])
         layout_right = __find_layout_bbox_by_line(lines[0]['bbox'], new_layout_bbox)[2]
+        layout_left = __find_layout_bbox_by_line(lines[0]['bbox'], new_layout_bbox)[0]
         para = [] # 元素是line
         
         for i, line in enumerate(lines):
@@ -173,7 +174,7 @@ def __split_para_in_layoutbox(lines_group, new_layout_bbox, lang="en", char_avg_
                     para.append(line)
                     paras.append(para)
                     para = []
-                elif line['bbox'][2] >= layout_right - right_tail_distance and next_line and next_line['bbox'][0] == layout_right: # 现在这行到了行尾沾满,下一行存在且顶格。
+                elif line['bbox'][2] >= layout_right - right_tail_distance and next_line and next_line['bbox'][0] == layout_left: # 现在这行到了行尾沾满,下一行存在且顶格。
                     para.append(line)
                 else: 
                     para.append(line)
@@ -197,7 +198,7 @@ def __find_layout_bbox_by_line(line_bbox, layout_bboxes):
     根据line找到所在的layout
     """
     for layout in layout_bboxes:
-        if _is_in(line_bbox, layout):
+        if _is_in_or_part_overlap(line_bbox, layout):
             return layout
     return None