Sfoglia il codice sorgente

更新了para_split

liukaiwen 1 anno fa
parent
commit
e31066bae5

+ 5 - 6
magic_pdf/para/para_split_by_model.py → magic_pdf/para/para_split_v2.py

@@ -256,7 +256,7 @@ def __split_para_in_layoutbox2(lines_group, new_layout_bbox, lang="en", char_avg
 
 
 
-def __split_para_in_layoutbox(blocks_group, new_layout_bbox, text_blocks, lang="en", char_avg_len=10):
+def __split_para_in_layoutbox(blocks_group, new_layout_bbox, lang="en", char_avg_len=10):
     """
     lines_group 进行行分段——layout内部进行分段。lines_group内每个元素是一个Layoutbox内的所有行。
     1. 先计算每个group的左右边界。
@@ -624,7 +624,7 @@ def __merge_signle_list_text(page_paras, new_layout_bbox, page_num, lang):
     pass
 
 
-def __do_split_page(blocks, layout_bboxes, new_layout_bbox, text_blocks, page_num, lang):
+def __do_split_page(blocks, layout_bboxes, new_layout_bbox, page_num, lang):
     """
     根据line和layout情况进行分段
     先实现一个根据行末尾特征分段的简单方法。
@@ -637,7 +637,7 @@ def __do_split_page(blocks, layout_bboxes, new_layout_bbox, text_blocks, page_nu
     4. 图、表,目前独占一行,不考虑分段。
     """
     lines_group, blocks_group = __group_line_by_layout(blocks, layout_bboxes, lang)  # block内分段
-    layout_list_info = __split_para_in_layoutbox(blocks_group, new_layout_bbox, text_blocks, lang)  # layout内分段
+    layout_list_info = __split_para_in_layoutbox(blocks_group, new_layout_bbox, lang)  # layout内分段
     blocks_group, page_list_info = __connect_list_inter_layout(blocks_group, new_layout_bbox, layout_list_info,
                                                                 page_num, lang)  # layout之间连接列表段落
     connected_layout_blocks = __connect_para_inter_layoutbox(blocks_group, new_layout_bbox, lang)  # layout间链接段落
@@ -646,16 +646,15 @@ def __do_split_page(blocks, layout_bboxes, new_layout_bbox, text_blocks, page_nu
 
 
 
-def para_split_by_model(pdf_info_dict, debug_mode, magic_model: MagicModel, lang="en"):
+def para_split(pdf_info_dict, debug_mode, lang="en"):
     new_layout_of_pages = []  # 数组的数组,每个元素是一个页面的layoutS
     all_page_list_info = []  # 保存每个页面开头和结尾是否是列表
     for page_num, page in pdf_info_dict.items():
         blocks = page['preproc_blocks']
         layout_bboxes = page['layout_bboxes']
-        text_blocks = magic_model.get_text_blocks(page_num)
         new_layout_bbox = __common_pre_proc(blocks, layout_bboxes)
         new_layout_of_pages.append(new_layout_bbox)
-        splited_blocks, page_list_info = __do_split_page(blocks, layout_bboxes, new_layout_bbox, text_blocks, page_num, lang)
+        splited_blocks, page_list_info = __do_split_page(blocks, layout_bboxes, new_layout_bbox, page_num, lang)
         all_page_list_info.append(page_list_info)
         page['para_blocks'] = splited_blocks
 

+ 2 - 5
magic_pdf/pdf_parse_by_ocr.py

@@ -11,7 +11,6 @@ from magic_pdf.libs.drop_tag import DropTag
 from magic_pdf.libs.hash_utils import compute_md5
 from magic_pdf.libs.ocr_content_type import ContentType
 from magic_pdf.para.para_split import para_split
-from magic_pdf.para.para_split_by_model import para_split_by_model
 from magic_pdf.pre_proc.construct_page_dict import ocr_construct_page_component
 from magic_pdf.pre_proc.detect_footer_by_model import parse_footers
 from magic_pdf.pre_proc.detect_footnote import parse_footnotes_by_model
@@ -209,10 +208,8 @@ def parse_pdf_by_ocr(
         pdf_info_dict[f"page_{page_id}"] = page_info
 
     """分段"""
-    if debug_mode:
-        para_split_by_model(pdf_info_dict, debug_mode=debug_mode)
-    else:
-        para_split(pdf_info_dict, debug_mode=debug_mode)
+
+    para_split(pdf_info_dict, debug_mode=debug_mode)
 
     """dict转list"""
     pdf_info_list = dict_to_list(pdf_info_dict)

+ 4 - 6
magic_pdf/pdf_parse_by_ocr_v2.py

@@ -12,8 +12,8 @@ from magic_pdf.pre_proc.cut_image import ocr_cut_image_and_table
 from magic_pdf.pre_proc.ocr_detect_all_bboxes import ocr_prepare_bboxes_for_layout_split
 from magic_pdf.pre_proc.ocr_dict_merge import sort_blocks_by_layout, fill_spans_in_blocks, fix_block_spans
 from magic_pdf.pre_proc.ocr_span_list_modify import remove_overlaps_min_spans, get_qa_need_list_v2
-from magic_pdf.para.para_split import para_split
-from magic_pdf.para.para_split_by_model import para_split_by_model
+# from magic_pdf.para.para_split import para_split
+from magic_pdf.para.para_split_v2 import para_split
 
 
 def parse_pdf_by_ocr(pdf_bytes,
@@ -92,10 +92,8 @@ def parse_pdf_by_ocr(pdf_bytes,
         pdf_info_dict[f"page_{page_id}"] = page_info
 
     """分段"""
-    if debug_mode:
-        para_split_by_model(pdf_info_dict, debug_mode=debug_mode)
-    else:
-        para_split(pdf_info_dict, debug_mode=debug_mode)
+    # if debug_mode:
+    para_split(pdf_info_dict, debug_mode=debug_mode)
 
     """dict转list"""
     pdf_info_list = dict_to_list(pdf_info_dict)