瀏覽代碼

refactor(para): improve line stop flag and remove unused debug mode

- Add '-' and '–' to LINE_STOP_FLAG in pdf_parse_union_core_v2.py
- Remove unused debug_mode parameter from para_split function in para_split_v3.py
myhloli 1 年之前
父節點
當前提交
5d6cbcb123
共有 2 個文件被更改,包括 3 次插入3 次删除
  1. 1 1
      magic_pdf/para/para_split_v3.py
  2. 2 2
      magic_pdf/pdf_parse_union_core_v2.py

+ 1 - 1
magic_pdf/para/para_split_v3.py

@@ -352,7 +352,7 @@ def __para_merge_page(blocks):
             continue
 
 
-def para_split(pdf_info_dict, debug_mode=False):
+def para_split(pdf_info_dict):
     all_blocks = []
     for page_num, page in pdf_info_dict.items():
         blocks = copy.deepcopy(page['preproc_blocks'])

+ 2 - 2
magic_pdf/pdf_parse_union_core_v2.py

@@ -114,7 +114,7 @@ def chars_to_content(span):
         del span['chars']
 
 
-LINE_STOP_FLAG = ('.', '!', '?', '。', '!', '?', ')', ')', '"', '”', ':', ':', ';', ';', ']', '】', '}', '}', '>', '》', '、', ',', ',')
+LINE_STOP_FLAG = ('.', '!', '?', '。', '!', '?', ')', ')', '"', '”', ':', ':', ';', ';', ']', '】', '}', '}', '>', '》', '、', ',', ',', '-', '—', '–',)
 def fill_char_in_spans(spans, all_chars):
 
     for char in all_chars:
@@ -830,7 +830,7 @@ def pdf_parse_union(
         pdf_info_dict[f'page_{page_id}'] = page_info
 
     """分段"""
-    para_split(pdf_info_dict, debug_mode=debug_mode)
+    para_split(pdf_info_dict)
 
     """dict转list"""
     pdf_info_list = dict_to_list(pdf_info_dict)