Răsfoiți Sursa

Merge pull request #765 from myhloli/add-list-group

refactor(para): improve paragraph splitting algorithm
Xiaomeng Zhao 1 an în urmă
părinte
comite
e4904cd6d3

+ 23 - 26
magic_pdf/dict2md/ocr_mkcontent.py

@@ -36,9 +36,9 @@ def ocr_mk_mm_markdown_with_para_and_pagination(pdf_info_dict: list,
             paras_of_layout, 'mm', img_buket_path)
         markdown_with_para_and_pagination.append({
             'page_no':
-            page_no,
+                page_no,
             'md_content':
-            '\n\n'.join(page_markdown)
+                '\n\n'.join(page_markdown)
         })
         page_no += 1
     return markdown_with_para_and_pagination
@@ -47,19 +47,17 @@ def ocr_mk_mm_markdown_with_para_and_pagination(pdf_info_dict: list,
 def ocr_mk_markdown_with_para_core_v2(paras_of_layout,
                                       mode,
                                       img_buket_path='',
-                                      parse_type="auto",
-                                      lang=None
                                       ):
     page_markdown = []
     for para_block in paras_of_layout:
         para_text = ''
         para_type = para_block['type']
         if para_type in [BlockType.Text, BlockType.List, BlockType.Index]:
-            para_text = merge_para_with_text(para_block, parse_type=parse_type, lang=lang)
+            para_text = merge_para_with_text(para_block)
         elif para_type == BlockType.Title:
-            para_text = f'# {merge_para_with_text(para_block, parse_type=parse_type, lang=lang)}'
+            para_text = f'# {merge_para_with_text(para_block)}'
         elif para_type == BlockType.InterlineEquation:
-            para_text = merge_para_with_text(para_block, parse_type=parse_type, lang=lang)
+            para_text = merge_para_with_text(para_block)
         elif para_type == BlockType.Image:
             if mode == 'nlp':
                 continue
@@ -72,17 +70,17 @@ def ocr_mk_markdown_with_para_core_v2(paras_of_layout,
                                     para_text += f"\n![]({join_path(img_buket_path, span['image_path'])})  \n"
                 for block in para_block['blocks']:  # 2nd.拼image_caption
                     if block['type'] == BlockType.ImageCaption:
-                        para_text += merge_para_with_text(block, parse_type=parse_type, lang=lang)
+                        para_text += merge_para_with_text(block)
                 for block in para_block['blocks']:  # 2nd.拼image_caption
                     if block['type'] == BlockType.ImageFootnote:
-                        para_text += merge_para_with_text(block, parse_type=parse_type, lang=lang)
+                        para_text += merge_para_with_text(block)
         elif para_type == BlockType.Table:
             if mode == 'nlp':
                 continue
             elif mode == 'mm':
                 for block in para_block['blocks']:  # 1st.拼table_caption
                     if block['type'] == BlockType.TableCaption:
-                        para_text += merge_para_with_text(block, parse_type=parse_type, lang=lang)
+                        para_text += merge_para_with_text(block)
                 for block in para_block['blocks']:  # 2nd.拼table_body
                     if block['type'] == BlockType.TableBody:
                         for line in block['lines']:
@@ -97,7 +95,7 @@ def ocr_mk_markdown_with_para_core_v2(paras_of_layout,
                                         para_text += f"\n![]({join_path(img_buket_path, span['image_path'])})  \n"
                 for block in para_block['blocks']:  # 3rd.拼table_footnote
                     if block['type'] == BlockType.TableFootnote:
-                        para_text += merge_para_with_text(block, parse_type=parse_type, lang=lang)
+                        para_text += merge_para_with_text(block)
 
         if para_text.strip() == '':
             continue
@@ -120,7 +118,7 @@ def detect_language(text):
         return 'empty'
 
 
-def merge_para_with_text(para_block, parse_type="auto", lang=None):
+def merge_para_with_text(para_block):
     para_text = ''
     for i, line in enumerate(para_block['lines']):
 
@@ -161,24 +159,24 @@ def merge_para_with_text(para_block, parse_type="auto", lang=None):
     return para_text
 
 
-def para_to_standard_format_v2(para_block, img_buket_path, page_idx, parse_type="auto", lang=None, drop_reason=None):
+def para_to_standard_format_v2(para_block, img_buket_path, page_idx, drop_reason=None):
     para_type = para_block['type']
     para_content = {}
     if para_type == BlockType.Text:
         para_content = {
             'type': 'text',
-            'text': merge_para_with_text(para_block, parse_type=parse_type, lang=lang),
+            'text': merge_para_with_text(para_block),
         }
     elif para_type == BlockType.Title:
         para_content = {
             'type': 'text',
-            'text': merge_para_with_text(para_block, parse_type=parse_type, lang=lang),
+            'text': merge_para_with_text(para_block),
             'text_level': 1,
         }
     elif para_type == BlockType.InterlineEquation:
         para_content = {
             'type': 'equation',
-            'text': merge_para_with_text(para_block, parse_type=parse_type, lang=lang),
+            'text': merge_para_with_text(para_block),
             'text_format': 'latex',
         }
     elif para_type == BlockType.Image:
@@ -189,9 +187,9 @@ def para_to_standard_format_v2(para_block, img_buket_path, page_idx, parse_type=
                     img_buket_path,
                     block['lines'][0]['spans'][0]['image_path'])
             if block['type'] == BlockType.ImageCaption:
-                para_content['img_caption'] = merge_para_with_text(block, parse_type=parse_type, lang=lang)
+                para_content['img_caption'] = merge_para_with_text(block)
             if block['type'] == BlockType.ImageFootnote:
-                para_content['img_footnote'] = merge_para_with_text(block, parse_type=parse_type, lang=lang)
+                para_content['img_footnote'] = merge_para_with_text(block)
     elif para_type == BlockType.Table:
         para_content = {'type': 'table'}
         for block in para_block['blocks']:
@@ -202,9 +200,9 @@ def para_to_standard_format_v2(para_block, img_buket_path, page_idx, parse_type=
                     para_content['table_body'] = f"\n\n{block['lines'][0]['spans'][0]['html']}\n\n"
                 para_content['img_path'] = join_path(img_buket_path, block["lines"][0]["spans"][0]['image_path'])
             if block['type'] == BlockType.TableCaption:
-                para_content['table_caption'] = merge_para_with_text(block, parse_type=parse_type, lang=lang)
+                para_content['table_caption'] = merge_para_with_text(block)
             if block['type'] == BlockType.TableFootnote:
-                para_content['table_footnote'] = merge_para_with_text(block, parse_type=parse_type, lang=lang)
+                para_content['table_footnote'] = merge_para_with_text(block)
 
     para_content['page_idx'] = page_idx
 
@@ -218,8 +216,7 @@ def union_make(pdf_info_dict: list,
                make_mode: str,
                drop_mode: str,
                img_buket_path: str = '',
-               parse_type: str = "auto",
-               lang=None):
+               ):
     output_content = []
     for page_info in pdf_info_dict:
         drop_reason_flag = False
@@ -246,20 +243,20 @@ def union_make(pdf_info_dict: list,
             continue
         if make_mode == MakeMode.MM_MD:
             page_markdown = ocr_mk_markdown_with_para_core_v2(
-                paras_of_layout, 'mm', img_buket_path, parse_type=parse_type, lang=lang)
+                paras_of_layout, 'mm', img_buket_path)
             output_content.extend(page_markdown)
         elif make_mode == MakeMode.NLP_MD:
             page_markdown = ocr_mk_markdown_with_para_core_v2(
-                paras_of_layout, 'nlp', parse_type=parse_type, lang=lang)
+                paras_of_layout, 'nlp')
             output_content.extend(page_markdown)
         elif make_mode == MakeMode.STANDARD_FORMAT:
             for para_block in paras_of_layout:
                 if drop_reason_flag:
                     para_content = para_to_standard_format_v2(
-                        para_block, img_buket_path, page_idx, parse_type=parse_type, lang=lang, drop_reason=drop_reason)
+                        para_block, img_buket_path, page_idx)
                 else:
                     para_content = para_to_standard_format_v2(
-                        para_block, img_buket_path, page_idx, parse_type=parse_type, lang=lang)
+                        para_block, img_buket_path, page_idx)
                 output_content.append(para_content)
     if make_mode in [MakeMode.MM_MD, MakeMode.NLP_MD]:
         return '\n\n'.join(output_content)

+ 15 - 2
magic_pdf/para/para_split_v3.py

@@ -59,7 +59,7 @@ def __is_list_or_index_block(block):
     # index block 是一种特殊的list block
     # 一个block如果是index block 应该同时满足以下特征
     # 1.block内有多个line 2.block 内有多个line两侧均顶格写 3.line的开头或者结尾均为数字
-    if len(block['lines']) >= 3:
+    if len(block['lines']) >= 2:
         first_line = block['lines'][0]
         line_height = first_line['bbox'][3] - first_line['bbox'][1]
         block_weight = block['bbox_fs'][2] - block['bbox_fs'][0]
@@ -227,6 +227,15 @@ def __merge_2_list_blocks(block1, block2):
     return block1, block2
 
 
+def __is_list_group(text_blocks_group):
+    # list group的特征是一个group内的所有block都满足以下条件
+    # 1.每个block都不超过3行 2. 每个block 的左边界都比较接近(逻辑简单点先不加这个规则)
+    for block in text_blocks_group:
+        if len(block['lines']) > 3:
+            return False
+    return True
+
+
 def __para_merge_page(blocks):
     page_text_blocks_groups = __process_blocks(blocks)
     for text_blocks_group in page_text_blocks_groups:
@@ -239,6 +248,10 @@ def __para_merge_page(blocks):
                 # logger.info(f"{block['type']}:{block}")
 
         if len(text_blocks_group) > 1:
+
+            # 在合并前判断这个group 是否是一个 list group
+            is_list_group = __is_list_group(text_blocks_group)
+
             # 倒序遍历
             for i in range(len(text_blocks_group) - 1, -1, -1):
                 current_block = text_blocks_group[i]
@@ -247,7 +260,7 @@ def __para_merge_page(blocks):
                 if i - 1 >= 0:
                     prev_block = text_blocks_group[i - 1]
 
-                    if current_block['type'] == 'text' and prev_block['type'] == 'text':
+                    if current_block['type'] == 'text' and prev_block['type'] == 'text' and not is_list_group:
                         __merge_2_text_blocks(current_block, prev_block)
                     elif (
                             (current_block['type'] == BlockType.List and prev_block['type'] == BlockType.List) or

+ 2 - 6
magic_pdf/pipe/AbsPipe.py

@@ -95,9 +95,7 @@ class AbsPipe(ABC):
         """
         pdf_mid_data = JsonCompressor.decompress_json(compressed_pdf_mid_data)
         pdf_info_list = pdf_mid_data["pdf_info"]
-        parse_type = pdf_mid_data["_parse_type"]
-        lang = pdf_mid_data.get("_lang", None)
-        content_list = union_make(pdf_info_list, MakeMode.STANDARD_FORMAT, drop_mode, img_buket_path, parse_type, lang)
+        content_list = union_make(pdf_info_list, MakeMode.STANDARD_FORMAT, drop_mode, img_buket_path)
         return content_list
 
     @staticmethod
@@ -107,9 +105,7 @@ class AbsPipe(ABC):
         """
         pdf_mid_data = JsonCompressor.decompress_json(compressed_pdf_mid_data)
         pdf_info_list = pdf_mid_data["pdf_info"]
-        parse_type = pdf_mid_data["_parse_type"]
-        lang = pdf_mid_data.get("_lang", None)
-        md_content = union_make(pdf_info_list, md_make_mode, drop_mode, img_buket_path, parse_type, lang)
+        md_content = union_make(pdf_info_list, md_make_mode, drop_mode, img_buket_path)
         return md_content