Bläddra i källkod

Merge pull request #31 from myhloli/master

重构 parse_by_ocr_v2.py
myhloli 1 år sedan
förälder
incheckning
91d296deb6

+ 13 - 0
magic_pdf/libs/ocr_content_type.py

@@ -4,4 +4,17 @@ class ContentType:
     Text = "text"
     InlineEquation = "inline_equation"
     InterlineEquation = "interline_equation"
+    
+class BlockType:
+    Image = "image"
+    ImageBody = "image_body"
+    ImageCaption = "image_caption"
+    Table = "table"
+    TableBody = "table_body"
+    TableCaption = "table_caption"
+    TableFootnote = "table_footnote"
+    Text = "text"
+    Title = "title"
+    InterlineEquation = "interline_equation"
+    Footnote = "footnote"
 

+ 41 - 3
magic_pdf/parse_by_ocr_v2.py

@@ -1,11 +1,17 @@
+import time
+
+from loguru import logger
+
 from magic_pdf.layout.layout_sort import get_bboxes_layout
+from magic_pdf.libs.convert_utils import dict_to_list
 from magic_pdf.libs.hash_utils import compute_md5
-from magic_pdf.libs.commons import fitz
+from magic_pdf.libs.commons import fitz, get_delta_time
 from magic_pdf.model.magic_model import MagicModel
+from magic_pdf.pre_proc.construct_page_dict import ocr_construct_page_component_v2
 from magic_pdf.pre_proc.cut_image import ocr_cut_image_and_table
 from magic_pdf.pre_proc.ocr_detect_all_bboxes import ocr_prepare_bboxes_for_layout_split
 from magic_pdf.pre_proc.ocr_dict_merge import sort_blocks_by_layout, fill_spans_in_blocks, fix_block_spans
-from magic_pdf.pre_proc.ocr_span_list_modify import remove_overlaps_min_spans
+from magic_pdf.pre_proc.ocr_span_list_modify import remove_overlaps_min_spans, get_qa_need_list_v2
 
 
 def parse_pdf_by_ocr(pdf_bytes,
@@ -15,17 +21,31 @@ def parse_pdf_by_ocr(pdf_bytes,
                      end_page_id=None,
                      debug_mode=False,
                      ):
-
     pdf_bytes_md5 = compute_md5(pdf_bytes)
     pdf_docs = fitz.open("pdf", pdf_bytes)
 
+    '''初始化空的pdf_info_dict'''
+    pdf_info_dict = {}
+
     '''用model_list和docs对象初始化magic_model'''
     magic_model = MagicModel(model_list, pdf_docs)
 
     '''根据输入的起始范围解析pdf'''
     end_page_id = end_page_id if end_page_id else len(pdf_docs) - 1
+
+    '''初始化启动时间'''
+    start_time = time.time()
+
     for page_id in range(start_page_id, end_page_id + 1):
 
+        '''debug时输出每页解析的耗时'''
+        if debug_mode:
+            time_now = time.time()
+            logger.info(
+                f"page_id: {page_id}, last_page_cost_time: {get_delta_time(start_time)}"
+            )
+            start_time = time_now
+
         '''从magic_model对象中获取后面会用到的区块信息'''
         img_blocks = magic_model.get_imgs(page_id)
         table_blocks = magic_model.get_tables(page_id)
@@ -61,3 +81,21 @@ def parse_pdf_by_ocr(pdf_bytes,
         '''对block进行fix操作'''
         fix_blocks = fix_block_spans(block_with_spans, img_blocks, table_blocks)
 
+        '''获取QA需要外置的list'''
+        images, tables, interline_equations = get_qa_need_list_v2(fix_blocks)
+
+        '''构造pdf_info_dict'''
+        page_info = ocr_construct_page_component_v2(fix_blocks, layout_bboxes, page_id, page_w, page_h, layout_tree,
+                                                    images, tables, interline_equations, discarded_blocks)
+        pdf_info_dict[f"page_{page_id}"] = page_info
+
+    """分段"""
+    pass
+
+    """dict转list"""
+    pdf_info_list = dict_to_list(pdf_info_dict)
+    new_pdf_info_dict = {
+        "pdf_info": pdf_info_list,
+    }
+
+    return new_pdf_info_dict

+ 28 - 11
magic_pdf/pre_proc/construct_page_dict.py

@@ -1,12 +1,13 @@
-
-def construct_page_component(page_id, image_info, table_info,  text_blocks_preproc, layout_bboxes, inline_eq_info, interline_eq_info, raw_pymu_blocks, 
-                             removed_text_blocks, removed_image_blocks, images_backup, droped_table_block, table_backup,layout_tree,
+def construct_page_component(page_id, image_info, table_info, text_blocks_preproc, layout_bboxes, inline_eq_info,
+                             interline_eq_info, raw_pymu_blocks,
+                             removed_text_blocks, removed_image_blocks, images_backup, droped_table_block, table_backup,
+                             layout_tree,
                              page_w, page_h, footnote_bboxes_tmp):
     """
     
     """
     return_dict = {}
-    
+
     return_dict['para_blocks'] = {}
     return_dict['preproc_blocks'] = text_blocks_preproc
     return_dict['images'] = image_info
@@ -16,24 +17,24 @@ def construct_page_component(page_id, image_info, table_info,  text_blocks_prepr
     return_dict['layout_bboxes'] = layout_bboxes
     return_dict['pymu_raw_blocks'] = raw_pymu_blocks
     return_dict['global_statistic'] = {}
-    
+
     return_dict['droped_text_block'] = removed_text_blocks
     return_dict['droped_image_block'] = removed_image_blocks
     return_dict['droped_table_block'] = []
     return_dict['image_backup'] = images_backup
-    return_dict['table_backup'] = []    
+    return_dict['table_backup'] = []
     return_dict['page_idx'] = page_id
     return_dict['page_size'] = [page_w, page_h]
-    return_dict['_layout_tree'] = layout_tree # 辅助分析layout作用
+    return_dict['_layout_tree'] = layout_tree  # 辅助分析layout作用
     return_dict['footnote_bboxes_tmp'] = footnote_bboxes_tmp
-    
+
     return return_dict
 
 
 def ocr_construct_page_component(blocks, layout_bboxes, page_id, page_w, page_h, layout_tree,
-                             images, tables, interline_equations, inline_equations,
-                             dropped_text_block, dropped_image_block, dropped_table_block, dropped_equation_block,
-                             need_remove_spans_bboxes_dict):
+                                 images, tables, interline_equations, inline_equations,
+                                 dropped_text_block, dropped_image_block, dropped_table_block, dropped_equation_block,
+                                 need_remove_spans_bboxes_dict):
     return_dict = {
         'preproc_blocks': blocks,
         'layout_bboxes': layout_bboxes,
@@ -51,3 +52,19 @@ def ocr_construct_page_component(blocks, layout_bboxes, page_id, page_w, page_h,
         'droped_bboxes': need_remove_spans_bboxes_dict,
     }
     return return_dict
+
+
+def ocr_construct_page_component_v2(blocks, layout_bboxes, page_id, page_w, page_h, layout_tree,
+                                    images, tables, interline_equations, droped_blocks):
+    return_dict = {
+        'preproc_blocks': blocks,
+        'layout_bboxes': layout_bboxes,
+        'page_idx': page_id,
+        'page_size': [page_w, page_h],
+        '_layout_tree': layout_tree,
+        'images': images,
+        'tables': tables,
+        'interline_equations': interline_equations,
+        'droped_blocks': droped_blocks,
+    }
+    return return_dict

+ 9 - 6
magic_pdf/pre_proc/ocr_detect_all_bboxes.py

@@ -1,32 +1,35 @@
+from magic_pdf.libs.ocr_content_type import BlockType
+
+
 def ocr_prepare_bboxes_for_layout_split(img_blocks, table_blocks, discarded_blocks, text_blocks,
                                         title_blocks, interline_equation_blocks, page_w, page_h):
     all_bboxes = []
 
     for image in img_blocks:
         x0, y0, x1, y1 = image['bbox']
-        all_bboxes.append([x0, y0, x1, y1, None, None, None, 'image_block', None, None, None, None])
+        all_bboxes.append([x0, y0, x1, y1, None, None, None, BlockType.Image, None, None, None, None])
 
     for table in table_blocks:
         x0, y0, x1, y1 = table['bbox']
-        all_bboxes.append([x0, y0, x1, y1, None, None, None, 'table_block', None, None, None, None])
+        all_bboxes.append([x0, y0, x1, y1, None, None, None, BlockType.Table, None, None, None, None])
 
     for text in text_blocks:
         x0, y0, x1, y1 = text['bbox']
-        all_bboxes.append([x0, y0, x1, y1, None, None, None, 'text_block', None, None, None, None])
+        all_bboxes.append([x0, y0, x1, y1, None, None, None, BlockType.Text, None, None, None, None])
 
     for title in title_blocks:
         x0, y0, x1, y1 = title['bbox']
-        all_bboxes.append([x0, y0, x1, y1, None, None, None, 'title_block', None, None, None, None])
+        all_bboxes.append([x0, y0, x1, y1, None, None, None, BlockType.Title, None, None, None, None])
 
     for interline_equation in interline_equation_blocks:
         x0, y0, x1, y1 = interline_equation['bbox']
-        all_bboxes.append([x0, y0, x1, y1, None, None, None, 'interline_equation_block', None, None, None, None])
+        all_bboxes.append([x0, y0, x1, y1, None, None, None, BlockType.InterlineEquation, None, None, None, None])
 
     '''discarded_blocks中只保留宽度超过1/3页面宽度的,高度超过10的,处于页面下半50%区域的(限定footnote)'''
     for discarded in discarded_blocks:
         x0, y0, x1, y1 = discarded['bbox']
         if (x1 - x0) > (page_w / 3) and (y1 - y0) > 10 and y0 > (page_h / 2):
-            all_bboxes.append([x0, y0, x1, y1, None, None, None, 'footnote', None, None, None, None])
+            all_bboxes.append([x0, y0, x1, y1, None, None, None, BlockType.Footnote, None, None, None, None])
 
     return all_bboxes
 

+ 17 - 12
magic_pdf/pre_proc/ocr_dict_merge.py

@@ -3,8 +3,8 @@ from loguru import logger
 from magic_pdf.libs.boxbase import __is_overlaps_y_exceeds_threshold, get_minbox_if_overlap_by_ratio, \
     calculate_overlap_area_in_bbox1_area_ratio
 from magic_pdf.libs.drop_tag import DropTag
-from magic_pdf.libs.ocr_content_type import ContentType
-from magic_pdf.pre_proc.ocr_fix_block_logic import fix_image_block, fix_table_block
+from magic_pdf.libs.ocr_content_type import ContentType, BlockType
+from magic_pdf.pre_proc.ocr_fix_block_logic import fix_image_block, fix_table_block, fix_text_block
 
 
 # 将每一个line中的span从左到右排序
@@ -117,7 +117,7 @@ def sort_blocks_by_layout(all_bboxes, layout_bboxes):
         layout_blocks = []
         for block in all_bboxes:
             # 如果是footnote则跳过
-            if block[7] == 'footnote':
+            if block[7] == BlockType.Footnote:
                 continue
             block_bbox = [block[0], block[1], block[2], block[3]]
             if calculate_overlap_area_in_bbox1_area_ratio(block_bbox, layout_bbox) > 0.8:
@@ -141,6 +141,9 @@ def sort_blocks_by_layout(all_bboxes, layout_bboxes):
 
 
 def fill_spans_in_blocks(blocks, spans):
+    '''
+    将allspans中的span按位置关系,放入blocks中
+    '''
     block_with_spans = []
     for block in blocks:
         block_type = block[7]
@@ -166,20 +169,22 @@ def fill_spans_in_blocks(blocks, spans):
 
 
 def fix_block_spans(block_with_spans, img_blocks, table_blocks):
+    '''
+    1、img_block和table_block因为包含caption和footnote的关系,存在block的嵌套关系
+        需要将caption和footnote的text_span放入相应img_block和table_block内的
+        caption_block和footnote_block中
+    2、同时需要删除block中的spans字段
+    '''
     fix_blocks = []
     for block in block_with_spans:
         block_type = block['block_type']
-        # 只有type为image_block和table_block才需要处理
-        if block_type == 'image_block':
+
+        if block_type == BlockType.Image:
             block = fix_image_block(block, img_blocks)
-        elif block_type == 'table_block':
+        elif block_type == BlockType.Table:
             block = fix_table_block(block, table_blocks)
-        elif block_type == 'text_block':
-            pass
-        elif block_type == 'title_block':
-            pass
-        elif block_type == 'interline_equation_block':
-            pass
+        elif block_type in [BlockType.Text, BlockType.Title, BlockType.InterlineEquation]:
+            block = fix_text_block(block)
         else:
             continue
         fix_blocks.append(block)

+ 18 - 6
magic_pdf/pre_proc/ocr_fix_block_logic.py

@@ -1,7 +1,8 @@
 from magic_pdf.libs.boxbase import calculate_overlap_area_in_bbox1_area_ratio
-from magic_pdf.libs.ocr_content_type import ContentType
+from magic_pdf.libs.ocr_content_type import ContentType, BlockType
 from magic_pdf.pre_proc.ocr_dict_merge import merge_spans_to_line, line_sort_spans_by_left_to_right
 
+
 def merge_spans_to_block(spans: list, block_bbox: list, block_type: str):
     block_spans = []
     # 如果有img_caption,则将img_block中的text_spans放入img_caption_block中
@@ -18,6 +19,7 @@ def merge_spans_to_block(spans: list, block_bbox: list, block_type: str):
     }
     return block, block_spans
 
+
 def make_body_block(span: dict, block_bbox: list, block_type: str):
     # 创建body_block
     body_line = {
@@ -41,7 +43,7 @@ def fix_image_block(block, img_blocks):
             for span in block['spans']:
                 if span['type'] == ContentType.Image and span['bbox'] == img_block['img_body_bbox']:
                     # 创建img_body_block
-                    img_body_block = make_body_block(span, img_block['img_body_bbox'], 'img_body_block')
+                    img_body_block = make_body_block(span, img_block['img_body_bbox'], BlockType.ImageBody)
                     block['blocks'].append(img_body_block)
 
                     # 从spans中移除img_body_block中已经放入的span
@@ -51,7 +53,7 @@ def fix_image_block(block, img_blocks):
             # 根据list长度,判断img_block中是否有img_caption
             if len(img_block['img_caption_bbox']) > 0:
                 img_caption_block, img_caption_spans = merge_spans_to_block(
-                    block['spans'], img_block['img_caption_bbox'], 'img_caption_block'
+                    block['spans'], img_block['img_caption_bbox'], BlockType.ImageCaption
                 )
                 block['blocks'].append(img_caption_block)
 
@@ -69,7 +71,7 @@ def fix_table_block(block, table_blocks):
             for span in block['spans']:
                 if span['type'] == ContentType.Table and span['bbox'] == table_block['table_body_bbox']:
                     # 创建table_body_block
-                    table_body_block = make_body_block(span, table_block['table_body_bbox'], 'table_body_block')
+                    table_body_block = make_body_block(span, table_block['table_body_bbox'], BlockType.TableBody)
                     block['blocks'].append(table_body_block)
 
                     # 从spans中移除img_body_block中已经放入的span
@@ -79,7 +81,7 @@ def fix_table_block(block, table_blocks):
             # 根据list长度,判断table_block中是否有caption
             if len(table_block['table_caption_bbox']) > 0:
                 table_caption_block, table_caption_spans = merge_spans_to_block(
-                    block['spans'], table_block['table_caption_bbox'], 'table_caption_block'
+                    block['spans'], table_block['table_caption_bbox'], BlockType.TableCaption
                 )
                 block['blocks'].append(table_caption_block)
 
@@ -92,10 +94,20 @@ def fix_table_block(block, table_blocks):
             # 根据list长度,判断table_block中是否有table_note
             if len(table_block['table_footnote_bbox']) > 0:
                 table_footnote_block, table_footnote_spans = merge_spans_to_block(
-                    block['spans'], table_block['table_footnote_bbox'], 'table_footnote_block'
+                    block['spans'], table_block['table_footnote_bbox'], BlockType.TableFootnote
                 )
                 block['blocks'].append(table_footnote_block)
 
             break
     del block['spans']
     return block
+
+
+def fix_text_block(block):
+    block_lines = merge_spans_to_line(block['spans'])
+    sort_block_lines = line_sort_spans_by_left_to_right(block_lines)
+    block['lines'] = sort_block_lines
+    del block['spans']
+    return block
+
+

+ 25 - 7
magic_pdf/pre_proc/ocr_span_list_modify.py

@@ -3,7 +3,7 @@ from loguru import logger
 from magic_pdf.libs.boxbase import calculate_overlap_area_in_bbox1_area_ratio, get_minbox_if_overlap_by_ratio, \
     __is_overlaps_y_exceeds_threshold
 from magic_pdf.libs.drop_tag import DropTag
-from magic_pdf.libs.ocr_content_type import ContentType
+from magic_pdf.libs.ocr_content_type import ContentType, BlockType
 
 
 def remove_overlaps_min_spans(spans):
@@ -50,7 +50,8 @@ def remove_spans_by_bboxes_dict(spans, need_remove_spans_bboxes_dict):
                     need_remove_spans.append(span)
                     break
                 # 当drop_tag为DropTag.FOOTNOTE时, 判断span是否在removed_bboxes中任意一个的下方,如果是,则删除该span
-                elif drop_tag == DropTag.FOOTNOTE and (span['bbox'][1]+span['bbox'][3])/2 > removed_bbox[3] and removed_bbox[0] < (span['bbox'][0]+span['bbox'][2])/2 < removed_bbox[2]:
+                elif drop_tag == DropTag.FOOTNOTE and (span['bbox'][1] + span['bbox'][3]) / 2 > removed_bbox[3] and \
+                        removed_bbox[0] < (span['bbox'][0] + span['bbox'][2]) / 2 < removed_bbox[2]:
                     need_remove_spans.append(span)
                     break
 
@@ -162,9 +163,10 @@ def modify_inline_equation(spans: list, displayed_list: list, text_inline_lines:
             text_line = text_inline_lines[j]
             y0, y1 = text_line[1]
             if (
-                    span_y0 < y0 and span_y > y0 or span_y0 < y1 and span_y > y1 or span_y0 < y0 and span_y > y1) and __is_overlaps_y_exceeds_threshold(
-                span['bbox'], (0, y0, 0, y1)):
-
+                    span_y0 < y0 < span_y or span_y0 < y1 < span_y or span_y0 < y0 and span_y > y1
+            ) and __is_overlaps_y_exceeds_threshold(
+                span['bbox'], (0, y0, 0, y1)
+            ):
                 # 调整公式类型
                 if span["type"] == ContentType.InterlineEquation:
                     # 最后一行是行间公式
@@ -181,8 +183,8 @@ def modify_inline_equation(spans: list, displayed_list: list, text_inline_lines:
                             span["bbox"][1] = y0
                             span["bbox"][3] = y1
                 break
-            elif span_y < y0 or span_y0 < y0 and span_y > y0 and not __is_overlaps_y_exceeds_threshold(span['bbox'],
-                                                                                                       (0, y0, 0, y1)):
+            elif span_y < y0 or span_y0 < y0 < span_y and not __is_overlaps_y_exceeds_threshold(span['bbox'],
+                                                                                                (0, y0, 0, y1)):
                 break
             else:
                 j += 1
@@ -211,3 +213,19 @@ def get_qa_need_list(blocks):
                 else:
                     continue
     return images, tables, interline_equations, inline_equations
+
+
+def get_qa_need_list_v2(blocks):
+    # 创建 images, tables, interline_equations, inline_equations 的副本
+    images = []
+    tables = []
+    interline_equations = []
+
+    for block in blocks:
+        if block["type"] == BlockType.Image:
+            images.append(block)
+        elif block["type"] == BlockType.Table:
+            tables.append(block)
+        elif block["type"] == BlockType.InterlineEquation:
+            interline_equations.append(block)
+    return images, tables, interline_equations