Browse Source

在dict中加入qa需要的字段

赵小蒙 1 year ago
parent
commit
85587b257b

+ 40 - 20
magic_pdf/pdf_parse_by_ocr.py

@@ -22,20 +22,30 @@ from magic_pdf.pre_proc.detect_page_number import parse_pageNos
 from magic_pdf.pre_proc.ocr_cut_image import cut_image_and_table
 from magic_pdf.pre_proc.ocr_detect_layout import layout_detect
 from magic_pdf.pre_proc.ocr_dict_merge import (
-    merge_spans_to_line_by_layout,
+    merge_spans_to_line_by_layout, merge_lines_to_block,
 )
 from magic_pdf.pre_proc.ocr_span_list_modify import remove_spans_by_bboxes, remove_overlaps_min_spans, \
-    adjust_bbox_for_standalone_block,modify_y_axis,modify_inline_equation
+    adjust_bbox_for_standalone_block, modify_y_axis, modify_inline_equation, get_qa_need_list, \
+    remove_spans_by_bboxes_dict
 from magic_pdf.pre_proc.remove_bbox_overlap import remove_overlap_between_bbox
 
 
-def construct_page_component(blocks, layout_bboxes, page_id, page_w, page_h, layout_tree):
+def construct_page_component(blocks, layout_bboxes, page_id, page_w, page_h, layout_tree,
+                             images, tables, interline_equations, inline_equations,
+                             dropped_text_block, dropped_image_block, dropped_table_block):
     return_dict = {
         'preproc_blocks': blocks,
         'layout_bboxes': layout_bboxes,
         'page_idx': page_id,
         'page_size': [page_w, page_h],
         '_layout_tree': layout_tree,
+        'images': images,
+        'tables': tables,
+        'interline_equations': interline_equations,
+        'inline_equations': inline_equations,
+        'dropped_text_block': dropped_text_block,
+        'dropped_image_block': dropped_image_block,
+        'dropped_table_block': dropped_table_block,
     }
     return return_dict
 
@@ -79,7 +89,6 @@ def parse_pdf_by_ocr(
 
     start_time = time.time()
 
-    remove_bboxes = []
 
     end_page_id = end_page_id if end_page_id else len(pdf_docs) - 1
     for page_id in range(start_page_id, end_page_id + 1):
@@ -111,11 +120,19 @@ def parse_pdf_by_ocr(
         )
 
         # 构建需要remove的bbox列表
-        need_remove_spans_bboxes = []
-        need_remove_spans_bboxes.extend(page_no_bboxes)
-        need_remove_spans_bboxes.extend(header_bboxes)
-        need_remove_spans_bboxes.extend(footer_bboxes)
-        need_remove_spans_bboxes.extend(footnote_bboxes)
+        # need_remove_spans_bboxes = []
+        # need_remove_spans_bboxes.extend(page_no_bboxes)
+        # need_remove_spans_bboxes.extend(header_bboxes)
+        # need_remove_spans_bboxes.extend(footer_bboxes)
+        # need_remove_spans_bboxes.extend(footnote_bboxes)
+
+        # 构建需要remove的bbox字典
+        need_remove_spans_bboxes_dict = {
+            "page_no": page_no_bboxes,
+            "header": header_bboxes,
+            "footer": footer_bboxes,
+            "footnote": footnote_bboxes,
+        }
 
         layout_dets = ocr_page_info["layout_dets"]
         spans = []
@@ -177,7 +194,9 @@ def parse_pdf_by_ocr(
         spans = remove_overlaps_min_spans(spans)
 
         # 删除remove_span_block_bboxes中的bbox
-        spans = remove_spans_by_bboxes(spans, need_remove_spans_bboxes)
+        # spans = remove_spans_by_bboxes(spans, need_remove_spans_bboxes)
+        # 按qa要求,增加drop相关数据
+        spans, dropped_text_block, dropped_image_block, dropped_table_block = remove_spans_by_bboxes_dict(spans, need_remove_spans_bboxes_dict)
 
         # 对image和table截图
         spans = cut_image_and_table(spans, page, page_id, book_name, save_path)
@@ -202,18 +221,19 @@ def parse_pdf_by_ocr(
         # 将spans合并成line(在layout内,从上到下,从左到右)
         lines = merge_spans_to_line_by_layout(spans, layout_bboxes)
 
-        # 目前不做block拼接,先做个结构,每个block中只有一个line,block的bbox就是line的bbox
-        blocks = []
-        for line in lines:
-            blocks.append(
-                {
-                    "bbox": line["bbox"],
-                    "lines": [line],
-                }
-            )
+        # 将lines合并成block
+        blocks = merge_lines_to_block(lines)
+
+        # 根据block合并段落
+
+
+        # 获取QA需要外置的list
+        images, tables, interline_equations, inline_equations = get_qa_need_list(blocks)
 
         # 构造pdf_info_dict
-        page_info = construct_page_component(blocks, layout_bboxes, page_id, page_w, page_h, layout_tree)
+        page_info = construct_page_component(blocks, layout_bboxes, page_id, page_w, page_h, layout_tree,
+                                             images, tables, interline_equations, inline_equations,
+                                             dropped_text_block, dropped_image_block, dropped_table_block)
         pdf_info_dict[f"page_{page_id}"] = page_info
 
     # 在测试时,保存调试信息

+ 13 - 0
magic_pdf/pre_proc/ocr_dict_merge.py

@@ -80,6 +80,19 @@ def merge_spans_to_line_by_layout(spans, layout_bboxes):
     return lines
 
 
+def merge_lines_to_block(lines):
+    # 目前不做block拼接,先做个结构,每个block中只有一个line,block的bbox就是line的bbox
+    blocks = []
+    for line in lines:
+        blocks.append(
+            {
+                "bbox": line["bbox"],
+                "lines": [line],
+            }
+        )
+    return blocks
+
+
 
 
 

+ 66 - 12
magic_pdf/pre_proc/ocr_span_list_modify.py

@@ -1,3 +1,5 @@
+from loguru import logger
+
 from magic_pdf.libs.boxbase import calculate_overlap_area_in_bbox1_area_ratio, get_minbox_if_overlap_by_ratio, \
     __is_overlaps_y_exceeds_threshold
 
@@ -31,6 +33,32 @@ def remove_spans_by_bboxes(spans, need_remove_spans_bboxes):
     return spans
 
 
+def remove_spans_by_bboxes_dict(spans, need_remove_spans_bboxes_dict):
+    dropped_text_block = []
+    dropped_image_block = []
+    dropped_table_block = []
+    for key, value in need_remove_spans_bboxes_dict.items():
+        # logger.info(f"remove spans by bbox dict, key: {key}, value: {value}")
+        need_remove_spans = []
+        for span in spans:
+            for removed_bbox in value:
+                if calculate_overlap_area_in_bbox1_area_ratio(span['bbox'], removed_bbox) > 0.5:
+                    need_remove_spans.append(span)
+                    break
+
+        for span in need_remove_spans:
+            spans.remove(span)
+            span['tag'] = key
+            if span['type'] in ['text', 'inline_equation', 'displayed_equation']:
+                dropped_text_block.append(span)
+            elif span['type'] == 'image':
+                dropped_image_block.append(span)
+            elif span['type'] == 'table':
+                dropped_table_block.append(span)
+
+    return spans, dropped_text_block, dropped_image_block, dropped_table_block
+
+
 def adjust_bbox_for_standalone_block(spans):
     # 对tpye=["displayed_equation", "image", "table"]进行额外处理,如果左边有字的话,将该span的bbox中y0调整至不高于文字的y0
     for sb_span in spans:
@@ -46,7 +74,6 @@ def adjust_bbox_for_standalone_block(spans):
     return spans
 
 
-
 def modify_y_axis(spans: list, displayed_list: list, text_inline_lines: list):
     # displayed_list = []
 
@@ -105,8 +132,7 @@ def modify_y_axis(spans: list, displayed_list: list, text_inline_lines: list):
         current_line = line[0]
         current_line.sort(key=lambda span: span['bbox'][0])
 
-
-    #调整每一个文字行内bbox统一
+    # 调整每一个文字行内bbox统一
     for line in text_inline_lines:
         current_line, (line_first_y0, line_first_y) = line
         for span in current_line:
@@ -115,8 +141,9 @@ def modify_y_axis(spans: list, displayed_list: list, text_inline_lines: list):
 
     # return spans, displayed_list, text_inline_lines
 
+
 def modify_inline_equation(spans: list, displayed_list: list, text_inline_lines: list):
-    #错误行间公式转行内公式
+    # 错误行间公式转行内公式
     j = 0
     for i in range(len(displayed_list)):
         # if i == 8:
@@ -127,26 +154,53 @@ def modify_inline_equation(spans: list, displayed_list: list, text_inline_lines:
         while j < len(text_inline_lines):
             text_line = text_inline_lines[j]
             y0, y1 = text_line[1]
-            if (span_y0 < y0 and span_y > y0 or span_y0 < y1 and span_y > y1 or span_y0 < y0 and span_y > y1) and __is_overlaps_y_exceeds_threshold(span['bbox'], (0, y0, 0, y1)):
+            if (
+                    span_y0 < y0 and span_y > y0 or span_y0 < y1 and span_y > y1 or span_y0 < y0 and span_y > y1) and __is_overlaps_y_exceeds_threshold(
+                    span['bbox'], (0, y0, 0, y1)):
 
-                #调整公式类型
+                # 调整公式类型
                 if span["type"] == "displayed_equation":
-                    #最后一行是行间公式
-                    if j+1 >= len(text_inline_lines):
+                    # 最后一行是行间公式
+                    if j + 1 >= len(text_inline_lines):
                         span["type"] = "inline_equation"
                         span["bbox"][1] = y0
                         span["bbox"][3] = y1
                     else:
-                        #行间公式旁边有多行文字或者行间公式比文字高3倍则不转换
+                        # 行间公式旁边有多行文字或者行间公式比文字高3倍则不转换
                         y0_next, y1_next = text_inline_lines[j + 1][1]
-                        if not __is_overlaps_y_exceeds_threshold(span['bbox'], (0, y0_next, 0, y1_next)) and 3*(y1-y0) > span_y - span_y0:
+                        if not __is_overlaps_y_exceeds_threshold(span['bbox'], (0, y0_next, 0, y1_next)) and 3 * (
+                                y1 - y0) > span_y - span_y0:
                             span["type"] = "inline_equation"
                             span["bbox"][1] = y0
                             span["bbox"][3] = y1
                 break
-            elif span_y < y0 or span_y0 < y0 and span_y > y0 and not __is_overlaps_y_exceeds_threshold(span['bbox'], (0, y0, 0, y1)):
+            elif span_y < y0 or span_y0 < y0 and span_y > y0 and not __is_overlaps_y_exceeds_threshold(span['bbox'],
+                                                                                                       (0, y0, 0, y1)):
                 break
             else:
                 j += 1
 
-    return spans
+    return spans
+
+
+def get_qa_need_list(blocks):
+    # 创建 images, tables, interline_equations, inline_equations 的副本
+    images = []
+    tables = []
+    interline_equations = []
+    inline_equations = []
+
+    for block in blocks:
+        for line in block["lines"]:
+            for span in line["spans"]:
+                if span["type"] == "image":
+                    images.append(span)
+                elif span["type"] == "table":
+                    tables.append(span)
+                elif span["type"] == "inline_equation":
+                    inline_equations.append(span)
+                elif span["type"] == "displayed_equation":
+                    interline_equations.append(span)
+                else:
+                    continue
+    return images, tables, interline_equations, inline_equations